_regex_core.py · agriconnect/regex

Learn more » Push, build, and install RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages
agriconnect / regex python

Repository URL to install this package:
Version: 2019.6.8

/ _regex_core.py

#
# Secret Labs' Regular Expression Engine core module
#
# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
#
# This version of the SRE library can be redistributed under CNRI's
# Python 1.6 license.  For any other use, please contact Secret Labs
# AB (info@pythonware.com).
#
# Portions of this engine have been developed in cooperation with
# CNRI.  Hewlett-Packard provided funding for 1.6 integration and
# other compatibility work.
#
# 2010-01-16 mrab Python front-end re-written and extended

import string
import sys
import unicodedata
from collections import defaultdict

import regex._regex as _regex

__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
  "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
  "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
  "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
  "Scanner"]

# The regex exception.
class error(Exception):
    """Exception raised for invalid regular expressions.

    Attributes:

        msg: The unformatted error message
        pattern: The regular expression pattern
        pos: The position in the pattern where compilation failed, or None
        lineno: The line number where compilation failed, unless pos is None
        colno: The column number where compilation failed, unless pos is None
    """

    def __init__(self, message, pattern=None, pos=None):
        newline = '\n' if isinstance(pattern, str) else b'\n'
        self.msg = message
        self.pattern = pattern
        self.pos = pos
        if pattern is not None and pos is not None:
            self.lineno = pattern.count(newline, 0, pos) + 1
            self.colno = pos - pattern.rfind(newline, 0, pos)

            message = "{} at position {}".format(message, pos)

            if newline in pattern:
                message += " (line {}, column {})".format(self.lineno,
                  self.colno)

        Exception.__init__(self, message)

# The exception for when a positional flag has been turned on in the old
# behaviour.
class _UnscopedFlagSet(Exception):
    pass

# The exception for when parsing fails and we want to try something else.
class ParseError(Exception):
    pass

# The exception for when there isn't a valid first set.
class _FirstSetError(Exception):
    pass

# Flags.
A = ASCII = 0x80          # Assume ASCII locale.
B = BESTMATCH = 0x1000    # Best fuzzy match.
D = DEBUG = 0x200         # Print parsed pattern.
E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
                          # fuzzy match.
F = FULLCASE = 0x4000     # Unicode full case-folding.
I = IGNORECASE = 0x2      # Ignore case.
L = LOCALE = 0x4          # Assume current 8-bit locale.
M = MULTILINE = 0x8       # Make anchors look for newline.
P = POSIX = 0x10000       # POSIX-style matching (leftmost longest).
R = REVERSE = 0x400       # Search backwards.
S = DOTALL = 0x10         # Make dot match newline.
U = UNICODE = 0x20        # Assume Unicode locale.
V0 = VERSION0 = 0x2000    # Old legacy behaviour.
V1 = VERSION1 = 0x100     # New enhanced behaviour.
W = WORD = 0x800          # Default Unicode word breaks.
X = VERBOSE = 0x40        # Ignore whitespace and comments.
T = TEMPLATE = 0x1        # Template (present because re module has it).

DEFAULT_VERSION = VERSION1

_ALL_VERSIONS = VERSION0 | VERSION1
_ALL_ENCODINGS = ASCII | LOCALE | UNICODE

# The default flags for the various versions.
DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}

# The mask for the flags.
GLOBAL_FLAGS = (_ALL_ENCODINGS | _ALL_VERSIONS | BESTMATCH | DEBUG |
  ENHANCEMATCH | POSIX | REVERSE)
SCOPED_FLAGS = FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE

ALPHA = frozenset(string.ascii_letters)
DIGITS = frozenset(string.digits)
ALNUM = ALPHA | DIGITS
OCT_DIGITS = frozenset(string.octdigits)
HEX_DIGITS = frozenset(string.hexdigits)
SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
NAMED_CHAR_PART = ALNUM | frozenset(" -")
PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
SET_OPS = ("||", "~~", "&&", "--")

# The width of the code words inside the regex engine.
BYTES_PER_CODE = _regex.get_code_size()
BITS_PER_CODE = BYTES_PER_CODE * 8

# The repeat count which represents infinity.
UNLIMITED = (1 << BITS_PER_CODE) - 1

# The regular expression flags.
REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
  "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
  "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
  VERBOSE}

# The case flags.
CASE_FLAGS = FULLCASE | IGNORECASE
NOCASE = 0
FULLIGNORECASE = FULLCASE | IGNORECASE

FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE

CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
  FULLIGNORECASE: FULLIGNORECASE}

# The number of digits in hexadecimal escapes.
HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}

# The names of the opcodes.
OPCODES = """
FAILURE
SUCCESS
ANY
ANY_ALL
ANY_ALL_REV
ANY_REV
ANY_U
ANY_U_REV
ATOMIC
BOUNDARY
BRANCH
CALL_REF
CHARACTER
CHARACTER_IGN
CHARACTER_IGN_REV
CHARACTER_REV
CONDITIONAL
DEFAULT_BOUNDARY
DEFAULT_END_OF_WORD
DEFAULT_START_OF_WORD
END
END_OF_LINE
END_OF_LINE_U
END_OF_STRING
END_OF_STRING_LINE
END_OF_STRING_LINE_U
END_OF_WORD
FUZZY
GRAPHEME_BOUNDARY
GREEDY_REPEAT
GROUP
GROUP_CALL
GROUP_EXISTS
KEEP
LAZY_REPEAT
LOOKAROUND
NEXT
PROPERTY
PROPERTY_IGN
PROPERTY_IGN_REV
PROPERTY_REV
PRUNE
RANGE
RANGE_IGN
RANGE_IGN_REV
RANGE_REV
REF_GROUP
REF_GROUP_FLD
REF_GROUP_FLD_REV
REF_GROUP_IGN
REF_GROUP_IGN_REV
REF_GROUP_REV
SEARCH_ANCHOR
SET_DIFF
SET_DIFF_IGN
SET_DIFF_IGN_REV
SET_DIFF_REV
SET_INTER
SET_INTER_IGN
SET_INTER_IGN_REV
SET_INTER_REV
SET_SYM_DIFF
SET_SYM_DIFF_IGN
SET_SYM_DIFF_IGN_REV
SET_SYM_DIFF_REV
SET_UNION
SET_UNION_IGN
SET_UNION_IGN_REV
SET_UNION_REV
SKIP
START_OF_LINE
START_OF_LINE_U
START_OF_STRING
START_OF_WORD
STRING
STRING_FLD
STRING_FLD_REV
STRING_IGN
STRING_IGN_REV
STRING_REV
STRING_SET
STRING_SET_FLD
STRING_SET_FLD_REV
STRING_SET_IGN
STRING_SET_IGN_REV
STRING_SET_REV
"""

# Define the opcodes in a namespace.
class Namespace:
    pass

OP = Namespace()
for i, op in enumerate(OPCODES.split()):
    setattr(OP, op, i)

def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
    """Make room in the given cache.

    Args:
        cache_dict: The cache dictionary to modify.
        args_dict: The dictionary of named list args used by patterns.
        max_length: Maximum # of entries in cache_dict before it is shrunk.
        divisor: Cache will shrink to max_length - 1/divisor*max_length items.
    """
    # Toss out a fraction of the entries at random to make room for new ones.
    # A random algorithm was chosen as opposed to simply cache_dict.popitem()
    # as popitem could penalize the same regular expression repeatedly based
    # on its internal hash value.  Being random should spread the cache miss
    # love around.
    cache_keys = tuple(cache_dict.keys())
    overage = len(cache_keys) - max_length
    if overage < 0:
        # Cache is already within limits.  Normally this should not happen
        # but it could due to multithreading.
        return

    number_to_toss = max_length // divisor + overage

    # The import is done here to avoid a circular dependency.
    import random
    if not hasattr(random, 'sample'):
        # Do nothing while resolving the circular dependency:
        #  re->random->warnings->tokenize->string->re
        return

    for doomed_key in random.sample(cache_keys, number_to_toss):
        try:
            del cache_dict[doomed_key]
        except KeyError:
            # Ignore problems if the cache changed from another thread.
            pass

    # Rebuild the arguments and locale-sensitivity dictionaries.
    args_dict.clear()
    sensitivity_dict = {}
    for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
        args_dict[pattern, pattern_type, flags, default_version, locale] = args
        try:
            sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
        except KeyError:
            pass

    locale_sensitive.clear()
    locale_sensitive.update(sensitivity_dict)

def _fold_case(info, string):
    "Folds the case of a string."
    flags = info.flags
    if (flags & _ALL_ENCODINGS) == 0:
        flags |= info.guess_encoding

    return _regex.fold_case(flags, string)

def is_cased_i(info, char):
    "Checks whether a character is cased."
    return len(_regex.get_all_cases(info.flags, char)) > 1

def is_cased_f(flags, char):
    "Checks whether a character is cased."
    return len(_regex.get_all_cases(flags, char)) > 1

def _compile_firstset(info, fs):
    "Compiles the firstset for the pattern."
    reverse = bool(info.flags & REVERSE)
    fs = _check_firstset(info, reverse, fs)
    if not fs:
        return []

    # Compile the firstset.
    return fs.compile(reverse)

def _check_firstset(info, reverse, fs):
    "Checks the firstset for the pattern."
    if not fs or None in fs:
        return None

    # If we ignore the case, for simplicity we won't build a firstset.
    members = set()
    case_flags = NOCASE
    for i in fs:
        if isinstance(i, Character) and not i.positive:
            return None

#        if i.case_flags:
#            if isinstance(i, Character):
#                if is_cased_i(info, i.value):
#                    return []
#            elif isinstance(i, SetBase):
#                return []
        case_flags |= i.case_flags
        members.add(i.with_flags(case_flags=NOCASE))

    if case_flags == (FULLCASE | IGNORECASE):
        return None

    # Build the firstset.
    fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
      zerowidth=True)
    fs = fs.optimise(info, reverse, in_set=True)

    return fs
Loading ...
agriconnect / regex python

Version: 2019.6.8

/ _regex_core.py

Products

About

Resources

Contact Gemfury