cpython/Lib/sre_compile.py

#
# Secret Labs' Regular Expression Engine
#
# convert template to internal format
#
# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#

"""Internal support module for sre"""

import _sre
import sre_parse
from sre_constants import *
from _sre import MAXREPEAT

assert _sre.MAGIC == MAGIC, "SRE module mismatch"

if _sre.CODESIZE == 2:
    MAXCODE = 65535
else:
    MAXCODE = 0xFFFFFFFF

_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
_SUCCESS_CODES = set([SUCCESS, FAILURE])
_ASSERT_CODES = set([ASSERT, ASSERT_NOT])

# Sets of lowercase characters which have the same uppercase.
_equivalences = (
    # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
    (0x69, 0x131), # iı
    # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
    (0x73, 0x17f), # sſ
    # MICRO SIGN, GREEK SMALL LETTER MU
    (0xb5, 0x3bc), # µμ
    # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
    (0x345, 0x3b9, 0x1fbe), # \u0345ιι
    # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
    (0x390, 0x1fd3), # ΐΐ
    # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
    (0x3b0, 0x1fe3), # ΰΰ
    # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
    (0x3b2, 0x3d0), # βϐ
    # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
    (0x3b5, 0x3f5), # εϵ
    # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
    (0x3b8, 0x3d1), # θϑ
    # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
    (0x3ba, 0x3f0), # κϰ
    # GREEK SMALL LETTER PI, GREEK PI SYMBOL
    (0x3c0, 0x3d6), # πϖ
    # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
    (0x3c1, 0x3f1), # ρϱ
    # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
    (0x3c2, 0x3c3), # ςσ
    # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
    (0x3c6, 0x3d5), # φϕ
    # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
    (0x1e61, 0x1e9b), # ṡẛ
    # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
    (0xfb05, 0xfb06), # ﬅﬆ
)

# Maps the lowercase code to lowercase codes which have the same uppercase.
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
                     for t in _equivalences for i in t}

def _compile(code, pattern, flags):
    # internal: compile a (sub)pattern
    emit = code.append
    _len = len
    LITERAL_CODES = _LITERAL_CODES
    REPEATING_CODES = _REPEATING_CODES
    SUCCESS_CODES = _SUCCESS_CODES
    ASSERT_CODES = _ASSERT_CODES
    if (flags & SRE_FLAG_IGNORECASE and
            not (flags & SRE_FLAG_LOCALE) and
            flags & SRE_FLAG_UNICODE):
        fixes = _ignorecase_fixes
    else:
        fixes = None
    for op, av in pattern:
        if op in LITERAL_CODES:
            if flags & SRE_FLAG_IGNORECASE:
                lo = _sre.getlower(av, flags)
                if fixes and lo in fixes:
                    emit(OPCODES[IN_IGNORE])
                    skip = _len(code); emit(0)
                    if op is NOT_LITERAL:
                        emit(OPCODES[NEGATE])
                    for k in (lo,) + fixes[lo]:
                        emit(OPCODES[LITERAL])
                        emit(k)
                    emit(OPCODES[FAILURE])
                    code[skip] = _len(code) - skip
                else:
                    emit(OPCODES[OP_IGNORE[op]])
                    emit(lo)
            else:
                emit(OPCODES[op])
                emit(av)
        elif op is IN:
            if flags & SRE_FLAG_IGNORECASE:
                emit(OPCODES[OP_IGNORE[op]])
                def fixup(literal, flags=flags):
                    return _sre.getlower(literal, flags)
            else:
                emit(OPCODES[op])
                fixup = None
            skip = _len(code); emit(0)
            _compile_charset(av, flags, code, fixup, fixes)
            code[skip] = _len(code) - skip
        elif op is ANY:
            if flags & SRE_FLAG_DOTALL:
                emit(OPCODES[ANY_ALL])
            else:
                emit(OPCODES[ANY])
        elif op in REPEATING_CODES:
            if flags & SRE_FLAG_TEMPLATE:
                raise error("internal: unsupported template operator")
            elif _simple(av) and op is not REPEAT:
                if op is MAX_REPEAT:
                    emit(OPCODES[REPEAT_ONE])
                else:
                    emit(OPCODES[MIN_REPEAT_ONE])
                skip = _len(code); emit(0)
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
                emit(OPCODES[SUCCESS])
                code[skip] = _len(code) - skip
            else:
                emit(OPCODES[REPEAT])
                skip = _len(code); emit(0)
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
                code[skip] = _len(code) - skip
                if op is MAX_REPEAT:
                    emit(OPCODES[MAX_UNTIL])
                else:
                    emit(OPCODES[MIN_UNTIL])
        elif op is SUBPATTERN:
            if av[0]:
                emit(OPCODES[MARK])
                emit((av[0]-1)*2)
            # _compile_info(code, av[1], flags)
            _compile(code, av[1], flags)
            if av[0]:
                emit(OPCODES[MARK])
                emit((av[0]-1)*2+1)
        elif op in SUCCESS_CODES:
            emit(OPCODES[op])
        elif op in ASSERT_CODES:
            emit(OPCODES[op])
            skip = _len(code); emit(0)
            if av[0] >= 0:
                emit(0) # look ahead
            else:
                lo, hi = av[1].getwidth()
                if lo != hi:
                    raise error("look-behind requires fixed-width pattern")
                emit(lo) # look behind
            _compile(code, av[1], flags)
            emit(OPCODES[SUCCESS])
            code[skip] = _len(code) - skip
        elif op is CALL:
            emit(OPCODES[op])
            skip = _len(code); emit(0)
            _compile(code, av, flags)
            emit(OPCODES[SUCCESS])
            code[skip] = _len(code) - skip
        elif op is AT:
            emit(OPCODES[op])
            if flags & SRE_FLAG_MULTILINE:
                av = AT_MULTILINE.get(av, av)
            if flags & SRE_FLAG_LOCALE:
                av = AT_LOCALE.get(av, av)
            elif flags & SRE_FLAG_UNICODE:
                av = AT_UNICODE.get(av, av)
            emit(ATCODES[av])
        elif op is BRANCH:
            emit(OPCODES[op])
            tail = []
            tailappend = tail.append
            for av in av[1]:
                skip = _len(code); emit(0)
                # _compile_info(code, av, flags)
                _compile(code, av, flags)
                emit(OPCODES[JUMP])
                tailappend(_len(code)); emit(0)
                code[skip] = _len(code) - skip
            emit(0) # end of branch
            for tail in tail:
                code[tail] = _len(code) - tail
        elif op is CATEGORY:
            emit(OPCODES[op])
            if flags & SRE_FLAG_LOCALE:
                av = CH_LOCALE[av]
            elif flags & SRE_FLAG_UNICODE:
                av = CH_UNICODE[av]
            emit(CHCODES[av])
        elif op is GROUPREF:
            if flags & SRE_FLAG_IGNORECASE:
                emit(OPCODES[OP_IGNORE[op]])
            else:
                emit(OPCODES[op])
            emit(av-1)
        elif op is GROUPREF_EXISTS:
            emit(OPCODES[op])
            emit(av[0]-1)
            skipyes = _len(code); emit(0)
            _compile(code, av[1], flags)
            if av[2]:
                emit(OPCODES[JUMP])
                skipno = _len(code); emit(0)
                code[skipyes] = _len(code) - skipyes + 1
                _compile(code, av[2], flags)
                code[skipno] = _len(code) - skipno
            else:
                code[skipyes] = _len(code) - skipyes + 1
        else:
            raise ValueError("unsupported operand type", op)

def _compile_charset(charset, flags, code, fixup=None, fixes=None):
    # compile charset subprogram
    emit = code.append
    for op, av in _optimize_charset(charset, fixup, fixes,
                                    flags & SRE_FLAG_UNICODE):
        emit(OPCODES[op])
        if op is NEGATE:
            pass
        elif op is LITERAL:
            emit(av)
        elif op is RANGE:
            emit(av[0])
            emit(av[1])
        elif op is CHARSET:
            code.extend(av)
        elif op is BIGCHARSET:
            code.extend(av)
        elif op is CATEGORY:
            if flags & SRE_FLAG_LOCALE:
                emit(CHCODES[CH_LOCALE[av]])
            elif flags & SRE_FLAG_UNICODE:
                emit(CHCODES[CH_UNICODE[av]])
            else:
                emit(CHCODES[av])
        else:
            raise error("internal: unsupported set operator")
    emit(OPCODES[FAILURE])

def _optimize_charset(charset, fixup, fixes, isunicode):
    # internal: optimize character set
    out = []
    tail = []
    charmap = bytearray(256)
    for op, av in charset:
        while True:
            try:
                if op is LITERAL:
                    if fixup:
                        i = fixup(av)
                        charmap[i] = 1
                        if fixes and i in fixes:
                            for k in fixes[i]:
                                charmap[k] = 1
                    else:
                        charmap[av] = 1
                elif op is RANGE:
                    r = range(av[0], av[1]+1)
                    if fixup:
                        r = map(fixup, r)
                    if fixup and fixes:
                        for i in r:
                            charmap[i] = 1
                            if i in fixes:
                                for k in fixes[i]:
                                    charmap[k] = 1
                    else:
                        for i in r:
                            charmap[i] = 1
                elif op is NEGATE:
                    out.append((op, av))
                else:
                    tail.append((op, av))
            except IndexError:
                if len(charmap) == 256:
                    # character set contains non-UCS1 character codes
                    charmap += b'\0' * 0xff00
                    continue
                # character set contains non-BMP character codes
                if fixup and isunicode and op is RANGE:
                    lo, hi = av
                    ranges = [av]
                    # There are only two ranges of cased astral characters:
                    # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
                    _fixup_range(max(0x10000, lo), min(0x11fff, hi),
                                 ranges, fixup)
                    for lo, hi in ranges:
                        if lo == hi:
                            tail.append((LITERAL, hi))
                        else:
                            tail.append((RANGE, (lo, hi)))
                else:
                    tail.append((op, av))
            break

    # compress character map
    runs = []
    q = 0
    while True:
        p = charmap.find(1, q)
        if p < 0:
            break
        if len(runs) >= 2:
            runs = None
            break
        q = charmap.find(0, p)
        if q < 0:
            runs.append((p, len(charmap)))
            break
        runs.append((p, q))
    if runs is not None:
        # use literal/range
        for p, q in runs:
            if q - p == 1:
                out.append((LITERAL, p))
            else:
                out.append((RANGE, (p, q - 1)))
        out += tail
        # if the case was changed or new representation is more compact
        if fixup or len(out) < len(charset):
            return out
        # else original character set is good enough
        return charset

    # use bitmap
    if len(charmap) == 256:
        data = _mk_bitmap(charmap)
        out.append((CHARSET, data))
        out += tail
        return out

    # To represent a big charset, first a bitmap of all characters in the
    # set is constructed. Then, this bitmap is sliced into chunks of 256
    # characters, duplicate chunks are eliminated, and each chunk is
    # given a number. In the compiled expression, the charset is
    # represented by a 32-bit word sequence, consisting of one word for
    # the number of different chunks, a sequence of 256 bytes (64 words)
    # of chunk numbers indexed by their original chunk position, and a
    # sequence of 256-bit chunks (8 words each).

    # Compression is normally good: in a typical charset, large ranges of
    # Unicode will be either completely excluded (e.g. if only cyrillic
    # letters are to be matched), or completely included (e.g. if large
    # subranges of Kanji match). These ranges will be represented by
    # chunks of all one-bits or all zero-bits.

    # Matching can be also done efficiently: the more significant byte of
    # the Unicode character is an index into the chunk number, and the
    # less significant byte is a bit index in the chunk (just like the
    # CHARSET matching).

    charmap = bytes(charmap) # should be hashable
    comps = {}
    mapping = bytearray(256)
    block = 0
    data = bytearray()
    for i in range(0, 65536, 256):
        chunk = charmap[i: i + 256]
        if chunk in comps:
            mapping[i // 256] = comps[chunk]
        else:
            mapping[i // 256] = comps[chunk] = block
            block += 1
            data += chunk
    data = _mk_bitmap(data)
    data[0:0] = [block] + _bytes_to_codes(mapping)
    out.append((BIGCHARSET, data))
    out += tail
    return out

def _fixup_range(lo, hi, ranges, fixup):
    for i in map(fixup, range(lo, hi+1)):
        for k, (lo, hi) in enumerate(ranges):
            if i < lo:
                if l == lo - 1:
                    ranges[k] = (i, hi)
                else:
                    ranges.insert(k, (i, i))
                break
            elif i > hi:
                if i == hi + 1:
                    ranges[k] = (lo, i)
                    break
            else:
                break
        else:
            ranges.append((i, i))

_CODEBITS = _sre.CODESIZE * 8
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
    s = bits.translate(_BITS_TRANS)[::-1]
    return [_int(s[i - _CODEBITS: i], 2)
            for i in range(len(s), 0, -_CODEBITS)]

def _bytes_to_codes(b):
    # Convert block indices to word array
    a = memoryview(b).cast('I')
    assert a.itemsize == _sre.CODESIZE
    assert len(a) * a.itemsize == len(b)
    return a.tolist()

def _simple(av):
    # check if av is a "simple" operator
    lo, hi = av[2].getwidth()
    return lo == hi == 1 and av[2][0][0] != SUBPATTERN

def _generate_overlap_table(prefix):
    """
    Generate an overlap table for the following prefix.
    An overlap table is a table of the same size as the prefix which
    informs about the potential self-overlap for each index in the prefix:
    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
      prefix[0:k]
    """
    table = [0] * len(prefix)
    for i in range(1, len(prefix)):
        idx = table[i - 1]
        while prefix[i] != prefix[idx]:
            if idx == 0:
                table[i] = 0
                break
            idx = table[idx - 1]
        else:
            table[i] = idx + 1
    return table

def _compile_info(code, pattern, flags):
    # internal: compile an info block.  in the current version,
    # this contains min/max pattern width, and an optional literal
    # prefix or a character map
    lo, hi = pattern.getwidth()
    if lo == 0:
        return # not worth it
    # look for a literal prefix
    prefix = []
    prefixappend = prefix.append
    prefix_skip = 0
    charset = [] # not used
    charsetappend = charset.append
    if not (flags & SRE_FLAG_IGNORECASE):
        # look for literal prefix
        for op, av in pattern.data:
            if op is LITERAL:
                if len(prefix) == prefix_skip:
                    prefix_skip = prefix_skip + 1
                prefixappend(av)
            elif op is SUBPATTERN and len(av[1]) == 1:
                op, av = av[1][0]
                if op is LITERAL:
                    prefixappend(av)
                else:
                    break
            else:
                break
        # if no prefix, look for charset prefix
        if not prefix and pattern.data:
            op, av = pattern.data[0]
            if op is SUBPATTERN and av[1]:
                op, av = av[1][0]
                if op is LITERAL:
                    charsetappend((op, av))
                elif op is BRANCH:
                    c = []
                    cappend = c.append
                    for p in av[1]:
                        if not p:
                            break
                        op, av = p[0]
                        if op is LITERAL:
                            cappend((op, av))
                        else:
                            break
                    else:
                        charset = c
            elif op is BRANCH:
                c = []
                cappend = c.append
                for p in av[1]:
                    if not p:
                        break
                    op, av = p[0]
                    if op is LITERAL:
                        cappend((op, av))
                    else:
                        break
                else:
                    charset = c
            elif op is IN:
                charset = av
##     if prefix:
##         print "*** PREFIX", prefix, prefix_skip
##     if charset:
##         print "*** CHARSET", charset
    # add an info block
    emit = code.append
    emit(OPCODES[INFO])
    skip = len(code); emit(0)
    # literal flag
    mask = 0
    if prefix:
        mask = SRE_INFO_PREFIX
        if len(prefix) == prefix_skip == len(pattern.data):
            mask = mask + SRE_INFO_LITERAL
    elif charset:
        mask = mask + SRE_INFO_CHARSET
    emit(mask)
    # pattern length
    if lo < MAXCODE:
        emit(lo)
    else:
        emit(MAXCODE)
        prefix = prefix[:MAXCODE]
    if hi < MAXCODE:
        emit(hi)
    else:
        emit(0)
    # add literal prefix
    if prefix:
        emit(len(prefix)) # length
        emit(prefix_skip) # skip
        code.extend(prefix)
        # generate overlap table
        code.extend(_generate_overlap_table(prefix))
    elif charset:
        _compile_charset(charset, flags, code)
    code[skip] = len(code) - skip

def isstring(obj):
    return isinstance(obj, (str, bytes))

def _code(p, flags):

    flags = p.pattern.flags | flags
    code = []

    # compile info block
    _compile_info(code, p, flags)

    # compile the pattern
    _compile(code, p.data, flags)

    code.append(OPCODES[SUCCESS])

    return code

def compile(p, flags=0):
    # internal: convert pattern list to internal format

    if isstring(p):
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None

    code = _code(p, flags)

    # print code

    # XXX: <fl> get rid of this limitation!
    if p.pattern.groups > 100:
        raise AssertionError(
            "sorry, but this version only supports 100 named groups"
            )

    # map in either direction
    groupindex = p.pattern.groupdict
    indexgroup = [None] * p.pattern.groups
    for k, i in groupindex.items():
        indexgroup[i] = k

    return _sre.compile(
        pattern, flags | p.pattern.flags, code,
        p.pattern.groups-1,
        groupindex, indexgroup
        )
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
+								#
 								# Secret Labs' Regular Expression Engine
 								#
 								# convert template to internal format
 								#
-												SRE fixes for 2.1 alpha:

-- added some more docstrings
-- fixed typo in scanner class (#125531)
-- the multiline flag (?m) should't affect the \Z operator (#127259)
-- fixed non-greedy backtracking bug (#123769, #127259)
-- added sre.DEBUG flag (currently dumps the parsed pattern structure)
-- fixed a couple of glitches in groupdict (the #126587 memory leak
   had already been fixed by AMK)

											
										
										
											2001-01-14 11:06:11 -04:00
+								# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
+								#
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
+								# See the sre.py file for information on usage and redistribution.
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
+								#
-												Added docstrings by Neal Norwitz.  This closes SF bug #450980.

											
										
										
											2001-09-04 16:10:20 -03:00
+								"""Internal support module for sre"""
-												Issue #20976: pyflakes: Remove unused imports

											
										
										
											2014-03-20 05:16:38 -03:00
+								import _sre
-												Merged revisions 62194,62197-62198,62204-62205,62214,62219-62221,62227,62229-62231,62233-62235,62237-62239 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk

........
  r62194 | jeffrey.yasskin | 2008-04-07 01:04:28 +0200 (Mon, 07 Apr 2008) | 7 lines

  Add enough debugging information to diagnose failures where the
  HandlerBException is ignored, and fix one such problem, where it was thrown
  during the __del__ method of the previous Popen object.

  We may want to find a better way of printing verbose information so it's not
  spammy when the test passes.
........
  r62197 | mark.hammond | 2008-04-07 03:53:39 +0200 (Mon, 07 Apr 2008) | 2 lines

  Issue #2513: enable 64bit cross compilation on windows.
........
  r62198 | mark.hammond | 2008-04-07 03:59:40 +0200 (Mon, 07 Apr 2008) | 2 lines

  correct heading underline for new "Cross-compiling on Windows" section
........
  r62204 | gregory.p.smith | 2008-04-07 08:33:21 +0200 (Mon, 07 Apr 2008) | 4 lines

  Use the new PyFile_IncUseCount & PyFile_DecUseCount calls appropriatly
  within the standard library.  These modules use PyFile_AsFile and later
  release the GIL while operating on the previously returned FILE*.
........
  r62205 | mark.summerfield | 2008-04-07 09:39:23 +0200 (Mon, 07 Apr 2008) | 4 lines

  changed "2500 components" to "several thousand" since the number keeps
  growning:-)
........
  r62214 | georg.brandl | 2008-04-07 20:51:59 +0200 (Mon, 07 Apr 2008) | 2 lines

  #2525: update timezone info examples in the docs.
........
  r62219 | andrew.kuchling | 2008-04-08 01:57:07 +0200 (Tue, 08 Apr 2008) | 1 line

  Write PEP 3127 section; add items
........
  r62220 | andrew.kuchling | 2008-04-08 01:57:21 +0200 (Tue, 08 Apr 2008) | 1 line

  Typo fix
........
  r62221 | andrew.kuchling | 2008-04-08 03:33:10 +0200 (Tue, 08 Apr 2008) | 1 line

  Typographical fix: 32bit -> 32-bit, 64bit -> 64-bit
........
  r62227 | andrew.kuchling | 2008-04-08 23:22:53 +0200 (Tue, 08 Apr 2008) | 1 line

  Add items
........
  r62229 | amaury.forgeotdarc | 2008-04-08 23:27:42 +0200 (Tue, 08 Apr 2008) | 7 lines

  Issue2564: Prevent a hang in "import test.autotest", which runs the entire test
  suite as a side-effect of importing the module.

  - in test_capi, a thread tried to import other modules
  - re.compile() imported sre_parse again on every call.
........
  r62230 | amaury.forgeotdarc | 2008-04-08 23:51:57 +0200 (Tue, 08 Apr 2008) | 2 lines

  Prevent an error when inspect.isabstract() is called with something else than a new-style class.
........
  r62231 | amaury.forgeotdarc | 2008-04-09 00:07:05 +0200 (Wed, 09 Apr 2008) | 8 lines

  Issue 2408: remove the _types module
  It was only used as a helper in types.py to access types (GetSetDescriptorType and MemberDescriptorType),
  when they can easily be obtained with python code.
  These expressions even work with Jython.

  I don't know what the future of the types module is; (cf. discussion in http://bugs.python.org/issue1605 )
  at least this change makes it simpler.
........
  r62233 | amaury.forgeotdarc | 2008-04-09 01:10:07 +0200 (Wed, 09 Apr 2008) | 2 lines

  Add a NEWS entry for previous checkin
........
  r62234 | trent.nelson | 2008-04-09 01:47:30 +0200 (Wed, 09 Apr 2008) | 37 lines

  - Issue #2550: The approach used by client/server code for obtaining ports
    to listen on in network-oriented tests has been refined in an effort to
    facilitate running multiple instances of the entire regression test suite
    in parallel without issue.  test_support.bind_port() has been fixed such
    that it will always return a unique port -- which wasn't always the case
    with the previous implementation, especially if socket options had been
    set that affected address reuse (i.e. SO_REUSEADDR, SO_REUSEPORT).  The
    new implementation of bind_port() will actually raise an exception if it
    is passed an AF_INET/SOCK_STREAM socket with either the SO_REUSEADDR or
    SO_REUSEPORT socket option set.  Furthermore, if available, bind_port()
    will set the SO_EXCLUSIVEADDRUSE option on the socket it's been passed.
    This currently only applies to Windows.  This option prevents any other
    sockets from binding to the host/port we've bound to, thus removing the
    possibility of the 'non-deterministic' behaviour, as Microsoft puts it,
    that occurs when a second SOCK_STREAM socket binds and accepts to a
    host/port that's already been bound by another socket.  The optional
    preferred port parameter to bind_port() has been removed.  Under no
    circumstances should tests be hard coding ports!

    test_support.find_unused_port() has also been introduced, which will pass
    a temporary socket object to bind_port() in order to obtain an unused port.
    The temporary socket object is then closed and deleted, and the port is
    returned.  This method should only be used for obtaining an unused port
    in order to pass to an external program (i.e. the -accept [port] argument
    to openssl's s_server mode) or as a parameter to a server-oriented class
    that doesn't give you direct access to the underlying socket used.

    Finally, test_support.HOST has been introduced, which should be used for
    the host argument of any relevant socket calls (i.e. bind and connect).

    The following tests were updated to following the new conventions:
      test_socket, test_smtplib, test_asyncore, test_ssl, test_httplib,
      test_poplib, test_ftplib, test_telnetlib, test_socketserver,
      test_asynchat and test_socket_ssl.

    It is now possible for multiple instances of the regression test suite to
    run in parallel without issue.
........
  r62235 | gregory.p.smith | 2008-04-09 02:25:17 +0200 (Wed, 09 Apr 2008) | 3 lines

  Fix zlib crash from zlib.decompressobj().flush(val) when val was not positive.
  It tried to allocate negative or zero memory.  That fails.
........
  r62237 | trent.nelson | 2008-04-09 02:34:53 +0200 (Wed, 09 Apr 2008) | 1 line

  Fix typo with regards to self.PORT shadowing class variables with the same name.
........
  r62238 | andrew.kuchling | 2008-04-09 03:08:32 +0200 (Wed, 09 Apr 2008) | 1 line

  Add items
........
  r62239 | jerry.seutter | 2008-04-09 07:07:58 +0200 (Wed, 09 Apr 2008) | 1 line

  Changed test so it no longer runs as a side effect of importing.
........

											
										
										
											2008-04-09 05:37:03 -03:00
+								import sre_parse
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
+								from sre_constants import *
-												Issue #13169: The maximal repetition number in a regular expression has been
increased from 65534 to 2147483647 (on 32-bit platform) or 4294967294 (on
64-bit).

											
										
										
											2013-02-16 10:47:47 -04:00
+								from _sre import MAXREPEAT
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
-												added "magic" number to the _sre module, to avoid weird errors caused
by compiler/engine mismatches

											
										
										
											2001-01-15 08:46:09 -04:00
+								assert _sre.MAGIC == MAGIC, "SRE module mismatch"
-												Fully support 32-bit codes. Enable BIGCHARSET in UCS-4 builds.

											
										
										
											2003-04-19 09:56:08 -03:00
+								if _sre.CODESIZE == 2:
 								    MAXCODE = 65535
 								else:
-												Rip out 'long' and 'L'-suffixed integer literals.
(Rough first cut.)

											
										
										
											2007-01-15 12:59:06 -04:00
+								    MAXCODE = 0xFFFFFFFF
-												-- use charset bitmaps where appropriate.  this gives a 5-10%
   speedup for some tests, including the python tokenizer.

-- added support for an optional charset anchor to the engine
   (currently unused by the code generator).

-- removed workaround for array module bug.

											
										
										
											2000-07-02 09:00:07 -03:00
-												Complete the previous effort to factor out constant expressions
and improve the speed of the if/elif/else blocks.

											
										
										
											2005-02-28 15:27:52 -04:00
+								_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
 								_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
 								_SUCCESS_CODES = set([SUCCESS, FAILURE])
 								_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								# Sets of lowercase characters which have the same uppercase.
 								_equivalences = (
 								    # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
 								    (0x69, 0x131), # iı
 								    # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
 								    (0x73, 0x17f), # sſ
 								    # MICRO SIGN, GREEK SMALL LETTER MU
 								    (0xb5, 0x3bc), # µμ
 								    # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
 								    (0x345, 0x3b9, 0x1fbe), # \u0345ιι
 								    # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
 								    (0x390, 0x1fd3), # ΐΐ
 								    # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
 								    (0x3b0, 0x1fe3), # ΰΰ
 								    # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
 								    (0x3b2, 0x3d0), # βϐ
 								    # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
 								    (0x3b5, 0x3f5), # εϵ
 								    # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
 								    (0x3b8, 0x3d1), # θϑ
 								    # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
 								    (0x3ba, 0x3f0), # κϰ
 								    # GREEK SMALL LETTER PI, GREEK PI SYMBOL
 								    (0x3c0, 0x3d6), # πϖ
 								    # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
 								    (0x3c1, 0x3f1), # ρϱ
 								    # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
 								    (0x3c2, 0x3c3), # ςσ
 								    # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
 								    (0x3c6, 0x3d5), # φϕ
 								    # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
 								    (0x1e61, 0x1e9b), # ṡẛ
 								    # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
 								    (0xfb05, 0xfb06), # ﬅﬆ
 								)
 								# Maps the lowercase code to lowercase codes which have the same uppercase.
 								_ignorecase_fixes = {i: tuple(j for j in t if i != j)
 								                     for t in _equivalences for i in t}
-												towards 1.6b1

											
										
										
											2000-06-29 05:58:44 -03:00
+								def _compile(code, pattern, flags):
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    # internal: compile a (sub)pattern
-												- fixed split
  (test_sre still complains about split, but that's caused by
  the group reset bug, not split itself)

- added more mark slots
  (should be dynamically allocated, but 100 is better than 32.
  and checking for the upper limit is better than overwriting
  the memory ;-)

- internal: renamed the cursor helper class

- internal: removed some bloat from sre_compile

											
										
										
											2000-06-29 13:57:40 -03:00
+								    emit = code.append
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								    _len = len
-												Complete the previous effort to factor out constant expressions
and improve the speed of the if/elif/else blocks.

											
										
										
											2005-02-28 15:27:52 -04:00
+								    LITERAL_CODES = _LITERAL_CODES
 								    REPEATING_CODES = _REPEATING_CODES
 								    SUCCESS_CODES = _SUCCESS_CODES
 								    ASSERT_CODES = _ASSERT_CODES
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								    if (flags & SRE_FLAG_IGNORECASE and
 								            not (flags & SRE_FLAG_LOCALE) and
 								            flags & SRE_FLAG_UNICODE):
 								        fixes = _ignorecase_fixes
 								    else:
 								        fixes = None
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
+								    for op, av in pattern:
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								        if op in LITERAL_CODES:
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								            if flags & SRE_FLAG_IGNORECASE:
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								                lo = _sre.getlower(av, flags)
 								                if fixes and lo in fixes:
 								                    emit(OPCODES[IN_IGNORE])
 								                    skip = _len(code); emit(0)
 								                    if op is NOT_LITERAL:
 								                        emit(OPCODES[NEGATE])
 								                    for k in (lo,) + fixes[lo]:
 								                        emit(OPCODES[LITERAL])
 								                        emit(k)
 								                    emit(OPCODES[FAILURE])
 								                    code[skip] = _len(code) - skip
 								                else:
 								                    emit(OPCODES[OP_IGNORE[op]])
 								                    emit(lo)
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								            else:
 								                emit(OPCODES[op])
-												from the really-stupid-bug department: uppercase literals should match
uppercase strings also when the IGNORECASE flag is set (bug #128899)

(also added test cases for recently fixed bugs to the regression suite
-- or in other words, check in re_tests.py too...)

											
										
										
											2001-01-15 14:28:14 -04:00
+								                emit(av)
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								        elif op is IN:
 								            if flags & SRE_FLAG_IGNORECASE:
 								                emit(OPCODES[OP_IGNORE[op]])
 								                def fixup(literal, flags=flags):
-												the mad patcher strikes again:

-- added pickling support (only works if sre is imported)

-- fixed wordsize problems in engine
   (instead of casting literals down to the character size,
   cast characters up to the literal size (same as the code
   word size).  this prevents false hits when you're matching
   a unicode pattern against an 8-bit string. (unfortunately,
   this broke another test, but I think the test should be
   changed in this case; more on that on python-dev)

-- added sre.purge function
   (unofficial, clears the cache)

											
										
										
											2000-06-30 10:55:15 -03:00
+								                    return _sre.getlower(literal, flags)
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								            else:
 								                emit(OPCODES[op])
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.

											
										
										
											2014-10-31 07:37:50 -03:00
+								                fixup = None
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            skip = _len(code); emit(0)
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								            _compile_charset(av, flags, code, fixup, fixes)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            code[skip] = _len(code) - skip
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								        elif op is ANY:
 								            if flags & SRE_FLAG_DOTALL:
-												final 0.9.8 updates:

-- added REPEAT_ONE operator
-- added ANY_ALL operator (used to represent "(?s).")

											
										
										
											2000-08-01 19:47:49 -03:00
+								                emit(OPCODES[ANY_ALL])
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            else:
-												final 0.9.8 updates:

-- added REPEAT_ONE operator
-- added ANY_ALL operator (used to represent "(?s).")

											
										
										
											2000-08-01 19:47:49 -03:00
+								                emit(OPCODES[ANY])
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								        elif op in REPEATING_CODES:
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								            if flags & SRE_FLAG_TEMPLATE:
-												Raise statement normalization in Lib/.

											
										
										
											2007-08-29 22:19:48 -03:00
+								                raise error("internal: unsupported template operator")
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            elif _simple(av) and op is not REPEAT:
 								                if op is MAX_REPEAT:
-												SF patch #720991 by Gary Herron:
A small fix for bug #545855 and Greg Chapman's
addition of op code SRE_OP_MIN_REPEAT_ONE for
eliminating recursion on simple uses of pattern '*?' on a
long string.

											
										
										
											2003-04-14 14:59:34 -03:00
+								                    emit(OPCODES[REPEAT_ONE])
 								                else:
 								                    emit(OPCODES[MIN_REPEAT_ONE])
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                skip = _len(code); emit(0)
-												final 0.9.8 updates:

-- added REPEAT_ONE operator
-- added ANY_ALL operator (used to represent "(?s).")

											
										
										
											2000-08-01 19:47:49 -03:00
+								                emit(av[0])
 								                emit(av[1])
 								                _compile(code, av[2], flags)
 								                emit(OPCODES[SUCCESS])
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                code[skip] = _len(code) - skip
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								            else:
-												final 0.9.8 updates:

-- added REPEAT_ONE operator
-- added ANY_ALL operator (used to represent "(?s).")

											
										
										
											2000-08-01 19:47:49 -03:00
+								                emit(OPCODES[REPEAT])
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                skip = _len(code); emit(0)
-												final 0.9.8 updates:

-- added REPEAT_ONE operator
-- added ANY_ALL operator (used to represent "(?s).")

											
										
										
											2000-08-01 19:47:49 -03:00
+								                emit(av[0])
 								                emit(av[1])
 								                _compile(code, av[2], flags)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                code[skip] = _len(code) - skip
 								                if op is MAX_REPEAT:
-												final 0.9.8 updates:

-- added REPEAT_ONE operator
-- added ANY_ALL operator (used to represent "(?s).")

											
										
										
											2000-08-01 19:47:49 -03:00
+								                    emit(OPCODES[MAX_UNTIL])
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								                else:
-												final 0.9.8 updates:

-- added REPEAT_ONE operator
-- added ANY_ALL operator (used to represent "(?s).")

											
										
										
											2000-08-01 19:47:49 -03:00
+								                    emit(OPCODES[MIN_UNTIL])
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								        elif op is SUBPATTERN:
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
+								            if av[0]:
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								                emit(OPCODES[MARK])
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
+								                emit((av[0]-1)*2)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								            # _compile_info(code, av[1], flags)
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								            _compile(code, av[1], flags)
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
+								            if av[0]:
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								                emit(OPCODES[MARK])
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
+								                emit((av[0]-1)*2+1)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								        elif op in SUCCESS_CODES:
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            emit(OPCODES[op])
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								        elif op in ASSERT_CODES:
-												- added lookbehind support (?<=pattern), (?<!pattern).
  the pattern must have a fixed width.

- got rid of array-module dependencies; the match pro-
  gram is now stored inside the pattern object, rather
  than in an extra string buffer.

- cleaned up a various of potential leaks, api abuses,
  and other minors in the engine module.

- use mal's new isalnum macro, rather than my own work-
  around.

- untabified test_sre.py.  seems like I removed a couple
  of trailing spaces in the process...

											
										
										
											2000-07-03 15:44:21 -03:00
+								            emit(OPCODES[op])
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            skip = _len(code); emit(0)
-												- added lookbehind support (?<=pattern), (?<!pattern).
  the pattern must have a fixed width.

- got rid of array-module dependencies; the match pro-
  gram is now stored inside the pattern object, rather
  than in an extra string buffer.

- cleaned up a various of potential leaks, api abuses,
  and other minors in the engine module.

- use mal's new isalnum macro, rather than my own work-
  around.

- untabified test_sre.py.  seems like I removed a couple
  of trailing spaces in the process...

											
										
										
											2000-07-03 15:44:21 -03:00
+								            if av[0] >= 0:
 								                emit(0) # look ahead
 								            else:
 								                lo, hi = av[1].getwidth()
 								                if lo != hi:
-												Raise statement normalization in Lib/.

											
										
										
											2007-08-29 22:19:48 -03:00
+								                    raise error("look-behind requires fixed-width pattern")
-												- added lookbehind support (?<=pattern), (?<!pattern).
  the pattern must have a fixed width.

- got rid of array-module dependencies; the match pro-
  gram is now stored inside the pattern object, rather
  than in an extra string buffer.

- cleaned up a various of potential leaks, api abuses,
  and other minors in the engine module.

- use mal's new isalnum macro, rather than my own work-
  around.

- untabified test_sre.py.  seems like I removed a couple
  of trailing spaces in the process...

											
										
										
											2000-07-03 15:44:21 -03:00
+								                emit(lo) # look behind
 								            _compile(code, av[1], flags)
 								            emit(OPCODES[SUCCESS])
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            code[skip] = _len(code) - skip
-												- added lookbehind support (?<=pattern), (?<!pattern).
  the pattern must have a fixed width.

- got rid of array-module dependencies; the match pro-
  gram is now stored inside the pattern object, rather
  than in an extra string buffer.

- cleaned up a various of potential leaks, api abuses,
  and other minors in the engine module.

- use mal's new isalnum macro, rather than my own work-
  around.

- untabified test_sre.py.  seems like I removed a couple
  of trailing spaces in the process...

											
										
										
											2000-07-03 15:44:21 -03:00
+								        elif op is CALL:
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            emit(OPCODES[op])
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            skip = _len(code); emit(0)
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            _compile(code, av, flags)
 								            emit(OPCODES[SUCCESS])
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            code[skip] = _len(code) - skip
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								        elif op is AT:
 								            emit(OPCODES[op])
 								            if flags & SRE_FLAG_MULTILINE:
-												sre 2.1b2 update:

- take locale into account for word boundary anchors (#410271)
- restored 2.0's *? behaviour (#233283, #408936 and others)
- speed up re.sub/re.subn

											
										
										
											2001-03-22 11:50:10 -04:00
+								                av = AT_MULTILINE.get(av, av)
 								            if flags & SRE_FLAG_LOCALE:
 								                av = AT_LOCALE.get(av, av)
 								            elif flags & SRE_FLAG_UNICODE:
 								                av = AT_UNICODE.get(av, av)
 								            emit(ATCODES[av])
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								        elif op is BRANCH:
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
+								            emit(OPCODES[op])
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            tail = []
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            tailappend = tail.append
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            for av in av[1]:
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                skip = _len(code); emit(0)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								                # _compile_info(code, av, flags)
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								                _compile(code, av, flags)
 								                emit(OPCODES[JUMP])
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                tailappend(_len(code)); emit(0)
 								                code[skip] = _len(code) - skip
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            emit(0) # end of branch
 								            for tail in tail:
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                code[tail] = _len(code) - tail
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								        elif op is CATEGORY:
 								            emit(OPCODES[op])
 								            if flags & SRE_FLAG_LOCALE:
-												sre 2.1b2 update:

- take locale into account for word boundary anchors (#410271)
- restored 2.0's *? behaviour (#233283, #408936 and others)
- speed up re.sub/re.subn

											
										
										
											2001-03-22 11:50:10 -04:00
+								                av = CH_LOCALE[av]
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            elif flags & SRE_FLAG_UNICODE:
-												sre 2.1b2 update:

- take locale into account for word boundary anchors (#410271)
- restored 2.0's *? behaviour (#233283, #408936 and others)
- speed up re.sub/re.subn

											
										
										
											2001-03-22 11:50:10 -04:00
+								                av = CH_UNICODE[av]
 								            emit(CHCODES[av])
-												- fixed grouping error bug

- changed "group" operator to "groupref"

											
										
										
											2000-07-03 18:31:48 -03:00
+								        elif op is GROUPREF:
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            if flags & SRE_FLAG_IGNORECASE:
 								                emit(OPCODES[OP_IGNORE[op]])
 								            else:
 								                emit(OPCODES[op])
 								            emit(av-1)
-												Implemented non-recursive SRE matching.

											
										
										
											2003-10-17 19:13:16 -03:00
+								        elif op is GROUPREF_EXISTS:
 								            emit(OPCODES[op])
-												[Bug #1177831] Fix generation of code for GROUPREF_EXISTS.  Thanks to Andre Malo for the fix.

											
										
										
											2005-06-02 10:35:52 -03:00
+								            emit(av[0]-1)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            skipyes = _len(code); emit(0)
-												Implemented non-recursive SRE matching.

											
										
										
											2003-10-17 19:13:16 -03:00
+								            _compile(code, av[1], flags)
 								            if av[2]:
 								                emit(OPCODES[JUMP])
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                skipno = _len(code); emit(0)
 								                code[skipyes] = _len(code) - skipyes + 1
-												Implemented non-recursive SRE matching.

											
										
										
											2003-10-17 19:13:16 -03:00
+								                _compile(code, av[2], flags)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                code[skipno] = _len(code) - skipno
-												Implemented non-recursive SRE matching.

											
										
										
											2003-10-17 19:13:16 -03:00
+								            else:
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                code[skipyes] = _len(code) - skipyes + 1
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								        else:
-												Raise statement normalization in Lib/.

											
										
										
											2007-08-29 22:19:48 -03:00
+								            raise ValueError("unsupported operand type", op)
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								def _compile_charset(charset, flags, code, fixup=None, fixes=None):
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								    # compile charset subprogram
 								    emit = code.append
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								    for op, av in _optimize_charset(charset, fixup, fixes,
 								                                    flags & SRE_FLAG_UNICODE):
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        emit(OPCODES[op])
 								        if op is NEGATE:
 								            pass
 								        elif op is LITERAL:
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.

											
										
										
											2014-10-31 07:37:50 -03:00
+								            emit(av)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        elif op is RANGE:
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.

											
										
										
											2014-10-31 07:37:50 -03:00
+								            emit(av[0])
 								            emit(av[1])
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        elif op is CHARSET:
 								            code.extend(av)
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
+								        elif op is BIGCHARSET:
 								            code.extend(av)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        elif op is CATEGORY:
 								            if flags & SRE_FLAG_LOCALE:
 								                emit(CHCODES[CH_LOCALE[av]])
 								            elif flags & SRE_FLAG_UNICODE:
 								                emit(CHCODES[CH_UNICODE[av]])
 								            else:
 								                emit(CHCODES[av])
 								        else:
-												Raise statement normalization in Lib/.

											
										
										
											2007-08-29 22:19:48 -03:00
+								            raise error("internal: unsupported set operator")
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								    emit(OPCODES[FAILURE])
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								def _optimize_charset(charset, fixup, fixes, isunicode):
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								    # internal: optimize character set
 								    out = []
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    tail = []
 								    charmap = bytearray(256)
 								    for op, av in charset:
 								        while True:
 								            try:
 								                if op is LITERAL:
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.

											
										
										
											2014-10-31 07:37:50 -03:00
+								                    if fixup:
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								                        i = fixup(av)
 								                        charmap[i] = 1
 								                        if fixes and i in fixes:
 								                            for k in fixes[i]:
 								                                charmap[k] = 1
 								                    else:
 								                        charmap[av] = 1
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								                elif op is RANGE:
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.

											
										
										
											2014-10-31 07:37:50 -03:00
+								                    r = range(av[0], av[1]+1)
 								                    if fixup:
 								                        r = map(fixup, r)
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								                    if fixup and fixes:
 								                        for i in r:
 								                            charmap[i] = 1
 								                            if i in fixes:
 								                                for k in fixes[i]:
 								                                    charmap[k] = 1
 								                    else:
 								                        for i in r:
 								                            charmap[i] = 1
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								                elif op is NEGATE:
 								                    out.append((op, av))
 								                else:
 								                    tail.append((op, av))
 								            except IndexError:
 								                if len(charmap) == 256:
 								                    # character set contains non-UCS1 character codes
 								                    charmap += b'\0' * 0xff00
 								                    continue
 								                # character set contains non-BMP character codes
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.

											
										
										
											2014-10-31 07:37:50 -03:00
+								                if fixup and isunicode and op is RANGE:
 								                    lo, hi = av
 								                    ranges = [av]
 								                    # There are only two ranges of cased astral characters:
 								                    # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
 								                    _fixup_range(max(0x10000, lo), min(0x11fff, hi),
 								                                 ranges, fixup)
 								                    for lo, hi in ranges:
 								                        if lo == hi:
 								                            tail.append((LITERAL, hi))
 								                        else:
 								                            tail.append((RANGE, (lo, hi)))
 								                else:
 								                    tail.append((op, av))
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								            break
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								    # compress character map
 								    runs = []
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    q = 0
 								    while True:
 								        p = charmap.find(1, q)
 								        if p < 0:
 								            break
 								        if len(runs) >= 2:
 								            runs = None
 								            break
 								        q = charmap.find(0, p)
 								        if q < 0:
 								            runs.append((p, len(charmap)))
 								            break
 								        runs.append((p, q))
 								    if runs is not None:
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        # use literal/range
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								        for p, q in runs:
 								            if q - p == 1:
 								                out.append((LITERAL, p))
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								            else:
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								                out.append((RANGE, (p, q - 1)))
 								        out += tail
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.

											
										
										
											2014-10-31 07:37:50 -03:00
+								        # if the case was changed or new representation is more compact
 								        if fixup or len(out) < len(charset):
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								            return out
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.

											
										
										
											2014-10-31 07:37:50 -03:00
+								        # else original character set is good enough
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								        return charset
 								    # use bitmap
 								    if len(charmap) == 256:
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
+								        data = _mk_bitmap(charmap)
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								        out.append((CHARSET, data))
 								        out += tail
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        return out
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    # To represent a big charset, first a bitmap of all characters in the
 								    # set is constructed. Then, this bitmap is sliced into chunks of 256
 								    # characters, duplicate chunks are eliminated, and each chunk is
 								    # given a number. In the compiled expression, the charset is
 								    # represented by a 32-bit word sequence, consisting of one word for
 								    # the number of different chunks, a sequence of 256 bytes (64 words)
 								    # of chunk numbers indexed by their original chunk position, and a
 								    # sequence of 256-bit chunks (8 words each).
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    # Compression is normally good: in a typical charset, large ranges of
 								    # Unicode will be either completely excluded (e.g. if only cyrillic
 								    # letters are to be matched), or completely included (e.g. if large
 								    # subranges of Kanji match). These ranges will be represented by
 								    # chunks of all one-bits or all zero-bits.
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    # Matching can be also done efficiently: the more significant byte of
 								    # the Unicode character is an index into the chunk number, and the
 								    # less significant byte is a bit index in the chunk (just like the
 								    # CHARSET matching).
-												Fully support 32-bit codes. Enable BIGCHARSET in UCS-4 builds.

											
										
										
											2003-04-19 09:56:08 -03:00
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    charmap = bytes(charmap) # should be hashable
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
+								    comps = {}
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    mapping = bytearray(256)
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
+								    block = 0
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    data = bytearray()
 								    for i in range(0, 65536, 256):
 								        chunk = charmap[i: i + 256]
 								        if chunk in comps:
 								            mapping[i // 256] = comps[chunk]
 								        else:
 								            mapping[i // 256] = comps[chunk] = block
 								            block += 1
 								            data += chunk
 								    data = _mk_bitmap(data)
 								    data[0:0] = [block] + _bytes_to_codes(mapping)
 								    out.append((BIGCHARSET, data))
 								    out += tail
 								    return out
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.

											
										
										
											2014-10-31 07:37:50 -03:00
+								def _fixup_range(lo, hi, ranges, fixup):
 								    for i in map(fixup, range(lo, hi+1)):
 								        for k, (lo, hi) in enumerate(ranges):
 								            if i < lo:
 								                if l == lo - 1:
 								                    ranges[k] = (i, hi)
 								                else:
 								                    ranges.insert(k, (i, i))
 								                break
 								            elif i > hi:
 								                if i == hi + 1:
 								                    ranges[k] = (lo, i)
 								                    break
 								            else:
 								                break
 								        else:
 								            ranges.append((i, i))
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								_CODEBITS = _sre.CODESIZE * 8
 								_BITS_TRANS = b'0' + b'1' * 255
 								def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
 								    s = bits.translate(_BITS_TRANS)[::-1]
 								    return [_int(s[i - _CODEBITS: i], 2)
 								            for i in range(len(s), 0, -_CODEBITS)]
 								def _bytes_to_codes(b):
 								    # Convert block indices to word array
-												Got rid of the array module dependency in the re module.
The re module could be used during building before array is built.

											
										
										
											2014-11-10 07:24:47 -04:00
+								    a = memoryview(b).cast('I')
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    assert a.itemsize == _sre.CODESIZE
 								    assert len(a) * a.itemsize == len(b)
 								    return a.tolist()
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								def _simple(av):
 								    # check if av is a "simple" operator
 								    lo, hi = av[2].getwidth()
 								    return lo == hi == 1 and av[2][0][0] != SUBPATTERN
-												Issue #19387: explain and test the sre overlap table

											
										
										
											2013-10-25 16:36:10 -03:00
+								def _generate_overlap_table(prefix):
 								    """
 								    Generate an overlap table for the following prefix.
 								    An overlap table is a table of the same size as the prefix which
 								    informs about the potential self-overlap for each index in the prefix:
 								    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
 								    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
 								      prefix[0:k]
 								    """
 								    table = [0] * len(prefix)
 								    for i in range(1, len(prefix)):
 								        idx = table[i - 1]
 								        while prefix[i] != prefix[idx]:
 								            if idx == 0:
 								                table[i] = 0
 								                break
 								            idx = table[idx - 1]
 								        else:
 								            table[i] = idx + 1
 								    return table
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								def _compile_info(code, pattern, flags):
 								    # internal: compile an info block.  in the current version,
-												-- use charset bitmaps where appropriate.  this gives a 5-10%
   speedup for some tests, including the python tokenizer.

-- added support for an optional charset anchor to the engine
   (currently unused by the code generator).

-- removed workaround for array module bug.

											
										
										
											2000-07-02 09:00:07 -03:00
+								    # this contains min/max pattern width, and an optional literal
 								    # prefix or a character map
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    lo, hi = pattern.getwidth()
 								    if lo == 0:
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								        return # not worth it
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    # look for a literal prefix
 								    prefix = []
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								    prefixappend = prefix.append
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								    prefix_skip = 0
-												-- use charset bitmaps where appropriate.  this gives a 5-10%
   speedup for some tests, including the python tokenizer.

-- added support for an optional charset anchor to the engine
   (currently unused by the code generator).

-- removed workaround for array module bug.

											
										
										
											2000-07-02 09:00:07 -03:00
+								    charset = [] # not used
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								    charsetappend = charset.append
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    if not (flags & SRE_FLAG_IGNORECASE):
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        # look for literal prefix
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								        for op, av in pattern.data:
 								            if op is LITERAL:
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								                if len(prefix) == prefix_skip:
 								                    prefix_skip = prefix_skip + 1
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                prefixappend(av)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								            elif op is SUBPATTERN and len(av[1]) == 1:
 								                op, av = av[1][0]
 								                if op is LITERAL:
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                    prefixappend(av)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								                else:
 								                    break
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								            else:
 								                break
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        # if no prefix, look for charset prefix
 								        if not prefix and pattern.data:
 								            op, av = pattern.data[0]
 								            if op is SUBPATTERN and av[1]:
 								                op, av = av[1][0]
 								                if op is LITERAL:
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                    charsetappend((op, av))
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								                elif op is BRANCH:
 								                    c = []
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                    cappend = c.append
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								                    for p in av[1]:
 								                        if not p:
 								                            break
 								                        op, av = p[0]
 								                        if op is LITERAL:
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                            cappend((op, av))
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								                        else:
 								                            break
 								                    else:
 								                        charset = c
 								            elif op is BRANCH:
 								                c = []
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                cappend = c.append
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								                for p in av[1]:
 								                    if not p:
 								                        break
 								                    op, av = p[0]
 								                    if op is LITERAL:
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                        cappend((op, av))
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								                    else:
 								                        break
 								                else:
 								                    charset = c
 								            elif op is IN:
 								                charset = av
 								##     if prefix:
 								##         print "*** PREFIX", prefix, prefix_skip
 								##     if charset:
 								##         print "*** CHARSET", charset
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    # add an info block
 								    emit = code.append
 								    emit(OPCODES[INFO])
 								    skip = len(code); emit(0)
 								    # literal flag
 								    mask = 0
-												-- use charset bitmaps where appropriate.  this gives a 5-10%
   speedup for some tests, including the python tokenizer.

-- added support for an optional charset anchor to the engine
   (currently unused by the code generator).

-- removed workaround for array module bug.

											
										
										
											2000-07-02 09:00:07 -03:00
+								    if prefix:
 								        mask = SRE_INFO_PREFIX
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        if len(prefix) == prefix_skip == len(pattern.data):
-												-- use charset bitmaps where appropriate.  this gives a 5-10%
   speedup for some tests, including the python tokenizer.

-- added support for an optional charset anchor to the engine
   (currently unused by the code generator).

-- removed workaround for array module bug.

											
										
										
											2000-07-02 09:00:07 -03:00
+								            mask = mask + SRE_INFO_LITERAL
 								    elif charset:
 								        mask = mask + SRE_INFO_CHARSET
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    emit(mask)
 								    # pattern length
-												-- use charset bitmaps where appropriate.  this gives a 5-10%
   speedup for some tests, including the python tokenizer.

-- added support for an optional charset anchor to the engine
   (currently unused by the code generator).

-- removed workaround for array module bug.

											
										
										
											2000-07-02 09:00:07 -03:00
+								    if lo < MAXCODE:
 								        emit(lo)
 								    else:
 								        emit(MAXCODE)
 								        prefix = prefix[:MAXCODE]
 								    if hi < MAXCODE:
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								        emit(hi)
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    else:
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								        emit(0)
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    # add literal prefix
 								    if prefix:
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        emit(len(prefix)) # length
 								        emit(prefix_skip) # skip
 								        code.extend(prefix)
 								        # generate overlap table
-												Issue #19387: explain and test the sre overlap table

											
										
										
											2013-10-25 16:36:10 -03:00
+								        code.extend(_generate_overlap_table(prefix))
-												-- use charset bitmaps where appropriate.  this gives a 5-10%
   speedup for some tests, including the python tokenizer.

-- added support for an optional charset anchor to the engine
   (currently unused by the code generator).

-- removed workaround for array module bug.

											
										
										
											2000-07-02 09:00:07 -03:00
+								    elif charset:
-												Fix from SF patch #633359 by Greg Chapman for SF bug #610299:
    The problem is in sre_compile.py: the call to
    _compile_charset near the end of _compile_info forgets to
    pass in the flags, so that the info charset is not compiled
    with re.U. (The info charset is used when searching to find
    the first character at which a match could start; it is not
    generated for patterns beginning with a repeat like '\w{1}'.)

											
										
										
											2003-02-23 21:18:35 -04:00
+								        _compile_charset(charset, flags, code)
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    code[skip] = len(code) - skip
-												Addendum to #764548: restore 2.1 compatibility.

											
										
										
											2003-07-02 18:37:16 -03:00
+								def isstring(obj):
-												Fix 're' to work on bytes. It could do with a few more tests, though.

											
										
										
											2008-03-18 17:19:54 -03:00
+								    return isinstance(obj, (str, bytes))
-												Addendum to #764548: restore 2.1 compatibility.

											
										
										
											2003-07-02 18:37:16 -03:00
-												-- fixed width calculations for alternations
-- fixed literal check in branch operator
   (this broke test_tokenize, as reported by Mark Favas)
-- added REPEAT_ONE operator (still not enabled, though)
-- added some debugging stuff (maxlevel)

											
										
										
											2000-08-01 18:05:41 -03:00
+								def _code(p, flags):
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
-												Fredrik Lundh: here's the 96.6% version of SRE

											
										
										
											2000-06-01 14:39:12 -03:00
+								    flags = p.pattern.flags | flags
-												- fixed split
  (test_sre still complains about split, but that's caused by
  the group reset bug, not split itself)

- added more mark slots
  (should be dynamically allocated, but 100 is better than 32.
  and checking for the upper limit is better than overwriting
  the memory ;-)

- internal: renamed the cursor helper class

- internal: removed some bloat from sre_compile

											
										
										
											2000-06-29 13:57:40 -03:00
+								    code = []
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
 								    # compile info block
 								    _compile_info(code, p, flags)
 								    # compile the pattern
-												Fredrik Lundh: here's the 96.6% version of SRE

											
										
										
											2000-06-01 14:39:12 -03:00
+								    _compile(code, p.data, flags)
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
-												Fredrik Lundh: here's the 96.6% version of SRE

											
										
										
											2000-06-01 14:39:12 -03:00
+								    code.append(OPCODES[SUCCESS])
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
+								    return code
 								def compile(p, flags=0):
 								    # internal: convert pattern list to internal format
-												Addendum to #764548: restore 2.1 compatibility.

											
										
										
											2003-07-02 18:37:16 -03:00
+								    if isstring(p):
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
+								        pattern = p
 								        p = sre_parse.parse(p, flags)
 								    else:
 								        pattern = None
-												-- fixed width calculations for alternations
-- fixed literal check in branch operator
   (this broke test_tokenize, as reported by Mark Favas)
-- added REPEAT_ONE operator (still not enabled, though)
-- added some debugging stuff (maxlevel)

											
										
										
											2000-08-01 18:05:41 -03:00
+								    code = _code(p, flags)
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
-												-- SRE 0.9.6 sync.  this includes:

 + added "regs" attribute
 + fixed "pos" and "endpos" attributes
 + reset "lastindex" and "lastgroup" in scanner methods
 + removed (?P#id) syntax; the "lastindex" and "lastgroup"
   attributes are now always set
 + removed string module dependencies in sre_parse
 + better debugging support in sre_parse
 + various tweaks to build under 1.5.2

											
										
										
											2000-07-23 18:46:17 -03:00
+								    # print code
-												SRE fixes for 2.1 alpha:

-- added some more docstrings
-- fixed typo in scanner class (#125531)
-- the multiline flag (?m) should't affect the \Z operator (#127259)
-- fixed non-greedy backtracking bug (#123769, #127259)
-- added sre.DEBUG flag (currently dumps the parsed pattern structure)
-- fixed a couple of glitches in groupdict (the #126587 memory leak
   had already been fixed by AMK)

											
										
										
											2001-01-14 11:06:11 -04:00
+								    # XXX: <fl> get rid of this limitation!
-												make sure to check for this limit even if we're running with -O

											
										
										
											2004-10-15 03:15:08 -03:00
+								    if p.pattern.groups > 100:
 								        raise AssertionError(
 								            "sorry, but this version only supports 100 named groups"
 								            )
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
-												- experimental: added two new attributes to the match object:
  "lastgroup" is the name of the last matched capturing group,
  "lastindex" is the index of the same group.  if no group was
  matched, both attributes are set to None.

  the (?P#) feature will be removed in the next relase.

											
										
										
											2000-07-02 19:25:39 -03:00
+								    # map in either direction
 								    groupindex = p.pattern.groupdict
 								    indexgroup = [None] * p.pattern.groups
 								    for k, i in groupindex.items():
 								        indexgroup[i] = k
-												Fredrik Lundh: here's the 96.6% version of SRE

											
										
										
											2000-06-01 14:39:12 -03:00
+								    return _sre.compile(
-												Merged revisions 59666-59679 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk

........
  r59666 | christian.heimes | 2008-01-02 19:28:32 +0100 (Wed, 02 Jan 2008) | 1 line

  Made vs9to8 Unix compatible
........
  r59669 | guido.van.rossum | 2008-01-02 20:00:46 +0100 (Wed, 02 Jan 2008) | 2 lines

  Patch #1696.  Don't attempt to close None in dry-run mode.
........
  r59671 | jeffrey.yasskin | 2008-01-03 03:21:52 +0100 (Thu, 03 Jan 2008) | 6 lines

  Backport PEP 3141 from the py3k branch to the trunk. This includes r50877 (just
  the complex_pow part), r56649, r56652, r56715, r57296, r57302, r57359, r57361,
  r57372, r57738, r57739, r58017, r58039, r58040, and r59390, and new
  documentation. The only significant difference is that round(x) returns a float
  to preserve backward-compatibility. See http://bugs.python.org/issue1689.
........
  r59672 | christian.heimes | 2008-01-03 16:41:30 +0100 (Thu, 03 Jan 2008) | 1 line

  Issue #1726: Remove Python/atof.c from PCBuild/pythoncore.vcproj
........
  r59675 | guido.van.rossum | 2008-01-03 20:12:44 +0100 (Thu, 03 Jan 2008) | 4 lines

  Issue #1700, reported by Nguyen Quan Son, fix by Fredruk Lundh:
  Regular Expression inline flags not handled correctly for some unicode
  characters.  (Forward port from 2.5.2.)
........
  r59676 | christian.heimes | 2008-01-03 21:23:15 +0100 (Thu, 03 Jan 2008) | 1 line

  Added math.isinf() and math.isnan()
........
  r59677 | christian.heimes | 2008-01-03 22:14:48 +0100 (Thu, 03 Jan 2008) | 1 line

  Some build bots don't compile mathmodule. There is an issue with the long definition of pi and euler
........
  r59678 | christian.heimes | 2008-01-03 23:16:32 +0100 (Thu, 03 Jan 2008) | 2 lines

  Modified PyImport_Import and PyImport_ImportModule to always use absolute imports by calling __import__ with an explicit level of 0
  Added a new API function PyImport_ImportModuleNoBlock. It solves the problem with dead locks when mixing threads and imports
........
  r59679 | christian.heimes | 2008-01-03 23:32:26 +0100 (Thu, 03 Jan 2008) | 1 line

  Added copysign(x, y) function to the math module
........

											
										
										
											2008-01-03 19:01:04 -04:00
+								        pattern, flags | p.pattern.flags, code,
-												- added lookbehind support (?<=pattern), (?<!pattern).
  the pattern must have a fixed width.

- got rid of array-module dependencies; the match pro-
  gram is now stored inside the pattern object, rather
  than in an extra string buffer.

- cleaned up a various of potential leaks, api abuses,
  and other minors in the engine module.

- use mal's new isalnum macro, rather than my own work-
  around.

- untabified test_sre.py.  seems like I removed a couple
  of trailing spaces in the process...

											
										
										
											2000-07-03 15:44:21 -03:00
+								        p.pattern.groups-1,
 								        groupindex, indexgroup
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								        )