cpython/Lib/sre_compile.py

#
# Secret Labs' Regular Expression Engine
#
# convert template to internal format
#
# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#

"""Internal support module for sre"""

import _sre
import sre_parse
from sre_constants import *

assert _sre.MAGIC == MAGIC, "SRE module mismatch"

_LITERAL_CODES = {LITERAL, NOT_LITERAL}
_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}

# Sets of lowercase characters which have the same uppercase.
_equivalences = (
    # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
    (0x69, 0x131), # iı
    # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
    (0x73, 0x17f), # sſ
    # MICRO SIGN, GREEK SMALL LETTER MU
    (0xb5, 0x3bc), # µμ
    # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
    (0x345, 0x3b9, 0x1fbe), # \u0345ιι
    # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
    (0x390, 0x1fd3), # ΐΐ
    # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
    (0x3b0, 0x1fe3), # ΰΰ
    # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
    (0x3b2, 0x3d0), # βϐ
    # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
    (0x3b5, 0x3f5), # εϵ
    # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
    (0x3b8, 0x3d1), # θϑ
    # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
    (0x3ba, 0x3f0), # κϰ
    # GREEK SMALL LETTER PI, GREEK PI SYMBOL
    (0x3c0, 0x3d6), # πϖ
    # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
    (0x3c1, 0x3f1), # ρϱ
    # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
    (0x3c2, 0x3c3), # ςσ
    # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
    (0x3c6, 0x3d5), # φϕ
    # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
    (0x1e61, 0x1e9b), # ṡẛ
    # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
    (0xfb05, 0xfb06), # ﬅﬆ
)

# Maps the lowercase code to lowercase codes which have the same uppercase.
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
                     for t in _equivalences for i in t}

def _combine_flags(flags, add_flags, del_flags,
                   TYPE_FLAGS=sre_parse.TYPE_FLAGS):
    if add_flags & TYPE_FLAGS:
        flags &= ~TYPE_FLAGS
    return (flags | add_flags) & ~del_flags

def _compile(code, pattern, flags):
    # internal: compile a (sub)pattern
    emit = code.append
    _len = len
    LITERAL_CODES = _LITERAL_CODES
    REPEATING_CODES = _REPEATING_CODES
    SUCCESS_CODES = _SUCCESS_CODES
    ASSERT_CODES = _ASSERT_CODES
    iscased = None
    tolower = None
    fixes = None
    if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
        if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
            iscased = _sre.unicode_iscased
            tolower = _sre.unicode_tolower
            fixes = _ignorecase_fixes
        else:
            iscased = _sre.ascii_iscased
            tolower = _sre.ascii_tolower
    for op, av in pattern:
        if op in LITERAL_CODES:
            if not flags & SRE_FLAG_IGNORECASE:
                emit(op)
                emit(av)
            elif flags & SRE_FLAG_LOCALE:
                emit(OP_LOCALE_IGNORE[op])
                emit(av)
            elif not iscased(av):
                emit(op)
                emit(av)
            else:
                lo = tolower(av)
                if not fixes:  # ascii
                    emit(OP_IGNORE[op])
                    emit(lo)
                elif lo not in fixes:
                    emit(OP_UNICODE_IGNORE[op])
                    emit(lo)
                else:
                    emit(IN_UNI_IGNORE)
                    skip = _len(code); emit(0)
                    if op is NOT_LITERAL:
                        emit(NEGATE)
                    for k in (lo,) + fixes[lo]:
                        emit(LITERAL)
                        emit(k)
                    emit(FAILURE)
                    code[skip] = _len(code) - skip
        elif op is IN:
            charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
            if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
                emit(IN_LOC_IGNORE)
            elif not hascased:
                emit(IN)
            elif not fixes:  # ascii
                emit(IN_IGNORE)
            else:
                emit(IN_UNI_IGNORE)
            skip = _len(code); emit(0)
            _compile_charset(charset, flags, code)
            code[skip] = _len(code) - skip
        elif op is ANY:
            if flags & SRE_FLAG_DOTALL:
                emit(ANY_ALL)
            else:
                emit(ANY)
        elif op in REPEATING_CODES:
            if flags & SRE_FLAG_TEMPLATE:
                raise error("internal: unsupported template operator %r" % (op,))
            if _simple(av[2]):
                if op is MAX_REPEAT:
                    emit(REPEAT_ONE)
                else:
                    emit(MIN_REPEAT_ONE)
                skip = _len(code); emit(0)
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
                emit(SUCCESS)
                code[skip] = _len(code) - skip
            else:
                emit(REPEAT)
                skip = _len(code); emit(0)
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
                code[skip] = _len(code) - skip
                if op is MAX_REPEAT:
                    emit(MAX_UNTIL)
                else:
                    emit(MIN_UNTIL)
        elif op is SUBPATTERN:
            group, add_flags, del_flags, p = av
            if group:
                emit(MARK)
                emit((group-1)*2)
            # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
            _compile(code, p, _combine_flags(flags, add_flags, del_flags))
            if group:
                emit(MARK)
                emit((group-1)*2+1)
        elif op in SUCCESS_CODES:
            emit(op)
        elif op in ASSERT_CODES:
            emit(op)
            skip = _len(code); emit(0)
            if av[0] >= 0:
                emit(0) # look ahead
            else:
                lo, hi = av[1].getwidth()
                if lo != hi:
                    raise error("look-behind requires fixed-width pattern")
                emit(lo) # look behind
            _compile(code, av[1], flags)
            emit(SUCCESS)
            code[skip] = _len(code) - skip
        elif op is CALL:
            emit(op)
            skip = _len(code); emit(0)
            _compile(code, av, flags)
            emit(SUCCESS)
            code[skip] = _len(code) - skip
        elif op is AT:
            emit(op)
            if flags & SRE_FLAG_MULTILINE:
                av = AT_MULTILINE.get(av, av)
            if flags & SRE_FLAG_LOCALE:
                av = AT_LOCALE.get(av, av)
            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
                av = AT_UNICODE.get(av, av)
            emit(av)
        elif op is BRANCH:
            emit(op)
            tail = []
            tailappend = tail.append
            for av in av[1]:
                skip = _len(code); emit(0)
                # _compile_info(code, av, flags)
                _compile(code, av, flags)
                emit(JUMP)
                tailappend(_len(code)); emit(0)
                code[skip] = _len(code) - skip
            emit(FAILURE) # end of branch
            for tail in tail:
                code[tail] = _len(code) - tail
        elif op is CATEGORY:
            emit(op)
            if flags & SRE_FLAG_LOCALE:
                av = CH_LOCALE[av]
            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
                av = CH_UNICODE[av]
            emit(av)
        elif op is GROUPREF:
            if not flags & SRE_FLAG_IGNORECASE:
                emit(op)
            elif flags & SRE_FLAG_LOCALE:
                emit(GROUPREF_LOC_IGNORE)
            elif not fixes:  # ascii
                emit(GROUPREF_IGNORE)
            else:
                emit(GROUPREF_UNI_IGNORE)
            emit(av-1)
        elif op is GROUPREF_EXISTS:
            emit(op)
            emit(av[0]-1)
            skipyes = _len(code); emit(0)
            _compile(code, av[1], flags)
            if av[2]:
                emit(JUMP)
                skipno = _len(code); emit(0)
                code[skipyes] = _len(code) - skipyes + 1
                _compile(code, av[2], flags)
                code[skipno] = _len(code) - skipno
            else:
                code[skipyes] = _len(code) - skipyes + 1
        else:
            raise error("internal: unsupported operand type %r" % (op,))

def _compile_charset(charset, flags, code):
    # compile charset subprogram
    emit = code.append
    for op, av in charset:
        emit(op)
        if op is NEGATE:
            pass
        elif op is LITERAL:
            emit(av)
        elif op is RANGE or op is RANGE_UNI_IGNORE:
            emit(av[0])
            emit(av[1])
        elif op is CHARSET:
            code.extend(av)
        elif op is BIGCHARSET:
            code.extend(av)
        elif op is CATEGORY:
            if flags & SRE_FLAG_LOCALE:
                emit(CH_LOCALE[av])
            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
                emit(CH_UNICODE[av])
            else:
                emit(av)
        else:
            raise error("internal: unsupported set operator %r" % (op,))
    emit(FAILURE)

def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
    # internal: optimize character set
    out = []
    tail = []
    charmap = bytearray(256)
    hascased = False
    for op, av in charset:
        while True:
            try:
                if op is LITERAL:
                    if fixup:
                        lo = fixup(av)
                        charmap[lo] = 1
                        if fixes and lo in fixes:
                            for k in fixes[lo]:
                                charmap[k] = 1
                        if not hascased and iscased(av):
                            hascased = True
                    else:
                        charmap[av] = 1
                elif op is RANGE:
                    r = range(av[0], av[1]+1)
                    if fixup:
                        if fixes:
                            for i in map(fixup, r):
                                charmap[i] = 1
                                if i in fixes:
                                    for k in fixes[i]:
                                        charmap[k] = 1
                        else:
                            for i in map(fixup, r):
                                charmap[i] = 1
                        if not hascased:
                            hascased = any(map(iscased, r))
                    else:
                        for i in r:
                            charmap[i] = 1
                elif op is NEGATE:
                    out.append((op, av))
                else:
                    tail.append((op, av))
            except IndexError:
                if len(charmap) == 256:
                    # character set contains non-UCS1 character codes
                    charmap += b'\0' * 0xff00
                    continue
                # Character set contains non-BMP character codes.
                if fixup:
                    hascased = True
                    # There are only two ranges of cased non-BMP characters:
                    # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
                    # and for both ranges RANGE_UNI_IGNORE works.
                    if op is RANGE:
                        op = RANGE_UNI_IGNORE
                tail.append((op, av))
            break

    # compress character map
    runs = []
    q = 0
    while True:
        p = charmap.find(1, q)
        if p < 0:
            break
        if len(runs) >= 2:
            runs = None
            break
        q = charmap.find(0, p)
        if q < 0:
            runs.append((p, len(charmap)))
            break
        runs.append((p, q))
    if runs is not None:
        # use literal/range
        for p, q in runs:
            if q - p == 1:
                out.append((LITERAL, p))
            else:
                out.append((RANGE, (p, q - 1)))
        out += tail
        # if the case was changed or new representation is more compact
        if hascased or len(out) < len(charset):
            return out, hascased
        # else original character set is good enough
        return charset, hascased

    # use bitmap
    if len(charmap) == 256:
        data = _mk_bitmap(charmap)
        out.append((CHARSET, data))
        out += tail
        return out, hascased

    # To represent a big charset, first a bitmap of all characters in the
    # set is constructed. Then, this bitmap is sliced into chunks of 256
    # characters, duplicate chunks are eliminated, and each chunk is
    # given a number. In the compiled expression, the charset is
    # represented by a 32-bit word sequence, consisting of one word for
    # the number of different chunks, a sequence of 256 bytes (64 words)
    # of chunk numbers indexed by their original chunk position, and a
    # sequence of 256-bit chunks (8 words each).

    # Compression is normally good: in a typical charset, large ranges of
    # Unicode will be either completely excluded (e.g. if only cyrillic
    # letters are to be matched), or completely included (e.g. if large
    # subranges of Kanji match). These ranges will be represented by
    # chunks of all one-bits or all zero-bits.

    # Matching can be also done efficiently: the more significant byte of
    # the Unicode character is an index into the chunk number, and the
    # less significant byte is a bit index in the chunk (just like the
    # CHARSET matching).

    charmap = bytes(charmap) # should be hashable
    comps = {}
    mapping = bytearray(256)
    block = 0
    data = bytearray()
    for i in range(0, 65536, 256):
        chunk = charmap[i: i + 256]
        if chunk in comps:
            mapping[i // 256] = comps[chunk]
        else:
            mapping[i // 256] = comps[chunk] = block
            block += 1
            data += chunk
    data = _mk_bitmap(data)
    data[0:0] = [block] + _bytes_to_codes(mapping)
    out.append((BIGCHARSET, data))
    out += tail
    return out, hascased

_CODEBITS = _sre.CODESIZE * 8
MAXCODE = (1 << _CODEBITS) - 1
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
    s = bits.translate(_BITS_TRANS)[::-1]
    return [_int(s[i - _CODEBITS: i], 2)
            for i in range(len(s), 0, -_CODEBITS)]

def _bytes_to_codes(b):
    # Convert block indices to word array
    a = memoryview(b).cast('I')
    assert a.itemsize == _sre.CODESIZE
    assert len(a) * a.itemsize == len(b)
    return a.tolist()

def _simple(p):
    # check if this subpattern is a "simple" operator
    if len(p) != 1:
        return False
    op, av = p[0]
    if op is SUBPATTERN:
        return av[0] is None and _simple(av[-1])
    return op in _UNIT_CODES

def _generate_overlap_table(prefix):
    """
    Generate an overlap table for the following prefix.
    An overlap table is a table of the same size as the prefix which
    informs about the potential self-overlap for each index in the prefix:
    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
      prefix[0:k]
    """
    table = [0] * len(prefix)
    for i in range(1, len(prefix)):
        idx = table[i - 1]
        while prefix[i] != prefix[idx]:
            if idx == 0:
                table[i] = 0
                break
            idx = table[idx - 1]
        else:
            table[i] = idx + 1
    return table

def _get_iscased(flags):
    if not flags & SRE_FLAG_IGNORECASE:
        return None
    elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
        return _sre.unicode_iscased
    else:
        return _sre.ascii_iscased

def _get_literal_prefix(pattern, flags):
    # look for literal prefix
    prefix = []
    prefixappend = prefix.append
    prefix_skip = None
    iscased = _get_iscased(flags)
    for op, av in pattern.data:
        if op is LITERAL:
            if iscased and iscased(av):
                break
            prefixappend(av)
        elif op is SUBPATTERN:
            group, add_flags, del_flags, p = av
            flags1 = _combine_flags(flags, add_flags, del_flags)
            if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
                break
            prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
            if prefix_skip is None:
                if group is not None:
                    prefix_skip = len(prefix)
                elif prefix_skip1 is not None:
                    prefix_skip = len(prefix) + prefix_skip1
            prefix.extend(prefix1)
            if not got_all:
                break
        else:
            break
    else:
        return prefix, prefix_skip, True
    return prefix, prefix_skip, False

def _get_charset_prefix(pattern, flags):
    while True:
        if not pattern.data:
            return None
        op, av = pattern.data[0]
        if op is not SUBPATTERN:
            break
        group, add_flags, del_flags, pattern = av
        flags = _combine_flags(flags, add_flags, del_flags)
        if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
            return None

    iscased = _get_iscased(flags)
    if op is LITERAL:
        if iscased and iscased(av):
            return None
        return [(op, av)]
    elif op is BRANCH:
        charset = []
        charsetappend = charset.append
        for p in av[1]:
            if not p:
                return None
            op, av = p[0]
            if op is LITERAL and not (iscased and iscased(av)):
                charsetappend((op, av))
            else:
                return None
        return charset
    elif op is IN:
        charset = av
        if iscased:
            for op, av in charset:
                if op is LITERAL:
                    if iscased(av):
                        return None
                elif op is RANGE:
                    if av[1] > 0xffff:
                        return None
                    if any(map(iscased, range(av[0], av[1]+1))):
                        return None
        return charset
    return None

def _compile_info(code, pattern, flags):
    # internal: compile an info block.  in the current version,
    # this contains min/max pattern width, and an optional literal
    # prefix or a character map
    lo, hi = pattern.getwidth()
    if hi > MAXCODE:
        hi = MAXCODE
    if lo == 0:
        code.extend([INFO, 4, 0, lo, hi])
        return
    # look for a literal prefix
    prefix = []
    prefix_skip = 0
    charset = [] # not used
    if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
        # look for literal prefix
        prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
        # if no prefix, look for charset prefix
        if not prefix:
            charset = _get_charset_prefix(pattern, flags)
##     if prefix:
##         print("*** PREFIX", prefix, prefix_skip)
##     if charset:
##         print("*** CHARSET", charset)
    # add an info block
    emit = code.append
    emit(INFO)
    skip = len(code); emit(0)
    # literal flag
    mask = 0
    if prefix:
        mask = SRE_INFO_PREFIX
        if prefix_skip is None and got_all:
            mask = mask | SRE_INFO_LITERAL
    elif charset:
        mask = mask | SRE_INFO_CHARSET
    emit(mask)
    # pattern length
    if lo < MAXCODE:
        emit(lo)
    else:
        emit(MAXCODE)
        prefix = prefix[:MAXCODE]
    emit(min(hi, MAXCODE))
    # add literal prefix
    if prefix:
        emit(len(prefix)) # length
        if prefix_skip is None:
            prefix_skip =  len(prefix)
        emit(prefix_skip) # skip
        code.extend(prefix)
        # generate overlap table
        code.extend(_generate_overlap_table(prefix))
    elif charset:
        charset, hascased = _optimize_charset(charset)
        assert not hascased
        _compile_charset(charset, flags, code)
    code[skip] = len(code) - skip

def isstring(obj):
    return isinstance(obj, (str, bytes))

def _code(p, flags):

    flags = p.state.flags | flags
    code = []

    # compile info block
    _compile_info(code, p, flags)

    # compile the pattern
    _compile(code, p.data, flags)

    code.append(SUCCESS)

    return code

def _hex_code(code):
    return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)

def dis(code):
    import sys

    labels = set()
    level = 0
    offset_width = len(str(len(code) - 1))

    def dis_(start, end):
        def print_(*args, to=None):
            if to is not None:
                labels.add(to)
                args += ('(to %d)' % (to,),)
            print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
                  end='  '*(level-1))
            print(*args)

        def print_2(*args):
            print(end=' '*(offset_width + 2*level))
            print(*args)

        nonlocal level
        level += 1
        i = start
        while i < end:
            start = i
            op = code[i]
            i += 1
            op = OPCODES[op]
            if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
                      MAX_UNTIL, MIN_UNTIL, NEGATE):
                print_(op)
            elif op in (LITERAL, NOT_LITERAL,
                        LITERAL_IGNORE, NOT_LITERAL_IGNORE,
                        LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
                        LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
                arg = code[i]
                i += 1
                print_(op, '%#02x (%r)' % (arg, chr(arg)))
            elif op is AT:
                arg = code[i]
                i += 1
                arg = str(ATCODES[arg])
                assert arg[:3] == 'AT_'
                print_(op, arg[3:])
            elif op is CATEGORY:
                arg = code[i]
                i += 1
                arg = str(CHCODES[arg])
                assert arg[:9] == 'CATEGORY_'
                print_(op, arg[9:])
            elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
                skip = code[i]
                print_(op, skip, to=i+skip)
                dis_(i+1, i+skip)
                i += skip
            elif op in (RANGE, RANGE_UNI_IGNORE):
                lo, hi = code[i: i+2]
                i += 2
                print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
            elif op is CHARSET:
                print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
                i += 256//_CODEBITS
            elif op is BIGCHARSET:
                arg = code[i]
                i += 1
                mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
                                        for x in code[i: i + 256//_sre.CODESIZE]))
                print_(op, arg, mapping)
                i += 256//_sre.CODESIZE
                level += 1
                for j in range(arg):
                    print_2(_hex_code(code[i: i + 256//_CODEBITS]))
                    i += 256//_CODEBITS
                level -= 1
            elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
                        GROUPREF_LOC_IGNORE):
                arg = code[i]
                i += 1
                print_(op, arg)
            elif op is JUMP:
                skip = code[i]
                print_(op, skip, to=i+skip)
                i += 1
            elif op is BRANCH:
                skip = code[i]
                print_(op, skip, to=i+skip)
                while skip:
                    dis_(i+1, i+skip)
                    i += skip
                    start = i
                    skip = code[i]
                    if skip:
                        print_('branch', skip, to=i+skip)
                    else:
                        print_(FAILURE)
                i += 1
            elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
                skip, min, max = code[i: i+3]
                if max == MAXREPEAT:
                    max = 'MAXREPEAT'
                print_(op, skip, min, max, to=i+skip)
                dis_(i+3, i+skip)
                i += skip
            elif op is GROUPREF_EXISTS:
                arg, skip = code[i: i+2]
                print_(op, arg, skip, to=i+skip)
                i += 2
            elif op in (ASSERT, ASSERT_NOT):
                skip, arg = code[i: i+2]
                print_(op, skip, arg, to=i+skip)
                dis_(i+2, i+skip)
                i += skip
            elif op is INFO:
                skip, flags, min, max = code[i: i+4]
                if max == MAXREPEAT:
                    max = 'MAXREPEAT'
                print_(op, skip, bin(flags), min, max, to=i+skip)
                start = i+4
                if flags & SRE_INFO_PREFIX:
                    prefix_len, prefix_skip = code[i+4: i+6]
                    print_2('  prefix_skip', prefix_skip)
                    start = i + 6
                    prefix = code[start: start+prefix_len]
                    print_2('  prefix',
                            '[%s]' % ', '.join('%#02x' % x for x in prefix),
                            '(%r)' % ''.join(map(chr, prefix)))
                    start += prefix_len
                    print_2('  overlap', code[start: start+prefix_len])
                    start += prefix_len
                if flags & SRE_INFO_CHARSET:
                    level += 1
                    print_2('in')
                    dis_(start, i+skip)
                    level -= 1
                i += skip
            else:
                raise ValueError(op)

        level -= 1

    dis_(0, len(code))


def compile(p, flags=0):
    # internal: convert pattern list to internal format

    if isstring(p):
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None

    code = _code(p, flags)

    if flags & SRE_FLAG_DEBUG:
        print()
        dis(code)

    # map in either direction
    groupindex = p.state.groupdict
    indexgroup = [None] * p.state.groups
    for k, i in groupindex.items():
        indexgroup[i] = k

    return _sre.compile(
        pattern, flags | p.state.flags, code,
        p.state.groups-1,
        groupindex, tuple(indexgroup)
        )
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
+								#
 								# Secret Labs' Regular Expression Engine
 								#
 								# convert template to internal format
 								#
-												SRE fixes for 2.1 alpha:

-- added some more docstrings
-- fixed typo in scanner class (#125531)
-- the multiline flag (?m) should't affect the \Z operator (#127259)
-- fixed non-greedy backtracking bug (#123769, #127259)
-- added sre.DEBUG flag (currently dumps the parsed pattern structure)
-- fixed a couple of glitches in groupdict (the #126587 memory leak
   had already been fixed by AMK)

											
										
										
											2001-01-14 11:06:11 -04:00
+								# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
+								#
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
+								# See the sre.py file for information on usage and redistribution.
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
+								#
-												Added docstrings by Neal Norwitz.  This closes SF bug #450980.

											
										
										
											2001-09-04 16:10:20 -03:00
+								"""Internal support module for sre"""
-												Issue #20976: pyflakes: Remove unused imports

											
										
										
											2014-03-20 05:16:38 -03:00
+								import _sre
-												Merged revisions 62194,62197-62198,62204-62205,62214,62219-62221,62227,62229-62231,62233-62235,62237-62239 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk

........
  r62194 | jeffrey.yasskin | 2008-04-07 01:04:28 +0200 (Mon, 07 Apr 2008) | 7 lines

  Add enough debugging information to diagnose failures where the
  HandlerBException is ignored, and fix one such problem, where it was thrown
  during the __del__ method of the previous Popen object.

  We may want to find a better way of printing verbose information so it's not
  spammy when the test passes.
........
  r62197 | mark.hammond | 2008-04-07 03:53:39 +0200 (Mon, 07 Apr 2008) | 2 lines

  Issue #2513: enable 64bit cross compilation on windows.
........
  r62198 | mark.hammond | 2008-04-07 03:59:40 +0200 (Mon, 07 Apr 2008) | 2 lines

  correct heading underline for new "Cross-compiling on Windows" section
........
  r62204 | gregory.p.smith | 2008-04-07 08:33:21 +0200 (Mon, 07 Apr 2008) | 4 lines

  Use the new PyFile_IncUseCount & PyFile_DecUseCount calls appropriatly
  within the standard library.  These modules use PyFile_AsFile and later
  release the GIL while operating on the previously returned FILE*.
........
  r62205 | mark.summerfield | 2008-04-07 09:39:23 +0200 (Mon, 07 Apr 2008) | 4 lines

  changed "2500 components" to "several thousand" since the number keeps
  growning:-)
........
  r62214 | georg.brandl | 2008-04-07 20:51:59 +0200 (Mon, 07 Apr 2008) | 2 lines

  #2525: update timezone info examples in the docs.
........
  r62219 | andrew.kuchling | 2008-04-08 01:57:07 +0200 (Tue, 08 Apr 2008) | 1 line

  Write PEP 3127 section; add items
........
  r62220 | andrew.kuchling | 2008-04-08 01:57:21 +0200 (Tue, 08 Apr 2008) | 1 line

  Typo fix
........
  r62221 | andrew.kuchling | 2008-04-08 03:33:10 +0200 (Tue, 08 Apr 2008) | 1 line

  Typographical fix: 32bit -> 32-bit, 64bit -> 64-bit
........
  r62227 | andrew.kuchling | 2008-04-08 23:22:53 +0200 (Tue, 08 Apr 2008) | 1 line

  Add items
........
  r62229 | amaury.forgeotdarc | 2008-04-08 23:27:42 +0200 (Tue, 08 Apr 2008) | 7 lines

  Issue2564: Prevent a hang in "import test.autotest", which runs the entire test
  suite as a side-effect of importing the module.

  - in test_capi, a thread tried to import other modules
  - re.compile() imported sre_parse again on every call.
........
  r62230 | amaury.forgeotdarc | 2008-04-08 23:51:57 +0200 (Tue, 08 Apr 2008) | 2 lines

  Prevent an error when inspect.isabstract() is called with something else than a new-style class.
........
  r62231 | amaury.forgeotdarc | 2008-04-09 00:07:05 +0200 (Wed, 09 Apr 2008) | 8 lines

  Issue 2408: remove the _types module
  It was only used as a helper in types.py to access types (GetSetDescriptorType and MemberDescriptorType),
  when they can easily be obtained with python code.
  These expressions even work with Jython.

  I don't know what the future of the types module is; (cf. discussion in http://bugs.python.org/issue1605 )
  at least this change makes it simpler.
........
  r62233 | amaury.forgeotdarc | 2008-04-09 01:10:07 +0200 (Wed, 09 Apr 2008) | 2 lines

  Add a NEWS entry for previous checkin
........
  r62234 | trent.nelson | 2008-04-09 01:47:30 +0200 (Wed, 09 Apr 2008) | 37 lines

  - Issue #2550: The approach used by client/server code for obtaining ports
    to listen on in network-oriented tests has been refined in an effort to
    facilitate running multiple instances of the entire regression test suite
    in parallel without issue.  test_support.bind_port() has been fixed such
    that it will always return a unique port -- which wasn't always the case
    with the previous implementation, especially if socket options had been
    set that affected address reuse (i.e. SO_REUSEADDR, SO_REUSEPORT).  The
    new implementation of bind_port() will actually raise an exception if it
    is passed an AF_INET/SOCK_STREAM socket with either the SO_REUSEADDR or
    SO_REUSEPORT socket option set.  Furthermore, if available, bind_port()
    will set the SO_EXCLUSIVEADDRUSE option on the socket it's been passed.
    This currently only applies to Windows.  This option prevents any other
    sockets from binding to the host/port we've bound to, thus removing the
    possibility of the 'non-deterministic' behaviour, as Microsoft puts it,
    that occurs when a second SOCK_STREAM socket binds and accepts to a
    host/port that's already been bound by another socket.  The optional
    preferred port parameter to bind_port() has been removed.  Under no
    circumstances should tests be hard coding ports!

    test_support.find_unused_port() has also been introduced, which will pass
    a temporary socket object to bind_port() in order to obtain an unused port.
    The temporary socket object is then closed and deleted, and the port is
    returned.  This method should only be used for obtaining an unused port
    in order to pass to an external program (i.e. the -accept [port] argument
    to openssl's s_server mode) or as a parameter to a server-oriented class
    that doesn't give you direct access to the underlying socket used.

    Finally, test_support.HOST has been introduced, which should be used for
    the host argument of any relevant socket calls (i.e. bind and connect).

    The following tests were updated to following the new conventions:
      test_socket, test_smtplib, test_asyncore, test_ssl, test_httplib,
      test_poplib, test_ftplib, test_telnetlib, test_socketserver,
      test_asynchat and test_socket_ssl.

    It is now possible for multiple instances of the regression test suite to
    run in parallel without issue.
........
  r62235 | gregory.p.smith | 2008-04-09 02:25:17 +0200 (Wed, 09 Apr 2008) | 3 lines

  Fix zlib crash from zlib.decompressobj().flush(val) when val was not positive.
  It tried to allocate negative or zero memory.  That fails.
........
  r62237 | trent.nelson | 2008-04-09 02:34:53 +0200 (Wed, 09 Apr 2008) | 1 line

  Fix typo with regards to self.PORT shadowing class variables with the same name.
........
  r62238 | andrew.kuchling | 2008-04-09 03:08:32 +0200 (Wed, 09 Apr 2008) | 1 line

  Add items
........
  r62239 | jerry.seutter | 2008-04-09 07:07:58 +0200 (Wed, 09 Apr 2008) | 1 line

  Changed test so it no longer runs as a side effect of importing.
........

											
										
										
											2008-04-09 05:37:03 -03:00
+								import sre_parse
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
+								from sre_constants import *
-												added "magic" number to the _sre module, to avoid weird errors caused
by compiler/engine mismatches

											
										
										
											2001-01-15 08:46:09 -04:00
+								assert _sre.MAGIC == MAGIC, "SRE module mismatch"
-												Issue #22823: Use set literals instead of creating a set from a list

											
										
										
											2014-11-09 19:56:33 -04:00
+								_LITERAL_CODES = {LITERAL, NOT_LITERAL}
 								_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
 								_SUCCESS_CODES = {SUCCESS, FAILURE}
 								_ASSERT_CODES = {ASSERT, ASSERT_NOT}
-												bpo-30340: Enhanced regular expressions optimization. (#1542)

This increased the performance of matching some patterns up to 25 times.

											
										
										
											2017-05-14 02:32:33 -03:00
+								_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
-												Complete the previous effort to factor out constant expressions
and improve the speed of the if/elif/else blocks.

											
										
										
											2005-02-28 15:27:52 -04:00
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								# Sets of lowercase characters which have the same uppercase.
 								_equivalences = (
 								    # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
 								    (0x69, 0x131), # iı
 								    # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
 								    (0x73, 0x17f), # sſ
 								    # MICRO SIGN, GREEK SMALL LETTER MU
 								    (0xb5, 0x3bc), # µμ
 								    # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
 								    (0x345, 0x3b9, 0x1fbe), # \u0345ιι
 								    # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
 								    (0x390, 0x1fd3), # ΐΐ
 								    # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
 								    (0x3b0, 0x1fe3), # ΰΰ
 								    # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
 								    (0x3b2, 0x3d0), # βϐ
 								    # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
 								    (0x3b5, 0x3f5), # εϵ
 								    # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
 								    (0x3b8, 0x3d1), # θϑ
 								    # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
 								    (0x3ba, 0x3f0), # κϰ
 								    # GREEK SMALL LETTER PI, GREEK PI SYMBOL
 								    (0x3c0, 0x3d6), # πϖ
 								    # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
 								    (0x3c1, 0x3f1), # ρϱ
 								    # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
 								    (0x3c2, 0x3c3), # ςσ
 								    # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
 								    (0x3c6, 0x3d5), # φϕ
 								    # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
 								    (0x1e61, 0x1e9b), # ṡẛ
 								    # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
 								    (0xfb05, 0xfb06), # ﬅﬆ
 								)
 								# Maps the lowercase code to lowercase codes which have the same uppercase.
 								_ignorecase_fixes = {i: tuple(j for j in t if i != j)
 								                     for t in _equivalences for i in t}
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								def _combine_flags(flags, add_flags, del_flags,
 								                   TYPE_FLAGS=sre_parse.TYPE_FLAGS):
 								    if add_flags & TYPE_FLAGS:
 								        flags &= ~TYPE_FLAGS
 								    return (flags | add_flags) & ~del_flags
-												towards 1.6b1

											
										
										
											2000-06-29 05:58:44 -03:00
+								def _compile(code, pattern, flags):
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    # internal: compile a (sub)pattern
-												- fixed split
  (test_sre still complains about split, but that's caused by
  the group reset bug, not split itself)

- added more mark slots
  (should be dynamically allocated, but 100 is better than 32.
  and checking for the upper limit is better than overwriting
  the memory ;-)

- internal: renamed the cursor helper class

- internal: removed some bloat from sre_compile

											
										
										
											2000-06-29 13:57:40 -03:00
+								    emit = code.append
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								    _len = len
-												Complete the previous effort to factor out constant expressions
and improve the speed of the if/elif/else blocks.

											
										
										
											2005-02-28 15:27:52 -04:00
+								    LITERAL_CODES = _LITERAL_CODES
 								    REPEATING_CODES = _REPEATING_CODES
 								    SUCCESS_CODES = _SUCCESS_CODES
 								    ASSERT_CODES = _ASSERT_CODES
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								    iscased = None
-												bpo-30277: Replace _sre.getlower() with _sre.ascii_tolower() and _sre.unicode_tolower(). (#1468)


											
										
										
											2017-05-05 04:42:46 -03:00
+								    tolower = None
 								    fixes = None
 								    if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
 								        if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								            iscased = _sre.unicode_iscased
-												bpo-30277: Replace _sre.getlower() with _sre.ascii_tolower() and _sre.unicode_tolower(). (#1468)


											
										
										
											2017-05-05 04:42:46 -03:00
+								            tolower = _sre.unicode_tolower
 								            fixes = _ignorecase_fixes
 								        else:
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								            iscased = _sre.ascii_iscased
-												bpo-30277: Replace _sre.getlower() with _sre.ascii_tolower() and _sre.unicode_tolower(). (#1468)


											
										
										
											2017-05-05 04:42:46 -03:00
+								            tolower = _sre.ascii_tolower
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
+								    for op, av in pattern:
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								        if op in LITERAL_CODES:
-												bpo-30215: Make re.compile() locale agnostic. (#1361)

Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time.  Only the locale at matching
time affects the result of matching.

											
										
										
											2017-05-05 02:53:40 -03:00
+								            if not flags & SRE_FLAG_IGNORECASE:
 								                emit(op)
 								                emit(av)
 								            elif flags & SRE_FLAG_LOCALE:
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								                emit(OP_LOCALE_IGNORE[op])
-												bpo-30215: Make re.compile() locale agnostic. (#1361)

Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time.  Only the locale at matching
time affects the result of matching.

											
										
										
											2017-05-05 02:53:40 -03:00
+								                emit(av)
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								            elif not iscased(av):
 								                emit(op)
 								                emit(av)
-												bpo-30215: Make re.compile() locale agnostic. (#1361)

Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time.  Only the locale at matching
time affects the result of matching.

											
										
										
											2017-05-05 02:53:40 -03:00
+								            else:
-												bpo-30277: Replace _sre.getlower() with _sre.ascii_tolower() and _sre.unicode_tolower(). (#1468)


											
										
										
											2017-05-05 04:42:46 -03:00
+								                lo = tolower(av)
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								                if not fixes:  # ascii
 								                    emit(OP_IGNORE[op])
 								                    emit(lo)
 								                elif lo not in fixes:
 								                    emit(OP_UNICODE_IGNORE[op])
 								                    emit(lo)
 								                else:
 								                    emit(IN_UNI_IGNORE)
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								                    skip = _len(code); emit(0)
 								                    if op is NOT_LITERAL:
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:43:14 -04:00
+								                        emit(NEGATE)
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								                    for k in (lo,) + fixes[lo]:
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:43:14 -04:00
+								                        emit(LITERAL)
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								                        emit(k)
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:43:14 -04:00
+								                    emit(FAILURE)
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								                    code[skip] = _len(code) - skip
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								        elif op is IN:
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								            charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
 								            if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
-												bpo-30215: Make re.compile() locale agnostic. (#1361)

Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time.  Only the locale at matching
time affects the result of matching.

											
										
										
											2017-05-05 02:53:40 -03:00
+								                emit(IN_LOC_IGNORE)
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								            elif not hascased:
 								                emit(IN)
 								            elif not fixes:  # ascii
-												bpo-30215: Make re.compile() locale agnostic. (#1361)

Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time.  Only the locale at matching
time affects the result of matching.

											
										
										
											2017-05-05 02:53:40 -03:00
+								                emit(IN_IGNORE)
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								            else:
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								                emit(IN_UNI_IGNORE)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            skip = _len(code); emit(0)
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								            _compile_charset(charset, flags, code)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            code[skip] = _len(code) - skip
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								        elif op is ANY:
 								            if flags & SRE_FLAG_DOTALL:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                emit(ANY_ALL)
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            else:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                emit(ANY)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								        elif op in REPEATING_CODES:
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								            if flags & SRE_FLAG_TEMPLATE:
-												Issue #22364: Improved some re error messages using regex for hints.

											
										
										
											2015-03-25 16:03:47 -03:00
+								                raise error("internal: unsupported template operator %r" % (op,))
-												bpo-30340: Enhanced regular expressions optimization. (#1542)

This increased the performance of matching some patterns up to 25 times.

											
										
										
											2017-05-14 02:32:33 -03:00
+								            if _simple(av[2]):
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                if op is MAX_REPEAT:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                    emit(REPEAT_ONE)
-												SF patch #720991 by Gary Herron:
A small fix for bug #545855 and Greg Chapman's
addition of op code SRE_OP_MIN_REPEAT_ONE for
eliminating recursion on simple uses of pattern '*?' on a
long string.

											
										
										
											2003-04-14 14:59:34 -03:00
+								                else:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                    emit(MIN_REPEAT_ONE)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                skip = _len(code); emit(0)
-												final 0.9.8 updates:

-- added REPEAT_ONE operator
-- added ANY_ALL operator (used to represent "(?s).")

											
										
										
											2000-08-01 19:47:49 -03:00
+								                emit(av[0])
 								                emit(av[1])
 								                _compile(code, av[2], flags)
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                emit(SUCCESS)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                code[skip] = _len(code) - skip
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								            else:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                emit(REPEAT)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                skip = _len(code); emit(0)
-												final 0.9.8 updates:

-- added REPEAT_ONE operator
-- added ANY_ALL operator (used to represent "(?s).")

											
										
										
											2000-08-01 19:47:49 -03:00
+								                emit(av[0])
 								                emit(av[1])
 								                _compile(code, av[2], flags)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                code[skip] = _len(code) - skip
 								                if op is MAX_REPEAT:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                    emit(MAX_UNTIL)
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								                else:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                    emit(MIN_UNTIL)
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								        elif op is SUBPATTERN:
-												Issue #433028: Added support of modifier spans in regular expressions.

											
										
										
											2016-09-09 18:57:55 -03:00
+								            group, add_flags, del_flags, p = av
 								            if group:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                emit(MARK)
-												Issue #433028: Added support of modifier spans in regular expressions.

											
										
										
											2016-09-09 18:57:55 -03:00
+								                emit((group-1)*2)
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								            # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
 								            _compile(code, p, _combine_flags(flags, add_flags, del_flags))
-												Issue #433028: Added support of modifier spans in regular expressions.

											
										
										
											2016-09-09 18:57:55 -03:00
+								            if group:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                emit(MARK)
-												Issue #433028: Added support of modifier spans in regular expressions.

											
										
										
											2016-09-09 18:57:55 -03:00
+								                emit((group-1)*2+1)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								        elif op in SUCCESS_CODES:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								            emit(op)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								        elif op in ASSERT_CODES:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								            emit(op)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            skip = _len(code); emit(0)
-												- added lookbehind support (?<=pattern), (?<!pattern).
  the pattern must have a fixed width.

- got rid of array-module dependencies; the match pro-
  gram is now stored inside the pattern object, rather
  than in an extra string buffer.

- cleaned up a various of potential leaks, api abuses,
  and other minors in the engine module.

- use mal's new isalnum macro, rather than my own work-
  around.

- untabified test_sre.py.  seems like I removed a couple
  of trailing spaces in the process...

											
										
										
											2000-07-03 15:44:21 -03:00
+								            if av[0] >= 0:
 								                emit(0) # look ahead
 								            else:
 								                lo, hi = av[1].getwidth()
 								                if lo != hi:
-												Raise statement normalization in Lib/.

											
										
										
											2007-08-29 22:19:48 -03:00
+								                    raise error("look-behind requires fixed-width pattern")
-												- added lookbehind support (?<=pattern), (?<!pattern).
  the pattern must have a fixed width.

- got rid of array-module dependencies; the match pro-
  gram is now stored inside the pattern object, rather
  than in an extra string buffer.

- cleaned up a various of potential leaks, api abuses,
  and other minors in the engine module.

- use mal's new isalnum macro, rather than my own work-
  around.

- untabified test_sre.py.  seems like I removed a couple
  of trailing spaces in the process...

											
										
										
											2000-07-03 15:44:21 -03:00
+								                emit(lo) # look behind
 								            _compile(code, av[1], flags)
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								            emit(SUCCESS)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            code[skip] = _len(code) - skip
-												- added lookbehind support (?<=pattern), (?<!pattern).
  the pattern must have a fixed width.

- got rid of array-module dependencies; the match pro-
  gram is now stored inside the pattern object, rather
  than in an extra string buffer.

- cleaned up a various of potential leaks, api abuses,
  and other minors in the engine module.

- use mal's new isalnum macro, rather than my own work-
  around.

- untabified test_sre.py.  seems like I removed a couple
  of trailing spaces in the process...

											
										
										
											2000-07-03 15:44:21 -03:00
+								        elif op is CALL:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								            emit(op)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            skip = _len(code); emit(0)
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            _compile(code, av, flags)
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								            emit(SUCCESS)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            code[skip] = _len(code) - skip
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								        elif op is AT:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								            emit(op)
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            if flags & SRE_FLAG_MULTILINE:
-												sre 2.1b2 update:

- take locale into account for word boundary anchors (#410271)
- restored 2.0's *? behaviour (#233283, #408936 and others)
- speed up re.sub/re.subn

											
										
										
											2001-03-22 11:50:10 -04:00
+								                av = AT_MULTILINE.get(av, av)
 								            if flags & SRE_FLAG_LOCALE:
 								                av = AT_LOCALE.get(av, av)
-												Issue #433028: Added support of modifier spans in regular expressions.

											
										
										
											2016-09-09 18:57:55 -03:00
+								            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
-												sre 2.1b2 update:

- take locale into account for word boundary anchors (#410271)
- restored 2.0's *? behaviour (#233283, #408936 and others)
- speed up re.sub/re.subn

											
										
										
											2001-03-22 11:50:10 -04:00
+								                av = AT_UNICODE.get(av, av)
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								            emit(av)
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								        elif op is BRANCH:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								            emit(op)
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            tail = []
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            tailappend = tail.append
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            for av in av[1]:
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                skip = _len(code); emit(0)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								                # _compile_info(code, av, flags)
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								                _compile(code, av, flags)
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                emit(JUMP)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                tailappend(_len(code)); emit(0)
 								                code[skip] = _len(code) - skip
-												Minor code clean up and improvements in the re module.

											
										
										
											2014-11-11 15:13:28 -04:00
+								            emit(FAILURE) # end of branch
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            for tail in tail:
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                code[tail] = _len(code) - tail
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								        elif op is CATEGORY:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								            emit(op)
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            if flags & SRE_FLAG_LOCALE:
-												sre 2.1b2 update:

- take locale into account for word boundary anchors (#410271)
- restored 2.0's *? behaviour (#233283, #408936 and others)
- speed up re.sub/re.subn

											
										
										
											2001-03-22 11:50:10 -04:00
+								                av = CH_LOCALE[av]
-												Issue #433028: Added support of modifier spans in regular expressions.

											
										
										
											2016-09-09 18:57:55 -03:00
+								            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
-												sre 2.1b2 update:

- take locale into account for word boundary anchors (#410271)
- restored 2.0's *? behaviour (#233283, #408936 and others)
- speed up re.sub/re.subn

											
										
										
											2001-03-22 11:50:10 -04:00
+								                av = CH_UNICODE[av]
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								            emit(av)
-												- fixed grouping error bug

- changed "group" operator to "groupref"

											
										
										
											2000-07-03 18:31:48 -03:00
+								        elif op is GROUPREF:
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								            if not flags & SRE_FLAG_IGNORECASE:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                emit(op)
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								            elif flags & SRE_FLAG_LOCALE:
 								                emit(GROUPREF_LOC_IGNORE)
 								            elif not fixes:  # ascii
 								                emit(GROUPREF_IGNORE)
 								            else:
 								                emit(GROUPREF_UNI_IGNORE)
-												- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py

											
										
										
											2000-06-30 07:41:31 -03:00
+								            emit(av-1)
-												Implemented non-recursive SRE matching.

											
										
										
											2003-10-17 19:13:16 -03:00
+								        elif op is GROUPREF_EXISTS:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								            emit(op)
-												[Bug #1177831] Fix generation of code for GROUPREF_EXISTS.  Thanks to Andre Malo for the fix.

											
										
										
											2005-06-02 10:35:52 -03:00
+								            emit(av[0]-1)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								            skipyes = _len(code); emit(0)
-												Implemented non-recursive SRE matching.

											
										
										
											2003-10-17 19:13:16 -03:00
+								            _compile(code, av[1], flags)
 								            if av[2]:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                emit(JUMP)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                skipno = _len(code); emit(0)
 								                code[skipyes] = _len(code) - skipyes + 1
-												Implemented non-recursive SRE matching.

											
										
										
											2003-10-17 19:13:16 -03:00
+								                _compile(code, av[2], flags)
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                code[skipno] = _len(code) - skipno
-												Implemented non-recursive SRE matching.

											
										
										
											2003-10-17 19:13:16 -03:00
+								            else:
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								                code[skipyes] = _len(code) - skipyes + 1
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								        else:
-												Issue #22364: Improved some re error messages using regex for hints.

											
										
										
											2015-03-25 16:03:47 -03:00
+								            raise error("internal: unsupported operand type %r" % (op,))
-												Added Fredrik Lundh's sre module and its supporting cast.

NOTE: THIS IS VERY ROUGH ALPHA CODE!

											
										
										
											2000-03-31 10:58:54 -04:00
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								def _compile_charset(charset, flags, code):
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								    # compile charset subprogram
 								    emit = code.append
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								    for op, av in charset:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								        emit(op)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        if op is NEGATE:
 								            pass
 								        elif op is LITERAL:
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.

											
										
										
											2014-10-31 07:36:56 -03:00
+								            emit(av)
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								        elif op is RANGE or op is RANGE_UNI_IGNORE:
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.

											
										
										
											2014-10-31 07:36:56 -03:00
+								            emit(av[0])
 								            emit(av[1])
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        elif op is CHARSET:
 								            code.extend(av)
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
+								        elif op is BIGCHARSET:
 								            code.extend(av)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        elif op is CATEGORY:
 								            if flags & SRE_FLAG_LOCALE:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                emit(CH_LOCALE[av])
-												Issue #433028: Added support of modifier spans in regular expressions.

											
										
										
											2016-09-09 18:57:55 -03:00
+								            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                emit(CH_UNICODE[av])
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								            else:
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								                emit(av)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        else:
-												Issue #22364: Improved some re error messages using regex for hints.

											
										
										
											2015-03-25 16:03:47 -03:00
+								            raise error("internal: unsupported set operator %r" % (op,))
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								    emit(FAILURE)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								    # internal: optimize character set
 								    out = []
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    tail = []
 								    charmap = bytearray(256)
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								    hascased = False
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    for op, av in charset:
 								        while True:
 								            try:
 								                if op is LITERAL:
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.

											
										
										
											2014-10-31 07:36:56 -03:00
+								                    if fixup:
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:43:14 -04:00
+								                        lo = fixup(av)
 								                        charmap[lo] = 1
 								                        if fixes and lo in fixes:
 								                            for k in fixes[lo]:
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								                                charmap[k] = 1
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								                        if not hascased and iscased(av):
 								                            hascased = True
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								                    else:
 								                        charmap[av] = 1
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								                elif op is RANGE:
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.

											
										
										
											2014-10-31 07:36:56 -03:00
+								                    r = range(av[0], av[1]+1)
 								                    if fixup:
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								                        if fixes:
 								                            for i in map(fixup, r):
 								                                charmap[i] = 1
 								                                if i in fixes:
 								                                    for k in fixes[i]:
 								                                        charmap[k] = 1
 								                        else:
 								                            for i in map(fixup, r):
 								                                charmap[i] = 1
 								                        if not hascased:
 								                            hascased = any(map(iscased, r))
-												Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions.

											
										
										
											2014-11-10 06:37:16 -04:00
+								                    else:
 								                        for i in r:
 								                            charmap[i] = 1
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								                elif op is NEGATE:
 								                    out.append((op, av))
 								                else:
 								                    tail.append((op, av))
 								            except IndexError:
 								                if len(charmap) == 256:
 								                    # character set contains non-UCS1 character codes
 								                    charmap += b'\0' * 0xff00
 								                    continue
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.

											
										
										
											2014-10-31 07:36:56 -03:00
+								                # Character set contains non-BMP character codes.
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								                if fixup:
 								                    hascased = True
 								                    # There are only two ranges of cased non-BMP characters:
 								                    # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								                    # and for both ranges RANGE_UNI_IGNORE works.
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								                    if op is RANGE:
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								                        op = RANGE_UNI_IGNORE
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								                tail.append((op, av))
 								            break
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								    # compress character map
 								    runs = []
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    q = 0
 								    while True:
 								        p = charmap.find(1, q)
 								        if p < 0:
 								            break
 								        if len(runs) >= 2:
 								            runs = None
 								            break
 								        q = charmap.find(0, p)
 								        if q < 0:
 								            runs.append((p, len(charmap)))
 								            break
 								        runs.append((p, q))
 								    if runs is not None:
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        # use literal/range
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								        for p, q in runs:
 								            if q - p == 1:
 								                out.append((LITERAL, p))
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								            else:
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								                out.append((RANGE, (p, q - 1)))
 								        out += tail
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.

											
										
										
											2014-10-31 07:36:56 -03:00
+								        # if the case was changed or new representation is more compact
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								        if hascased or len(out) < len(charset):
 								            return out, hascased
-												Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.

											
										
										
											2014-10-31 07:36:56 -03:00
+								        # else original character set is good enough
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								        return charset, hascased
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
 								    # use bitmap
 								    if len(charmap) == 256:
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
+								        data = _mk_bitmap(charmap)
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								        out.append((CHARSET, data))
 								        out += tail
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								        return out, hascased
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    # To represent a big charset, first a bitmap of all characters in the
 								    # set is constructed. Then, this bitmap is sliced into chunks of 256
 								    # characters, duplicate chunks are eliminated, and each chunk is
 								    # given a number. In the compiled expression, the charset is
 								    # represented by a 32-bit word sequence, consisting of one word for
 								    # the number of different chunks, a sequence of 256 bytes (64 words)
 								    # of chunk numbers indexed by their original chunk position, and a
 								    # sequence of 256-bit chunks (8 words each).
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    # Compression is normally good: in a typical charset, large ranges of
 								    # Unicode will be either completely excluded (e.g. if only cyrillic
 								    # letters are to be matched), or completely included (e.g. if large
 								    # subranges of Kanji match). These ranges will be represented by
 								    # chunks of all one-bits or all zero-bits.
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    # Matching can be also done efficiently: the more significant byte of
 								    # the Unicode character is an index into the chunk number, and the
 								    # less significant byte is a bit index in the chunk (just like the
 								    # CHARSET matching).
-												Fully support 32-bit codes. Enable BIGCHARSET in UCS-4 builds.

											
										
										
											2003-04-19 09:56:08 -03:00
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    charmap = bytes(charmap) # should be hashable
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
+								    comps = {}
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    mapping = bytearray(256)
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
+								    block = 0
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    data = bytearray()
 								    for i in range(0, 65536, 256):
 								        chunk = charmap[i: i + 256]
 								        if chunk in comps:
 								            mapping[i // 256] = comps[chunk]
 								        else:
 								            mapping[i // 256] = comps[chunk] = block
 								            block += 1
 								            data += chunk
 								    data = _mk_bitmap(data)
 								    data[0:0] = [block] + _bytes_to_codes(mapping)
 								    out.append((BIGCHARSET, data))
 								    out += tail
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								    return out, hascased
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
 								_CODEBITS = _sre.CODESIZE * 8
-												Minor code clean up and improvements in the re module.

											
										
										
											2014-11-11 15:13:28 -04:00
+								MAXCODE = (1 << _CODEBITS) - 1
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								_BITS_TRANS = b'0' + b'1' * 255
 								def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
 								    s = bits.translate(_BITS_TRANS)[::-1]
 								    return [_int(s[i - _CODEBITS: i], 2)
 								            for i in range(len(s), 0, -_CODEBITS)]
 								def _bytes_to_codes(b):
 								    # Convert block indices to word array
-												Got rid of the array module dependency in the re module.
The re module could be used during building before array is built.

											
										
										
											2014-11-10 07:24:47 -04:00
+								    a = memoryview(b).cast('I')
-												Issue #19329: Optimized compiling charsets in regular expressions.

											
										
										
											2013-10-27 03:20:29 -03:00
+								    assert a.itemsize == _sre.CODESIZE
 								    assert len(a) * a.itemsize == len(b)
 								    return a.tolist()
-												added martin's BIGCHARSET patch to SRE 2.1.1.  martin reports 2x
speedups for certain unicode character ranges.

											
										
										
											2001-07-02 13:58:38 -03:00
-												bpo-30340: Enhanced regular expressions optimization. (#1542)

This increased the performance of matching some patterns up to 25 times.

											
										
										
											2017-05-14 02:32:33 -03:00
+								def _simple(p):
 								    # check if this subpattern is a "simple" operator
 								    if len(p) != 1:
 								        return False
 								    op, av = p[0]
 								    if op is SUBPATTERN:
 								        return av[0] is None and _simple(av[-1])
 								    return op in _UNIT_CODES
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
-												Issue #19387: explain and test the sre overlap table

											
										
										
											2013-10-25 16:36:10 -03:00
+								def _generate_overlap_table(prefix):
 								    """
 								    Generate an overlap table for the following prefix.
 								    An overlap table is a table of the same size as the prefix which
 								    informs about the potential self-overlap for each index in the prefix:
 								    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
 								    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
 								      prefix[0:k]
 								    """
 								    table = [0] * len(prefix)
 								    for i in range(1, len(prefix)):
 								        idx = table[i - 1]
 								        while prefix[i] != prefix[idx]:
 								            if idx == 0:
 								                table[i] = 0
 								                break
 								            idx = table[idx - 1]
 								        else:
 								            table[i] = idx + 1
 								    return table
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								def _get_iscased(flags):
 								    if not flags & SRE_FLAG_IGNORECASE:
 								        return None
 								    elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
 								        return _sre.unicode_iscased
 								    else:
 								        return _sre.ascii_iscased
 								def _get_literal_prefix(pattern, flags):
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
+								    # look for literal prefix
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    prefix = []
-												Simple optimizations:
* pre-build a single identity function for the fixup function
* pre-build membership tests in dictionaries instead of in-line tuples
* assign len() to a local variable
* assign append() methods to a local variable
* use xrange() instead of range()
* replace "x<<1" with "x+x"

											
										
										
											2004-03-26 07:16:55 -04:00
+								    prefixappend = prefix.append
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
+								    prefix_skip = None
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								    iscased = _get_iscased(flags)
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
+								    for op, av in pattern.data:
 								        if op is LITERAL:
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								            if iscased and iscased(av):
 								                break
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
+								            prefixappend(av)
 								        elif op is SUBPATTERN:
-												Issue #433028: Added support of modifier spans in regular expressions.

											
										
										
											2016-09-09 18:57:55 -03:00
+								            group, add_flags, del_flags, p = av
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								            flags1 = _combine_flags(flags, add_flags, del_flags)
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								            if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
-												Issue #433028: Added support of modifier spans in regular expressions.

											
										
										
											2016-09-09 18:57:55 -03:00
+								                break
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								            prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
+								            if prefix_skip is None:
-												Issue #433028: Added support of modifier spans in regular expressions.

											
										
										
											2016-09-09 18:57:55 -03:00
+								                if group is not None:
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
+								                    prefix_skip = len(prefix)
 								                elif prefix_skip1 is not None:
 								                    prefix_skip = len(prefix) + prefix_skip1
 								            prefix.extend(prefix1)
 								            if not got_all:
 								                break
 								        else:
 								            break
-												Issue #433028: Added support of modifier spans in regular expressions.

											
										
										
											2016-09-09 18:57:55 -03:00
+								    else:
 								        return prefix, prefix_skip, True
 								    return prefix, prefix_skip, False
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								def _get_charset_prefix(pattern, flags):
 								    while True:
 								        if not pattern.data:
 								            return None
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
+								        op, av = pattern.data[0]
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								        if op is not SUBPATTERN:
 								            break
 								        group, add_flags, del_flags, pattern = av
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								        flags = _combine_flags(flags, add_flags, del_flags)
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								        if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
 								            return None
 								    iscased = _get_iscased(flags)
 								    if op is LITERAL:
 								        if iscased and iscased(av):
 								            return None
 								        return [(op, av)]
 								    elif op is BRANCH:
 								        charset = []
 								        charsetappend = charset.append
 								        for p in av[1]:
 								            if not p:
 								                return None
 								            op, av = p[0]
 								            if op is LITERAL and not (iscased and iscased(av)):
 								                charsetappend((op, av))
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
+								            else:
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								                return None
 								        return charset
 								    elif op is IN:
 								        charset = av
 								        if iscased:
 								            for op, av in charset:
 								                if op is LITERAL:
 								                    if iscased(av):
 								                        return None
 								                elif op is RANGE:
 								                    if av[1] > 0xffff:
 								                        return None
 								                    if any(map(iscased, range(av[0], av[1]+1))):
 								                        return None
 								        return charset
 								    return None
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
 								def _compile_info(code, pattern, flags):
 								    # internal: compile an info block.  in the current version,
 								    # this contains min/max pattern width, and an optional literal
 								    # prefix or a character map
 								    lo, hi = pattern.getwidth()
 								    if hi > MAXCODE:
 								        hi = MAXCODE
 								    if lo == 0:
 								        code.extend([INFO, 4, 0, lo, hi])
 								        return
 								    # look for a literal prefix
 								    prefix = []
 								    prefix_skip = 0
 								    charset = [] # not used
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								    if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
+								        # look for literal prefix
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								        prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
+								        # if no prefix, look for charset prefix
 								        if not prefix:
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								            charset = _get_charset_prefix(pattern, flags)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								##     if prefix:
-												Minor code clean up and improvements in the re module.

											
										
										
											2014-11-11 15:13:28 -04:00
+								##         print("*** PREFIX", prefix, prefix_skip)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								##     if charset:
-												Minor code clean up and improvements in the re module.

											
										
										
											2014-11-11 15:13:28 -04:00
+								##         print("*** CHARSET", charset)
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    # add an info block
 								    emit = code.append
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								    emit(INFO)
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    skip = len(code); emit(0)
 								    # literal flag
 								    mask = 0
-												-- use charset bitmaps where appropriate.  this gives a 5-10%
   speedup for some tests, including the python tokenizer.

-- added support for an optional charset anchor to the engine
   (currently unused by the code generator).

-- removed workaround for array module bug.

											
										
										
											2000-07-02 09:00:07 -03:00
+								    if prefix:
 								        mask = SRE_INFO_PREFIX
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
+								        if prefix_skip is None and got_all:
-												Minor code clean up and improvements in the re module.

											
										
										
											2014-11-11 15:13:28 -04:00
+								            mask = mask | SRE_INFO_LITERAL
-												-- use charset bitmaps where appropriate.  this gives a 5-10%
   speedup for some tests, including the python tokenizer.

-- added support for an optional charset anchor to the engine
   (currently unused by the code generator).

-- removed workaround for array module bug.

											
										
										
											2000-07-02 09:00:07 -03:00
+								    elif charset:
-												Minor code clean up and improvements in the re module.

											
										
										
											2014-11-11 15:13:28 -04:00
+								        mask = mask | SRE_INFO_CHARSET
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    emit(mask)
 								    # pattern length
-												-- use charset bitmaps where appropriate.  this gives a 5-10%
   speedup for some tests, including the python tokenizer.

-- added support for an optional charset anchor to the engine
   (currently unused by the code generator).

-- removed workaround for array module bug.

											
										
										
											2000-07-02 09:00:07 -03:00
+								    if lo < MAXCODE:
 								        emit(lo)
 								    else:
 								        emit(MAXCODE)
 								        prefix = prefix[:MAXCODE]
-												Issue #22818: Splitting on a pattern that could match an empty string now
raises a warning.  Patterns that can only match empty strings are now
rejected.

											
										
										
											2015-02-03 05:04:19 -04:00
+								    emit(min(hi, MAXCODE))
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    # add literal prefix
 								    if prefix:
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        emit(len(prefix)) # length
-												Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

											
										
										
											2015-06-21 08:06:55 -03:00
+								        if prefix_skip is None:
 								            prefix_skip =  len(prefix)
-												-- reset marks if repeat_one tail doesn't match
   (this should fix Sjoerd's xmllib problem)
-- added skip field to INFO header
-- changed compiler to generate charset INFO header
-- changed trace messages to support post-mortem analysis

											
										
										
											2000-08-07 17:59:04 -03:00
+								        emit(prefix_skip) # skip
 								        code.extend(prefix)
 								        # generate overlap table
-												Issue #19387: explain and test the sre overlap table

											
										
										
											2013-10-25 16:36:10 -03:00
+								        code.extend(_generate_overlap_table(prefix))
-												-- use charset bitmaps where appropriate.  this gives a 5-10%
   speedup for some tests, including the python tokenizer.

-- added support for an optional charset anchor to the engine
   (currently unused by the code generator).

-- removed workaround for array module bug.

											
										
										
											2000-07-02 09:00:07 -03:00
+								    elif charset:
-												bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.

											
										
										
											2017-05-09 17:37:14 -03:00
+								        charset, hascased = _optimize_charset(charset)
 								        assert not hascased
-												Fix from SF patch #633359 by Greg Chapman for SF bug #610299:
    The problem is in sre_compile.py: the call to
    _compile_charset near the end of _compile_info forgets to
    pass in the flags, so that the info charset is not compiled
    with re.U. (The info charset is used when searching to find
    the first character at which a match could start; it is not
    generated for patterns beginning with a repeat like '\w{1}'.)

											
										
										
											2003-02-23 21:18:35 -04:00
+								        _compile_charset(charset, flags, code)
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
+								    code[skip] = len(code) - skip
-												Addendum to #764548: restore 2.1 compatibility.

											
										
										
											2003-07-02 18:37:16 -03:00
+								def isstring(obj):
-												Fix 're' to work on bytes. It could do with a few more tests, though.

											
										
										
											2008-03-18 17:19:54 -03:00
+								    return isinstance(obj, (str, bytes))
-												Addendum to #764548: restore 2.1 compatibility.

											
										
										
											2003-07-02 18:37:16 -03:00
-												-- fixed width calculations for alternations
-- fixed literal check in branch operator
   (this broke test_tokenize, as reported by Mark Favas)
-- added REPEAT_ONE operator (still not enabled, though)
-- added some debugging stuff (maxlevel)

											
										
										
											2000-08-01 18:05:41 -03:00
+								def _code(p, flags):
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
-												bpo-34681: Rename class Pattern in sre_parse to State. (GH-9310)

Also rename corresponding attributes, parameters and variables.
											
										
										
											2018-09-18 03:16:26 -03:00
+								    flags = p.state.flags | flags
-												- fixed split
  (test_sre still complains about split, but that's caused by
  the group reset bug, not split itself)

- added more mark slots
  (should be dynamically allocated, but 100 is better than 32.
  and checking for the upper limit is better than overwriting
  the memory ;-)

- internal: renamed the cursor helper class

- internal: removed some bloat from sre_compile

											
										
										
											2000-06-29 13:57:40 -03:00
+								    code = []
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
 								    # compile info block
 								    _compile_info(code, p, flags)
 								    # compile the pattern
-												Fredrik Lundh: here's the 96.6% version of SRE

											
										
										
											2000-06-01 14:39:12 -03:00
+								    _compile(code, p.data, flags)
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
-												Issue #22434: Constants in sre_constants are now named constants (enum-like).

											
										
										
											2014-11-09 14:48:36 -04:00
+								    code.append(SUCCESS)
-												still trying to figure out how to fix the remaining
group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)

											
										
										
											2000-06-29 20:33:12 -03:00
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
+								    return code
-												bpo-30299: Display a bytecode when compile a regex in debug mode. (#1491)

`re.compile(..., re.DEBUG)` now displays the compiled bytecode in
human readable form.

											
										
										
											2017-05-14 03:05:13 -03:00
+								def _hex_code(code):
 								    return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
 								def dis(code):
 								    import sys
 								    labels = set()
 								    level = 0
 								    offset_width = len(str(len(code) - 1))
 								    def dis_(start, end):
 								        def print_(*args, to=None):
 								            if to is not None:
 								                labels.add(to)
 								                args += ('(to %d)' % (to,),)
 								            print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
 								                  end='  '*(level-1))
 								            print(*args)
 								        def print_2(*args):
 								            print(end=' '*(offset_width + 2*level))
 								            print(*args)
 								        nonlocal level
 								        level += 1
 								        i = start
 								        while i < end:
 								            start = i
 								            op = code[i]
 								            i += 1
 								            op = OPCODES[op]
 								            if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
 								                      MAX_UNTIL, MIN_UNTIL, NEGATE):
 								                print_(op)
 								            elif op in (LITERAL, NOT_LITERAL,
 								                        LITERAL_IGNORE, NOT_LITERAL_IGNORE,
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								                        LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
-												bpo-30299: Display a bytecode when compile a regex in debug mode. (#1491)

`re.compile(..., re.DEBUG)` now displays the compiled bytecode in
human readable form.

											
										
										
											2017-05-14 03:05:13 -03:00
+								                        LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
 								                arg = code[i]
 								                i += 1
 								                print_(op, '%#02x (%r)' % (arg, chr(arg)))
 								            elif op is AT:
 								                arg = code[i]
 								                i += 1
 								                arg = str(ATCODES[arg])
 								                assert arg[:3] == 'AT_'
 								                print_(op, arg[3:])
 								            elif op is CATEGORY:
 								                arg = code[i]
 								                i += 1
 								                arg = str(CHCODES[arg])
 								                assert arg[:9] == 'CATEGORY_'
 								                print_(op, arg[9:])
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								            elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
-												bpo-30299: Display a bytecode when compile a regex in debug mode. (#1491)

`re.compile(..., re.DEBUG)` now displays the compiled bytecode in
human readable form.

											
										
										
											2017-05-14 03:05:13 -03:00
+								                skip = code[i]
 								                print_(op, skip, to=i+skip)
 								                dis_(i+1, i+skip)
 								                i += skip
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								            elif op in (RANGE, RANGE_UNI_IGNORE):
-												bpo-30299: Display a bytecode when compile a regex in debug mode. (#1491)

`re.compile(..., re.DEBUG)` now displays the compiled bytecode in
human readable form.

											
										
										
											2017-05-14 03:05:13 -03:00
+								                lo, hi = code[i: i+2]
 								                i += 2
 								                print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
 								            elif op is CHARSET:
 								                print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
 								                i += 256//_CODEBITS
 								            elif op is BIGCHARSET:
 								                arg = code[i]
 								                i += 1
 								                mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
 								                                        for x in code[i: i + 256//_sre.CODESIZE]))
 								                print_(op, arg, mapping)
 								                i += 256//_sre.CODESIZE
 								                level += 1
 								                for j in range(arg):
 								                    print_2(_hex_code(code[i: i + 256//_CODEBITS]))
 								                    i += 256//_CODEBITS
 								                level -= 1
-												bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)


											
										
										
											2017-10-24 17:31:42 -03:00
+								            elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
 								                        GROUPREF_LOC_IGNORE):
-												bpo-30299: Display a bytecode when compile a regex in debug mode. (#1491)

`re.compile(..., re.DEBUG)` now displays the compiled bytecode in
human readable form.

											
										
										
											2017-05-14 03:05:13 -03:00
+								                arg = code[i]
 								                i += 1
 								                print_(op, arg)
 								            elif op is JUMP:
 								                skip = code[i]
 								                print_(op, skip, to=i+skip)
 								                i += 1
 								            elif op is BRANCH:
 								                skip = code[i]
 								                print_(op, skip, to=i+skip)
 								                while skip:
 								                    dis_(i+1, i+skip)
 								                    i += skip
 								                    start = i
 								                    skip = code[i]
 								                    if skip:
 								                        print_('branch', skip, to=i+skip)
 								                    else:
 								                        print_(FAILURE)
 								                i += 1
 								            elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
 								                skip, min, max = code[i: i+3]
 								                if max == MAXREPEAT:
 								                    max = 'MAXREPEAT'
 								                print_(op, skip, min, max, to=i+skip)
 								                dis_(i+3, i+skip)
 								                i += skip
 								            elif op is GROUPREF_EXISTS:
 								                arg, skip = code[i: i+2]
 								                print_(op, arg, skip, to=i+skip)
 								                i += 2
 								            elif op in (ASSERT, ASSERT_NOT):
 								                skip, arg = code[i: i+2]
 								                print_(op, skip, arg, to=i+skip)
 								                dis_(i+2, i+skip)
 								                i += skip
 								            elif op is INFO:
 								                skip, flags, min, max = code[i: i+4]
 								                if max == MAXREPEAT:
 								                    max = 'MAXREPEAT'
 								                print_(op, skip, bin(flags), min, max, to=i+skip)
 								                start = i+4
 								                if flags & SRE_INFO_PREFIX:
 								                    prefix_len, prefix_skip = code[i+4: i+6]
 								                    print_2('  prefix_skip', prefix_skip)
 								                    start = i + 6
 								                    prefix = code[start: start+prefix_len]
 								                    print_2('  prefix',
 								                            '[%s]' % ', '.join('%#02x' % x for x in prefix),
 								                            '(%r)' % ''.join(map(chr, prefix)))
 								                    start += prefix_len
 								                    print_2('  overlap', code[start: start+prefix_len])
 								                    start += prefix_len
 								                if flags & SRE_INFO_CHARSET:
 								                    level += 1
 								                    print_2('in')
 								                    dis_(start, i+skip)
 								                    level -= 1
 								                i += skip
 								            else:
 								                raise ValueError(op)
 								        level -= 1
 								    dis_(0, len(code))
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
+								def compile(p, flags=0):
 								    # internal: convert pattern list to internal format
-												Addendum to #764548: restore 2.1 compatibility.

											
										
										
											2003-07-02 18:37:16 -03:00
+								    if isstring(p):
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
+								        pattern = p
 								        p = sre_parse.parse(p, flags)
 								    else:
 								        pattern = None
-												-- fixed width calculations for alternations
-- fixed literal check in branch operator
   (this broke test_tokenize, as reported by Mark Favas)
-- added REPEAT_ONE operator (still not enabled, though)
-- added some debugging stuff (maxlevel)

											
										
										
											2000-08-01 18:05:41 -03:00
+								    code = _code(p, flags)
-												SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default

											
										
										
											2000-08-01 15:20:07 -03:00
-												bpo-30299: Display a bytecode when compile a regex in debug mode. (#1491)

`re.compile(..., re.DEBUG)` now displays the compiled bytecode in
human readable form.

											
										
										
											2017-05-14 03:05:13 -03:00
+								    if flags & SRE_FLAG_DEBUG:
 								        print()
 								        dis(code)
-												-- SRE 0.9.6 sync.  this includes:

 + added "regs" attribute
 + fixed "pos" and "endpos" attributes
 + reset "lastindex" and "lastgroup" in scanner methods
 + removed (?P#id) syntax; the "lastindex" and "lastgroup"
   attributes are now always set
 + removed string module dependencies in sre_parse
 + better debugging support in sre_parse
 + various tweaks to build under 1.5.2

											
										
										
											2000-07-23 18:46:17 -03:00
-												- experimental: added two new attributes to the match object:
  "lastgroup" is the name of the last matched capturing group,
  "lastindex" is the index of the same group.  if no group was
  matched, both attributes are set to None.

  the (?P#) feature will be removed in the next relase.

											
										
										
											2000-07-02 19:25:39 -03:00
+								    # map in either direction
-												bpo-34681: Rename class Pattern in sre_parse to State. (GH-9310)

Also rename corresponding attributes, parameters and variables.
											
										
										
											2018-09-18 03:16:26 -03:00
+								    groupindex = p.state.groupdict
 								    indexgroup = [None] * p.state.groups
-												- experimental: added two new attributes to the match object:
  "lastgroup" is the name of the last matched capturing group,
  "lastindex" is the index of the same group.  if no group was
  matched, both attributes are set to None.

  the (?P#) feature will be removed in the next relase.

											
										
										
											2000-07-02 19:25:39 -03:00
+								    for k, i in groupindex.items():
 								        indexgroup[i] = k
-												Fredrik Lundh: here's the 96.6% version of SRE

											
										
										
											2000-06-01 14:39:12 -03:00
+								    return _sre.compile(
-												bpo-34681: Rename class Pattern in sre_parse to State. (GH-9310)

Also rename corresponding attributes, parameters and variables.
											
										
										
											2018-09-18 03:16:26 -03:00
+								        pattern, flags | p.state.flags, code,
 								        p.state.groups-1,
-												Issue #28765: _sre.compile() now checks the type of groupindex and indexgroup

groupindex must a dictionary and indexgroup must be a tuple.

Previously, indexgroup was a list. Use a tuple to reduce the memory usage.

											
										
										
											2016-11-22 18:04:39 -04:00
+								        groupindex, tuple(indexgroup)
-												- pedantic: make sure "python -t" doesn't complain...

											
										
										
											2000-06-30 04:50:59 -03:00
+								        )