diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 1b3e9f82e34..da422a9da57 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -13,7 +13,6 @@ import _sre import sre_parse from sre_constants import * -from _sre import MAXREPEAT assert _sre.MAGIC == MAGIC, "SRE module mismatch" @@ -38,65 +37,65 @@ def _compile(code, pattern, flags): for op, av in pattern: if op in LITERAL_CODES: if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) + emit(OP_IGNORE[op]) emit(_sre.getlower(av, flags)) else: - emit(OPCODES[op]) + emit(op) emit(av) elif op is IN: if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) + emit(OP_IGNORE[op]) def fixup(literal, flags=flags): return _sre.getlower(literal, flags) else: - emit(OPCODES[op]) + emit(op) fixup = None skip = _len(code); emit(0) _compile_charset(av, flags, code, fixup) code[skip] = _len(code) - skip elif op is ANY: if flags & SRE_FLAG_DOTALL: - emit(OPCODES[ANY_ALL]) + emit(ANY_ALL) else: - emit(OPCODES[ANY]) + emit(ANY) elif op in REPEATING_CODES: if flags & SRE_FLAG_TEMPLATE: raise error("internal: unsupported template operator") elif _simple(av) and op is not REPEAT: if op is MAX_REPEAT: - emit(OPCODES[REPEAT_ONE]) + emit(REPEAT_ONE) else: - emit(OPCODES[MIN_REPEAT_ONE]) + emit(MIN_REPEAT_ONE) skip = _len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) - emit(OPCODES[SUCCESS]) + emit(SUCCESS) code[skip] = _len(code) - skip else: - emit(OPCODES[REPEAT]) + emit(REPEAT) skip = _len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) code[skip] = _len(code) - skip if op is MAX_REPEAT: - emit(OPCODES[MAX_UNTIL]) + emit(MAX_UNTIL) else: - emit(OPCODES[MIN_UNTIL]) + emit(MIN_UNTIL) elif op is SUBPATTERN: if av[0]: - emit(OPCODES[MARK]) + emit(MARK) emit((av[0]-1)*2) # _compile_info(code, av[1], flags) _compile(code, av[1], flags) if av[0]: - emit(OPCODES[MARK]) + emit(MARK) emit((av[0]-1)*2+1) elif op in SUCCESS_CODES: - emit(OPCODES[op]) + emit(op) elif op in ASSERT_CODES: - emit(OPCODES[op]) + emit(op) skip = _len(code); emit(0) if av[0] >= 0: emit(0) # look ahead @@ -106,57 +105,57 @@ def _compile(code, pattern, flags): raise error("look-behind requires fixed-width pattern") emit(lo) # look behind _compile(code, av[1], flags) - emit(OPCODES[SUCCESS]) + emit(SUCCESS) code[skip] = _len(code) - skip elif op is CALL: - emit(OPCODES[op]) + emit(op) skip = _len(code); emit(0) _compile(code, av, flags) - emit(OPCODES[SUCCESS]) + emit(SUCCESS) code[skip] = _len(code) - skip elif op is AT: - emit(OPCODES[op]) + emit(op) if flags & SRE_FLAG_MULTILINE: av = AT_MULTILINE.get(av, av) if flags & SRE_FLAG_LOCALE: av = AT_LOCALE.get(av, av) elif flags & SRE_FLAG_UNICODE: av = AT_UNICODE.get(av, av) - emit(ATCODES[av]) + emit(av) elif op is BRANCH: - emit(OPCODES[op]) + emit(op) tail = [] tailappend = tail.append for av in av[1]: skip = _len(code); emit(0) # _compile_info(code, av, flags) _compile(code, av, flags) - emit(OPCODES[JUMP]) + emit(JUMP) tailappend(_len(code)); emit(0) code[skip] = _len(code) - skip emit(0) # end of branch for tail in tail: code[tail] = _len(code) - tail elif op is CATEGORY: - emit(OPCODES[op]) + emit(op) if flags & SRE_FLAG_LOCALE: av = CH_LOCALE[av] elif flags & SRE_FLAG_UNICODE: av = CH_UNICODE[av] - emit(CHCODES[av]) + emit(av) elif op is GROUPREF: if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) + emit(OP_IGNORE[op]) else: - emit(OPCODES[op]) + emit(op) emit(av-1) elif op is GROUPREF_EXISTS: - emit(OPCODES[op]) + emit(op) emit(av[0]-1) skipyes = _len(code); emit(0) _compile(code, av[1], flags) if av[2]: - emit(OPCODES[JUMP]) + emit(JUMP) skipno = _len(code); emit(0) code[skipyes] = _len(code) - skipyes + 1 _compile(code, av[2], flags) @@ -170,7 +169,7 @@ def _compile_charset(charset, flags, code, fixup=None): # compile charset subprogram emit = code.append for op, av in _optimize_charset(charset, fixup): - emit(OPCODES[op]) + emit(op) if op is NEGATE: pass elif op is LITERAL: @@ -184,14 +183,14 @@ def _compile_charset(charset, flags, code, fixup=None): code.extend(av) elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: - emit(CHCODES[CH_LOCALE[av]]) + emit(CH_LOCALE[av]) elif flags & SRE_FLAG_UNICODE: - emit(CHCODES[CH_UNICODE[av]]) + emit(CH_UNICODE[av]) else: - emit(CHCODES[av]) + emit(av) else: raise error("internal: unsupported set operator") - emit(OPCODES[FAILURE]) + emit(FAILURE) def _optimize_charset(charset, fixup): # internal: optimize character set @@ -414,7 +413,7 @@ def _compile_info(code, pattern, flags): ## print "*** CHARSET", charset # add an info block emit = code.append - emit(OPCODES[INFO]) + emit(INFO) skip = len(code); emit(0) # literal flag mask = 0 @@ -460,7 +459,7 @@ def _code(p, flags): # compile the pattern _compile(code, p.data, flags) - code.append(OPCODES[SUCCESS]) + code.append(SUCCESS) return code @@ -475,7 +474,7 @@ def compile(p, flags=0): code = _code(p, flags) - # print code + # print(code) # map in either direction groupindex = p.pattern.groupdict diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 8296ecdb702..7480bf3ae04 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -23,138 +23,81 @@ from _sre import MAXREPEAT, MAXGROUPS class error(Exception): pass + +class _NamedIntConstant(int): + def __new__(cls, value, name): + self = super(_NamedIntConstant, cls).__new__(cls, value) + self.name = name + return self + + def __str__(self): + return self.name + + __repr__ = __str__ + +MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT') + +def _makecodes(names): + names = names.strip().split() + items = [_NamedIntConstant(i, name) for i, name in enumerate(names)] + globals().update({item.name: item for item in items}) + return items + # operators +# failure=0 success=1 (just because it looks better that way :-) +OPCODES = _makecodes(""" + FAILURE SUCCESS -FAILURE = "failure" -SUCCESS = "success" + ANY ANY_ALL + ASSERT ASSERT_NOT + AT + BRANCH + CALL + CATEGORY + CHARSET BIGCHARSET + GROUPREF GROUPREF_EXISTS GROUPREF_IGNORE + IN IN_IGNORE + INFO + JUMP + LITERAL LITERAL_IGNORE + MARK + MAX_UNTIL + MIN_UNTIL + NOT_LITERAL NOT_LITERAL_IGNORE + NEGATE + RANGE + REPEAT + REPEAT_ONE + SUBPATTERN + MIN_REPEAT_ONE + RANGE_IGNORE -ANY = "any" -ANY_ALL = "any_all" -ASSERT = "assert" -ASSERT_NOT = "assert_not" -AT = "at" -BIGCHARSET = "bigcharset" -BRANCH = "branch" -CALL = "call" -CATEGORY = "category" -CHARSET = "charset" -GROUPREF = "groupref" -GROUPREF_IGNORE = "groupref_ignore" -GROUPREF_EXISTS = "groupref_exists" -IN = "in" -IN_IGNORE = "in_ignore" -INFO = "info" -JUMP = "jump" -LITERAL = "literal" -LITERAL_IGNORE = "literal_ignore" -MARK = "mark" -MAX_REPEAT = "max_repeat" -MAX_UNTIL = "max_until" -MIN_REPEAT = "min_repeat" -MIN_UNTIL = "min_until" -NEGATE = "negate" -NOT_LITERAL = "not_literal" -NOT_LITERAL_IGNORE = "not_literal_ignore" -RANGE = "range" -RANGE_IGNORE = "range_ignore" -REPEAT = "repeat" -REPEAT_ONE = "repeat_one" -SUBPATTERN = "subpattern" -MIN_REPEAT_ONE = "min_repeat_one" + MIN_REPEAT MAX_REPEAT +""") +del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT # positions -AT_BEGINNING = "at_beginning" -AT_BEGINNING_LINE = "at_beginning_line" -AT_BEGINNING_STRING = "at_beginning_string" -AT_BOUNDARY = "at_boundary" -AT_NON_BOUNDARY = "at_non_boundary" -AT_END = "at_end" -AT_END_LINE = "at_end_line" -AT_END_STRING = "at_end_string" -AT_LOC_BOUNDARY = "at_loc_boundary" -AT_LOC_NON_BOUNDARY = "at_loc_non_boundary" -AT_UNI_BOUNDARY = "at_uni_boundary" -AT_UNI_NON_BOUNDARY = "at_uni_non_boundary" +ATCODES = _makecodes(""" + AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING + AT_BOUNDARY AT_NON_BOUNDARY + AT_END AT_END_LINE AT_END_STRING + AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY + AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY +""") # categories -CATEGORY_DIGIT = "category_digit" -CATEGORY_NOT_DIGIT = "category_not_digit" -CATEGORY_SPACE = "category_space" -CATEGORY_NOT_SPACE = "category_not_space" -CATEGORY_WORD = "category_word" -CATEGORY_NOT_WORD = "category_not_word" -CATEGORY_LINEBREAK = "category_linebreak" -CATEGORY_NOT_LINEBREAK = "category_not_linebreak" -CATEGORY_LOC_WORD = "category_loc_word" -CATEGORY_LOC_NOT_WORD = "category_loc_not_word" -CATEGORY_UNI_DIGIT = "category_uni_digit" -CATEGORY_UNI_NOT_DIGIT = "category_uni_not_digit" -CATEGORY_UNI_SPACE = "category_uni_space" -CATEGORY_UNI_NOT_SPACE = "category_uni_not_space" -CATEGORY_UNI_WORD = "category_uni_word" -CATEGORY_UNI_NOT_WORD = "category_uni_not_word" -CATEGORY_UNI_LINEBREAK = "category_uni_linebreak" -CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak" +CHCODES = _makecodes(""" + CATEGORY_DIGIT CATEGORY_NOT_DIGIT + CATEGORY_SPACE CATEGORY_NOT_SPACE + CATEGORY_WORD CATEGORY_NOT_WORD + CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK + CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD + CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT + CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE + CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD + CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK +""") -OPCODES = [ - - # failure=0 success=1 (just because it looks better that way :-) - FAILURE, SUCCESS, - - ANY, ANY_ALL, - ASSERT, ASSERT_NOT, - AT, - BRANCH, - CALL, - CATEGORY, - CHARSET, BIGCHARSET, - GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE, - IN, IN_IGNORE, - INFO, - JUMP, - LITERAL, LITERAL_IGNORE, - MARK, - MAX_UNTIL, - MIN_UNTIL, - NOT_LITERAL, NOT_LITERAL_IGNORE, - NEGATE, - RANGE, - REPEAT, - REPEAT_ONE, - SUBPATTERN, - MIN_REPEAT_ONE, - RANGE_IGNORE, - -] - -ATCODES = [ - AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY, - AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING, - AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY, - AT_UNI_NON_BOUNDARY -] - -CHCODES = [ - CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE, - CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD, - CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_WORD, - CATEGORY_LOC_NOT_WORD, CATEGORY_UNI_DIGIT, CATEGORY_UNI_NOT_DIGIT, - CATEGORY_UNI_SPACE, CATEGORY_UNI_NOT_SPACE, CATEGORY_UNI_WORD, - CATEGORY_UNI_NOT_WORD, CATEGORY_UNI_LINEBREAK, - CATEGORY_UNI_NOT_LINEBREAK -] - -def makedict(list): - d = {} - i = 0 - for item in list: - d[item] = i - i = i + 1 - return d - -OPCODES = makedict(OPCODES) -ATCODES = makedict(ATCODES) -CHCODES = makedict(CHCODES) # replacement operations for "ignore case" mode OP_IGNORE = { @@ -220,9 +163,9 @@ SRE_INFO_CHARSET = 4 # pattern starts with character from given set if __name__ == "__main__": def dump(f, d, prefix): - items = sorted(d.items(), key=lambda a: a[1]) - for k, v in items: - f.write("#define %s_%s %s\n" % (prefix, k.upper(), v)) + items = sorted(d) + for item in items: + f.write("#define %s_%s %d\n" % (prefix, item, item)) f = open("sre_constants.h", "w") f.write("""\ /* diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index edc3ff143ac..768ed792ee1 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -13,7 +13,6 @@ # XXX: show string offset and offending character for all errors from sre_constants import * -from _sre import MAXREPEAT SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" @@ -103,24 +102,24 @@ class SubPattern: nl = True seqtypes = (tuple, list) for op, av in self.data: - print(level*" " + op, end='') + print(level*" " + str(op), end='') if op == IN: # member sublanguage print() for op, a in av: - print((level+1)*" " + op, a) + print((level+1)*" " + str(op), a) elif op == BRANCH: print() for i, a in enumerate(av[1]): if i: - print(level*" " + "or") + print(level*" " + "OR") a.dump(level+1) elif op == GROUPREF_EXISTS: condgroup, item_yes, item_no = av print('', condgroup) item_yes.dump(level+1) if item_no: - print(level*" " + "else") + print(level*" " + "ELSE") item_no.dump(level+1) elif isinstance(av, seqtypes): nl = False diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 4f7e1629e56..8baf3b4f59a 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1285,22 +1285,22 @@ class ReTests(unittest.TestCase): with captured_stdout() as out: re.compile(pat, re.DEBUG) dump = '''\ -subpattern 1 - literal 46 -subpattern None - branch - in - literal 99 - literal 104 - or - literal 112 - literal 121 -subpattern None - groupref_exists 1 - at at_end - else - literal 58 - literal 32 +SUBPATTERN 1 + LITERAL 46 +SUBPATTERN None + BRANCH + IN + LITERAL 99 + LITERAL 104 + OR + LITERAL 112 + LITERAL 121 +SUBPATTERN None + GROUPREF_EXISTS 1 + AT AT_END + ELSE + LITERAL 58 + LITERAL 32 ''' self.assertEqual(out.getvalue(), dump) # Debug output is output again even a second time (bypassing