Issue #22434: Constants in sre_constants are now named constants (enum-like).

This commit is contained in:
Serhiy Storchaka 2014-11-09 20:48:36 +02:00
parent bf764a1912
commit c7f7d3897e
4 changed files with 127 additions and 186 deletions

View File

@ -13,7 +13,6 @@
import _sre import _sre
import sre_parse import sre_parse
from sre_constants import * from sre_constants import *
from _sre import MAXREPEAT
assert _sre.MAGIC == MAGIC, "SRE module mismatch" assert _sre.MAGIC == MAGIC, "SRE module mismatch"
@ -38,65 +37,65 @@ def _compile(code, pattern, flags):
for op, av in pattern: for op, av in pattern:
if op in LITERAL_CODES: if op in LITERAL_CODES:
if flags & SRE_FLAG_IGNORECASE: if flags & SRE_FLAG_IGNORECASE:
emit(OPCODES[OP_IGNORE[op]]) emit(OP_IGNORE[op])
emit(_sre.getlower(av, flags)) emit(_sre.getlower(av, flags))
else: else:
emit(OPCODES[op]) emit(op)
emit(av) emit(av)
elif op is IN: elif op is IN:
if flags & SRE_FLAG_IGNORECASE: if flags & SRE_FLAG_IGNORECASE:
emit(OPCODES[OP_IGNORE[op]]) emit(OP_IGNORE[op])
def fixup(literal, flags=flags): def fixup(literal, flags=flags):
return _sre.getlower(literal, flags) return _sre.getlower(literal, flags)
else: else:
emit(OPCODES[op]) emit(op)
fixup = None fixup = None
skip = _len(code); emit(0) skip = _len(code); emit(0)
_compile_charset(av, flags, code, fixup) _compile_charset(av, flags, code, fixup)
code[skip] = _len(code) - skip code[skip] = _len(code) - skip
elif op is ANY: elif op is ANY:
if flags & SRE_FLAG_DOTALL: if flags & SRE_FLAG_DOTALL:
emit(OPCODES[ANY_ALL]) emit(ANY_ALL)
else: else:
emit(OPCODES[ANY]) emit(ANY)
elif op in REPEATING_CODES: elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE: if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator") raise error("internal: unsupported template operator")
elif _simple(av) and op is not REPEAT: elif _simple(av) and op is not REPEAT:
if op is MAX_REPEAT: if op is MAX_REPEAT:
emit(OPCODES[REPEAT_ONE]) emit(REPEAT_ONE)
else: else:
emit(OPCODES[MIN_REPEAT_ONE]) emit(MIN_REPEAT_ONE)
skip = _len(code); emit(0) skip = _len(code); emit(0)
emit(av[0]) emit(av[0])
emit(av[1]) emit(av[1])
_compile(code, av[2], flags) _compile(code, av[2], flags)
emit(OPCODES[SUCCESS]) emit(SUCCESS)
code[skip] = _len(code) - skip code[skip] = _len(code) - skip
else: else:
emit(OPCODES[REPEAT]) emit(REPEAT)
skip = _len(code); emit(0) skip = _len(code); emit(0)
emit(av[0]) emit(av[0])
emit(av[1]) emit(av[1])
_compile(code, av[2], flags) _compile(code, av[2], flags)
code[skip] = _len(code) - skip code[skip] = _len(code) - skip
if op is MAX_REPEAT: if op is MAX_REPEAT:
emit(OPCODES[MAX_UNTIL]) emit(MAX_UNTIL)
else: else:
emit(OPCODES[MIN_UNTIL]) emit(MIN_UNTIL)
elif op is SUBPATTERN: elif op is SUBPATTERN:
if av[0]: if av[0]:
emit(OPCODES[MARK]) emit(MARK)
emit((av[0]-1)*2) emit((av[0]-1)*2)
# _compile_info(code, av[1], flags) # _compile_info(code, av[1], flags)
_compile(code, av[1], flags) _compile(code, av[1], flags)
if av[0]: if av[0]:
emit(OPCODES[MARK]) emit(MARK)
emit((av[0]-1)*2+1) emit((av[0]-1)*2+1)
elif op in SUCCESS_CODES: elif op in SUCCESS_CODES:
emit(OPCODES[op]) emit(op)
elif op in ASSERT_CODES: elif op in ASSERT_CODES:
emit(OPCODES[op]) emit(op)
skip = _len(code); emit(0) skip = _len(code); emit(0)
if av[0] >= 0: if av[0] >= 0:
emit(0) # look ahead emit(0) # look ahead
@ -106,57 +105,57 @@ def _compile(code, pattern, flags):
raise error("look-behind requires fixed-width pattern") raise error("look-behind requires fixed-width pattern")
emit(lo) # look behind emit(lo) # look behind
_compile(code, av[1], flags) _compile(code, av[1], flags)
emit(OPCODES[SUCCESS]) emit(SUCCESS)
code[skip] = _len(code) - skip code[skip] = _len(code) - skip
elif op is CALL: elif op is CALL:
emit(OPCODES[op]) emit(op)
skip = _len(code); emit(0) skip = _len(code); emit(0)
_compile(code, av, flags) _compile(code, av, flags)
emit(OPCODES[SUCCESS]) emit(SUCCESS)
code[skip] = _len(code) - skip code[skip] = _len(code) - skip
elif op is AT: elif op is AT:
emit(OPCODES[op]) emit(op)
if flags & SRE_FLAG_MULTILINE: if flags & SRE_FLAG_MULTILINE:
av = AT_MULTILINE.get(av, av) av = AT_MULTILINE.get(av, av)
if flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_LOCALE:
av = AT_LOCALE.get(av, av) av = AT_LOCALE.get(av, av)
elif flags & SRE_FLAG_UNICODE: elif flags & SRE_FLAG_UNICODE:
av = AT_UNICODE.get(av, av) av = AT_UNICODE.get(av, av)
emit(ATCODES[av]) emit(av)
elif op is BRANCH: elif op is BRANCH:
emit(OPCODES[op]) emit(op)
tail = [] tail = []
tailappend = tail.append tailappend = tail.append
for av in av[1]: for av in av[1]:
skip = _len(code); emit(0) skip = _len(code); emit(0)
# _compile_info(code, av, flags) # _compile_info(code, av, flags)
_compile(code, av, flags) _compile(code, av, flags)
emit(OPCODES[JUMP]) emit(JUMP)
tailappend(_len(code)); emit(0) tailappend(_len(code)); emit(0)
code[skip] = _len(code) - skip code[skip] = _len(code) - skip
emit(0) # end of branch emit(0) # end of branch
for tail in tail: for tail in tail:
code[tail] = _len(code) - tail code[tail] = _len(code) - tail
elif op is CATEGORY: elif op is CATEGORY:
emit(OPCODES[op]) emit(op)
if flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_LOCALE:
av = CH_LOCALE[av] av = CH_LOCALE[av]
elif flags & SRE_FLAG_UNICODE: elif flags & SRE_FLAG_UNICODE:
av = CH_UNICODE[av] av = CH_UNICODE[av]
emit(CHCODES[av]) emit(av)
elif op is GROUPREF: elif op is GROUPREF:
if flags & SRE_FLAG_IGNORECASE: if flags & SRE_FLAG_IGNORECASE:
emit(OPCODES[OP_IGNORE[op]]) emit(OP_IGNORE[op])
else: else:
emit(OPCODES[op]) emit(op)
emit(av-1) emit(av-1)
elif op is GROUPREF_EXISTS: elif op is GROUPREF_EXISTS:
emit(OPCODES[op]) emit(op)
emit(av[0]-1) emit(av[0]-1)
skipyes = _len(code); emit(0) skipyes = _len(code); emit(0)
_compile(code, av[1], flags) _compile(code, av[1], flags)
if av[2]: if av[2]:
emit(OPCODES[JUMP]) emit(JUMP)
skipno = _len(code); emit(0) skipno = _len(code); emit(0)
code[skipyes] = _len(code) - skipyes + 1 code[skipyes] = _len(code) - skipyes + 1
_compile(code, av[2], flags) _compile(code, av[2], flags)
@ -170,7 +169,7 @@ def _compile_charset(charset, flags, code, fixup=None):
# compile charset subprogram # compile charset subprogram
emit = code.append emit = code.append
for op, av in _optimize_charset(charset, fixup): for op, av in _optimize_charset(charset, fixup):
emit(OPCODES[op]) emit(op)
if op is NEGATE: if op is NEGATE:
pass pass
elif op is LITERAL: elif op is LITERAL:
@ -184,14 +183,14 @@ def _compile_charset(charset, flags, code, fixup=None):
code.extend(av) code.extend(av)
elif op is CATEGORY: elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_LOCALE:
emit(CHCODES[CH_LOCALE[av]]) emit(CH_LOCALE[av])
elif flags & SRE_FLAG_UNICODE: elif flags & SRE_FLAG_UNICODE:
emit(CHCODES[CH_UNICODE[av]]) emit(CH_UNICODE[av])
else: else:
emit(CHCODES[av]) emit(av)
else: else:
raise error("internal: unsupported set operator") raise error("internal: unsupported set operator")
emit(OPCODES[FAILURE]) emit(FAILURE)
def _optimize_charset(charset, fixup): def _optimize_charset(charset, fixup):
# internal: optimize character set # internal: optimize character set
@ -414,7 +413,7 @@ def _compile_info(code, pattern, flags):
## print "*** CHARSET", charset ## print "*** CHARSET", charset
# add an info block # add an info block
emit = code.append emit = code.append
emit(OPCODES[INFO]) emit(INFO)
skip = len(code); emit(0) skip = len(code); emit(0)
# literal flag # literal flag
mask = 0 mask = 0
@ -460,7 +459,7 @@ def _code(p, flags):
# compile the pattern # compile the pattern
_compile(code, p.data, flags) _compile(code, p.data, flags)
code.append(OPCODES[SUCCESS]) code.append(SUCCESS)
return code return code
@ -475,7 +474,7 @@ def compile(p, flags=0):
code = _code(p, flags) code = _code(p, flags)
# print code # print(code)
# map in either direction # map in either direction
groupindex = p.pattern.groupdict groupindex = p.pattern.groupdict

View File

@ -23,138 +23,81 @@ from _sre import MAXREPEAT, MAXGROUPS
class error(Exception): class error(Exception):
pass pass
class _NamedIntConstant(int):
def __new__(cls, value, name):
self = super(_NamedIntConstant, cls).__new__(cls, value)
self.name = name
return self
def __str__(self):
return self.name
__repr__ = __str__
MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT')
def _makecodes(names):
names = names.strip().split()
items = [_NamedIntConstant(i, name) for i, name in enumerate(names)]
globals().update({item.name: item for item in items})
return items
# operators # operators
# failure=0 success=1 (just because it looks better that way :-)
OPCODES = _makecodes("""
FAILURE SUCCESS
FAILURE = "failure" ANY ANY_ALL
SUCCESS = "success" ASSERT ASSERT_NOT
AT
BRANCH
CALL
CATEGORY
CHARSET BIGCHARSET
GROUPREF GROUPREF_EXISTS GROUPREF_IGNORE
IN IN_IGNORE
INFO
JUMP
LITERAL LITERAL_IGNORE
MARK
MAX_UNTIL
MIN_UNTIL
NOT_LITERAL NOT_LITERAL_IGNORE
NEGATE
RANGE
REPEAT
REPEAT_ONE
SUBPATTERN
MIN_REPEAT_ONE
RANGE_IGNORE
ANY = "any" MIN_REPEAT MAX_REPEAT
ANY_ALL = "any_all" """)
ASSERT = "assert" del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT
ASSERT_NOT = "assert_not"
AT = "at"
BIGCHARSET = "bigcharset"
BRANCH = "branch"
CALL = "call"
CATEGORY = "category"
CHARSET = "charset"
GROUPREF = "groupref"
GROUPREF_IGNORE = "groupref_ignore"
GROUPREF_EXISTS = "groupref_exists"
IN = "in"
IN_IGNORE = "in_ignore"
INFO = "info"
JUMP = "jump"
LITERAL = "literal"
LITERAL_IGNORE = "literal_ignore"
MARK = "mark"
MAX_REPEAT = "max_repeat"
MAX_UNTIL = "max_until"
MIN_REPEAT = "min_repeat"
MIN_UNTIL = "min_until"
NEGATE = "negate"
NOT_LITERAL = "not_literal"
NOT_LITERAL_IGNORE = "not_literal_ignore"
RANGE = "range"
RANGE_IGNORE = "range_ignore"
REPEAT = "repeat"
REPEAT_ONE = "repeat_one"
SUBPATTERN = "subpattern"
MIN_REPEAT_ONE = "min_repeat_one"
# positions # positions
AT_BEGINNING = "at_beginning" ATCODES = _makecodes("""
AT_BEGINNING_LINE = "at_beginning_line" AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING
AT_BEGINNING_STRING = "at_beginning_string" AT_BOUNDARY AT_NON_BOUNDARY
AT_BOUNDARY = "at_boundary" AT_END AT_END_LINE AT_END_STRING
AT_NON_BOUNDARY = "at_non_boundary" AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY
AT_END = "at_end" AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY
AT_END_LINE = "at_end_line" """)
AT_END_STRING = "at_end_string"
AT_LOC_BOUNDARY = "at_loc_boundary"
AT_LOC_NON_BOUNDARY = "at_loc_non_boundary"
AT_UNI_BOUNDARY = "at_uni_boundary"
AT_UNI_NON_BOUNDARY = "at_uni_non_boundary"
# categories # categories
CATEGORY_DIGIT = "category_digit" CHCODES = _makecodes("""
CATEGORY_NOT_DIGIT = "category_not_digit" CATEGORY_DIGIT CATEGORY_NOT_DIGIT
CATEGORY_SPACE = "category_space" CATEGORY_SPACE CATEGORY_NOT_SPACE
CATEGORY_NOT_SPACE = "category_not_space" CATEGORY_WORD CATEGORY_NOT_WORD
CATEGORY_WORD = "category_word" CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK
CATEGORY_NOT_WORD = "category_not_word" CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD
CATEGORY_LINEBREAK = "category_linebreak" CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT
CATEGORY_NOT_LINEBREAK = "category_not_linebreak" CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE
CATEGORY_LOC_WORD = "category_loc_word" CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD
CATEGORY_LOC_NOT_WORD = "category_loc_not_word" CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK
CATEGORY_UNI_DIGIT = "category_uni_digit" """)
CATEGORY_UNI_NOT_DIGIT = "category_uni_not_digit"
CATEGORY_UNI_SPACE = "category_uni_space"
CATEGORY_UNI_NOT_SPACE = "category_uni_not_space"
CATEGORY_UNI_WORD = "category_uni_word"
CATEGORY_UNI_NOT_WORD = "category_uni_not_word"
CATEGORY_UNI_LINEBREAK = "category_uni_linebreak"
CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak"
OPCODES = [
# failure=0 success=1 (just because it looks better that way :-)
FAILURE, SUCCESS,
ANY, ANY_ALL,
ASSERT, ASSERT_NOT,
AT,
BRANCH,
CALL,
CATEGORY,
CHARSET, BIGCHARSET,
GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE,
IN, IN_IGNORE,
INFO,
JUMP,
LITERAL, LITERAL_IGNORE,
MARK,
MAX_UNTIL,
MIN_UNTIL,
NOT_LITERAL, NOT_LITERAL_IGNORE,
NEGATE,
RANGE,
REPEAT,
REPEAT_ONE,
SUBPATTERN,
MIN_REPEAT_ONE,
RANGE_IGNORE,
]
ATCODES = [
AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY,
AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING,
AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY,
AT_UNI_NON_BOUNDARY
]
CHCODES = [
CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE,
CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD,
CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_WORD,
CATEGORY_LOC_NOT_WORD, CATEGORY_UNI_DIGIT, CATEGORY_UNI_NOT_DIGIT,
CATEGORY_UNI_SPACE, CATEGORY_UNI_NOT_SPACE, CATEGORY_UNI_WORD,
CATEGORY_UNI_NOT_WORD, CATEGORY_UNI_LINEBREAK,
CATEGORY_UNI_NOT_LINEBREAK
]
def makedict(list):
d = {}
i = 0
for item in list:
d[item] = i
i = i + 1
return d
OPCODES = makedict(OPCODES)
ATCODES = makedict(ATCODES)
CHCODES = makedict(CHCODES)
# replacement operations for "ignore case" mode # replacement operations for "ignore case" mode
OP_IGNORE = { OP_IGNORE = {
@ -220,9 +163,9 @@ SRE_INFO_CHARSET = 4 # pattern starts with character from given set
if __name__ == "__main__": if __name__ == "__main__":
def dump(f, d, prefix): def dump(f, d, prefix):
items = sorted(d.items(), key=lambda a: a[1]) items = sorted(d)
for k, v in items: for item in items:
f.write("#define %s_%s %s\n" % (prefix, k.upper(), v)) f.write("#define %s_%s %d\n" % (prefix, item, item))
f = open("sre_constants.h", "w") f = open("sre_constants.h", "w")
f.write("""\ f.write("""\
/* /*

View File

@ -13,7 +13,6 @@
# XXX: show string offset and offending character for all errors # XXX: show string offset and offending character for all errors
from sre_constants import * from sre_constants import *
from _sre import MAXREPEAT
SPECIAL_CHARS = ".\\[{()*+?^$|" SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{" REPEAT_CHARS = "*+?{"
@ -103,24 +102,24 @@ class SubPattern:
nl = True nl = True
seqtypes = (tuple, list) seqtypes = (tuple, list)
for op, av in self.data: for op, av in self.data:
print(level*" " + op, end='') print(level*" " + str(op), end='')
if op == IN: if op == IN:
# member sublanguage # member sublanguage
print() print()
for op, a in av: for op, a in av:
print((level+1)*" " + op, a) print((level+1)*" " + str(op), a)
elif op == BRANCH: elif op == BRANCH:
print() print()
for i, a in enumerate(av[1]): for i, a in enumerate(av[1]):
if i: if i:
print(level*" " + "or") print(level*" " + "OR")
a.dump(level+1) a.dump(level+1)
elif op == GROUPREF_EXISTS: elif op == GROUPREF_EXISTS:
condgroup, item_yes, item_no = av condgroup, item_yes, item_no = av
print('', condgroup) print('', condgroup)
item_yes.dump(level+1) item_yes.dump(level+1)
if item_no: if item_no:
print(level*" " + "else") print(level*" " + "ELSE")
item_no.dump(level+1) item_no.dump(level+1)
elif isinstance(av, seqtypes): elif isinstance(av, seqtypes):
nl = False nl = False

View File

@ -1285,22 +1285,22 @@ class ReTests(unittest.TestCase):
with captured_stdout() as out: with captured_stdout() as out:
re.compile(pat, re.DEBUG) re.compile(pat, re.DEBUG)
dump = '''\ dump = '''\
subpattern 1 SUBPATTERN 1
literal 46 LITERAL 46
subpattern None SUBPATTERN None
branch BRANCH
in IN
literal 99 LITERAL 99
literal 104 LITERAL 104
or OR
literal 112 LITERAL 112
literal 121 LITERAL 121
subpattern None SUBPATTERN None
groupref_exists 1 GROUPREF_EXISTS 1
at at_end AT AT_END
else ELSE
literal 58 LITERAL 58
literal 32 LITERAL 32
''' '''
self.assertEqual(out.getvalue(), dump) self.assertEqual(out.getvalue(), dump)
# Debug output is output again even a second time (bypassing # Debug output is output again even a second time (bypassing