bpo-30299: Display a bytecode when compile a regex in debug mode. (#1491)

`re.compile(..., re.DEBUG)` now displays the compiled bytecode in
human readable form.
This commit is contained in:
Serhiy Storchaka 2017-05-14 09:05:13 +03:00 committed by GitHub
parent 821a9d146b
commit 4ab6abfca4
3 changed files with 177 additions and 1 deletions

View File

@ -595,6 +595,150 @@ def _code(p, flags):
return code
def _hex_code(code):
return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
def dis(code):
import sys
labels = set()
level = 0
offset_width = len(str(len(code) - 1))
def dis_(start, end):
def print_(*args, to=None):
if to is not None:
labels.add(to)
args += ('(to %d)' % (to,),)
print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
end=' '*(level-1))
print(*args)
def print_2(*args):
print(end=' '*(offset_width + 2*level))
print(*args)
nonlocal level
level += 1
i = start
while i < end:
start = i
op = code[i]
i += 1
op = OPCODES[op]
if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
MAX_UNTIL, MIN_UNTIL, NEGATE):
print_(op)
elif op in (LITERAL, NOT_LITERAL,
LITERAL_IGNORE, NOT_LITERAL_IGNORE,
LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
arg = code[i]
i += 1
print_(op, '%#02x (%r)' % (arg, chr(arg)))
elif op is AT:
arg = code[i]
i += 1
arg = str(ATCODES[arg])
assert arg[:3] == 'AT_'
print_(op, arg[3:])
elif op is CATEGORY:
arg = code[i]
i += 1
arg = str(CHCODES[arg])
assert arg[:9] == 'CATEGORY_'
print_(op, arg[9:])
elif op in (IN, IN_IGNORE, IN_LOC_IGNORE):
skip = code[i]
print_(op, skip, to=i+skip)
dis_(i+1, i+skip)
i += skip
elif op in (RANGE, RANGE_IGNORE):
lo, hi = code[i: i+2]
i += 2
print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
elif op is CHARSET:
print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
i += 256//_CODEBITS
elif op is BIGCHARSET:
arg = code[i]
i += 1
mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
for x in code[i: i + 256//_sre.CODESIZE]))
print_(op, arg, mapping)
i += 256//_sre.CODESIZE
level += 1
for j in range(arg):
print_2(_hex_code(code[i: i + 256//_CODEBITS]))
i += 256//_CODEBITS
level -= 1
elif op in (MARK, GROUPREF, GROUPREF_IGNORE):
arg = code[i]
i += 1
print_(op, arg)
elif op is JUMP:
skip = code[i]
print_(op, skip, to=i+skip)
i += 1
elif op is BRANCH:
skip = code[i]
print_(op, skip, to=i+skip)
while skip:
dis_(i+1, i+skip)
i += skip
start = i
skip = code[i]
if skip:
print_('branch', skip, to=i+skip)
else:
print_(FAILURE)
i += 1
elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
skip, min, max = code[i: i+3]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, min, max, to=i+skip)
dis_(i+3, i+skip)
i += skip
elif op is GROUPREF_EXISTS:
arg, skip = code[i: i+2]
print_(op, arg, skip, to=i+skip)
i += 2
elif op in (ASSERT, ASSERT_NOT):
skip, arg = code[i: i+2]
print_(op, skip, arg, to=i+skip)
dis_(i+2, i+skip)
i += skip
elif op is INFO:
skip, flags, min, max = code[i: i+4]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, bin(flags), min, max, to=i+skip)
start = i+4
if flags & SRE_INFO_PREFIX:
prefix_len, prefix_skip = code[i+4: i+6]
print_2(' prefix_skip', prefix_skip)
start = i + 6
prefix = code[start: start+prefix_len]
print_2(' prefix',
'[%s]' % ', '.join('%#02x' % x for x in prefix),
'(%r)' % ''.join(map(chr, prefix)))
start += prefix_len
print_2(' overlap', code[start: start+prefix_len])
start += prefix_len
if flags & SRE_INFO_CHARSET:
level += 1
print_2('in')
dis_(start, i+skip)
level -= 1
i += skip
else:
raise ValueError(op)
level -= 1
dis_(0, len(code))
def compile(p, flags=0):
# internal: convert pattern list to internal format
@ -606,7 +750,9 @@ def compile(p, flags=0):
code = _code(p, flags)
# print(code)
if flags & SRE_FLAG_DEBUG:
print()
dis(code)
# map in either direction
groupindex = p.pattern.groupdict

View File

@ -1688,10 +1688,12 @@ class ReTests(unittest.TestCase):
self.assertEqual(m.group(1), "")
self.assertEqual(m.group(2), "y")
@cpython_only
def test_debug_flag(self):
pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
with captured_stdout() as out:
re.compile(pat, re.DEBUG)
self.maxDiff = None
dump = '''\
SUBPATTERN 1 0 0
LITERAL 46
@ -1707,6 +1709,31 @@ GROUPREF_EXISTS 1
ELSE
LITERAL 58
LITERAL 32
0. INFO 8 0b1 2 5 (to 9)
prefix_skip 0
prefix [0x2e] ('.')
overlap [0]
9: MARK 0
11. LITERAL 0x2e ('.')
13. MARK 1
15. BRANCH 10 (to 26)
17. IN 6 (to 24)
19. LITERAL 0x63 ('c')
21. LITERAL 0x68 ('h')
23. FAILURE
24: JUMP 9 (to 34)
26: branch 7 (to 33)
27. LITERAL 0x70 ('p')
29. LITERAL 0x79 ('y')
31. JUMP 2 (to 34)
33: FAILURE
34: GROUPREF_EXISTS 0 6 (to 41)
37. AT END
39. JUMP 5 (to 45)
41: LITERAL 0x3a (':')
43. LITERAL 0x20 (' ')
45: SUCCESS
'''
self.assertEqual(out.getvalue(), dump)
# Debug output is output again even a second time (bypassing

View File

@ -323,6 +323,9 @@ Extension Modules
Library
-------
- bpo-30299: Compiling regular expression in debug mode on CPython now displays
the compiled bytecode in human readable form.
- bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
running coroutine and the coroutine returned without any more ``await``.