From 4ab6abfca4d6e444cca04821b24701cde6993f4e Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 14 May 2017 09:05:13 +0300 Subject: [PATCH] bpo-30299: Display a bytecode when compile a regex in debug mode. (#1491) `re.compile(..., re.DEBUG)` now displays the compiled bytecode in human readable form. --- Lib/sre_compile.py | 148 +++++++++++++++++++++++++++++++++++++++++++- Lib/test/test_re.py | 27 ++++++++ Misc/NEWS | 3 + 3 files changed, 177 insertions(+), 1 deletion(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index aeb89bcc7b4..144620c6d1b 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -595,6 +595,150 @@ def _code(p, flags): return code +def _hex_code(code): + return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code) + +def dis(code): + import sys + + labels = set() + level = 0 + offset_width = len(str(len(code) - 1)) + + def dis_(start, end): + def print_(*args, to=None): + if to is not None: + labels.add(to) + args += ('(to %d)' % (to,),) + print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'), + end=' '*(level-1)) + print(*args) + + def print_2(*args): + print(end=' '*(offset_width + 2*level)) + print(*args) + + nonlocal level + level += 1 + i = start + while i < end: + start = i + op = code[i] + i += 1 + op = OPCODES[op] + if op in (SUCCESS, FAILURE, ANY, ANY_ALL, + MAX_UNTIL, MIN_UNTIL, NEGATE): + print_(op) + elif op in (LITERAL, NOT_LITERAL, + LITERAL_IGNORE, NOT_LITERAL_IGNORE, + LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE): + arg = code[i] + i += 1 + print_(op, '%#02x (%r)' % (arg, chr(arg))) + elif op is AT: + arg = code[i] + i += 1 + arg = str(ATCODES[arg]) + assert arg[:3] == 'AT_' + print_(op, arg[3:]) + elif op is CATEGORY: + arg = code[i] + i += 1 + arg = str(CHCODES[arg]) + assert arg[:9] == 'CATEGORY_' + print_(op, arg[9:]) + elif op in (IN, IN_IGNORE, IN_LOC_IGNORE): + skip = code[i] + print_(op, skip, to=i+skip) + dis_(i+1, i+skip) + i += skip + elif op in (RANGE, RANGE_IGNORE): + lo, hi = code[i: i+2] + i += 2 + print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi))) + elif op is CHARSET: + print_(op, _hex_code(code[i: i + 256//_CODEBITS])) + i += 256//_CODEBITS + elif op is BIGCHARSET: + arg = code[i] + i += 1 + mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder) + for x in code[i: i + 256//_sre.CODESIZE])) + print_(op, arg, mapping) + i += 256//_sre.CODESIZE + level += 1 + for j in range(arg): + print_2(_hex_code(code[i: i + 256//_CODEBITS])) + i += 256//_CODEBITS + level -= 1 + elif op in (MARK, GROUPREF, GROUPREF_IGNORE): + arg = code[i] + i += 1 + print_(op, arg) + elif op is JUMP: + skip = code[i] + print_(op, skip, to=i+skip) + i += 1 + elif op is BRANCH: + skip = code[i] + print_(op, skip, to=i+skip) + while skip: + dis_(i+1, i+skip) + i += skip + start = i + skip = code[i] + if skip: + print_('branch', skip, to=i+skip) + else: + print_(FAILURE) + i += 1 + elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE): + skip, min, max = code[i: i+3] + if max == MAXREPEAT: + max = 'MAXREPEAT' + print_(op, skip, min, max, to=i+skip) + dis_(i+3, i+skip) + i += skip + elif op is GROUPREF_EXISTS: + arg, skip = code[i: i+2] + print_(op, arg, skip, to=i+skip) + i += 2 + elif op in (ASSERT, ASSERT_NOT): + skip, arg = code[i: i+2] + print_(op, skip, arg, to=i+skip) + dis_(i+2, i+skip) + i += skip + elif op is INFO: + skip, flags, min, max = code[i: i+4] + if max == MAXREPEAT: + max = 'MAXREPEAT' + print_(op, skip, bin(flags), min, max, to=i+skip) + start = i+4 + if flags & SRE_INFO_PREFIX: + prefix_len, prefix_skip = code[i+4: i+6] + print_2(' prefix_skip', prefix_skip) + start = i + 6 + prefix = code[start: start+prefix_len] + print_2(' prefix', + '[%s]' % ', '.join('%#02x' % x for x in prefix), + '(%r)' % ''.join(map(chr, prefix))) + start += prefix_len + print_2(' overlap', code[start: start+prefix_len]) + start += prefix_len + if flags & SRE_INFO_CHARSET: + level += 1 + print_2('in') + dis_(start, i+skip) + level -= 1 + i += skip + else: + raise ValueError(op) + + level -= 1 + + dis_(0, len(code)) + + def compile(p, flags=0): # internal: convert pattern list to internal format @@ -606,7 +750,9 @@ def compile(p, flags=0): code = _code(p, flags) - # print(code) + if flags & SRE_FLAG_DEBUG: + print() + dis(code) # map in either direction groupindex = p.pattern.groupdict diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 5d36b54680d..1bb26540547 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1688,10 +1688,12 @@ class ReTests(unittest.TestCase): self.assertEqual(m.group(1), "") self.assertEqual(m.group(2), "y") + @cpython_only def test_debug_flag(self): pat = r'(\.)(?:[ch]|py)(?(1)$|: )' with captured_stdout() as out: re.compile(pat, re.DEBUG) + self.maxDiff = None dump = '''\ SUBPATTERN 1 0 0 LITERAL 46 @@ -1707,6 +1709,31 @@ GROUPREF_EXISTS 1 ELSE LITERAL 58 LITERAL 32 + + 0. INFO 8 0b1 2 5 (to 9) + prefix_skip 0 + prefix [0x2e] ('.') + overlap [0] + 9: MARK 0 +11. LITERAL 0x2e ('.') +13. MARK 1 +15. BRANCH 10 (to 26) +17. IN 6 (to 24) +19. LITERAL 0x63 ('c') +21. LITERAL 0x68 ('h') +23. FAILURE +24: JUMP 9 (to 34) +26: branch 7 (to 33) +27. LITERAL 0x70 ('p') +29. LITERAL 0x79 ('y') +31. JUMP 2 (to 34) +33: FAILURE +34: GROUPREF_EXISTS 0 6 (to 41) +37. AT END +39. JUMP 5 (to 45) +41: LITERAL 0x3a (':') +43. LITERAL 0x20 (' ') +45: SUCCESS ''' self.assertEqual(out.getvalue(), dump) # Debug output is output again even a second time (bypassing diff --git a/Misc/NEWS b/Misc/NEWS index e6b4ced1a26..bf19b25c48e 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -323,6 +323,9 @@ Extension Modules Library ------- +- bpo-30299: Compiling regular expression in debug mode on CPython now displays + the compiled bytecode in human readable form. + - bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is running coroutine and the coroutine returned without any more ``await``.