From 76467ba6d6cac910523373efe967339f96b10c85 Mon Sep 17 00:00:00 2001 From: Jeremy Hylton Date: Wed, 23 Aug 2006 21:14:03 +0000 Subject: [PATCH] Bug fixes large and small for tokenize. Small: Always generate a NL or NEWLINE token following a COMMENT token. The old code did not generate an NL token if the comment was on a line by itself. Large: The output of untokenize() will now match the input exactly if it is passed the full token sequence. The old, crufty output is still generated if a limited input sequence is provided, where limited means that it does not include position information for tokens. Remaining bug: There is no CONTINUATION token (\) so there is no way for untokenize() to handle such code. Also, expanded the number of doctests in hopes of eventually removing the old-style tests that compare against a golden file. Bug fix candidate for Python 2.5.1. (Sigh.) --- Lib/test/output/test_tokenize | 75 +++++++++++++++-------- Lib/test/test_tokenize.py | 74 +++++++++++++++++++---- Lib/tokenize.py | 110 ++++++++++++++++++++++++---------- 3 files changed, 193 insertions(+), 66 deletions(-) diff --git a/Lib/test/output/test_tokenize b/Lib/test/output/test_tokenize index b78a223475d..1d1c6a8c551 100644 --- a/Lib/test/output/test_tokenize +++ b/Lib/test/output/test_tokenize @@ -1,15 +1,23 @@ test_tokenize -1,0-1,35: COMMENT "# Tests for the 'tokenize' module.\n" -2,0-2,43: COMMENT '# Large bits stolen from test_grammar.py. \n' +1,0-1,34: COMMENT "# Tests for the 'tokenize' module." +1,34-1,35: NL '\n' +2,0-2,42: COMMENT '# Large bits stolen from test_grammar.py. ' +2,42-2,43: NL '\n' 3,0-3,1: NL '\n' -4,0-4,11: COMMENT '# Comments\n' +4,0-4,10: COMMENT '# Comments' +4,10-4,11: NL '\n' 5,0-5,3: STRING '"#"' 5,3-5,4: NEWLINE '\n' -6,0-6,3: COMMENT "#'\n" -7,0-7,3: COMMENT '#"\n' -8,0-8,3: COMMENT '#\\\n' -9,7-9,9: COMMENT '#\n' -10,4-10,10: COMMENT '# abc\n' +6,0-6,2: COMMENT "#'" +6,2-6,3: NL '\n' +7,0-7,2: COMMENT '#"' +7,2-7,3: NL '\n' +8,0-8,2: COMMENT '#\\' +8,2-8,3: NL '\n' +9,7-9,8: COMMENT '#' +9,8-9,9: NL '\n' +10,4-10,9: COMMENT '# abc' +10,9-10,10: NL '\n' 11,0-12,4: STRING "'''#\n#'''" 12,4-12,5: NEWLINE '\n' 13,0-13,1: NL '\n' @@ -19,7 +27,8 @@ test_tokenize 14,7-14,8: COMMENT '#' 14,8-14,9: NEWLINE '\n' 15,0-15,1: NL '\n' -16,0-16,25: COMMENT '# Balancing continuation\n' +16,0-16,24: COMMENT '# Balancing continuation' +16,24-16,25: NL '\n' 17,0-17,1: NL '\n' 18,0-18,1: NAME 'a' 18,2-18,3: OP '=' @@ -92,7 +101,8 @@ test_tokenize 29,2-29,3: OP ')' 29,3-29,4: NEWLINE '\n' 30,0-30,1: NL '\n' -31,0-31,37: COMMENT '# Backslash means line continuation:\n' +31,0-31,36: COMMENT '# Backslash means line continuation:' +31,36-31,37: NL '\n' 32,0-32,1: NAME 'x' 32,2-32,3: OP '=' 32,4-32,5: NUMBER '1' @@ -100,13 +110,15 @@ test_tokenize 33,2-33,3: NUMBER '1' 33,3-33,4: NEWLINE '\n' 34,0-34,1: NL '\n' -35,0-35,55: COMMENT '# Backslash does not means continuation in comments :\\\n' +35,0-35,54: COMMENT '# Backslash does not means continuation in comments :\\' +35,54-35,55: NL '\n' 36,0-36,1: NAME 'x' 36,2-36,3: OP '=' 36,4-36,5: NUMBER '0' 36,5-36,6: NEWLINE '\n' 37,0-37,1: NL '\n' -38,0-38,20: COMMENT '# Ordinary integers\n' +38,0-38,19: COMMENT '# Ordinary integers' +38,19-38,20: NL '\n' 39,0-39,4: NUMBER '0xff' 39,5-39,7: OP '<>' 39,8-39,11: NUMBER '255' @@ -137,7 +149,8 @@ test_tokenize 44,15-44,16: NUMBER '1' 44,16-44,17: NEWLINE '\n' 45,0-45,1: NL '\n' -46,0-46,16: COMMENT '# Long integers\n' +46,0-46,15: COMMENT '# Long integers' +46,15-46,16: NL '\n' 47,0-47,1: NAME 'x' 47,2-47,3: OP '=' 47,4-47,6: NUMBER '0L' @@ -171,7 +184,8 @@ test_tokenize 54,4-54,35: NUMBER '123456789012345678901234567890l' 54,35-54,36: NEWLINE '\n' 55,0-55,1: NL '\n' -56,0-56,25: COMMENT '# Floating-point numbers\n' +56,0-56,24: COMMENT '# Floating-point numbers' +56,24-56,25: NL '\n' 57,0-57,1: NAME 'x' 57,2-57,3: OP '=' 57,4-57,8: NUMBER '3.14' @@ -184,7 +198,8 @@ test_tokenize 59,2-59,3: OP '=' 59,4-59,9: NUMBER '0.314' 59,9-59,10: NEWLINE '\n' -60,0-60,18: COMMENT '# XXX x = 000.314\n' +60,0-60,17: COMMENT '# XXX x = 000.314' +60,17-60,18: NL '\n' 61,0-61,1: NAME 'x' 61,2-61,3: OP '=' 61,4-61,8: NUMBER '.314' @@ -218,7 +233,8 @@ test_tokenize 68,4-68,9: NUMBER '3.1e4' 68,9-68,10: NEWLINE '\n' 69,0-69,1: NL '\n' -70,0-70,18: COMMENT '# String literals\n' +70,0-70,17: COMMENT '# String literals' +70,17-70,18: NL '\n' 71,0-71,1: NAME 'x' 71,2-71,3: OP '=' 71,4-71,6: STRING "''" @@ -366,7 +382,8 @@ test_tokenize 125,6-126,3: STRING "uR'''spam\n'''" 126,3-126,4: NEWLINE '\n' 127,0-127,1: NL '\n' -128,0-128,14: COMMENT '# Indentation\n' +128,0-128,13: COMMENT '# Indentation' +128,13-128,14: NL '\n' 129,0-129,2: NAME 'if' 129,3-129,4: NUMBER '1' 129,4-129,5: OP ':' @@ -438,7 +455,8 @@ test_tokenize 142,14-142,15: NUMBER '2' 142,15-142,16: NEWLINE '\n' 143,0-143,1: NL '\n' -144,0-144,12: COMMENT '# Operators\n' +144,0-144,11: COMMENT '# Operators' +144,11-144,12: NL '\n' 145,0-145,1: NL '\n' 146,0-146,0: DEDENT '' 146,0-146,0: DEDENT '' @@ -500,7 +518,8 @@ test_tokenize 149,27-149,28: OP ')' 149,28-149,29: NEWLINE '\n' 150,0-150,1: NL '\n' -151,0-151,13: COMMENT '# comparison\n' +151,0-151,12: COMMENT '# comparison' +151,12-151,13: NL '\n' 152,0-152,2: NAME 'if' 152,3-152,4: NUMBER '1' 152,5-152,6: OP '<' @@ -531,7 +550,8 @@ test_tokenize 152,67-152,71: NAME 'pass' 152,71-152,72: NEWLINE '\n' 153,0-153,1: NL '\n' -154,0-154,9: COMMENT '# binary\n' +154,0-154,8: COMMENT '# binary' +154,8-154,9: NL '\n' 155,0-155,1: NAME 'x' 155,2-155,3: OP '=' 155,4-155,5: NUMBER '1' @@ -551,7 +571,8 @@ test_tokenize 157,8-157,9: NUMBER '1' 157,9-157,10: NEWLINE '\n' 158,0-158,1: NL '\n' -159,0-159,8: COMMENT '# shift\n' +159,0-159,7: COMMENT '# shift' +159,7-159,8: NL '\n' 160,0-160,1: NAME 'x' 160,2-160,3: OP '=' 160,4-160,5: NUMBER '1' @@ -561,7 +582,8 @@ test_tokenize 160,14-160,15: NUMBER '1' 160,15-160,16: NEWLINE '\n' 161,0-161,1: NL '\n' -162,0-162,11: COMMENT '# additive\n' +162,0-162,10: COMMENT '# additive' +162,10-162,11: NL '\n' 163,0-163,1: NAME 'x' 163,2-163,3: OP '=' 163,4-163,5: NUMBER '1' @@ -575,7 +597,8 @@ test_tokenize 163,20-163,21: NUMBER '1' 163,21-163,22: NEWLINE '\n' 164,0-164,1: NL '\n' -165,0-165,17: COMMENT '# multiplicative\n' +165,0-165,16: COMMENT '# multiplicative' +165,16-165,17: NL '\n' 166,0-166,1: NAME 'x' 166,2-166,3: OP '=' 166,4-166,5: NUMBER '1' @@ -587,7 +610,8 @@ test_tokenize 166,16-166,17: NUMBER '1' 166,17-166,18: NEWLINE '\n' 167,0-167,1: NL '\n' -168,0-168,8: COMMENT '# unary\n' +168,0-168,7: COMMENT '# unary' +168,7-168,8: NL '\n' 169,0-169,1: NAME 'x' 169,2-169,3: OP '=' 169,4-169,5: OP '~' @@ -625,7 +649,8 @@ test_tokenize 170,24-170,25: NUMBER '1' 170,25-170,26: NEWLINE '\n' 171,0-171,1: NL '\n' -172,0-172,11: COMMENT '# selector\n' +172,0-172,10: COMMENT '# selector' +172,10-172,11: NL '\n' 173,0-173,6: NAME 'import' 173,7-173,10: NAME 'sys' 173,10-173,11: OP ',' diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index a0f61d7cf6a..86f1b9b404f 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -9,20 +9,73 @@ code, print out a table with the tokens. The ENDMARK is omitted for brevity. >>> dump_tokens("1 + 1") -NUMBER '1' (1, 0) (1, 1) -OP '+' (1, 2) (1, 3) -NUMBER '1' (1, 4) (1, 5) +NUMBER '1' (1, 0) (1, 1) +OP '+' (1, 2) (1, 3) +NUMBER '1' (1, 4) (1, 5) + +A comment generates a token here, unlike in the parser module. The +comment token is followed by an NL or a NEWLINE token, depending on +whether the line contains the completion of a statement. + +>>> dump_tokens("if False:\\n" +... " # NL\\n" +... " True = False # NEWLINE\\n") +NAME 'if' (1, 0) (1, 2) +NAME 'False' (1, 3) (1, 8) +OP ':' (1, 8) (1, 9) +NEWLINE '\\n' (1, 9) (1, 10) +COMMENT '# NL' (2, 4) (2, 8) +NL '\\n' (2, 8) (2, 9) +INDENT ' ' (3, 0) (3, 4) +NAME 'True' (3, 4) (3, 8) +OP '=' (3, 9) (3, 10) +NAME 'False' (3, 11) (3, 16) +COMMENT '# NEWLINE' (3, 17) (3, 26) +NEWLINE '\\n' (3, 26) (3, 27) +DEDENT '' (4, 0) (4, 0) + There will be a bunch more tests of specific source patterns. The tokenize module also defines an untokenize function that should -regenerate the original program text from the tokens. (It doesn't -work very well at the moment.) +regenerate the original program text from the tokens. + +There are some standard formatting practices that are easy to get right. >>> roundtrip("if x == 1:\\n" ... " print x\\n") -if x ==1 : - print x +if x == 1: + print x + +Some people use different formatting conventions, which makes +untokenize a little trickier. Note that this test involves trailing +whitespace after the colon. You can't see it, but it's there! + +>>> roundtrip("if x == 1 : \\n" +... " print x\\n") +if x == 1 : + print x + +Comments need to go in the right place. + +>>> roundtrip("if x == 1:\\n" +... " # A comment by itself.\\n" +... " print x # Comment here, too.\\n" +... " # Another comment.\\n" +... "after_if = True\\n") +if x == 1: + # A comment by itself. + print x # Comment here, too. + # Another comment. +after_if = True + +>>> roundtrip("if (x # The comments need to go in the right place\\n" +... " == 1):\\n" +... " print 'x == 1'\\n") +if (x # The comments need to go in the right place + == 1): + print 'x == 1' + """ import os, glob, random @@ -30,7 +83,7 @@ from cStringIO import StringIO from test.test_support import (verbose, findfile, is_resource_enabled, TestFailed) from tokenize import (tokenize, generate_tokens, untokenize, tok_name, - ENDMARKER, NUMBER, NAME, OP, STRING) + ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT) # Test roundtrip for `untokenize`. `f` is a file path. The source code in f # is tokenized, converted back to source code via tokenize.untokenize(), @@ -61,11 +114,12 @@ def dump_tokens(s): if type == ENDMARKER: break type = tok_name[type] - print "%(type)-10.10s %(token)-10.10r %(start)s %(end)s" % locals() + print "%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals() def roundtrip(s): f = StringIO(s) - print untokenize(generate_tokens(f.readline)), + source = untokenize(generate_tokens(f.readline)) + print source, # This is an example from the docs, set up as a doctest. def decistmt(s): diff --git a/Lib/tokenize.py b/Lib/tokenize.py index a9be4cfe03e..ec9f63c6b67 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -159,14 +159,76 @@ def tokenize_loop(readline, tokeneater): for token_info in generate_tokens(readline): tokeneater(*token_info) +class Untokenizer: + + def __init__(self): + self.tokens = [] + self.prev_row = 1 + self.prev_col = 0 + + def add_whitespace(self, start): + row, col = start + while row > self.prev_row: + print row, "<", self.prev_row + self.tokens.append("\n") + self.prev_row += 1 + col_offset = col - self.prev_col + if col_offset: + self.tokens.append(" " * col_offset) + + def untokenize(self, iterable): + for t in iterable: + if len(t) == 2: + self.compat(t, iterable) + break + tok_type, token, start, end, line = t + self.add_whitespace(start) + self.tokens.append(token) + self.prev_row, self.prev_col = end + if tok_type in (NEWLINE, NL): + self.prev_row += 1 + self.prev_col = 0 + return "".join(self.tokens) + + def compat(self, token, iterable): + startline = False + indents = [] + toks_append = self.tokens.append + toknum, tokval = token + if toknum in (NAME, NUMBER): + tokval += ' ' + if toknum in (NEWLINE, NL): + startline = True + for tok in iterable: + toknum, tokval = tok[:2] + + if toknum in (NAME, NUMBER): + tokval += ' ' + + if toknum == INDENT: + indents.append(tokval) + continue + elif toknum == DEDENT: + indents.pop() + continue + elif toknum in (NEWLINE, NL): + startline = True + elif startline and indents: + toks_append(indents[-1]) + startline = False + toks_append(tokval) def untokenize(iterable): """Transform tokens back into Python source code. Each element returned by the iterable must be a token sequence - with at least two elements, a token number and token value. + with at least two elements, a token number and token value. If + only two tokens are passed, the resulting output is poor. - Round-trip invariant: + Round-trip invariant for full input: + Untokenized source will match input source exactly + + Round-trip invariant for limited intput: # Output text will tokenize the back to the input t1 = [tok[:2] for tok in generate_tokens(f.readline)] newcode = untokenize(t1) @@ -174,31 +236,8 @@ def untokenize(iterable): t2 = [tok[:2] for tokin generate_tokens(readline)] assert t1 == t2 """ - - startline = False - indents = [] - toks = [] - toks_append = toks.append - for tok in iterable: - toknum, tokval = tok[:2] - - if toknum in (NAME, NUMBER): - tokval += ' ' - - if toknum == INDENT: - indents.append(tokval) - continue - elif toknum == DEDENT: - indents.pop() - continue - elif toknum in (NEWLINE, COMMENT, NL): - startline = True - elif startline and indents: - toks_append(indents[-1]) - startline = False - toks_append(tokval) - return ''.join(toks) - + ut = Untokenizer() + return ut.untokenize(iterable) def generate_tokens(readline): """ @@ -237,7 +276,7 @@ def generate_tokens(readline): if endmatch: pos = end = endmatch.end(0) yield (STRING, contstr + line[:end], - strstart, (lnum, end), contline + line) + strstart, (lnum, end), contline + line) contstr, needcont = '', 0 contline = None elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': @@ -263,7 +302,15 @@ def generate_tokens(readline): if pos == max: break if line[pos] in '#\r\n': # skip comments or blank lines - yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], + if line[pos] == '#': + comment_token = line[pos:].rstrip('\r\n') + nl_pos = pos + len(comment_token) + yield (COMMENT, comment_token, + (lnum, pos), (lnum, pos + len(comment_token)), line) + yield (NL, line[nl_pos:], + (lnum, nl_pos), (lnum, len(line)), line) + else: + yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], (lnum, pos), (lnum, len(line)), line) continue @@ -294,9 +341,10 @@ def generate_tokens(readline): (initial == '.' and token != '.'): # ordinary number yield (NUMBER, token, spos, epos, line) elif initial in '\r\n': - yield (parenlev > 0 and NL or NEWLINE, - token, spos, epos, line) + yield (NL if parenlev > 0 else NEWLINE, + token, spos, epos, line) elif initial == '#': + assert not token.endswith("\n") yield (COMMENT, token, spos, epos, line) elif token in triple_quoted: endprog = endprogs[token]