Bug fixes large and small for tokenize.

Small: Always generate a NL or NEWLINE token following a COMMENT token. The old code did not generate an NL token if the comment was on a line by itself. Large: The output of untokenize() will now match the input exactly if it is passed the full token sequence. The old, crufty output is still generated if a limited input sequence is provided, where limited means that it does not include position information for tokens. Remaining bug: There is no CONTINUATION token (\) so there is no way for untokenize() to handle such code. Also, expanded the number of doctests in hopes of eventually removing the old-style tests that compare against a golden file. Bug fix candidate for Python 2.5.1. (Sigh.)
2006-08-23 21:14:03 +00:00 · 2006-08-23 21:14:03 +00:00 · 76467ba6d6
parent 20362a820b
commit 76467ba6d6
3 changed files with 193 additions and 66 deletions
--- a/Lib/test/output/test_tokenize
+++ b/Lib/test/output/test_tokenize
@ -1,15 +1,23 @@
 test_tokenize
-1,0-1,35:	COMMENT	"# Tests for the 'tokenize' module.\n"
-2,0-2,43:	COMMENT	'# Large bits stolen from test_grammar.py. \n'
+1,0-1,34:	COMMENT	"# Tests for the 'tokenize' module."
+1,34-1,35:	NL	'\n'
+2,0-2,42:	COMMENT	'# Large bits stolen from test_grammar.py. '
+2,42-2,43:	NL	'\n'
 3,0-3,1:	NL	'\n'
-4,0-4,11:	COMMENT	'# Comments\n'
+4,0-4,10:	COMMENT	'# Comments'
+4,10-4,11:	NL	'\n'
 5,0-5,3:	STRING	'"#"'
 5,3-5,4:	NEWLINE	'\n'
-6,0-6,3:	COMMENT	"#'\n"
-7,0-7,3:	COMMENT	'#"\n'
-8,0-8,3:	COMMENT	'#\\\n'
-9,7-9,9:	COMMENT	'#\n'
-10,4-10,10:	COMMENT	'# abc\n'
+6,0-6,2:	COMMENT	"#'"
+6,2-6,3:	NL	'\n'
+7,0-7,2:	COMMENT	'#"'
+7,2-7,3:	NL	'\n'
+8,0-8,2:	COMMENT	'#\\'
+8,2-8,3:	NL	'\n'
+9,7-9,8:	COMMENT	'#'
+9,8-9,9:	NL	'\n'
+10,4-10,9:	COMMENT	'# abc'
+10,9-10,10:	NL	'\n'
 11,0-12,4:	STRING	"'''#\n#'''"
 12,4-12,5:	NEWLINE	'\n'
 13,0-13,1:	NL	'\n'
@ -19,7 +27,8 @@ test_tokenize
 14,7-14,8:	COMMENT	'#'
 14,8-14,9:	NEWLINE	'\n'
 15,0-15,1:	NL	'\n'
-16,0-16,25:	COMMENT	'# Balancing continuation\n'
+16,0-16,24:	COMMENT	'# Balancing continuation'
+16,24-16,25:	NL	'\n'
 17,0-17,1:	NL	'\n'
 18,0-18,1:	NAME	'a'
 18,2-18,3:	OP	'='
@ -92,7 +101,8 @@ test_tokenize
 29,2-29,3:	OP	')'
 29,3-29,4:	NEWLINE	'\n'
 30,0-30,1:	NL	'\n'
-31,0-31,37:	COMMENT	'# Backslash means line continuation:\n'
+31,0-31,36:	COMMENT	'# Backslash means line continuation:'
+31,36-31,37:	NL	'\n'
 32,0-32,1:	NAME	'x'
 32,2-32,3:	OP	'='
 32,4-32,5:	NUMBER	'1'
@ -100,13 +110,15 @@ test_tokenize
 33,2-33,3:	NUMBER	'1'
 33,3-33,4:	NEWLINE	'\n'
 34,0-34,1:	NL	'\n'
-35,0-35,55:	COMMENT	'# Backslash does not means continuation in comments :\\\n'
+35,0-35,54:	COMMENT	'# Backslash does not means continuation in comments :\\'
+35,54-35,55:	NL	'\n'
 36,0-36,1:	NAME	'x'
 36,2-36,3:	OP	'='
 36,4-36,5:	NUMBER	'0'
 36,5-36,6:	NEWLINE	'\n'
 37,0-37,1:	NL	'\n'
-38,0-38,20:	COMMENT	'# Ordinary integers\n'
+38,0-38,19:	COMMENT	'# Ordinary integers'
+38,19-38,20:	NL	'\n'
 39,0-39,4:	NUMBER	'0xff'
 39,5-39,7:	OP	'<>'
 39,8-39,11:	NUMBER	'255'
@ -137,7 +149,8 @@ test_tokenize
 44,15-44,16:	NUMBER	'1'
 44,16-44,17:	NEWLINE	'\n'
 45,0-45,1:	NL	'\n'
-46,0-46,16:	COMMENT	'# Long integers\n'
+46,0-46,15:	COMMENT	'# Long integers'
+46,15-46,16:	NL	'\n'
 47,0-47,1:	NAME	'x'
 47,2-47,3:	OP	'='
 47,4-47,6:	NUMBER	'0L'
@ -171,7 +184,8 @@ test_tokenize
 54,4-54,35:	NUMBER	'123456789012345678901234567890l'
 54,35-54,36:	NEWLINE	'\n'
 55,0-55,1:	NL	'\n'
-56,0-56,25:	COMMENT	'# Floating-point numbers\n'
+56,0-56,24:	COMMENT	'# Floating-point numbers'
+56,24-56,25:	NL	'\n'
 57,0-57,1:	NAME	'x'
 57,2-57,3:	OP	'='
 57,4-57,8:	NUMBER	'3.14'
@ -184,7 +198,8 @@ test_tokenize
 59,2-59,3:	OP	'='
 59,4-59,9:	NUMBER	'0.314'
 59,9-59,10:	NEWLINE	'\n'
-60,0-60,18:	COMMENT	'# XXX x = 000.314\n'
+60,0-60,17:	COMMENT	'# XXX x = 000.314'
+60,17-60,18:	NL	'\n'
 61,0-61,1:	NAME	'x'
 61,2-61,3:	OP	'='
 61,4-61,8:	NUMBER	'.314'
@ -218,7 +233,8 @@ test_tokenize
 68,4-68,9:	NUMBER	'3.1e4'
 68,9-68,10:	NEWLINE	'\n'
 69,0-69,1:	NL	'\n'
-70,0-70,18:	COMMENT	'# String literals\n'
+70,0-70,17:	COMMENT	'# String literals'
+70,17-70,18:	NL	'\n'
 71,0-71,1:	NAME	'x'
 71,2-71,3:	OP	'='
 71,4-71,6:	STRING	"''"
@ -366,7 +382,8 @@ test_tokenize
 125,6-126,3:	STRING	"uR'''spam\n'''"
 126,3-126,4:	NEWLINE	'\n'
 127,0-127,1:	NL	'\n'
-128,0-128,14:	COMMENT	'# Indentation\n'
+128,0-128,13:	COMMENT	'# Indentation'
+128,13-128,14:	NL	'\n'
 129,0-129,2:	NAME	'if'
 129,3-129,4:	NUMBER	'1'
 129,4-129,5:	OP	':'
@ -438,7 +455,8 @@ test_tokenize
 142,14-142,15:	NUMBER	'2'
 142,15-142,16:	NEWLINE	'\n'
 143,0-143,1:	NL	'\n'
-144,0-144,12:	COMMENT	'# Operators\n'
+144,0-144,11:	COMMENT	'# Operators'
+144,11-144,12:	NL	'\n'
 145,0-145,1:	NL	'\n'
 146,0-146,0:	DEDENT	''
 146,0-146,0:	DEDENT	''
@ -500,7 +518,8 @@ test_tokenize
 149,27-149,28:	OP	')'
 149,28-149,29:	NEWLINE	'\n'
 150,0-150,1:	NL	'\n'
-151,0-151,13:	COMMENT	'# comparison\n'
+151,0-151,12:	COMMENT	'# comparison'
+151,12-151,13:	NL	'\n'
 152,0-152,2:	NAME	'if'
 152,3-152,4:	NUMBER	'1'
 152,5-152,6:	OP	'<'
@ -531,7 +550,8 @@ test_tokenize
 152,67-152,71:	NAME	'pass'
 152,71-152,72:	NEWLINE	'\n'
 153,0-153,1:	NL	'\n'
-154,0-154,9:	COMMENT	'# binary\n'
+154,0-154,8:	COMMENT	'# binary'
+154,8-154,9:	NL	'\n'
 155,0-155,1:	NAME	'x'
 155,2-155,3:	OP	'='
 155,4-155,5:	NUMBER	'1'
@ -551,7 +571,8 @@ test_tokenize
 157,8-157,9:	NUMBER	'1'
 157,9-157,10:	NEWLINE	'\n'
 158,0-158,1:	NL	'\n'
-159,0-159,8:	COMMENT	'# shift\n'
+159,0-159,7:	COMMENT	'# shift'
+159,7-159,8:	NL	'\n'
 160,0-160,1:	NAME	'x'
 160,2-160,3:	OP	'='
 160,4-160,5:	NUMBER	'1'
@ -561,7 +582,8 @@ test_tokenize
 160,14-160,15:	NUMBER	'1'
 160,15-160,16:	NEWLINE	'\n'
 161,0-161,1:	NL	'\n'
-162,0-162,11:	COMMENT	'# additive\n'
+162,0-162,10:	COMMENT	'# additive'
+162,10-162,11:	NL	'\n'
 163,0-163,1:	NAME	'x'
 163,2-163,3:	OP	'='
 163,4-163,5:	NUMBER	'1'
@ -575,7 +597,8 @@ test_tokenize
 163,20-163,21:	NUMBER	'1'
 163,21-163,22:	NEWLINE	'\n'
 164,0-164,1:	NL	'\n'
-165,0-165,17:	COMMENT	'# multiplicative\n'
+165,0-165,16:	COMMENT	'# multiplicative'
+165,16-165,17:	NL	'\n'
 166,0-166,1:	NAME	'x'
 166,2-166,3:	OP	'='
 166,4-166,5:	NUMBER	'1'
@ -587,7 +610,8 @@ test_tokenize
 166,16-166,17:	NUMBER	'1'
 166,17-166,18:	NEWLINE	'\n'
 167,0-167,1:	NL	'\n'
-168,0-168,8:	COMMENT	'# unary\n'
+168,0-168,7:	COMMENT	'# unary'
+168,7-168,8:	NL	'\n'
 169,0-169,1:	NAME	'x'
 169,2-169,3:	OP	'='
 169,4-169,5:	OP	'~'
@ -625,7 +649,8 @@ test_tokenize
 170,24-170,25:	NUMBER	'1'
 170,25-170,26:	NEWLINE	'\n'
 171,0-171,1:	NL	'\n'
-172,0-172,11:	COMMENT	'# selector\n'
+172,0-172,10:	COMMENT	'# selector'
+172,10-172,11:	NL	'\n'
 173,0-173,6:	NAME	'import'
 173,7-173,10:	NAME	'sys'
 173,10-173,11:	OP	','
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@ -9,20 +9,73 @@ code, print out a table with the tokens.  The ENDMARK is omitted for
 brevity.

 >>> dump_tokens("1 + 1")
-NUMBER      '1'        (1, 0) (1, 1)
-OP          '+'        (1, 2) (1, 3)
-NUMBER      '1'        (1, 4) (1, 5)
+NUMBER      '1'           (1, 0) (1, 1)
+OP          '+'           (1, 2) (1, 3)
+NUMBER      '1'           (1, 4) (1, 5)
+
+A comment generates a token here, unlike in the parser module.  The
+comment token is followed by an NL or a NEWLINE token, depending on
+whether the line contains the completion of a statement.
+
+>>> dump_tokens("if False:\\n"
+...             "    # NL\\n"
+...             "    True = False # NEWLINE\\n")
+NAME        'if'          (1, 0) (1, 2)
+NAME        'False'       (1, 3) (1, 8)
+OP          ':'           (1, 8) (1, 9)
+NEWLINE     '\\n'          (1, 9) (1, 10)
+COMMENT     '# NL'        (2, 4) (2, 8)
+NL          '\\n'          (2, 8) (2, 9)
+INDENT      '    '        (3, 0) (3, 4)
+NAME        'True'        (3, 4) (3, 8)
+OP          '='           (3, 9) (3, 10)
+NAME        'False'       (3, 11) (3, 16)
+COMMENT     '# NEWLINE'   (3, 17) (3, 26)
+NEWLINE     '\\n'          (3, 26) (3, 27)
+DEDENT      ''            (4, 0) (4, 0)
+                                                    

 There will be a bunch more tests of specific source patterns.

 The tokenize module also defines an untokenize function that should
-regenerate the original program text from the tokens.  (It doesn't
-work very well at the moment.)
+regenerate the original program text from the tokens.
+
+There are some standard formatting practices that are easy to get right.

 >>> roundtrip("if x == 1:\\n"
 ...           "    print x\\n")               
-if x ==1 :
-    print x 
+if x == 1:
+    print x
+
+Some people use different formatting conventions, which makes
+untokenize a little trickier.  Note that this test involves trailing
+whitespace after the colon.  You can't see it, but it's there!
+
+>>> roundtrip("if   x  ==  1  :  \\n"
+...           "  print x\\n")
+if   x  ==  1  :  
+  print x
+
+Comments need to go in the right place.
+
+>>> roundtrip("if x == 1:\\n"
+...           "    # A comment by itself.\\n"
+...           "    print x  # Comment here, too.\\n"
+...           "    # Another comment.\\n"
+...           "after_if = True\\n")
+if x == 1:
+    # A comment by itself.
+    print x  # Comment here, too.
+    # Another comment.
+after_if = True
+
+>>> roundtrip("if (x  # The comments need to go in the right place\\n"
+...           "    == 1):\\n"
+...           "    print 'x == 1'\\n")
+if (x  # The comments need to go in the right place
+    == 1):
+    print 'x == 1'
+
 """

 import os, glob, random
@ -30,7 +83,7 @@ from cStringIO import StringIO
 from test.test_support import (verbose, findfile, is_resource_enabled,
                               TestFailed)
 from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
-                      ENDMARKER, NUMBER, NAME, OP, STRING)
+                      ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)

 # Test roundtrip for `untokenize`.  `f` is a file path.  The source code in f
 # is tokenized, converted back to source code via tokenize.untokenize(),
@ -61,11 +114,12 @@ def dump_tokens(s):
        if type == ENDMARKER:
            break
        type = tok_name[type]
-        print "%(type)-10.10s  %(token)-10.10r %(start)s %(end)s" % locals()
+        print "%(type)-10.10s  %(token)-13.13r %(start)s %(end)s" % locals()

 def roundtrip(s):
    f = StringIO(s)
-    print untokenize(generate_tokens(f.readline)),
+    source = untokenize(generate_tokens(f.readline))
+    print source,

 # This is an example from the docs, set up as a doctest.
 def decistmt(s):
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -159,14 +159,76 @@ def tokenize_loop(readline, tokeneater):
    for token_info in generate_tokens(readline):
        tokeneater(*token_info)

+class Untokenizer:
+
+    def __init__(self):
+        self.tokens = []
+        self.prev_row = 1
+        self.prev_col = 0
+
+    def add_whitespace(self, start):
+        row, col = start
+        while row > self.prev_row:
+            print row, "<", self.prev_row
+            self.tokens.append("\n")
+            self.prev_row += 1
+        col_offset = col - self.prev_col
+        if col_offset:
+            self.tokens.append(" " * col_offset)
+
+    def untokenize(self, iterable):
+        for t in iterable:
+            if len(t) == 2:
+                self.compat(t, iterable)
+                break
+            tok_type, token, start, end, line = t
+            self.add_whitespace(start)
+            self.tokens.append(token)
+            self.prev_row, self.prev_col = end
+            if tok_type in (NEWLINE, NL):
+                self.prev_row += 1
+                self.prev_col = 0
+        return "".join(self.tokens)
+
+    def compat(self, token, iterable):
+        startline = False
+        indents = []
+        toks_append = self.tokens.append
+        toknum, tokval = token
+        if toknum in (NAME, NUMBER):
+            tokval += ' '
+        if toknum in (NEWLINE, NL):
+            startline = True
+        for tok in iterable:
+            toknum, tokval = tok[:2]
+
+            if toknum in (NAME, NUMBER):
+                tokval += ' '
+
+            if toknum == INDENT:
+                indents.append(tokval)
+                continue
+            elif toknum == DEDENT:
+                indents.pop()
+                continue
+            elif toknum in (NEWLINE, NL):
+                startline = True
+            elif startline and indents:
+                toks_append(indents[-1])
+                startline = False
+            toks_append(tokval)

 def untokenize(iterable):
    """Transform tokens back into Python source code.

    Each element returned by the iterable must be a token sequence
-    with at least two elements, a token number and token value.
+    with at least two elements, a token number and token value.  If
+    only two tokens are passed, the resulting output is poor.

-    Round-trip invariant:
+    Round-trip invariant for full input:
+        Untokenized source will match input source exactly
+
+    Round-trip invariant for limited intput:
        # Output text will tokenize the back to the input
        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
        newcode = untokenize(t1)
@ -174,31 +236,8 @@ def untokenize(iterable):
        t2 = [tok[:2] for tokin generate_tokens(readline)]
        assert t1 == t2
    """
-
-    startline = False
-    indents = []
-    toks = []
-    toks_append = toks.append
-    for tok in iterable:
-        toknum, tokval = tok[:2]
-
-        if toknum in (NAME, NUMBER):
-            tokval += ' '
-
-        if toknum == INDENT:
-            indents.append(tokval)
-            continue
-        elif toknum == DEDENT:
-            indents.pop()
-            continue
-        elif toknum in (NEWLINE, COMMENT, NL):
-            startline = True
-        elif startline and indents:
-            toks_append(indents[-1])
-            startline = False
-        toks_append(tokval)
-    return ''.join(toks)
-
+    ut = Untokenizer()
+    return ut.untokenize(iterable)

 def generate_tokens(readline):
    """
@ -237,7 +276,7 @@ def generate_tokens(readline):
            if endmatch:
                pos = end = endmatch.end(0)
                yield (STRING, contstr + line[:end],
-                           strstart, (lnum, end), contline + line)
+                       strstart, (lnum, end), contline + line)
                contstr, needcont = '', 0
                contline = None
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
@ -263,7 +302,15 @@ def generate_tokens(readline):
            if pos == max: break

            if line[pos] in '#\r\n':           # skip comments or blank lines
-                yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
+                if line[pos] == '#':
+                    comment_token = line[pos:].rstrip('\r\n')
+                    nl_pos = pos + len(comment_token)
+                    yield (COMMENT, comment_token,
+                           (lnum, pos), (lnum, pos + len(comment_token)), line)
+                    yield (NL, line[nl_pos:],
+                           (lnum, nl_pos), (lnum, len(line)), line)
+                else:
+                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
                           (lnum, pos), (lnum, len(line)), line)
                continue

@ -294,9 +341,10 @@ def generate_tokens(readline):
                   (initial == '.' and token != '.'):      # ordinary number
                    yield (NUMBER, token, spos, epos, line)
                elif initial in '\r\n':
-                    yield (parenlev > 0 and NL or NEWLINE,
-                               token, spos, epos, line)
+                    yield (NL if parenlev > 0 else NEWLINE,
+                           token, spos, epos, line)
                elif initial == '#':
+                    assert not token.endswith("\n")
                    yield (COMMENT, token, spos, epos, line)
                elif token in triple_quoted:
                    endprog = endprogs[token]