Bug fixes large and small for tokenize.

Small: Always generate a NL or NEWLINE token following
       a COMMENT token.  The old code did not generate an NL token if
       the comment was on a line by itself.

Large: The output of untokenize() will now match the
       input exactly if it is passed the full token sequence.  The
       old, crufty output is still generated if a limited input
       sequence is provided, where limited means that it does not
       include position information for tokens.

Remaining bug: There is no CONTINUATION token (\) so there is no way
for untokenize() to handle such code.

Also, expanded the number of doctests in hopes of eventually removing
the old-style tests that compare against a golden file.

Bug fix candidate for Python 2.5.1. (Sigh.)
This commit is contained in:
Jeremy Hylton 2006-08-23 21:14:03 +00:00
parent 20362a820b
commit 76467ba6d6
3 changed files with 193 additions and 66 deletions

View File

@ -1,15 +1,23 @@
test_tokenize
1,0-1,35: COMMENT "# Tests for the 'tokenize' module.\n"
2,0-2,43: COMMENT '# Large bits stolen from test_grammar.py. \n'
1,0-1,34: COMMENT "# Tests for the 'tokenize' module."
1,34-1,35: NL '\n'
2,0-2,42: COMMENT '# Large bits stolen from test_grammar.py. '
2,42-2,43: NL '\n'
3,0-3,1: NL '\n'
4,0-4,11: COMMENT '# Comments\n'
4,0-4,10: COMMENT '# Comments'
4,10-4,11: NL '\n'
5,0-5,3: STRING '"#"'
5,3-5,4: NEWLINE '\n'
6,0-6,3: COMMENT "#'\n"
7,0-7,3: COMMENT '#"\n'
8,0-8,3: COMMENT '#\\\n'
9,7-9,9: COMMENT '#\n'
10,4-10,10: COMMENT '# abc\n'
6,0-6,2: COMMENT "#'"
6,2-6,3: NL '\n'
7,0-7,2: COMMENT '#"'
7,2-7,3: NL '\n'
8,0-8,2: COMMENT '#\\'
8,2-8,3: NL '\n'
9,7-9,8: COMMENT '#'
9,8-9,9: NL '\n'
10,4-10,9: COMMENT '# abc'
10,9-10,10: NL '\n'
11,0-12,4: STRING "'''#\n#'''"
12,4-12,5: NEWLINE '\n'
13,0-13,1: NL '\n'
@ -19,7 +27,8 @@ test_tokenize
14,7-14,8: COMMENT '#'
14,8-14,9: NEWLINE '\n'
15,0-15,1: NL '\n'
16,0-16,25: COMMENT '# Balancing continuation\n'
16,0-16,24: COMMENT '# Balancing continuation'
16,24-16,25: NL '\n'
17,0-17,1: NL '\n'
18,0-18,1: NAME 'a'
18,2-18,3: OP '='
@ -92,7 +101,8 @@ test_tokenize
29,2-29,3: OP ')'
29,3-29,4: NEWLINE '\n'
30,0-30,1: NL '\n'
31,0-31,37: COMMENT '# Backslash means line continuation:\n'
31,0-31,36: COMMENT '# Backslash means line continuation:'
31,36-31,37: NL '\n'
32,0-32,1: NAME 'x'
32,2-32,3: OP '='
32,4-32,5: NUMBER '1'
@ -100,13 +110,15 @@ test_tokenize
33,2-33,3: NUMBER '1'
33,3-33,4: NEWLINE '\n'
34,0-34,1: NL '\n'
35,0-35,55: COMMENT '# Backslash does not means continuation in comments :\\\n'
35,0-35,54: COMMENT '# Backslash does not means continuation in comments :\\'
35,54-35,55: NL '\n'
36,0-36,1: NAME 'x'
36,2-36,3: OP '='
36,4-36,5: NUMBER '0'
36,5-36,6: NEWLINE '\n'
37,0-37,1: NL '\n'
38,0-38,20: COMMENT '# Ordinary integers\n'
38,0-38,19: COMMENT '# Ordinary integers'
38,19-38,20: NL '\n'
39,0-39,4: NUMBER '0xff'
39,5-39,7: OP '<>'
39,8-39,11: NUMBER '255'
@ -137,7 +149,8 @@ test_tokenize
44,15-44,16: NUMBER '1'
44,16-44,17: NEWLINE '\n'
45,0-45,1: NL '\n'
46,0-46,16: COMMENT '# Long integers\n'
46,0-46,15: COMMENT '# Long integers'
46,15-46,16: NL '\n'
47,0-47,1: NAME 'x'
47,2-47,3: OP '='
47,4-47,6: NUMBER '0L'
@ -171,7 +184,8 @@ test_tokenize
54,4-54,35: NUMBER '123456789012345678901234567890l'
54,35-54,36: NEWLINE '\n'
55,0-55,1: NL '\n'
56,0-56,25: COMMENT '# Floating-point numbers\n'
56,0-56,24: COMMENT '# Floating-point numbers'
56,24-56,25: NL '\n'
57,0-57,1: NAME 'x'
57,2-57,3: OP '='
57,4-57,8: NUMBER '3.14'
@ -184,7 +198,8 @@ test_tokenize
59,2-59,3: OP '='
59,4-59,9: NUMBER '0.314'
59,9-59,10: NEWLINE '\n'
60,0-60,18: COMMENT '# XXX x = 000.314\n'
60,0-60,17: COMMENT '# XXX x = 000.314'
60,17-60,18: NL '\n'
61,0-61,1: NAME 'x'
61,2-61,3: OP '='
61,4-61,8: NUMBER '.314'
@ -218,7 +233,8 @@ test_tokenize
68,4-68,9: NUMBER '3.1e4'
68,9-68,10: NEWLINE '\n'
69,0-69,1: NL '\n'
70,0-70,18: COMMENT '# String literals\n'
70,0-70,17: COMMENT '# String literals'
70,17-70,18: NL '\n'
71,0-71,1: NAME 'x'
71,2-71,3: OP '='
71,4-71,6: STRING "''"
@ -366,7 +382,8 @@ test_tokenize
125,6-126,3: STRING "uR'''spam\n'''"
126,3-126,4: NEWLINE '\n'
127,0-127,1: NL '\n'
128,0-128,14: COMMENT '# Indentation\n'
128,0-128,13: COMMENT '# Indentation'
128,13-128,14: NL '\n'
129,0-129,2: NAME 'if'
129,3-129,4: NUMBER '1'
129,4-129,5: OP ':'
@ -438,7 +455,8 @@ test_tokenize
142,14-142,15: NUMBER '2'
142,15-142,16: NEWLINE '\n'
143,0-143,1: NL '\n'
144,0-144,12: COMMENT '# Operators\n'
144,0-144,11: COMMENT '# Operators'
144,11-144,12: NL '\n'
145,0-145,1: NL '\n'
146,0-146,0: DEDENT ''
146,0-146,0: DEDENT ''
@ -500,7 +518,8 @@ test_tokenize
149,27-149,28: OP ')'
149,28-149,29: NEWLINE '\n'
150,0-150,1: NL '\n'
151,0-151,13: COMMENT '# comparison\n'
151,0-151,12: COMMENT '# comparison'
151,12-151,13: NL '\n'
152,0-152,2: NAME 'if'
152,3-152,4: NUMBER '1'
152,5-152,6: OP '<'
@ -531,7 +550,8 @@ test_tokenize
152,67-152,71: NAME 'pass'
152,71-152,72: NEWLINE '\n'
153,0-153,1: NL '\n'
154,0-154,9: COMMENT '# binary\n'
154,0-154,8: COMMENT '# binary'
154,8-154,9: NL '\n'
155,0-155,1: NAME 'x'
155,2-155,3: OP '='
155,4-155,5: NUMBER '1'
@ -551,7 +571,8 @@ test_tokenize
157,8-157,9: NUMBER '1'
157,9-157,10: NEWLINE '\n'
158,0-158,1: NL '\n'
159,0-159,8: COMMENT '# shift\n'
159,0-159,7: COMMENT '# shift'
159,7-159,8: NL '\n'
160,0-160,1: NAME 'x'
160,2-160,3: OP '='
160,4-160,5: NUMBER '1'
@ -561,7 +582,8 @@ test_tokenize
160,14-160,15: NUMBER '1'
160,15-160,16: NEWLINE '\n'
161,0-161,1: NL '\n'
162,0-162,11: COMMENT '# additive\n'
162,0-162,10: COMMENT '# additive'
162,10-162,11: NL '\n'
163,0-163,1: NAME 'x'
163,2-163,3: OP '='
163,4-163,5: NUMBER '1'
@ -575,7 +597,8 @@ test_tokenize
163,20-163,21: NUMBER '1'
163,21-163,22: NEWLINE '\n'
164,0-164,1: NL '\n'
165,0-165,17: COMMENT '# multiplicative\n'
165,0-165,16: COMMENT '# multiplicative'
165,16-165,17: NL '\n'
166,0-166,1: NAME 'x'
166,2-166,3: OP '='
166,4-166,5: NUMBER '1'
@ -587,7 +610,8 @@ test_tokenize
166,16-166,17: NUMBER '1'
166,17-166,18: NEWLINE '\n'
167,0-167,1: NL '\n'
168,0-168,8: COMMENT '# unary\n'
168,0-168,7: COMMENT '# unary'
168,7-168,8: NL '\n'
169,0-169,1: NAME 'x'
169,2-169,3: OP '='
169,4-169,5: OP '~'
@ -625,7 +649,8 @@ test_tokenize
170,24-170,25: NUMBER '1'
170,25-170,26: NEWLINE '\n'
171,0-171,1: NL '\n'
172,0-172,11: COMMENT '# selector\n'
172,0-172,10: COMMENT '# selector'
172,10-172,11: NL '\n'
173,0-173,6: NAME 'import'
173,7-173,10: NAME 'sys'
173,10-173,11: OP ','

View File

@ -9,20 +9,73 @@ code, print out a table with the tokens. The ENDMARK is omitted for
brevity.
>>> dump_tokens("1 + 1")
NUMBER '1' (1, 0) (1, 1)
OP '+' (1, 2) (1, 3)
NUMBER '1' (1, 4) (1, 5)
NUMBER '1' (1, 0) (1, 1)
OP '+' (1, 2) (1, 3)
NUMBER '1' (1, 4) (1, 5)
A comment generates a token here, unlike in the parser module. The
comment token is followed by an NL or a NEWLINE token, depending on
whether the line contains the completion of a statement.
>>> dump_tokens("if False:\\n"
... " # NL\\n"
... " True = False # NEWLINE\\n")
NAME 'if' (1, 0) (1, 2)
NAME 'False' (1, 3) (1, 8)
OP ':' (1, 8) (1, 9)
NEWLINE '\\n' (1, 9) (1, 10)
COMMENT '# NL' (2, 4) (2, 8)
NL '\\n' (2, 8) (2, 9)
INDENT ' ' (3, 0) (3, 4)
NAME 'True' (3, 4) (3, 8)
OP '=' (3, 9) (3, 10)
NAME 'False' (3, 11) (3, 16)
COMMENT '# NEWLINE' (3, 17) (3, 26)
NEWLINE '\\n' (3, 26) (3, 27)
DEDENT '' (4, 0) (4, 0)
There will be a bunch more tests of specific source patterns.
The tokenize module also defines an untokenize function that should
regenerate the original program text from the tokens. (It doesn't
work very well at the moment.)
regenerate the original program text from the tokens.
There are some standard formatting practices that are easy to get right.
>>> roundtrip("if x == 1:\\n"
... " print x\\n")
if x ==1 :
print x
if x == 1:
print x
Some people use different formatting conventions, which makes
untokenize a little trickier. Note that this test involves trailing
whitespace after the colon. You can't see it, but it's there!
>>> roundtrip("if x == 1 : \\n"
... " print x\\n")
if x == 1 :
print x
Comments need to go in the right place.
>>> roundtrip("if x == 1:\\n"
... " # A comment by itself.\\n"
... " print x # Comment here, too.\\n"
... " # Another comment.\\n"
... "after_if = True\\n")
if x == 1:
# A comment by itself.
print x # Comment here, too.
# Another comment.
after_if = True
>>> roundtrip("if (x # The comments need to go in the right place\\n"
... " == 1):\\n"
... " print 'x == 1'\\n")
if (x # The comments need to go in the right place
== 1):
print 'x == 1'
"""
import os, glob, random
@ -30,7 +83,7 @@ from cStringIO import StringIO
from test.test_support import (verbose, findfile, is_resource_enabled,
TestFailed)
from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
ENDMARKER, NUMBER, NAME, OP, STRING)
ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
# is tokenized, converted back to source code via tokenize.untokenize(),
@ -61,11 +114,12 @@ def dump_tokens(s):
if type == ENDMARKER:
break
type = tok_name[type]
print "%(type)-10.10s %(token)-10.10r %(start)s %(end)s" % locals()
print "%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals()
def roundtrip(s):
f = StringIO(s)
print untokenize(generate_tokens(f.readline)),
source = untokenize(generate_tokens(f.readline))
print source,
# This is an example from the docs, set up as a doctest.
def decistmt(s):

View File

@ -159,14 +159,76 @@ def tokenize_loop(readline, tokeneater):
for token_info in generate_tokens(readline):
tokeneater(*token_info)
class Untokenizer:
def __init__(self):
self.tokens = []
self.prev_row = 1
self.prev_col = 0
def add_whitespace(self, start):
row, col = start
while row > self.prev_row:
print row, "<", self.prev_row
self.tokens.append("\n")
self.prev_row += 1
col_offset = col - self.prev_col
if col_offset:
self.tokens.append(" " * col_offset)
def untokenize(self, iterable):
for t in iterable:
if len(t) == 2:
self.compat(t, iterable)
break
tok_type, token, start, end, line = t
self.add_whitespace(start)
self.tokens.append(token)
self.prev_row, self.prev_col = end
if tok_type in (NEWLINE, NL):
self.prev_row += 1
self.prev_col = 0
return "".join(self.tokens)
def compat(self, token, iterable):
startline = False
indents = []
toks_append = self.tokens.append
toknum, tokval = token
if toknum in (NAME, NUMBER):
tokval += ' '
if toknum in (NEWLINE, NL):
startline = True
for tok in iterable:
toknum, tokval = tok[:2]
if toknum in (NAME, NUMBER):
tokval += ' '
if toknum == INDENT:
indents.append(tokval)
continue
elif toknum == DEDENT:
indents.pop()
continue
elif toknum in (NEWLINE, NL):
startline = True
elif startline and indents:
toks_append(indents[-1])
startline = False
toks_append(tokval)
def untokenize(iterable):
"""Transform tokens back into Python source code.
Each element returned by the iterable must be a token sequence
with at least two elements, a token number and token value.
with at least two elements, a token number and token value. If
only two tokens are passed, the resulting output is poor.
Round-trip invariant:
Round-trip invariant for full input:
Untokenized source will match input source exactly
Round-trip invariant for limited intput:
# Output text will tokenize the back to the input
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
newcode = untokenize(t1)
@ -174,31 +236,8 @@ def untokenize(iterable):
t2 = [tok[:2] for tokin generate_tokens(readline)]
assert t1 == t2
"""
startline = False
indents = []
toks = []
toks_append = toks.append
for tok in iterable:
toknum, tokval = tok[:2]
if toknum in (NAME, NUMBER):
tokval += ' '
if toknum == INDENT:
indents.append(tokval)
continue
elif toknum == DEDENT:
indents.pop()
continue
elif toknum in (NEWLINE, COMMENT, NL):
startline = True
elif startline and indents:
toks_append(indents[-1])
startline = False
toks_append(tokval)
return ''.join(toks)
ut = Untokenizer()
return ut.untokenize(iterable)
def generate_tokens(readline):
"""
@ -237,7 +276,7 @@ def generate_tokens(readline):
if endmatch:
pos = end = endmatch.end(0)
yield (STRING, contstr + line[:end],
strstart, (lnum, end), contline + line)
strstart, (lnum, end), contline + line)
contstr, needcont = '', 0
contline = None
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
@ -263,7 +302,15 @@ def generate_tokens(readline):
if pos == max: break
if line[pos] in '#\r\n': # skip comments or blank lines
yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
if line[pos] == '#':
comment_token = line[pos:].rstrip('\r\n')
nl_pos = pos + len(comment_token)
yield (COMMENT, comment_token,
(lnum, pos), (lnum, pos + len(comment_token)), line)
yield (NL, line[nl_pos:],
(lnum, nl_pos), (lnum, len(line)), line)
else:
yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
(lnum, pos), (lnum, len(line)), line)
continue
@ -294,9 +341,10 @@ def generate_tokens(readline):
(initial == '.' and token != '.'): # ordinary number
yield (NUMBER, token, spos, epos, line)
elif initial in '\r\n':
yield (parenlev > 0 and NL or NEWLINE,
token, spos, epos, line)
yield (NL if parenlev > 0 else NEWLINE,
token, spos, epos, line)
elif initial == '#':
assert not token.endswith("\n")
yield (COMMENT, token, spos, epos, line)
elif token in triple_quoted:
endprog = endprogs[token]