Bug fixes large and small for tokenize.
Small: Always generate a NL or NEWLINE token following a COMMENT token. The old code did not generate an NL token if the comment was on a line by itself. Large: The output of untokenize() will now match the input exactly if it is passed the full token sequence. The old, crufty output is still generated if a limited input sequence is provided, where limited means that it does not include position information for tokens. Remaining bug: There is no CONTINUATION token (\) so there is no way for untokenize() to handle such code. Also, expanded the number of doctests in hopes of eventually removing the old-style tests that compare against a golden file. Bug fix candidate for Python 2.5.1. (Sigh.)
This commit is contained in:
parent
20362a820b
commit
76467ba6d6
|
@ -1,15 +1,23 @@
|
||||||
test_tokenize
|
test_tokenize
|
||||||
1,0-1,35: COMMENT "# Tests for the 'tokenize' module.\n"
|
1,0-1,34: COMMENT "# Tests for the 'tokenize' module."
|
||||||
2,0-2,43: COMMENT '# Large bits stolen from test_grammar.py. \n'
|
1,34-1,35: NL '\n'
|
||||||
|
2,0-2,42: COMMENT '# Large bits stolen from test_grammar.py. '
|
||||||
|
2,42-2,43: NL '\n'
|
||||||
3,0-3,1: NL '\n'
|
3,0-3,1: NL '\n'
|
||||||
4,0-4,11: COMMENT '# Comments\n'
|
4,0-4,10: COMMENT '# Comments'
|
||||||
|
4,10-4,11: NL '\n'
|
||||||
5,0-5,3: STRING '"#"'
|
5,0-5,3: STRING '"#"'
|
||||||
5,3-5,4: NEWLINE '\n'
|
5,3-5,4: NEWLINE '\n'
|
||||||
6,0-6,3: COMMENT "#'\n"
|
6,0-6,2: COMMENT "#'"
|
||||||
7,0-7,3: COMMENT '#"\n'
|
6,2-6,3: NL '\n'
|
||||||
8,0-8,3: COMMENT '#\\\n'
|
7,0-7,2: COMMENT '#"'
|
||||||
9,7-9,9: COMMENT '#\n'
|
7,2-7,3: NL '\n'
|
||||||
10,4-10,10: COMMENT '# abc\n'
|
8,0-8,2: COMMENT '#\\'
|
||||||
|
8,2-8,3: NL '\n'
|
||||||
|
9,7-9,8: COMMENT '#'
|
||||||
|
9,8-9,9: NL '\n'
|
||||||
|
10,4-10,9: COMMENT '# abc'
|
||||||
|
10,9-10,10: NL '\n'
|
||||||
11,0-12,4: STRING "'''#\n#'''"
|
11,0-12,4: STRING "'''#\n#'''"
|
||||||
12,4-12,5: NEWLINE '\n'
|
12,4-12,5: NEWLINE '\n'
|
||||||
13,0-13,1: NL '\n'
|
13,0-13,1: NL '\n'
|
||||||
|
@ -19,7 +27,8 @@ test_tokenize
|
||||||
14,7-14,8: COMMENT '#'
|
14,7-14,8: COMMENT '#'
|
||||||
14,8-14,9: NEWLINE '\n'
|
14,8-14,9: NEWLINE '\n'
|
||||||
15,0-15,1: NL '\n'
|
15,0-15,1: NL '\n'
|
||||||
16,0-16,25: COMMENT '# Balancing continuation\n'
|
16,0-16,24: COMMENT '# Balancing continuation'
|
||||||
|
16,24-16,25: NL '\n'
|
||||||
17,0-17,1: NL '\n'
|
17,0-17,1: NL '\n'
|
||||||
18,0-18,1: NAME 'a'
|
18,0-18,1: NAME 'a'
|
||||||
18,2-18,3: OP '='
|
18,2-18,3: OP '='
|
||||||
|
@ -92,7 +101,8 @@ test_tokenize
|
||||||
29,2-29,3: OP ')'
|
29,2-29,3: OP ')'
|
||||||
29,3-29,4: NEWLINE '\n'
|
29,3-29,4: NEWLINE '\n'
|
||||||
30,0-30,1: NL '\n'
|
30,0-30,1: NL '\n'
|
||||||
31,0-31,37: COMMENT '# Backslash means line continuation:\n'
|
31,0-31,36: COMMENT '# Backslash means line continuation:'
|
||||||
|
31,36-31,37: NL '\n'
|
||||||
32,0-32,1: NAME 'x'
|
32,0-32,1: NAME 'x'
|
||||||
32,2-32,3: OP '='
|
32,2-32,3: OP '='
|
||||||
32,4-32,5: NUMBER '1'
|
32,4-32,5: NUMBER '1'
|
||||||
|
@ -100,13 +110,15 @@ test_tokenize
|
||||||
33,2-33,3: NUMBER '1'
|
33,2-33,3: NUMBER '1'
|
||||||
33,3-33,4: NEWLINE '\n'
|
33,3-33,4: NEWLINE '\n'
|
||||||
34,0-34,1: NL '\n'
|
34,0-34,1: NL '\n'
|
||||||
35,0-35,55: COMMENT '# Backslash does not means continuation in comments :\\\n'
|
35,0-35,54: COMMENT '# Backslash does not means continuation in comments :\\'
|
||||||
|
35,54-35,55: NL '\n'
|
||||||
36,0-36,1: NAME 'x'
|
36,0-36,1: NAME 'x'
|
||||||
36,2-36,3: OP '='
|
36,2-36,3: OP '='
|
||||||
36,4-36,5: NUMBER '0'
|
36,4-36,5: NUMBER '0'
|
||||||
36,5-36,6: NEWLINE '\n'
|
36,5-36,6: NEWLINE '\n'
|
||||||
37,0-37,1: NL '\n'
|
37,0-37,1: NL '\n'
|
||||||
38,0-38,20: COMMENT '# Ordinary integers\n'
|
38,0-38,19: COMMENT '# Ordinary integers'
|
||||||
|
38,19-38,20: NL '\n'
|
||||||
39,0-39,4: NUMBER '0xff'
|
39,0-39,4: NUMBER '0xff'
|
||||||
39,5-39,7: OP '<>'
|
39,5-39,7: OP '<>'
|
||||||
39,8-39,11: NUMBER '255'
|
39,8-39,11: NUMBER '255'
|
||||||
|
@ -137,7 +149,8 @@ test_tokenize
|
||||||
44,15-44,16: NUMBER '1'
|
44,15-44,16: NUMBER '1'
|
||||||
44,16-44,17: NEWLINE '\n'
|
44,16-44,17: NEWLINE '\n'
|
||||||
45,0-45,1: NL '\n'
|
45,0-45,1: NL '\n'
|
||||||
46,0-46,16: COMMENT '# Long integers\n'
|
46,0-46,15: COMMENT '# Long integers'
|
||||||
|
46,15-46,16: NL '\n'
|
||||||
47,0-47,1: NAME 'x'
|
47,0-47,1: NAME 'x'
|
||||||
47,2-47,3: OP '='
|
47,2-47,3: OP '='
|
||||||
47,4-47,6: NUMBER '0L'
|
47,4-47,6: NUMBER '0L'
|
||||||
|
@ -171,7 +184,8 @@ test_tokenize
|
||||||
54,4-54,35: NUMBER '123456789012345678901234567890l'
|
54,4-54,35: NUMBER '123456789012345678901234567890l'
|
||||||
54,35-54,36: NEWLINE '\n'
|
54,35-54,36: NEWLINE '\n'
|
||||||
55,0-55,1: NL '\n'
|
55,0-55,1: NL '\n'
|
||||||
56,0-56,25: COMMENT '# Floating-point numbers\n'
|
56,0-56,24: COMMENT '# Floating-point numbers'
|
||||||
|
56,24-56,25: NL '\n'
|
||||||
57,0-57,1: NAME 'x'
|
57,0-57,1: NAME 'x'
|
||||||
57,2-57,3: OP '='
|
57,2-57,3: OP '='
|
||||||
57,4-57,8: NUMBER '3.14'
|
57,4-57,8: NUMBER '3.14'
|
||||||
|
@ -184,7 +198,8 @@ test_tokenize
|
||||||
59,2-59,3: OP '='
|
59,2-59,3: OP '='
|
||||||
59,4-59,9: NUMBER '0.314'
|
59,4-59,9: NUMBER '0.314'
|
||||||
59,9-59,10: NEWLINE '\n'
|
59,9-59,10: NEWLINE '\n'
|
||||||
60,0-60,18: COMMENT '# XXX x = 000.314\n'
|
60,0-60,17: COMMENT '# XXX x = 000.314'
|
||||||
|
60,17-60,18: NL '\n'
|
||||||
61,0-61,1: NAME 'x'
|
61,0-61,1: NAME 'x'
|
||||||
61,2-61,3: OP '='
|
61,2-61,3: OP '='
|
||||||
61,4-61,8: NUMBER '.314'
|
61,4-61,8: NUMBER '.314'
|
||||||
|
@ -218,7 +233,8 @@ test_tokenize
|
||||||
68,4-68,9: NUMBER '3.1e4'
|
68,4-68,9: NUMBER '3.1e4'
|
||||||
68,9-68,10: NEWLINE '\n'
|
68,9-68,10: NEWLINE '\n'
|
||||||
69,0-69,1: NL '\n'
|
69,0-69,1: NL '\n'
|
||||||
70,0-70,18: COMMENT '# String literals\n'
|
70,0-70,17: COMMENT '# String literals'
|
||||||
|
70,17-70,18: NL '\n'
|
||||||
71,0-71,1: NAME 'x'
|
71,0-71,1: NAME 'x'
|
||||||
71,2-71,3: OP '='
|
71,2-71,3: OP '='
|
||||||
71,4-71,6: STRING "''"
|
71,4-71,6: STRING "''"
|
||||||
|
@ -366,7 +382,8 @@ test_tokenize
|
||||||
125,6-126,3: STRING "uR'''spam\n'''"
|
125,6-126,3: STRING "uR'''spam\n'''"
|
||||||
126,3-126,4: NEWLINE '\n'
|
126,3-126,4: NEWLINE '\n'
|
||||||
127,0-127,1: NL '\n'
|
127,0-127,1: NL '\n'
|
||||||
128,0-128,14: COMMENT '# Indentation\n'
|
128,0-128,13: COMMENT '# Indentation'
|
||||||
|
128,13-128,14: NL '\n'
|
||||||
129,0-129,2: NAME 'if'
|
129,0-129,2: NAME 'if'
|
||||||
129,3-129,4: NUMBER '1'
|
129,3-129,4: NUMBER '1'
|
||||||
129,4-129,5: OP ':'
|
129,4-129,5: OP ':'
|
||||||
|
@ -438,7 +455,8 @@ test_tokenize
|
||||||
142,14-142,15: NUMBER '2'
|
142,14-142,15: NUMBER '2'
|
||||||
142,15-142,16: NEWLINE '\n'
|
142,15-142,16: NEWLINE '\n'
|
||||||
143,0-143,1: NL '\n'
|
143,0-143,1: NL '\n'
|
||||||
144,0-144,12: COMMENT '# Operators\n'
|
144,0-144,11: COMMENT '# Operators'
|
||||||
|
144,11-144,12: NL '\n'
|
||||||
145,0-145,1: NL '\n'
|
145,0-145,1: NL '\n'
|
||||||
146,0-146,0: DEDENT ''
|
146,0-146,0: DEDENT ''
|
||||||
146,0-146,0: DEDENT ''
|
146,0-146,0: DEDENT ''
|
||||||
|
@ -500,7 +518,8 @@ test_tokenize
|
||||||
149,27-149,28: OP ')'
|
149,27-149,28: OP ')'
|
||||||
149,28-149,29: NEWLINE '\n'
|
149,28-149,29: NEWLINE '\n'
|
||||||
150,0-150,1: NL '\n'
|
150,0-150,1: NL '\n'
|
||||||
151,0-151,13: COMMENT '# comparison\n'
|
151,0-151,12: COMMENT '# comparison'
|
||||||
|
151,12-151,13: NL '\n'
|
||||||
152,0-152,2: NAME 'if'
|
152,0-152,2: NAME 'if'
|
||||||
152,3-152,4: NUMBER '1'
|
152,3-152,4: NUMBER '1'
|
||||||
152,5-152,6: OP '<'
|
152,5-152,6: OP '<'
|
||||||
|
@ -531,7 +550,8 @@ test_tokenize
|
||||||
152,67-152,71: NAME 'pass'
|
152,67-152,71: NAME 'pass'
|
||||||
152,71-152,72: NEWLINE '\n'
|
152,71-152,72: NEWLINE '\n'
|
||||||
153,0-153,1: NL '\n'
|
153,0-153,1: NL '\n'
|
||||||
154,0-154,9: COMMENT '# binary\n'
|
154,0-154,8: COMMENT '# binary'
|
||||||
|
154,8-154,9: NL '\n'
|
||||||
155,0-155,1: NAME 'x'
|
155,0-155,1: NAME 'x'
|
||||||
155,2-155,3: OP '='
|
155,2-155,3: OP '='
|
||||||
155,4-155,5: NUMBER '1'
|
155,4-155,5: NUMBER '1'
|
||||||
|
@ -551,7 +571,8 @@ test_tokenize
|
||||||
157,8-157,9: NUMBER '1'
|
157,8-157,9: NUMBER '1'
|
||||||
157,9-157,10: NEWLINE '\n'
|
157,9-157,10: NEWLINE '\n'
|
||||||
158,0-158,1: NL '\n'
|
158,0-158,1: NL '\n'
|
||||||
159,0-159,8: COMMENT '# shift\n'
|
159,0-159,7: COMMENT '# shift'
|
||||||
|
159,7-159,8: NL '\n'
|
||||||
160,0-160,1: NAME 'x'
|
160,0-160,1: NAME 'x'
|
||||||
160,2-160,3: OP '='
|
160,2-160,3: OP '='
|
||||||
160,4-160,5: NUMBER '1'
|
160,4-160,5: NUMBER '1'
|
||||||
|
@ -561,7 +582,8 @@ test_tokenize
|
||||||
160,14-160,15: NUMBER '1'
|
160,14-160,15: NUMBER '1'
|
||||||
160,15-160,16: NEWLINE '\n'
|
160,15-160,16: NEWLINE '\n'
|
||||||
161,0-161,1: NL '\n'
|
161,0-161,1: NL '\n'
|
||||||
162,0-162,11: COMMENT '# additive\n'
|
162,0-162,10: COMMENT '# additive'
|
||||||
|
162,10-162,11: NL '\n'
|
||||||
163,0-163,1: NAME 'x'
|
163,0-163,1: NAME 'x'
|
||||||
163,2-163,3: OP '='
|
163,2-163,3: OP '='
|
||||||
163,4-163,5: NUMBER '1'
|
163,4-163,5: NUMBER '1'
|
||||||
|
@ -575,7 +597,8 @@ test_tokenize
|
||||||
163,20-163,21: NUMBER '1'
|
163,20-163,21: NUMBER '1'
|
||||||
163,21-163,22: NEWLINE '\n'
|
163,21-163,22: NEWLINE '\n'
|
||||||
164,0-164,1: NL '\n'
|
164,0-164,1: NL '\n'
|
||||||
165,0-165,17: COMMENT '# multiplicative\n'
|
165,0-165,16: COMMENT '# multiplicative'
|
||||||
|
165,16-165,17: NL '\n'
|
||||||
166,0-166,1: NAME 'x'
|
166,0-166,1: NAME 'x'
|
||||||
166,2-166,3: OP '='
|
166,2-166,3: OP '='
|
||||||
166,4-166,5: NUMBER '1'
|
166,4-166,5: NUMBER '1'
|
||||||
|
@ -587,7 +610,8 @@ test_tokenize
|
||||||
166,16-166,17: NUMBER '1'
|
166,16-166,17: NUMBER '1'
|
||||||
166,17-166,18: NEWLINE '\n'
|
166,17-166,18: NEWLINE '\n'
|
||||||
167,0-167,1: NL '\n'
|
167,0-167,1: NL '\n'
|
||||||
168,0-168,8: COMMENT '# unary\n'
|
168,0-168,7: COMMENT '# unary'
|
||||||
|
168,7-168,8: NL '\n'
|
||||||
169,0-169,1: NAME 'x'
|
169,0-169,1: NAME 'x'
|
||||||
169,2-169,3: OP '='
|
169,2-169,3: OP '='
|
||||||
169,4-169,5: OP '~'
|
169,4-169,5: OP '~'
|
||||||
|
@ -625,7 +649,8 @@ test_tokenize
|
||||||
170,24-170,25: NUMBER '1'
|
170,24-170,25: NUMBER '1'
|
||||||
170,25-170,26: NEWLINE '\n'
|
170,25-170,26: NEWLINE '\n'
|
||||||
171,0-171,1: NL '\n'
|
171,0-171,1: NL '\n'
|
||||||
172,0-172,11: COMMENT '# selector\n'
|
172,0-172,10: COMMENT '# selector'
|
||||||
|
172,10-172,11: NL '\n'
|
||||||
173,0-173,6: NAME 'import'
|
173,0-173,6: NAME 'import'
|
||||||
173,7-173,10: NAME 'sys'
|
173,7-173,10: NAME 'sys'
|
||||||
173,10-173,11: OP ','
|
173,10-173,11: OP ','
|
||||||
|
|
|
@ -13,16 +13,69 @@ NUMBER '1' (1, 0) (1, 1)
|
||||||
OP '+' (1, 2) (1, 3)
|
OP '+' (1, 2) (1, 3)
|
||||||
NUMBER '1' (1, 4) (1, 5)
|
NUMBER '1' (1, 4) (1, 5)
|
||||||
|
|
||||||
|
A comment generates a token here, unlike in the parser module. The
|
||||||
|
comment token is followed by an NL or a NEWLINE token, depending on
|
||||||
|
whether the line contains the completion of a statement.
|
||||||
|
|
||||||
|
>>> dump_tokens("if False:\\n"
|
||||||
|
... " # NL\\n"
|
||||||
|
... " True = False # NEWLINE\\n")
|
||||||
|
NAME 'if' (1, 0) (1, 2)
|
||||||
|
NAME 'False' (1, 3) (1, 8)
|
||||||
|
OP ':' (1, 8) (1, 9)
|
||||||
|
NEWLINE '\\n' (1, 9) (1, 10)
|
||||||
|
COMMENT '# NL' (2, 4) (2, 8)
|
||||||
|
NL '\\n' (2, 8) (2, 9)
|
||||||
|
INDENT ' ' (3, 0) (3, 4)
|
||||||
|
NAME 'True' (3, 4) (3, 8)
|
||||||
|
OP '=' (3, 9) (3, 10)
|
||||||
|
NAME 'False' (3, 11) (3, 16)
|
||||||
|
COMMENT '# NEWLINE' (3, 17) (3, 26)
|
||||||
|
NEWLINE '\\n' (3, 26) (3, 27)
|
||||||
|
DEDENT '' (4, 0) (4, 0)
|
||||||
|
|
||||||
|
|
||||||
There will be a bunch more tests of specific source patterns.
|
There will be a bunch more tests of specific source patterns.
|
||||||
|
|
||||||
The tokenize module also defines an untokenize function that should
|
The tokenize module also defines an untokenize function that should
|
||||||
regenerate the original program text from the tokens. (It doesn't
|
regenerate the original program text from the tokens.
|
||||||
work very well at the moment.)
|
|
||||||
|
There are some standard formatting practices that are easy to get right.
|
||||||
|
|
||||||
>>> roundtrip("if x == 1:\\n"
|
>>> roundtrip("if x == 1:\\n"
|
||||||
... " print x\\n")
|
... " print x\\n")
|
||||||
if x == 1:
|
if x == 1:
|
||||||
print x
|
print x
|
||||||
|
|
||||||
|
Some people use different formatting conventions, which makes
|
||||||
|
untokenize a little trickier. Note that this test involves trailing
|
||||||
|
whitespace after the colon. You can't see it, but it's there!
|
||||||
|
|
||||||
|
>>> roundtrip("if x == 1 : \\n"
|
||||||
|
... " print x\\n")
|
||||||
|
if x == 1 :
|
||||||
|
print x
|
||||||
|
|
||||||
|
Comments need to go in the right place.
|
||||||
|
|
||||||
|
>>> roundtrip("if x == 1:\\n"
|
||||||
|
... " # A comment by itself.\\n"
|
||||||
|
... " print x # Comment here, too.\\n"
|
||||||
|
... " # Another comment.\\n"
|
||||||
|
... "after_if = True\\n")
|
||||||
|
if x == 1:
|
||||||
|
# A comment by itself.
|
||||||
|
print x # Comment here, too.
|
||||||
|
# Another comment.
|
||||||
|
after_if = True
|
||||||
|
|
||||||
|
>>> roundtrip("if (x # The comments need to go in the right place\\n"
|
||||||
|
... " == 1):\\n"
|
||||||
|
... " print 'x == 1'\\n")
|
||||||
|
if (x # The comments need to go in the right place
|
||||||
|
== 1):
|
||||||
|
print 'x == 1'
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os, glob, random
|
import os, glob, random
|
||||||
|
@ -30,7 +83,7 @@ from cStringIO import StringIO
|
||||||
from test.test_support import (verbose, findfile, is_resource_enabled,
|
from test.test_support import (verbose, findfile, is_resource_enabled,
|
||||||
TestFailed)
|
TestFailed)
|
||||||
from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
|
from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
|
||||||
ENDMARKER, NUMBER, NAME, OP, STRING)
|
ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
|
||||||
|
|
||||||
# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
|
# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
|
||||||
# is tokenized, converted back to source code via tokenize.untokenize(),
|
# is tokenized, converted back to source code via tokenize.untokenize(),
|
||||||
|
@ -61,11 +114,12 @@ def dump_tokens(s):
|
||||||
if type == ENDMARKER:
|
if type == ENDMARKER:
|
||||||
break
|
break
|
||||||
type = tok_name[type]
|
type = tok_name[type]
|
||||||
print "%(type)-10.10s %(token)-10.10r %(start)s %(end)s" % locals()
|
print "%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals()
|
||||||
|
|
||||||
def roundtrip(s):
|
def roundtrip(s):
|
||||||
f = StringIO(s)
|
f = StringIO(s)
|
||||||
print untokenize(generate_tokens(f.readline)),
|
source = untokenize(generate_tokens(f.readline))
|
||||||
|
print source,
|
||||||
|
|
||||||
# This is an example from the docs, set up as a doctest.
|
# This is an example from the docs, set up as a doctest.
|
||||||
def decistmt(s):
|
def decistmt(s):
|
||||||
|
|
|
@ -159,26 +159,46 @@ def tokenize_loop(readline, tokeneater):
|
||||||
for token_info in generate_tokens(readline):
|
for token_info in generate_tokens(readline):
|
||||||
tokeneater(*token_info)
|
tokeneater(*token_info)
|
||||||
|
|
||||||
|
class Untokenizer:
|
||||||
|
|
||||||
def untokenize(iterable):
|
def __init__(self):
|
||||||
"""Transform tokens back into Python source code.
|
self.tokens = []
|
||||||
|
self.prev_row = 1
|
||||||
|
self.prev_col = 0
|
||||||
|
|
||||||
Each element returned by the iterable must be a token sequence
|
def add_whitespace(self, start):
|
||||||
with at least two elements, a token number and token value.
|
row, col = start
|
||||||
|
while row > self.prev_row:
|
||||||
|
print row, "<", self.prev_row
|
||||||
|
self.tokens.append("\n")
|
||||||
|
self.prev_row += 1
|
||||||
|
col_offset = col - self.prev_col
|
||||||
|
if col_offset:
|
||||||
|
self.tokens.append(" " * col_offset)
|
||||||
|
|
||||||
Round-trip invariant:
|
def untokenize(self, iterable):
|
||||||
# Output text will tokenize the back to the input
|
for t in iterable:
|
||||||
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
|
if len(t) == 2:
|
||||||
newcode = untokenize(t1)
|
self.compat(t, iterable)
|
||||||
readline = iter(newcode.splitlines(1)).next
|
break
|
||||||
t2 = [tok[:2] for tokin generate_tokens(readline)]
|
tok_type, token, start, end, line = t
|
||||||
assert t1 == t2
|
self.add_whitespace(start)
|
||||||
"""
|
self.tokens.append(token)
|
||||||
|
self.prev_row, self.prev_col = end
|
||||||
|
if tok_type in (NEWLINE, NL):
|
||||||
|
self.prev_row += 1
|
||||||
|
self.prev_col = 0
|
||||||
|
return "".join(self.tokens)
|
||||||
|
|
||||||
|
def compat(self, token, iterable):
|
||||||
startline = False
|
startline = False
|
||||||
indents = []
|
indents = []
|
||||||
toks = []
|
toks_append = self.tokens.append
|
||||||
toks_append = toks.append
|
toknum, tokval = token
|
||||||
|
if toknum in (NAME, NUMBER):
|
||||||
|
tokval += ' '
|
||||||
|
if toknum in (NEWLINE, NL):
|
||||||
|
startline = True
|
||||||
for tok in iterable:
|
for tok in iterable:
|
||||||
toknum, tokval = tok[:2]
|
toknum, tokval = tok[:2]
|
||||||
|
|
||||||
|
@ -191,14 +211,33 @@ def untokenize(iterable):
|
||||||
elif toknum == DEDENT:
|
elif toknum == DEDENT:
|
||||||
indents.pop()
|
indents.pop()
|
||||||
continue
|
continue
|
||||||
elif toknum in (NEWLINE, COMMENT, NL):
|
elif toknum in (NEWLINE, NL):
|
||||||
startline = True
|
startline = True
|
||||||
elif startline and indents:
|
elif startline and indents:
|
||||||
toks_append(indents[-1])
|
toks_append(indents[-1])
|
||||||
startline = False
|
startline = False
|
||||||
toks_append(tokval)
|
toks_append(tokval)
|
||||||
return ''.join(toks)
|
|
||||||
|
|
||||||
|
def untokenize(iterable):
|
||||||
|
"""Transform tokens back into Python source code.
|
||||||
|
|
||||||
|
Each element returned by the iterable must be a token sequence
|
||||||
|
with at least two elements, a token number and token value. If
|
||||||
|
only two tokens are passed, the resulting output is poor.
|
||||||
|
|
||||||
|
Round-trip invariant for full input:
|
||||||
|
Untokenized source will match input source exactly
|
||||||
|
|
||||||
|
Round-trip invariant for limited intput:
|
||||||
|
# Output text will tokenize the back to the input
|
||||||
|
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
|
||||||
|
newcode = untokenize(t1)
|
||||||
|
readline = iter(newcode.splitlines(1)).next
|
||||||
|
t2 = [tok[:2] for tokin generate_tokens(readline)]
|
||||||
|
assert t1 == t2
|
||||||
|
"""
|
||||||
|
ut = Untokenizer()
|
||||||
|
return ut.untokenize(iterable)
|
||||||
|
|
||||||
def generate_tokens(readline):
|
def generate_tokens(readline):
|
||||||
"""
|
"""
|
||||||
|
@ -263,6 +302,14 @@ def generate_tokens(readline):
|
||||||
if pos == max: break
|
if pos == max: break
|
||||||
|
|
||||||
if line[pos] in '#\r\n': # skip comments or blank lines
|
if line[pos] in '#\r\n': # skip comments or blank lines
|
||||||
|
if line[pos] == '#':
|
||||||
|
comment_token = line[pos:].rstrip('\r\n')
|
||||||
|
nl_pos = pos + len(comment_token)
|
||||||
|
yield (COMMENT, comment_token,
|
||||||
|
(lnum, pos), (lnum, pos + len(comment_token)), line)
|
||||||
|
yield (NL, line[nl_pos:],
|
||||||
|
(lnum, nl_pos), (lnum, len(line)), line)
|
||||||
|
else:
|
||||||
yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
|
yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
|
||||||
(lnum, pos), (lnum, len(line)), line)
|
(lnum, pos), (lnum, len(line)), line)
|
||||||
continue
|
continue
|
||||||
|
@ -294,9 +341,10 @@ def generate_tokens(readline):
|
||||||
(initial == '.' and token != '.'): # ordinary number
|
(initial == '.' and token != '.'): # ordinary number
|
||||||
yield (NUMBER, token, spos, epos, line)
|
yield (NUMBER, token, spos, epos, line)
|
||||||
elif initial in '\r\n':
|
elif initial in '\r\n':
|
||||||
yield (parenlev > 0 and NL or NEWLINE,
|
yield (NL if parenlev > 0 else NEWLINE,
|
||||||
token, spos, epos, line)
|
token, spos, epos, line)
|
||||||
elif initial == '#':
|
elif initial == '#':
|
||||||
|
assert not token.endswith("\n")
|
||||||
yield (COMMENT, token, spos, epos, line)
|
yield (COMMENT, token, spos, epos, line)
|
||||||
elif token in triple_quoted:
|
elif token in triple_quoted:
|
||||||
endprog = endprogs[token]
|
endprog = endprogs[token]
|
||||||
|
|
Loading…
Reference in New Issue