Add untokenize() function to allow full round-trip tokenization.

Should significantly enhance the utility of the module by supporting
the creation of tools that modify the token stream and writeback the
modified result.
This commit is contained in:
Raymond Hettinger 2005-06-10 11:05:19 +00:00
parent bf7255fffb
commit 68c0453418
5 changed files with 182 additions and 7 deletions

View File

@ -45,6 +45,9 @@ An older entry point is retained for backward compatibility:
provides the same interface as the \method{readline()} method of
built-in file objects (see section~\ref{bltin-file-objects}). Each
call to the function should return one line of input as a string.
Alternately, \var{readline} may be a callable object that signals
completion by raising \exception{StopIteration}.
\versionchanged[Added StopIteration support]{2.5}
The second parameter, \var{tokeneater}, must also be a callable
object. It is called once for each token, with five arguments,
@ -65,3 +68,52 @@ passed to the \var{tokeneater} function by \function{tokenize()}:
are generated when a logical line of code is continued over multiple
physical lines.
\end{datadesc}
Another function is provided to reverse the tokenization process.
This is useful for creating tools that tokenize a script, modify
the token stream, and write back the modified script.
\begin{funcdesc}{untokenize}{iterable}
Converts tokens back into Python source code. The \variable{iterable}
must return sequences with at least two elements, the token type and
the token string. Any additional sequence elements are ignored.
The reconstructed script is returned as a single string. The
result is guaranteed to tokenize back to match the input so that
the conversion is lossless and round-trips are assured. The
guarantee applies only to the token type and token string as
the spacing between tokens (column positions) may change.
\versionadded{2.5}
\end{funcdesc}
Example of a script re-writer that transforms float literals into
Decimal objects:
\begin{verbatim}
def decistmt(s):
"""Substitute Decimals for floats in a string of statements.
>>> from decimal import Decimal
>>> s = 'print +21.3e-5*-.1234/81.7'
>>> decistmt(s)
"print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
>>> exec(s)
-3.21716034272e-007
>>> exec(decistmt(s))
-3.217160342717258261933904529E-7
"""
result = []
g = generate_tokens(StringIO(s).readline) # tokenize the string
for toknum, tokval, _, _, _ in g:
if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
result.extend([
(NAME, 'Decimal'),
(OP, '('),
(STRING, repr(tokval)),
(OP, ')')
])
else:
result.append((toknum, tokval))
return untokenize(result)
\end{verbatim}

View File

@ -91,7 +91,9 @@ resources to test. Currently only the following are defined:
compiler - Test the compiler package by compiling all the source
in the standard library and test suite. This takes
a long time.
a long time. Enabling this resource also allows
test_tokenize to verify round-trip lexing on every
file in the test library.
subprocess Run all tests for the subprocess module.

View File

@ -1,12 +1,82 @@
from test.test_support import verbose, findfile
import tokenize, os, sys
from test.test_support import verbose, findfile, is_resource_enabled
import os, glob, random
from tokenize import (tokenize, generate_tokens, untokenize,
NUMBER, NAME, OP, STRING)
if verbose:
print 'starting...'
f = file(findfile('tokenize_tests' + os.extsep + 'txt'))
tokenize.tokenize(f.readline)
tokenize(f.readline)
f.close()
###### Test roundtrip for untokenize ##########################
def test_roundtrip(f):
## print 'Testing:', f
f = file(f)
try:
fulltok = list(generate_tokens(f.readline))
finally:
f.close()
t1 = [tok[:2] for tok in fulltok]
newtext = untokenize(t1)
readline = iter(newtext.splitlines(1)).next
t2 = [tok[:2] for tok in generate_tokens(readline)]
assert t1 == t2
f = findfile('tokenize_tests' + os.extsep + 'txt')
test_roundtrip(f)
testdir = os.path.dirname(f) or os.curdir
testfiles = glob.glob(testdir + os.sep + 'test*.py')
if not is_resource_enabled('compiler'):
testfiles = random.sample(testfiles, 10)
for f in testfiles:
test_roundtrip(f)
###### Test example in the docs ###############################
from decimal import Decimal
from cStringIO import StringIO
def decistmt(s):
"""Substitute Decimals for floats in a string of statements.
>>> from decimal import Decimal
>>> s = 'print +21.3e-5*-.1234/81.7'
>>> decistmt(s)
"print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
>>> exec(s)
-3.21716034272e-007
>>> exec(decistmt(s))
-3.217160342717258261933904529E-7
"""
result = []
g = generate_tokens(StringIO(s).readline) # tokenize the string
for toknum, tokval, _, _, _ in g:
if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
result.extend([
(NAME, 'Decimal'),
(OP, '('),
(STRING, repr(tokval)),
(OP, ')')
])
else:
result.append((toknum, tokval))
return untokenize(result)
import doctest
doctest.testmod()
if verbose:
print 'finished'

View File

@ -31,7 +31,7 @@ from token import *
import token
__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
"generate_tokens", "NL"]
"generate_tokens", "NL", "untokenize"]
del x
del token
@ -159,12 +159,55 @@ def tokenize_loop(readline, tokeneater):
for token_info in generate_tokens(readline):
tokeneater(*token_info)
def untokenize(iterable):
"""Transform tokens back into Python source code.
Each element returned by the iterable must be a token sequence
with at least two elements, a token number and token value.
Round-trip invariant:
# Output text will tokenize the back to the input
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
newcode = untokenize(t1)
readline = iter(newcode.splitlines(1)).next
t2 = [tok[:2] for tokin generate_tokens(readline)]
assert t1 == t2
"""
startline = False
indents = []
toks = []
toks_append = toks.append
for tok in iterable:
toknum, tokval = tok[:2]
if toknum == NAME:
tokval += ' '
if toknum == INDENT:
indents.append(tokval)
continue
elif toknum == DEDENT:
indents.pop()
continue
elif toknum in (NEWLINE, COMMENT, NL):
startline = True
elif startline and indents:
toks_append(indents[-1])
startline = False
toks_append(tokval)
return ''.join(toks)
def generate_tokens(readline):
"""
The generate_tokens() generator requires one argment, readline, which
must be a callable object which provides the same interface as the
readline() method of built-in file objects. Each call to the function
should return one line of input as a string.
should return one line of input as a string. Alternately, readline
can be a callable function terminating with StopIteration:
readline = open(myfile).next # Example of alternate readline
The generator produces 5-tuples with these members: the token type; the
token string; a 2-tuple (srow, scol) of ints specifying the row and
@ -180,7 +223,10 @@ def generate_tokens(readline):
indents = [0]
while 1: # loop over lines in stream
line = readline()
try:
line = readline()
except StopIteration:
line = ''
lnum = lnum + 1
pos, max = 0, len(line)

View File

@ -141,6 +141,11 @@ Extension Modules
Library
-------
- The tokenize module has a new untokenize() function to support a full
roundtrip from lexed tokens back to Python sourcecode. In addition,
the generate_tokens() function now accepts a callable argument that
terminates by raising StopIteration.
- Bug #1196315: fix weakref.WeakValueDictionary constructor.
- Bug #1213894: os.path.realpath didn't resolve symlinks that were the first