Add untokenize() function to allow full round-trip tokenization.
Should significantly enhance the utility of the module by supporting the creation of tools that modify the token stream and writeback the modified result.
This commit is contained in:
parent
bf7255fffb
commit
68c0453418
|
@ -45,6 +45,9 @@ An older entry point is retained for backward compatibility:
|
|||
provides the same interface as the \method{readline()} method of
|
||||
built-in file objects (see section~\ref{bltin-file-objects}). Each
|
||||
call to the function should return one line of input as a string.
|
||||
Alternately, \var{readline} may be a callable object that signals
|
||||
completion by raising \exception{StopIteration}.
|
||||
\versionchanged[Added StopIteration support]{2.5}
|
||||
|
||||
The second parameter, \var{tokeneater}, must also be a callable
|
||||
object. It is called once for each token, with five arguments,
|
||||
|
@ -65,3 +68,52 @@ passed to the \var{tokeneater} function by \function{tokenize()}:
|
|||
are generated when a logical line of code is continued over multiple
|
||||
physical lines.
|
||||
\end{datadesc}
|
||||
|
||||
Another function is provided to reverse the tokenization process.
|
||||
This is useful for creating tools that tokenize a script, modify
|
||||
the token stream, and write back the modified script.
|
||||
|
||||
\begin{funcdesc}{untokenize}{iterable}
|
||||
Converts tokens back into Python source code. The \variable{iterable}
|
||||
must return sequences with at least two elements, the token type and
|
||||
the token string. Any additional sequence elements are ignored.
|
||||
|
||||
The reconstructed script is returned as a single string. The
|
||||
result is guaranteed to tokenize back to match the input so that
|
||||
the conversion is lossless and round-trips are assured. The
|
||||
guarantee applies only to the token type and token string as
|
||||
the spacing between tokens (column positions) may change.
|
||||
\versionadded{2.5}
|
||||
\end{funcdesc}
|
||||
|
||||
Example of a script re-writer that transforms float literals into
|
||||
Decimal objects:
|
||||
\begin{verbatim}
|
||||
def decistmt(s):
|
||||
"""Substitute Decimals for floats in a string of statements.
|
||||
|
||||
>>> from decimal import Decimal
|
||||
>>> s = 'print +21.3e-5*-.1234/81.7'
|
||||
>>> decistmt(s)
|
||||
"print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
|
||||
|
||||
>>> exec(s)
|
||||
-3.21716034272e-007
|
||||
>>> exec(decistmt(s))
|
||||
-3.217160342717258261933904529E-7
|
||||
|
||||
"""
|
||||
result = []
|
||||
g = generate_tokens(StringIO(s).readline) # tokenize the string
|
||||
for toknum, tokval, _, _, _ in g:
|
||||
if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
|
||||
result.extend([
|
||||
(NAME, 'Decimal'),
|
||||
(OP, '('),
|
||||
(STRING, repr(tokval)),
|
||||
(OP, ')')
|
||||
])
|
||||
else:
|
||||
result.append((toknum, tokval))
|
||||
return untokenize(result)
|
||||
\end{verbatim}
|
||||
|
|
|
@ -91,7 +91,9 @@ resources to test. Currently only the following are defined:
|
|||
|
||||
compiler - Test the compiler package by compiling all the source
|
||||
in the standard library and test suite. This takes
|
||||
a long time.
|
||||
a long time. Enabling this resource also allows
|
||||
test_tokenize to verify round-trip lexing on every
|
||||
file in the test library.
|
||||
|
||||
subprocess Run all tests for the subprocess module.
|
||||
|
||||
|
|
|
@ -1,12 +1,82 @@
|
|||
from test.test_support import verbose, findfile
|
||||
import tokenize, os, sys
|
||||
from test.test_support import verbose, findfile, is_resource_enabled
|
||||
import os, glob, random
|
||||
from tokenize import (tokenize, generate_tokens, untokenize,
|
||||
NUMBER, NAME, OP, STRING)
|
||||
|
||||
if verbose:
|
||||
print 'starting...'
|
||||
|
||||
f = file(findfile('tokenize_tests' + os.extsep + 'txt'))
|
||||
tokenize.tokenize(f.readline)
|
||||
tokenize(f.readline)
|
||||
f.close()
|
||||
|
||||
|
||||
|
||||
###### Test roundtrip for untokenize ##########################
|
||||
|
||||
def test_roundtrip(f):
|
||||
## print 'Testing:', f
|
||||
f = file(f)
|
||||
try:
|
||||
fulltok = list(generate_tokens(f.readline))
|
||||
finally:
|
||||
f.close()
|
||||
|
||||
t1 = [tok[:2] for tok in fulltok]
|
||||
newtext = untokenize(t1)
|
||||
readline = iter(newtext.splitlines(1)).next
|
||||
t2 = [tok[:2] for tok in generate_tokens(readline)]
|
||||
assert t1 == t2
|
||||
|
||||
|
||||
f = findfile('tokenize_tests' + os.extsep + 'txt')
|
||||
test_roundtrip(f)
|
||||
|
||||
testdir = os.path.dirname(f) or os.curdir
|
||||
testfiles = glob.glob(testdir + os.sep + 'test*.py')
|
||||
if not is_resource_enabled('compiler'):
|
||||
testfiles = random.sample(testfiles, 10)
|
||||
|
||||
for f in testfiles:
|
||||
test_roundtrip(f)
|
||||
|
||||
|
||||
|
||||
###### Test example in the docs ###############################
|
||||
|
||||
from decimal import Decimal
|
||||
from cStringIO import StringIO
|
||||
|
||||
def decistmt(s):
|
||||
"""Substitute Decimals for floats in a string of statements.
|
||||
|
||||
>>> from decimal import Decimal
|
||||
>>> s = 'print +21.3e-5*-.1234/81.7'
|
||||
>>> decistmt(s)
|
||||
"print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
|
||||
|
||||
>>> exec(s)
|
||||
-3.21716034272e-007
|
||||
>>> exec(decistmt(s))
|
||||
-3.217160342717258261933904529E-7
|
||||
|
||||
"""
|
||||
result = []
|
||||
g = generate_tokens(StringIO(s).readline) # tokenize the string
|
||||
for toknum, tokval, _, _, _ in g:
|
||||
if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
|
||||
result.extend([
|
||||
(NAME, 'Decimal'),
|
||||
(OP, '('),
|
||||
(STRING, repr(tokval)),
|
||||
(OP, ')')
|
||||
])
|
||||
else:
|
||||
result.append((toknum, tokval))
|
||||
return untokenize(result)
|
||||
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
if verbose:
|
||||
print 'finished'
|
||||
|
|
|
@ -31,7 +31,7 @@ from token import *
|
|||
|
||||
import token
|
||||
__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
|
||||
"generate_tokens", "NL"]
|
||||
"generate_tokens", "NL", "untokenize"]
|
||||
del x
|
||||
del token
|
||||
|
||||
|
@ -159,12 +159,55 @@ def tokenize_loop(readline, tokeneater):
|
|||
for token_info in generate_tokens(readline):
|
||||
tokeneater(*token_info)
|
||||
|
||||
|
||||
def untokenize(iterable):
|
||||
"""Transform tokens back into Python source code.
|
||||
|
||||
Each element returned by the iterable must be a token sequence
|
||||
with at least two elements, a token number and token value.
|
||||
|
||||
Round-trip invariant:
|
||||
# Output text will tokenize the back to the input
|
||||
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
|
||||
newcode = untokenize(t1)
|
||||
readline = iter(newcode.splitlines(1)).next
|
||||
t2 = [tok[:2] for tokin generate_tokens(readline)]
|
||||
assert t1 == t2
|
||||
"""
|
||||
|
||||
startline = False
|
||||
indents = []
|
||||
toks = []
|
||||
toks_append = toks.append
|
||||
for tok in iterable:
|
||||
toknum, tokval = tok[:2]
|
||||
|
||||
if toknum == NAME:
|
||||
tokval += ' '
|
||||
|
||||
if toknum == INDENT:
|
||||
indents.append(tokval)
|
||||
continue
|
||||
elif toknum == DEDENT:
|
||||
indents.pop()
|
||||
continue
|
||||
elif toknum in (NEWLINE, COMMENT, NL):
|
||||
startline = True
|
||||
elif startline and indents:
|
||||
toks_append(indents[-1])
|
||||
startline = False
|
||||
toks_append(tokval)
|
||||
return ''.join(toks)
|
||||
|
||||
|
||||
def generate_tokens(readline):
|
||||
"""
|
||||
The generate_tokens() generator requires one argment, readline, which
|
||||
must be a callable object which provides the same interface as the
|
||||
readline() method of built-in file objects. Each call to the function
|
||||
should return one line of input as a string.
|
||||
should return one line of input as a string. Alternately, readline
|
||||
can be a callable function terminating with StopIteration:
|
||||
readline = open(myfile).next # Example of alternate readline
|
||||
|
||||
The generator produces 5-tuples with these members: the token type; the
|
||||
token string; a 2-tuple (srow, scol) of ints specifying the row and
|
||||
|
@ -180,7 +223,10 @@ def generate_tokens(readline):
|
|||
indents = [0]
|
||||
|
||||
while 1: # loop over lines in stream
|
||||
line = readline()
|
||||
try:
|
||||
line = readline()
|
||||
except StopIteration:
|
||||
line = ''
|
||||
lnum = lnum + 1
|
||||
pos, max = 0, len(line)
|
||||
|
||||
|
|
|
@ -141,6 +141,11 @@ Extension Modules
|
|||
Library
|
||||
-------
|
||||
|
||||
- The tokenize module has a new untokenize() function to support a full
|
||||
roundtrip from lexed tokens back to Python sourcecode. In addition,
|
||||
the generate_tokens() function now accepts a callable argument that
|
||||
terminates by raising StopIteration.
|
||||
|
||||
- Bug #1196315: fix weakref.WeakValueDictionary constructor.
|
||||
|
||||
- Bug #1213894: os.path.realpath didn't resolve symlinks that were the first
|
||||
|
|
Loading…
Reference in New Issue