From 68c04534182f2c09783b6506701a8bc25c98b4a9 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 10 Jun 2005 11:05:19 +0000 Subject: [PATCH] Add untokenize() function to allow full round-trip tokenization. Should significantly enhance the utility of the module by supporting the creation of tools that modify the token stream and writeback the modified result. --- Doc/lib/libtokenize.tex | 52 +++++++++++++++++++++++++++ Lib/test/regrtest.py | 4 ++- Lib/test/test_tokenize.py | 76 +++++++++++++++++++++++++++++++++++++-- Lib/tokenize.py | 52 +++++++++++++++++++++++++-- Misc/NEWS | 5 +++ 5 files changed, 182 insertions(+), 7 deletions(-) diff --git a/Doc/lib/libtokenize.tex b/Doc/lib/libtokenize.tex index 6cd93484214..dc5f8c19954 100644 --- a/Doc/lib/libtokenize.tex +++ b/Doc/lib/libtokenize.tex @@ -45,6 +45,9 @@ An older entry point is retained for backward compatibility: provides the same interface as the \method{readline()} method of built-in file objects (see section~\ref{bltin-file-objects}). Each call to the function should return one line of input as a string. + Alternately, \var{readline} may be a callable object that signals + completion by raising \exception{StopIteration}. + \versionchanged[Added StopIteration support]{2.5} The second parameter, \var{tokeneater}, must also be a callable object. It is called once for each token, with five arguments, @@ -65,3 +68,52 @@ passed to the \var{tokeneater} function by \function{tokenize()}: are generated when a logical line of code is continued over multiple physical lines. \end{datadesc} + +Another function is provided to reverse the tokenization process. +This is useful for creating tools that tokenize a script, modify +the token stream, and write back the modified script. + +\begin{funcdesc}{untokenize}{iterable} + Converts tokens back into Python source code. The \variable{iterable} + must return sequences with at least two elements, the token type and + the token string. Any additional sequence elements are ignored. + + The reconstructed script is returned as a single string. The + result is guaranteed to tokenize back to match the input so that + the conversion is lossless and round-trips are assured. The + guarantee applies only to the token type and token string as + the spacing between tokens (column positions) may change. + \versionadded{2.5} +\end{funcdesc} + +Example of a script re-writer that transforms float literals into +Decimal objects: +\begin{verbatim} +def decistmt(s): + """Substitute Decimals for floats in a string of statements. + + >>> from decimal import Decimal + >>> s = 'print +21.3e-5*-.1234/81.7' + >>> decistmt(s) + "print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')" + + >>> exec(s) + -3.21716034272e-007 + >>> exec(decistmt(s)) + -3.217160342717258261933904529E-7 + + """ + result = [] + g = generate_tokens(StringIO(s).readline) # tokenize the string + for toknum, tokval, _, _, _ in g: + if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens + result.extend([ + (NAME, 'Decimal'), + (OP, '('), + (STRING, repr(tokval)), + (OP, ')') + ]) + else: + result.append((toknum, tokval)) + return untokenize(result) +\end{verbatim} diff --git a/Lib/test/regrtest.py b/Lib/test/regrtest.py index 6160b3d07d5..85e784b33fd 100755 --- a/Lib/test/regrtest.py +++ b/Lib/test/regrtest.py @@ -91,7 +91,9 @@ resources to test. Currently only the following are defined: compiler - Test the compiler package by compiling all the source in the standard library and test suite. This takes - a long time. + a long time. Enabling this resource also allows + test_tokenize to verify round-trip lexing on every + file in the test library. subprocess Run all tests for the subprocess module. diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index d21740468f1..2ce435f585d 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,12 +1,82 @@ -from test.test_support import verbose, findfile -import tokenize, os, sys +from test.test_support import verbose, findfile, is_resource_enabled +import os, glob, random +from tokenize import (tokenize, generate_tokens, untokenize, + NUMBER, NAME, OP, STRING) if verbose: print 'starting...' f = file(findfile('tokenize_tests' + os.extsep + 'txt')) -tokenize.tokenize(f.readline) +tokenize(f.readline) f.close() + + +###### Test roundtrip for untokenize ########################## + +def test_roundtrip(f): + ## print 'Testing:', f + f = file(f) + try: + fulltok = list(generate_tokens(f.readline)) + finally: + f.close() + + t1 = [tok[:2] for tok in fulltok] + newtext = untokenize(t1) + readline = iter(newtext.splitlines(1)).next + t2 = [tok[:2] for tok in generate_tokens(readline)] + assert t1 == t2 + + +f = findfile('tokenize_tests' + os.extsep + 'txt') +test_roundtrip(f) + +testdir = os.path.dirname(f) or os.curdir +testfiles = glob.glob(testdir + os.sep + 'test*.py') +if not is_resource_enabled('compiler'): + testfiles = random.sample(testfiles, 10) + +for f in testfiles: + test_roundtrip(f) + + + +###### Test example in the docs ############################### + +from decimal import Decimal +from cStringIO import StringIO + +def decistmt(s): + """Substitute Decimals for floats in a string of statements. + + >>> from decimal import Decimal + >>> s = 'print +21.3e-5*-.1234/81.7' + >>> decistmt(s) + "print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')" + + >>> exec(s) + -3.21716034272e-007 + >>> exec(decistmt(s)) + -3.217160342717258261933904529E-7 + + """ + result = [] + g = generate_tokens(StringIO(s).readline) # tokenize the string + for toknum, tokval, _, _, _ in g: + if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens + result.extend([ + (NAME, 'Decimal'), + (OP, '('), + (STRING, repr(tokval)), + (OP, ')') + ]) + else: + result.append((toknum, tokval)) + return untokenize(result) + +import doctest +doctest.testmod() + if verbose: print 'finished' diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 9087e84ca0e..b29da6b7ada 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -31,7 +31,7 @@ from token import * import token __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", - "generate_tokens", "NL"] + "generate_tokens", "NL", "untokenize"] del x del token @@ -159,12 +159,55 @@ def tokenize_loop(readline, tokeneater): for token_info in generate_tokens(readline): tokeneater(*token_info) + +def untokenize(iterable): + """Transform tokens back into Python source code. + + Each element returned by the iterable must be a token sequence + with at least two elements, a token number and token value. + + Round-trip invariant: + # Output text will tokenize the back to the input + t1 = [tok[:2] for tok in generate_tokens(f.readline)] + newcode = untokenize(t1) + readline = iter(newcode.splitlines(1)).next + t2 = [tok[:2] for tokin generate_tokens(readline)] + assert t1 == t2 + """ + + startline = False + indents = [] + toks = [] + toks_append = toks.append + for tok in iterable: + toknum, tokval = tok[:2] + + if toknum == NAME: + tokval += ' ' + + if toknum == INDENT: + indents.append(tokval) + continue + elif toknum == DEDENT: + indents.pop() + continue + elif toknum in (NEWLINE, COMMENT, NL): + startline = True + elif startline and indents: + toks_append(indents[-1]) + startline = False + toks_append(tokval) + return ''.join(toks) + + def generate_tokens(readline): """ The generate_tokens() generator requires one argment, readline, which must be a callable object which provides the same interface as the readline() method of built-in file objects. Each call to the function - should return one line of input as a string. + should return one line of input as a string. Alternately, readline + can be a callable function terminating with StopIteration: + readline = open(myfile).next # Example of alternate readline The generator produces 5-tuples with these members: the token type; the token string; a 2-tuple (srow, scol) of ints specifying the row and @@ -180,7 +223,10 @@ def generate_tokens(readline): indents = [0] while 1: # loop over lines in stream - line = readline() + try: + line = readline() + except StopIteration: + line = '' lnum = lnum + 1 pos, max = 0, len(line) diff --git a/Misc/NEWS b/Misc/NEWS index f15e87390b0..4763598d3ae 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -141,6 +141,11 @@ Extension Modules Library ------- +- The tokenize module has a new untokenize() function to support a full + roundtrip from lexed tokens back to Python sourcecode. In addition, + the generate_tokens() function now accepts a callable argument that + terminates by raising StopIteration. + - Bug #1196315: fix weakref.WeakValueDictionary constructor. - Bug #1213894: os.path.realpath didn't resolve symlinks that were the first