Issue #2134: Add support for tokenize.TokenInfo.exact_type.

2012-01-19 00:44:45 -06:00 · 2012-01-19 00:44:45 -06:00 · 00c7f85298
parent 3f67ec1afd
commit 00c7f85298
4 changed files with 187 additions and 3 deletions
--- a/Doc/library/tokenize.rst
+++ b/Doc/library/tokenize.rst
@ -15,6 +15,11 @@ implemented in Python.  The scanner in this module returns comments as tokens
 as well, making it useful for implementing "pretty-printers," including
 colorizers for on-screen displays.
 To simplify token stream handling, all :ref:`operators` and :ref:`delimiters`
 tokens are returned using the generic :data:`token.OP` token type.  The exact
 type can be determined by checking the ``exact_type`` property on the
 :term:`named tuple` returned from :func:`tokenize.tokenize`.
 Tokenizing Input
 ----------------
@ -36,9 +41,17 @@ The primary entry point is a :term:`generator`:
   returned as a :term:`named tuple` with the field names:
   ``type string start end line``.
   The returned :term:`named tuple` has a additional property named
   ``exact_type`` that contains the exact operator type for
   :data:`token.OP` tokens.  For all other token types ``exact_type``
   equals the named tuple ``type`` field.
   .. versionchanged:: 3.1
      Added support for named tuples.
   .. versionchanged:: 3.3
      Added support for ``exact_type``.
   :func:`tokenize` determines the source encoding of the file by looking for a
   UTF-8 BOM or encoding cookie, according to :pep:`263`.
@ -131,7 +144,19 @@ It is as simple as:
 .. code-block:: sh
-   python -m tokenize [filename.py]
+   python -m tokenize [-e] [filename.py]
 The following options are accepted:
 .. program:: tokenize
 .. cmdoption:: -h, --help
   show this help message and exit
 .. cmdoption:: -e, --exact
   display token names using the exact type
 If :file:`filename.py` is specified its contents are tokenized to stdout.
 Otherwise, tokenization is performed on stdin.
@ -215,3 +240,29 @@ the name of the token, and the final column is the value of the token (if any)
    4,10-4,11:          OP             ')'
    4,11-4,12:          NEWLINE        '\n'
    5,0-5,0:            ENDMARKER      ''
 The exact token type names can be displayed using the ``-e`` option:
 .. code-block:: sh
    $ python -m tokenize -e hello.py
    0,0-0,0:            ENCODING       'utf-8'
    1,0-1,3:            NAME           'def'
    1,4-1,13:           NAME           'say_hello'
    1,13-1,14:          LPAR           '('
    1,14-1,15:          RPAR           ')'
    1,15-1,16:          COLON          ':'
    1,16-1,17:          NEWLINE        '\n'
    2,0-2,4:            INDENT         '    '
    2,4-2,9:            NAME           'print'
    2,9-2,10:           LPAR           '('
    2,10-2,25:          STRING         '"Hello, World!"'
    2,25-2,26:          RPAR           ')'
    2,26-2,27:          NEWLINE        '\n'
    3,0-3,1:            NL             '\n'
    4,0-4,0:            DEDENT         ''
    4,0-4,9:            NAME           'say_hello'
    4,9-4,10:           LPAR           '('
    4,10-4,11:          RPAR           ')'
    4,11-4,12:          NEWLINE        '\n'
    5,0-5,0:            ENDMARKER      ''
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@ -567,11 +567,12 @@ Non-ascii identifiers
 from test import support
 from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
-                     STRING, ENDMARKER, tok_name, detect_encoding,
+                     STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
                     open as tokenize_open)
 from io import BytesIO
 from unittest import TestCase
 import os, sys, glob
 import token
 def dump_tokens(s):
    """Print out the tokens in s in a table format.
@ -922,6 +923,78 @@ class TestTokenize(TestCase):
        self.assertTrue(encoding_used, encoding)
    def assertExactTypeEqual(self, opstr, *optypes):
        tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
        num_optypes = len(optypes)
        self.assertEqual(len(tokens), 2 + num_optypes)
        self.assertEqual(token.tok_name[tokens[0].exact_type],
                         token.tok_name[ENCODING])
        for i in range(num_optypes):
            self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
                             token.tok_name[optypes[i]])
        self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
                         token.tok_name[token.ENDMARKER])
    def test_exact_type(self):
        self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
        self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
        self.assertExactTypeEqual(':', token.COLON)
        self.assertExactTypeEqual(',', token.COMMA)
        self.assertExactTypeEqual(';', token.SEMI)
        self.assertExactTypeEqual('+', token.PLUS)
        self.assertExactTypeEqual('-', token.MINUS)
        self.assertExactTypeEqual('*', token.STAR)
        self.assertExactTypeEqual('/', token.SLASH)
        self.assertExactTypeEqual('|', token.VBAR)
        self.assertExactTypeEqual('&', token.AMPER)
        self.assertExactTypeEqual('<', token.LESS)
        self.assertExactTypeEqual('>', token.GREATER)
        self.assertExactTypeEqual('=', token.EQUAL)
        self.assertExactTypeEqual('.', token.DOT)
        self.assertExactTypeEqual('%', token.PERCENT)
        self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
        self.assertExactTypeEqual('==', token.EQEQUAL)
        self.assertExactTypeEqual('!=', token.NOTEQUAL)
        self.assertExactTypeEqual('<=', token.LESSEQUAL)
        self.assertExactTypeEqual('>=', token.GREATEREQUAL)
        self.assertExactTypeEqual('~', token.TILDE)
        self.assertExactTypeEqual('^', token.CIRCUMFLEX)
        self.assertExactTypeEqual('<<', token.LEFTSHIFT)
        self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
        self.assertExactTypeEqual('**', token.DOUBLESTAR)
        self.assertExactTypeEqual('+=', token.PLUSEQUAL)
        self.assertExactTypeEqual('-=', token.MINEQUAL)
        self.assertExactTypeEqual('*=', token.STAREQUAL)
        self.assertExactTypeEqual('/=', token.SLASHEQUAL)
        self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
        self.assertExactTypeEqual('&=', token.AMPEREQUAL)
        self.assertExactTypeEqual('|=', token.VBAREQUAL)
        self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
        self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
        self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
        self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
        self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
        self.assertExactTypeEqual('//', token.DOUBLESLASH)
        self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
        self.assertExactTypeEqual('@', token.AT)
        self.assertExactTypeEqual('a**2+b**2==c**2',
                                  NAME, token.DOUBLESTAR, NUMBER,
                                  token.PLUS,
                                  NAME, token.DOUBLESTAR, NUMBER,
                                  token.EQEQUAL,
                                  NAME, token.DOUBLESTAR, NUMBER)
        self.assertExactTypeEqual('{1, 2, 3}',
                                  token.LBRACE,
                                  token.NUMBER, token.COMMA,
                                  token.NUMBER, token.COMMA,
                                  token.NUMBER,
                                  token.RBRACE)
        self.assertExactTypeEqual('^(x & 0x1)',
                                  token.CIRCUMFLEX,
                                  token.LPAR,
                                  token.NAME, token.AMPER, token.NUMBER,
                                  token.RPAR)
 __test__ = {"doctests" : doctests, 'decistmt': decistmt}
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -45,6 +45,51 @@ tok_name[NL] = 'NL'
 ENCODING = N_TOKENS + 2
 tok_name[ENCODING] = 'ENCODING'
 N_TOKENS += 3
 EXACT_TOKEN_TYPES = {
    '(':   LPAR,
    ')':   RPAR,
    '[':   LSQB,
    ']':   RSQB,
    ':':   COLON,
    ',':   COMMA,
    ';':   SEMI,
    '+':   PLUS,
    '-':   MINUS,
    '*':   STAR,
    '/':   SLASH,
    '|':   VBAR,
    '&':   AMPER,
    '<':   LESS,
    '>':   GREATER,
    '=':   EQUAL,
    '.':   DOT,
    '%':   PERCENT,
    '{':   LBRACE,
    '}':   RBRACE,
    '==':  EQEQUAL,
    '!=':  NOTEQUAL,
    '<=':  LESSEQUAL,
    '>=':  GREATEREQUAL,
    '~':   TILDE,
    '^':   CIRCUMFLEX,
    '<<':  LEFTSHIFT,
    '>>':  RIGHTSHIFT,
    '**':  DOUBLESTAR,
    '+=':  PLUSEQUAL,
    '-=':  MINEQUAL,
    '*=':  STAREQUAL,
    '/=':  SLASHEQUAL,
    '%=':  PERCENTEQUAL,
    '&=':  AMPEREQUAL,
    '|=':  VBAREQUAL,
    '^=': CIRCUMFLEXEQUAL,
    '<<=': LEFTSHIFTEQUAL,
    '>>=': RIGHTSHIFTEQUAL,
    '**=': DOUBLESTAREQUAL,
    '//':  DOUBLESLASH,
    '//=': DOUBLESLASHEQUAL,
    '@':   AT
 }
 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
    def __repr__(self):
@ -52,6 +97,13 @@ class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line'
        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
                self._replace(type=annotated_type))
    @property
    def exact_type(self):
        if self.type == OP and self.string in EXACT_TOKEN_TYPES:
            return EXACT_TOKEN_TYPES[self.string]
        else:
            return self.type
 def group(*choices): return '(' + '|'.join(choices) + ')'
 def any(*choices): return group(*choices) + '*'
 def maybe(*choices): return group(*choices) + '?'
@ -549,6 +601,8 @@ def main():
    parser.add_argument(dest='filename', nargs='?',
                        metavar='filename.py',
                        help='the file to tokenize; defaults to stdin')
    parser.add_argument('-e', '--exact', dest='exact', action='store_true',
                        help='display token names using the exact type')
    args = parser.parse_args()
    try:
@ -563,9 +617,12 @@ def main():
        # Output the tokenization
        for token in tokens:
            token_type = token.type
            if args.exact:
                token_type = token.exact_type
            token_range = "%d,%d-%d,%d:" % (token.start + token.end)
            print("%-20s%-15s%-15r" %
-                  (token_range, tok_name[token.type], token.string))
+                  (token_range, tok_name[token_type], token.string))
    except IndentationError as err:
        line, column = err.args[1][1:3]
        error(err.args[0], filename, (line, column))
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -450,6 +450,9 @@ Core and Builtins
 Library
 -------
 - Issue #2134: A new attribute that specifies the exact type of token.OP
  tokens has been added to tokenize.TokenInfo.
 - Issue #13722: Avoid silencing ImportErrors when initializing the codecs
  registry.