Issue #2134: Add support for tokenize.TokenInfo.exact_type.
This commit is contained in:
parent
3f67ec1afd
commit
00c7f85298
|
@ -15,6 +15,11 @@ implemented in Python. The scanner in this module returns comments as tokens
|
||||||
as well, making it useful for implementing "pretty-printers," including
|
as well, making it useful for implementing "pretty-printers," including
|
||||||
colorizers for on-screen displays.
|
colorizers for on-screen displays.
|
||||||
|
|
||||||
|
To simplify token stream handling, all :ref:`operators` and :ref:`delimiters`
|
||||||
|
tokens are returned using the generic :data:`token.OP` token type. The exact
|
||||||
|
type can be determined by checking the ``exact_type`` property on the
|
||||||
|
:term:`named tuple` returned from :func:`tokenize.tokenize`.
|
||||||
|
|
||||||
Tokenizing Input
|
Tokenizing Input
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
|
@ -36,9 +41,17 @@ The primary entry point is a :term:`generator`:
|
||||||
returned as a :term:`named tuple` with the field names:
|
returned as a :term:`named tuple` with the field names:
|
||||||
``type string start end line``.
|
``type string start end line``.
|
||||||
|
|
||||||
|
The returned :term:`named tuple` has a additional property named
|
||||||
|
``exact_type`` that contains the exact operator type for
|
||||||
|
:data:`token.OP` tokens. For all other token types ``exact_type``
|
||||||
|
equals the named tuple ``type`` field.
|
||||||
|
|
||||||
.. versionchanged:: 3.1
|
.. versionchanged:: 3.1
|
||||||
Added support for named tuples.
|
Added support for named tuples.
|
||||||
|
|
||||||
|
.. versionchanged:: 3.3
|
||||||
|
Added support for ``exact_type``.
|
||||||
|
|
||||||
:func:`tokenize` determines the source encoding of the file by looking for a
|
:func:`tokenize` determines the source encoding of the file by looking for a
|
||||||
UTF-8 BOM or encoding cookie, according to :pep:`263`.
|
UTF-8 BOM or encoding cookie, according to :pep:`263`.
|
||||||
|
|
||||||
|
@ -131,7 +144,19 @@ It is as simple as:
|
||||||
|
|
||||||
.. code-block:: sh
|
.. code-block:: sh
|
||||||
|
|
||||||
python -m tokenize [filename.py]
|
python -m tokenize [-e] [filename.py]
|
||||||
|
|
||||||
|
The following options are accepted:
|
||||||
|
|
||||||
|
.. program:: tokenize
|
||||||
|
|
||||||
|
.. cmdoption:: -h, --help
|
||||||
|
|
||||||
|
show this help message and exit
|
||||||
|
|
||||||
|
.. cmdoption:: -e, --exact
|
||||||
|
|
||||||
|
display token names using the exact type
|
||||||
|
|
||||||
If :file:`filename.py` is specified its contents are tokenized to stdout.
|
If :file:`filename.py` is specified its contents are tokenized to stdout.
|
||||||
Otherwise, tokenization is performed on stdin.
|
Otherwise, tokenization is performed on stdin.
|
||||||
|
@ -215,3 +240,29 @@ the name of the token, and the final column is the value of the token (if any)
|
||||||
4,10-4,11: OP ')'
|
4,10-4,11: OP ')'
|
||||||
4,11-4,12: NEWLINE '\n'
|
4,11-4,12: NEWLINE '\n'
|
||||||
5,0-5,0: ENDMARKER ''
|
5,0-5,0: ENDMARKER ''
|
||||||
|
|
||||||
|
The exact token type names can be displayed using the ``-e`` option:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ python -m tokenize -e hello.py
|
||||||
|
0,0-0,0: ENCODING 'utf-8'
|
||||||
|
1,0-1,3: NAME 'def'
|
||||||
|
1,4-1,13: NAME 'say_hello'
|
||||||
|
1,13-1,14: LPAR '('
|
||||||
|
1,14-1,15: RPAR ')'
|
||||||
|
1,15-1,16: COLON ':'
|
||||||
|
1,16-1,17: NEWLINE '\n'
|
||||||
|
2,0-2,4: INDENT ' '
|
||||||
|
2,4-2,9: NAME 'print'
|
||||||
|
2,9-2,10: LPAR '('
|
||||||
|
2,10-2,25: STRING '"Hello, World!"'
|
||||||
|
2,25-2,26: RPAR ')'
|
||||||
|
2,26-2,27: NEWLINE '\n'
|
||||||
|
3,0-3,1: NL '\n'
|
||||||
|
4,0-4,0: DEDENT ''
|
||||||
|
4,0-4,9: NAME 'say_hello'
|
||||||
|
4,9-4,10: LPAR '('
|
||||||
|
4,10-4,11: RPAR ')'
|
||||||
|
4,11-4,12: NEWLINE '\n'
|
||||||
|
5,0-5,0: ENDMARKER ''
|
||||||
|
|
|
@ -567,11 +567,12 @@ Non-ascii identifiers
|
||||||
|
|
||||||
from test import support
|
from test import support
|
||||||
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
|
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
|
||||||
STRING, ENDMARKER, tok_name, detect_encoding,
|
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
|
||||||
open as tokenize_open)
|
open as tokenize_open)
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
import os, sys, glob
|
import os, sys, glob
|
||||||
|
import token
|
||||||
|
|
||||||
def dump_tokens(s):
|
def dump_tokens(s):
|
||||||
"""Print out the tokens in s in a table format.
|
"""Print out the tokens in s in a table format.
|
||||||
|
@ -922,6 +923,78 @@ class TestTokenize(TestCase):
|
||||||
|
|
||||||
self.assertTrue(encoding_used, encoding)
|
self.assertTrue(encoding_used, encoding)
|
||||||
|
|
||||||
|
def assertExactTypeEqual(self, opstr, *optypes):
|
||||||
|
tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
|
||||||
|
num_optypes = len(optypes)
|
||||||
|
self.assertEqual(len(tokens), 2 + num_optypes)
|
||||||
|
self.assertEqual(token.tok_name[tokens[0].exact_type],
|
||||||
|
token.tok_name[ENCODING])
|
||||||
|
for i in range(num_optypes):
|
||||||
|
self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
|
||||||
|
token.tok_name[optypes[i]])
|
||||||
|
self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
|
||||||
|
token.tok_name[token.ENDMARKER])
|
||||||
|
|
||||||
|
def test_exact_type(self):
|
||||||
|
self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
|
||||||
|
self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
|
||||||
|
self.assertExactTypeEqual(':', token.COLON)
|
||||||
|
self.assertExactTypeEqual(',', token.COMMA)
|
||||||
|
self.assertExactTypeEqual(';', token.SEMI)
|
||||||
|
self.assertExactTypeEqual('+', token.PLUS)
|
||||||
|
self.assertExactTypeEqual('-', token.MINUS)
|
||||||
|
self.assertExactTypeEqual('*', token.STAR)
|
||||||
|
self.assertExactTypeEqual('/', token.SLASH)
|
||||||
|
self.assertExactTypeEqual('|', token.VBAR)
|
||||||
|
self.assertExactTypeEqual('&', token.AMPER)
|
||||||
|
self.assertExactTypeEqual('<', token.LESS)
|
||||||
|
self.assertExactTypeEqual('>', token.GREATER)
|
||||||
|
self.assertExactTypeEqual('=', token.EQUAL)
|
||||||
|
self.assertExactTypeEqual('.', token.DOT)
|
||||||
|
self.assertExactTypeEqual('%', token.PERCENT)
|
||||||
|
self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
|
||||||
|
self.assertExactTypeEqual('==', token.EQEQUAL)
|
||||||
|
self.assertExactTypeEqual('!=', token.NOTEQUAL)
|
||||||
|
self.assertExactTypeEqual('<=', token.LESSEQUAL)
|
||||||
|
self.assertExactTypeEqual('>=', token.GREATEREQUAL)
|
||||||
|
self.assertExactTypeEqual('~', token.TILDE)
|
||||||
|
self.assertExactTypeEqual('^', token.CIRCUMFLEX)
|
||||||
|
self.assertExactTypeEqual('<<', token.LEFTSHIFT)
|
||||||
|
self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
|
||||||
|
self.assertExactTypeEqual('**', token.DOUBLESTAR)
|
||||||
|
self.assertExactTypeEqual('+=', token.PLUSEQUAL)
|
||||||
|
self.assertExactTypeEqual('-=', token.MINEQUAL)
|
||||||
|
self.assertExactTypeEqual('*=', token.STAREQUAL)
|
||||||
|
self.assertExactTypeEqual('/=', token.SLASHEQUAL)
|
||||||
|
self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
|
||||||
|
self.assertExactTypeEqual('&=', token.AMPEREQUAL)
|
||||||
|
self.assertExactTypeEqual('|=', token.VBAREQUAL)
|
||||||
|
self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
|
||||||
|
self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
|
||||||
|
self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
|
||||||
|
self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
|
||||||
|
self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
|
||||||
|
self.assertExactTypeEqual('//', token.DOUBLESLASH)
|
||||||
|
self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
|
||||||
|
self.assertExactTypeEqual('@', token.AT)
|
||||||
|
|
||||||
|
self.assertExactTypeEqual('a**2+b**2==c**2',
|
||||||
|
NAME, token.DOUBLESTAR, NUMBER,
|
||||||
|
token.PLUS,
|
||||||
|
NAME, token.DOUBLESTAR, NUMBER,
|
||||||
|
token.EQEQUAL,
|
||||||
|
NAME, token.DOUBLESTAR, NUMBER)
|
||||||
|
self.assertExactTypeEqual('{1, 2, 3}',
|
||||||
|
token.LBRACE,
|
||||||
|
token.NUMBER, token.COMMA,
|
||||||
|
token.NUMBER, token.COMMA,
|
||||||
|
token.NUMBER,
|
||||||
|
token.RBRACE)
|
||||||
|
self.assertExactTypeEqual('^(x & 0x1)',
|
||||||
|
token.CIRCUMFLEX,
|
||||||
|
token.LPAR,
|
||||||
|
token.NAME, token.AMPER, token.NUMBER,
|
||||||
|
token.RPAR)
|
||||||
|
|
||||||
__test__ = {"doctests" : doctests, 'decistmt': decistmt}
|
__test__ = {"doctests" : doctests, 'decistmt': decistmt}
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,51 @@ tok_name[NL] = 'NL'
|
||||||
ENCODING = N_TOKENS + 2
|
ENCODING = N_TOKENS + 2
|
||||||
tok_name[ENCODING] = 'ENCODING'
|
tok_name[ENCODING] = 'ENCODING'
|
||||||
N_TOKENS += 3
|
N_TOKENS += 3
|
||||||
|
EXACT_TOKEN_TYPES = {
|
||||||
|
'(': LPAR,
|
||||||
|
')': RPAR,
|
||||||
|
'[': LSQB,
|
||||||
|
']': RSQB,
|
||||||
|
':': COLON,
|
||||||
|
',': COMMA,
|
||||||
|
';': SEMI,
|
||||||
|
'+': PLUS,
|
||||||
|
'-': MINUS,
|
||||||
|
'*': STAR,
|
||||||
|
'/': SLASH,
|
||||||
|
'|': VBAR,
|
||||||
|
'&': AMPER,
|
||||||
|
'<': LESS,
|
||||||
|
'>': GREATER,
|
||||||
|
'=': EQUAL,
|
||||||
|
'.': DOT,
|
||||||
|
'%': PERCENT,
|
||||||
|
'{': LBRACE,
|
||||||
|
'}': RBRACE,
|
||||||
|
'==': EQEQUAL,
|
||||||
|
'!=': NOTEQUAL,
|
||||||
|
'<=': LESSEQUAL,
|
||||||
|
'>=': GREATEREQUAL,
|
||||||
|
'~': TILDE,
|
||||||
|
'^': CIRCUMFLEX,
|
||||||
|
'<<': LEFTSHIFT,
|
||||||
|
'>>': RIGHTSHIFT,
|
||||||
|
'**': DOUBLESTAR,
|
||||||
|
'+=': PLUSEQUAL,
|
||||||
|
'-=': MINEQUAL,
|
||||||
|
'*=': STAREQUAL,
|
||||||
|
'/=': SLASHEQUAL,
|
||||||
|
'%=': PERCENTEQUAL,
|
||||||
|
'&=': AMPEREQUAL,
|
||||||
|
'|=': VBAREQUAL,
|
||||||
|
'^=': CIRCUMFLEXEQUAL,
|
||||||
|
'<<=': LEFTSHIFTEQUAL,
|
||||||
|
'>>=': RIGHTSHIFTEQUAL,
|
||||||
|
'**=': DOUBLESTAREQUAL,
|
||||||
|
'//': DOUBLESLASH,
|
||||||
|
'//=': DOUBLESLASHEQUAL,
|
||||||
|
'@': AT
|
||||||
|
}
|
||||||
|
|
||||||
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
|
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -52,6 +97,13 @@ class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line'
|
||||||
return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
|
return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
|
||||||
self._replace(type=annotated_type))
|
self._replace(type=annotated_type))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def exact_type(self):
|
||||||
|
if self.type == OP and self.string in EXACT_TOKEN_TYPES:
|
||||||
|
return EXACT_TOKEN_TYPES[self.string]
|
||||||
|
else:
|
||||||
|
return self.type
|
||||||
|
|
||||||
def group(*choices): return '(' + '|'.join(choices) + ')'
|
def group(*choices): return '(' + '|'.join(choices) + ')'
|
||||||
def any(*choices): return group(*choices) + '*'
|
def any(*choices): return group(*choices) + '*'
|
||||||
def maybe(*choices): return group(*choices) + '?'
|
def maybe(*choices): return group(*choices) + '?'
|
||||||
|
@ -549,6 +601,8 @@ def main():
|
||||||
parser.add_argument(dest='filename', nargs='?',
|
parser.add_argument(dest='filename', nargs='?',
|
||||||
metavar='filename.py',
|
metavar='filename.py',
|
||||||
help='the file to tokenize; defaults to stdin')
|
help='the file to tokenize; defaults to stdin')
|
||||||
|
parser.add_argument('-e', '--exact', dest='exact', action='store_true',
|
||||||
|
help='display token names using the exact type')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -563,9 +617,12 @@ def main():
|
||||||
|
|
||||||
# Output the tokenization
|
# Output the tokenization
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
|
token_type = token.type
|
||||||
|
if args.exact:
|
||||||
|
token_type = token.exact_type
|
||||||
token_range = "%d,%d-%d,%d:" % (token.start + token.end)
|
token_range = "%d,%d-%d,%d:" % (token.start + token.end)
|
||||||
print("%-20s%-15s%-15r" %
|
print("%-20s%-15s%-15r" %
|
||||||
(token_range, tok_name[token.type], token.string))
|
(token_range, tok_name[token_type], token.string))
|
||||||
except IndentationError as err:
|
except IndentationError as err:
|
||||||
line, column = err.args[1][1:3]
|
line, column = err.args[1][1:3]
|
||||||
error(err.args[0], filename, (line, column))
|
error(err.args[0], filename, (line, column))
|
||||||
|
|
|
@ -450,6 +450,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #2134: A new attribute that specifies the exact type of token.OP
|
||||||
|
tokens has been added to tokenize.TokenInfo.
|
||||||
|
|
||||||
- Issue #13722: Avoid silencing ImportErrors when initializing the codecs
|
- Issue #13722: Avoid silencing ImportErrors when initializing the codecs
|
||||||
registry.
|
registry.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue