Issue #5857: tokenize.tokenize() now returns named tuples.

This commit is contained in:
Raymond Hettinger 2009-04-29 00:34:27 +00:00
parent c1edc2d632
commit a48db39992
2 changed files with 28 additions and 20 deletions

View File

@ -27,7 +27,12 @@ The primary entry point is a :term:`generator`:
column where the token begins in the source; a 2-tuple ``(erow, ecol)`` of column where the token begins in the source; a 2-tuple ``(erow, ecol)`` of
ints specifying the row and column where the token ends in the source; and ints specifying the row and column where the token ends in the source; and
the line on which the token was found. The line passed (the last tuple item) the line on which the token was found. The line passed (the last tuple item)
is the *logical* line; continuation lines are included. is the *logical* line; continuation lines are included. The 5 tuple is
returned as a :term:`named tuple` with the field names:
``type string start end line``.
.. versionchanged:: 3.1
Added support for named tuples.
:func:`tokenize` determines the source encoding of the file by looking for a :func:`tokenize` determines the source encoding of the file by looking for a
UTF-8 BOM or encoding cookie, according to :pep:`263`. UTF-8 BOM or encoding cookie, according to :pep:`263`.

View File

@ -24,6 +24,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
'Michael Foord') 'Michael Foord')
import collections
import re, string, sys import re, string, sys
from token import * from token import *
from codecs import lookup, BOM_UTF8 from codecs import lookup, BOM_UTF8
@ -31,7 +32,7 @@ cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
import token import token
__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
"detect_encoding", "NL", "untokenize", "ENCODING"] "detect_encoding", "NL", "untokenize", "ENCODING", "Tokenize"]
del token del token
COMMENT = N_TOKENS COMMENT = N_TOKENS
@ -42,6 +43,8 @@ ENCODING = N_TOKENS + 2
tok_name[ENCODING] = 'ENCODING' tok_name[ENCODING] = 'ENCODING'
N_TOKENS += 3 N_TOKENS += 3
TokenInfo = collections.namedtuple('TokenInfo', 'type string start end line')
def group(*choices): return '(' + '|'.join(choices) + ')' def group(*choices): return '(' + '|'.join(choices) + ')'
def any(*choices): return group(*choices) + '*' def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?' def maybe(*choices): return group(*choices) + '?'
@ -346,7 +349,7 @@ def _tokenize(readline, encoding):
indents = [0] indents = [0]
if encoding is not None: if encoding is not None:
yield (ENCODING, encoding, (0, 0), (0, 0), '') yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
while True: # loop over lines in stream while True: # loop over lines in stream
try: try:
line = readline() line = readline()
@ -364,12 +367,12 @@ def _tokenize(readline, encoding):
endmatch = endprog.match(line) endmatch = endprog.match(line)
if endmatch: if endmatch:
pos = end = endmatch.end(0) pos = end = endmatch.end(0)
yield (STRING, contstr + line[:end], yield TokenInfo(STRING, contstr + line[:end],
strstart, (lnum, end), contline + line) strstart, (lnum, end), contline + line)
contstr, needcont = '', 0 contstr, needcont = '', 0
contline = None contline = None
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
yield (ERRORTOKEN, contstr + line, yield TokenInfo(ERRORTOKEN, contstr + line,
strstart, (lnum, len(line)), contline) strstart, (lnum, len(line)), contline)
contstr = '' contstr = ''
contline = None contline = None
@ -394,25 +397,25 @@ def _tokenize(readline, encoding):
if line[pos] == '#': if line[pos] == '#':
comment_token = line[pos:].rstrip('\r\n') comment_token = line[pos:].rstrip('\r\n')
nl_pos = pos + len(comment_token) nl_pos = pos + len(comment_token)
yield (COMMENT, comment_token, yield TokenInfo(COMMENT, comment_token,
(lnum, pos), (lnum, pos + len(comment_token)), line) (lnum, pos), (lnum, pos + len(comment_token)), line)
yield (NL, line[nl_pos:], yield TokenInfo(NL, line[nl_pos:],
(lnum, nl_pos), (lnum, len(line)), line) (lnum, nl_pos), (lnum, len(line)), line)
else: else:
yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
(lnum, pos), (lnum, len(line)), line) (lnum, pos), (lnum, len(line)), line)
continue continue
if column > indents[-1]: # count indents or dedents if column > indents[-1]: # count indents or dedents
indents.append(column) indents.append(column)
yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
while column < indents[-1]: while column < indents[-1]:
if column not in indents: if column not in indents:
raise IndentationError( raise IndentationError(
"unindent does not match any outer indentation level", "unindent does not match any outer indentation level",
("<tokenize>", lnum, pos, line)) ("<tokenize>", lnum, pos, line))
indents = indents[:-1] indents = indents[:-1]
yield (DEDENT, '', (lnum, pos), (lnum, pos), line) yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
else: # continued statement else: # continued statement
if not line: if not line:
@ -428,20 +431,20 @@ def _tokenize(readline, encoding):
if (initial in numchars or # ordinary number if (initial in numchars or # ordinary number
(initial == '.' and token != '.' and token != '...')): (initial == '.' and token != '.' and token != '...')):
yield (NUMBER, token, spos, epos, line) yield TokenInfo(NUMBER, token, spos, epos, line)
elif initial in '\r\n': elif initial in '\r\n':
yield (NL if parenlev > 0 else NEWLINE, yield TokenInfo(NL if parenlev > 0 else NEWLINE,
token, spos, epos, line) token, spos, epos, line)
elif initial == '#': elif initial == '#':
assert not token.endswith("\n") assert not token.endswith("\n")
yield (COMMENT, token, spos, epos, line) yield TokenInfo(COMMENT, token, spos, epos, line)
elif token in triple_quoted: elif token in triple_quoted:
endprog = endprogs[token] endprog = endprogs[token]
endmatch = endprog.match(line, pos) endmatch = endprog.match(line, pos)
if endmatch: # all on one line if endmatch: # all on one line
pos = endmatch.end(0) pos = endmatch.end(0)
token = line[start:pos] token = line[start:pos]
yield (STRING, token, spos, (lnum, pos), line) yield TokenInfo(STRING, token, spos, (lnum, pos), line)
else: else:
strstart = (lnum, start) # multiple lines strstart = (lnum, start) # multiple lines
contstr = line[start:] contstr = line[start:]
@ -458,23 +461,23 @@ def _tokenize(readline, encoding):
contline = line contline = line
break break
else: # ordinary string else: # ordinary string
yield (STRING, token, spos, epos, line) yield TokenInfo(STRING, token, spos, epos, line)
elif initial in namechars: # ordinary name elif initial in namechars: # ordinary name
yield (NAME, token, spos, epos, line) yield TokenInfo(NAME, token, spos, epos, line)
elif initial == '\\': # continued stmt elif initial == '\\': # continued stmt
continued = 1 continued = 1
else: else:
if initial in '([{': parenlev = parenlev + 1 if initial in '([{': parenlev = parenlev + 1
elif initial in ')]}': parenlev = parenlev - 1 elif initial in ')]}': parenlev = parenlev - 1
yield (OP, token, spos, epos, line) yield TokenInfo(OP, token, spos, epos, line)
else: else:
yield (ERRORTOKEN, line[pos], yield TokenInfo(ERRORTOKEN, line[pos],
(lnum, pos), (lnum, pos+1), line) (lnum, pos), (lnum, pos+1), line)
pos = pos + 1 pos = pos + 1
for indent in indents[1:]: # pop remaining indent levels for indent in indents[1:]: # pop remaining indent levels
yield (DEDENT, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
# An undocumented, backwards compatible, API for all the places in the standard # An undocumented, backwards compatible, API for all the places in the standard