Ka-Ping's muich improved version of March 26, 1997:
# Ignore now accepts \f as whitespace. Operator now includes '**'. # Ignore and Special now accept \n or \r\n at the end of a line. # Imagnumber is new. Expfloat is corrected to reject '0e4'.
This commit is contained in:
parent
24dacb38c5
commit
1aec32363f
168
Lib/tokenize.py
168
Lib/tokenize.py
|
@ -1,158 +1,182 @@
|
|||
"""Tokenization help for Python programs.
|
||||
|
||||
This module compiles a regular expression that recognizes Python
|
||||
tokens in individual lines of text. The regular expression handles
|
||||
everything except indentation, continuations, and triple-quoted
|
||||
strings. The function 'tokenize.tokenize()' takes care of these
|
||||
things for streams of text. It accepts a readline-like function which
|
||||
is called repeatedly to come up with the next input line (or "" for
|
||||
EOF), and a "token-eater" function which is called for each token
|
||||
found, passing its type, a string containing the token, the line
|
||||
number, the line, and the starting and ending positions of the token
|
||||
within the line. It is designed to match the working of the Python
|
||||
tokenizer exactly.
|
||||
This module exports a function called 'tokenize()' that breaks a stream of
|
||||
text into Python tokens. It accepts a readline-like method which is called
|
||||
repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
|
||||
function which is called once for each token found. The latter function is
|
||||
passed the token type, a string containing the token, the starting and
|
||||
ending (row, column) coordinates of the token, and the original line. It is
|
||||
designed to match the working of the Python tokenizer exactly, except that
|
||||
it produces COMMENT tokens for comments and gives type OP for all operators.
|
||||
|
||||
"""
|
||||
For compatibility with the older 'tokenize' module, this also compiles a
|
||||
regular expression into 'tokenprog' that matches Python tokens in individual
|
||||
lines of text, leaving the token in 'tokenprog.group(3)', but does not
|
||||
handle indentation, continuations, or multi-line strings."""
|
||||
|
||||
__version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 10 March 1997"
|
||||
__version__ = "Ka-Ping Yee, 26 March 1997"
|
||||
|
||||
import string, regex
|
||||
from token import *
|
||||
|
||||
COMMENT = N_TOKENS
|
||||
tok_name[COMMENT] = 'COMMENT'
|
||||
|
||||
# Changes from 1.3:
|
||||
# Ignore now accepts \f as whitespace. Operator now includes '**'.
|
||||
# Ignore and Special now accept \n or \r\n at the end of a line.
|
||||
# Imagnumber is new. Expfloat is corrected to reject '0e4'.
|
||||
# Note: to get a quoted backslash in a regex, it must be enclosed in brackets.
|
||||
|
||||
def group(*choices): return '\(' + string.join(choices, '\|') + '\)'
|
||||
|
||||
Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?'
|
||||
Whitespace = '[ \f\t]*'
|
||||
Comment = '\(#[^\r\n]*\)'
|
||||
Ignore = Whitespace + group('[\]\r?\n' + Whitespace)+'*' + Comment+'?'
|
||||
Name = '[a-zA-Z_][a-zA-Z0-9_]*'
|
||||
|
||||
ImagZero = '0[jJ]' # This is not caught by any of the following
|
||||
Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
|
||||
Octnumber = '0[0-7]*[lL]?'
|
||||
Decnumber = '[1-9][0-9]*[lLjJ]?'
|
||||
Intnumber = group(ImagZero, Hexnumber, Octnumber, Decnumber)
|
||||
Decnumber = '[1-9][0-9]*[lL]?'
|
||||
Intnumber = group(Hexnumber, Octnumber, Decnumber)
|
||||
Exponent = '[eE][-+]?[0-9]+'
|
||||
Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'
|
||||
Expfloat = '[0-9]+' + Exponent
|
||||
Floatnumber = group(Pointfloat, Expfloat) + "[jJ]?"
|
||||
Number = group(Floatnumber, Intnumber)
|
||||
Expfloat = '[1-9][0-9]*' + Exponent
|
||||
Floatnumber = group(Pointfloat, Expfloat)
|
||||
Imagnumber = group('0[jJ]', '[1-9][0-9]*[jJ]', Floatnumber + '[jJ]')
|
||||
Number = group(Imagnumber, Floatnumber, Intnumber)
|
||||
|
||||
Single = group('^\'', '[^\]\'')
|
||||
Single = group("^'", "[^\]'")
|
||||
Double = group('^"', '[^\]"')
|
||||
Tsingle = group('^\'\'\'', '[^\]\'\'\'')
|
||||
Tdouble = group('^"""', '[^\]"""')
|
||||
Triple = group('\'\'\'', '"""')
|
||||
String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),
|
||||
'"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))
|
||||
Single3 = group("^'''", "[^\]'''")
|
||||
Double3 = group('^"""', '[^\]"""')
|
||||
Triple = group("'''", '"""')
|
||||
String = group("'" + group('[\].', "[^\n'\]") + "*'",
|
||||
'"' + group('[\].', '[^\n"\]') + '*"')
|
||||
|
||||
Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',
|
||||
'<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
|
||||
Bracket = '[][(){}]'
|
||||
Special = group('[\]?\r?\n', '[:;.,`\f]')
|
||||
Special = group('\r?\n', '[:;.,`]')
|
||||
Funny = group(Operator, Bracket, Special)
|
||||
|
||||
PlainToken = group(Name, Number, Triple, String, Funny)
|
||||
PlainToken = group(Name, Number, String, Funny)
|
||||
Token = Ignore + PlainToken
|
||||
|
||||
ContStr = group("'" + group('[\].', "[^\n'\]")+'*' + group("'", '[\]\r?\n'),
|
||||
'"' + group('[\].', '[^\n"\]')+'*' + group('"', '[\]\r?\n'))
|
||||
PseudoExtras = group('[\]\r?\n', Comment, Triple)
|
||||
PseudoToken = Whitespace + group(PseudoExtras, Name, Number, ContStr, Funny)
|
||||
|
||||
try:
|
||||
save_syntax = regex.set_syntax(0) # use default syntax
|
||||
saved_syntax = regex.set_syntax(0) # use default syntax
|
||||
tokenprog = regex.compile(Token)
|
||||
pseudoprog = regex.compile(PseudoToken)
|
||||
endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),
|
||||
'\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }
|
||||
'\'\'\'': regex.compile(Single3), '"""': regex.compile(Double3) }
|
||||
finally:
|
||||
regex.set_syntax(save_syntax) # restore original syntax
|
||||
regex.set_syntax(saved_syntax) # restore original syntax
|
||||
|
||||
tabsize = 8
|
||||
TokenError = 'TokenError'
|
||||
def printtoken(type, string, linenum, line, start, end): # for testing
|
||||
print `linenum` + ':', tok_name[type], repr(string)
|
||||
def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
|
||||
print "%d,%d-%d,%d:\t%s\t%s" % \
|
||||
(srow, scol, erow, ecol, tok_name[type], repr(token))
|
||||
|
||||
def tokenize(readline, tokeneater = printtoken):
|
||||
linenum = parenlev = continued = 0
|
||||
def tokenize(readline, tokeneater=printtoken):
|
||||
lnum = parenlev = continued = 0
|
||||
namechars, numchars = string.letters + '_', string.digits
|
||||
contstr = ''
|
||||
indents = [0]
|
||||
|
||||
while 1: # loop over lines in stream
|
||||
line = readline()
|
||||
linenum = linenum + 1
|
||||
if line[-2:] == '\r\n': line = line[:-2] + '\n'
|
||||
lnum = lnum + 1
|
||||
pos, max = 0, len(line)
|
||||
|
||||
if contstr: # continued string
|
||||
if not line: raise TokenError, "EOF within multi-line string"
|
||||
if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'
|
||||
if endprog.search(line) >= 0:
|
||||
pos = end = endprog.regs[0][1]
|
||||
tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)
|
||||
tokeneater(STRING, contstr + line[:end],
|
||||
strstart, (lnum, end), line)
|
||||
contstr = ''
|
||||
else:
|
||||
contstr = contstr + line
|
||||
continue
|
||||
|
||||
elif parenlev == 0 and not continued: # this is a new statement
|
||||
elif parenlev == 0 and not continued: # new statement
|
||||
if not line: break
|
||||
column = 0
|
||||
while 1: # measure leading whitespace
|
||||
while pos < max: # measure leading whitespace
|
||||
if line[pos] == ' ': column = column + 1
|
||||
elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize
|
||||
elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
|
||||
elif line[pos] == '\f': column = 0
|
||||
else: break
|
||||
pos = pos + 1
|
||||
if line[pos] in '#\n': continue # skip comments or blank lines
|
||||
if pos == max: break
|
||||
|
||||
if line[pos] in '#\r\n': # skip comments or blank lines
|
||||
tokeneater((NEWLINE, COMMENT)[line[pos] == '#'], line[pos:],
|
||||
(lnum, pos), (lnum, len(line)), line)
|
||||
continue
|
||||
|
||||
if column > indents[-1]: # count indents or dedents
|
||||
indents.append(column)
|
||||
tokeneater(INDENT, '\t', linenum, line, 0, 0)
|
||||
tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
|
||||
while column < indents[-1]:
|
||||
indents = indents[:-1]
|
||||
tokeneater(DEDENT, '\t', linenum, line, 0, 0)
|
||||
tokeneater(DEDENT, line[:pos], (lnum, 0), (lnum, pos), line)
|
||||
|
||||
else: # continued statement
|
||||
if not line: raise TokenError, "EOF within multi-line statement"
|
||||
continued = 0
|
||||
|
||||
while pos < max:
|
||||
if tokenprog.match(line, pos) > 0: # scan for tokens
|
||||
start, end = tokenprog.regs[3]
|
||||
token = line[start:end]
|
||||
if pseudoprog.match(line, pos) > 0: # scan for tokens
|
||||
start, end = pseudoprog.regs[1]
|
||||
spos, epos = (lnum, start), (lnum, end)
|
||||
token, initial = line[start:end], line[start]
|
||||
pos = end
|
||||
|
||||
if token[0] in namechars: # ordinary name
|
||||
tokeneater(NAME, token, linenum, line, start, end)
|
||||
elif token[0] in numchars: # ordinary number
|
||||
tokeneater(NUMBER, token, linenum, line, start, end)
|
||||
|
||||
if initial in namechars: # ordinary name
|
||||
tokeneater(NAME, token, spos, epos, line)
|
||||
elif initial in numchars: # ordinary number
|
||||
tokeneater(NUMBER, token, spos, epos, line)
|
||||
elif initial in '\r\n':
|
||||
tokeneater(NEWLINE, token, spos, epos, line)
|
||||
elif initial == '#':
|
||||
tokeneater(COMMENT, token, spos, epos, line)
|
||||
elif initial == '\\': # continued stmt
|
||||
continued = 1
|
||||
elif token in ('\'\'\'', '"""'): # triple-quoted
|
||||
endprog = endprogs[token]
|
||||
if endprog.search(line, pos) >= 0: # all on one line
|
||||
pos = endprog.regs[0][1]
|
||||
token = line[start:pos]
|
||||
tokeneater(STRING, token, linenum, line, start, pos)
|
||||
token = line[start:pos]
|
||||
tokeneater(STRING, token, spos, (lnum, pos), line)
|
||||
else:
|
||||
contstr = line[start:] # multiple lines
|
||||
strstart = (lnum, start) # multiple lines
|
||||
contstr = line[start:]
|
||||
break
|
||||
elif token[0] in '\'"':
|
||||
elif initial in '\'"':
|
||||
if token[-1] == '\n': # continued string
|
||||
endprog, contstr = endprogs[token[0]], line[start:]
|
||||
strstart = (lnum, start)
|
||||
endprog, contstr = endprogs[initial], line[start:]
|
||||
break
|
||||
else: # ordinary string
|
||||
tokeneater(STRING, token, linenum, line, start, end)
|
||||
|
||||
elif token[0] == '\n':
|
||||
tokeneater(NEWLINE, token, linenum, line, start, end)
|
||||
elif token[0] == '\\': # continued stmt
|
||||
continued = 1
|
||||
|
||||
tokeneater(STRING, token, spos, epos, line)
|
||||
else:
|
||||
if token[0] in '([{': parenlev = parenlev + 1
|
||||
if token[0] in ')]}': parenlev = parenlev - 1
|
||||
tokeneater(OP, token, linenum, line, start, end)
|
||||
if initial in '([{': parenlev = parenlev + 1
|
||||
elif initial in ')]}': parenlev = parenlev - 1
|
||||
tokeneater(OP, token, spos, epos, line)
|
||||
else:
|
||||
tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)
|
||||
tokeneater(ERRORTOKEN, line[pos], spos, (lnum, pos+1), line)
|
||||
pos = pos + 1
|
||||
|
||||
for indent in indents[1:]: # pop remaining indent levels
|
||||
tokeneater(DEDENT, '\t', linenum, line, 0, 0)
|
||||
tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
|
||||
|
||||
if __name__ == '__main__': # testing
|
||||
import sys
|
||||
file = open(sys.argv[-1])
|
||||
tokenize(file.readline)
|
||||
tokenize(open(sys.argv[-1]).readline)
|
||||
|
|
Loading…
Reference in New Issue