Ka-Ping's version.
This commit is contained in:
parent
19700b6a98
commit
fc6f5339a9
173
Lib/tokenize.py
173
Lib/tokenize.py
|
@ -1,63 +1,150 @@
|
||||||
# This module compiles a regular expression that recognizes Python tokens.
|
"""tokenize.py (Ka-Ping Yee, 4 March 1997)
|
||||||
# It is designed to match the working of the Python tokenizer exactly.
|
|
||||||
# It takes care of everything except indentation;
|
|
||||||
# note that un-escaped newlines are tokens, too.
|
|
||||||
# tokenprog.regs[3] gives the location of the token without whitespace
|
|
||||||
# It also defines various subexpressions, but doesn't compile them.
|
|
||||||
# See the function test() below for an example of how to use.
|
|
||||||
|
|
||||||
import regex
|
This module compiles a regular expression that recognizes Python tokens
|
||||||
|
in individual lines of text. The regular expression handles everything
|
||||||
|
except indentation, continuations, and triple-quoted strings. The function
|
||||||
|
'tokenize.tokenize()' takes care of these things for streams of text. It
|
||||||
|
accepts a file-like object and a function, uses the readline() method to
|
||||||
|
scan the file, and calls the function called once for each token found
|
||||||
|
passing its type, a string containing the token, the line number, the line,
|
||||||
|
and the starting and ending positions of the token within the line.
|
||||||
|
It is designed to match the working of the Python tokenizer exactly."""
|
||||||
|
|
||||||
# Note: to get a quoted backslash in a regexp, it must be quadrupled.
|
import string, regex
|
||||||
|
from token import *
|
||||||
|
|
||||||
Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?'
|
def group(*choices): return '\(' + string.join(choices, '\|') + '\)'
|
||||||
|
|
||||||
|
Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?'
|
||||||
Name = '[a-zA-Z_][a-zA-Z0-9_]*'
|
Name = '[a-zA-Z_][a-zA-Z0-9_]*'
|
||||||
|
|
||||||
Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
|
Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
|
||||||
Octnumber = '0[0-7]*[lL]?'
|
Octnumber = '0[0-7]*[lL]?'
|
||||||
Decnumber = '[1-9][0-9]*[lL]?'
|
Decnumber = '[1-9][0-9]*[lL]?'
|
||||||
Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber
|
Intnumber = group(Hexnumber, Octnumber, Decnumber)
|
||||||
Exponent = '[eE][-+]?[0-9]+'
|
Exponent = '[eE][-+]?[0-9]+'
|
||||||
Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?'
|
Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'
|
||||||
Expfloat = '[0-9]+' + Exponent
|
Expfloat = '[0-9]+' + Exponent
|
||||||
Floatnumber = Pointfloat + '\|' + Expfloat
|
Floatnumber = group(Pointfloat, Expfloat)
|
||||||
Number = Floatnumber + '\|' + Intnumber
|
Number = group(Floatnumber, Intnumber)
|
||||||
|
|
||||||
String = '\'\(\\\\.\|[^\\\n\']\)*\'' + '\|' + '"\(\\\\.\|[^\\\n"]\)*"'
|
Single = group('^\'', '[^\]\'')
|
||||||
# Note: this module *recognizes* double quotes, but for backward
|
Double = group('^"', '[^\]"')
|
||||||
# compatibility, it doesn't *use* them!
|
Tsingle = group('^\'\'\'', '[^\]\'\'\'')
|
||||||
|
Tdouble = group('^"""', '[^\]"""')
|
||||||
|
Triple = group('\'\'\'', '"""')
|
||||||
|
String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),
|
||||||
|
'"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))
|
||||||
|
|
||||||
Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>'
|
Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',
|
||||||
|
'<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
|
||||||
Bracket = '[][(){}]'
|
Bracket = '[][(){}]'
|
||||||
Special = '[:;.,`\n]'
|
Special = group('[\]?\r?\n', '[:;.,`\f]')
|
||||||
Funny = Operator + '\|' + Bracket + '\|' + Special
|
Funny = group(Operator, Bracket, Special)
|
||||||
|
|
||||||
PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny
|
PlainToken = group(Name, Number, Triple, String, Funny)
|
||||||
|
Token = Ignore + PlainToken
|
||||||
Token = Ignore + '\(' + PlainToken + '\)'
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
save_syntax = regex.set_syntax(0) # Use default syntax
|
save_syntax = regex.set_syntax(0) # use default syntax
|
||||||
tokenprog = regex.compile(Token)
|
tokenprog = regex.compile(Token)
|
||||||
|
endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),
|
||||||
|
'\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }
|
||||||
finally:
|
finally:
|
||||||
if save_syntax != 0:
|
regex.set_syntax(save_syntax) # restore original syntax
|
||||||
dummy = regex.set_syntax(save_syntax) # Restore original syntax
|
|
||||||
|
|
||||||
|
tabsize = 8
|
||||||
|
TokenError = 'TokenError'
|
||||||
|
def printtoken(type, string, linenum, line, start, end): # for testing
|
||||||
|
print `linenum` + ':', tok_name[type], repr(string)
|
||||||
|
|
||||||
def test(file):
|
def tokenize(readline, tokeneater = printtoken):
|
||||||
f = open(file, 'r')
|
linenum = parenlev = continued = 0
|
||||||
while 1:
|
namechars, numchars = string.letters + '_', string.digits
|
||||||
line = f.readline()
|
contstr = ''
|
||||||
if not line: break
|
indents = [0]
|
||||||
i, n = 0, len(line)
|
while 1: # loop over lines in stream
|
||||||
while i < n:
|
line = readline()
|
||||||
j = tokenprog.match(line, i)
|
linenum = linenum + 1
|
||||||
if j < 0:
|
if line[-2:] == '\r\n': line = line[:-2] + '\n'
|
||||||
print 'No token at', `line[i:i+20]` + '...'
|
pos, max = 0, len(line)
|
||||||
i = i+1
|
|
||||||
else:
|
if contstr: # continued string
|
||||||
i = i+j
|
if not line: raise TokenError, "EOF within multi-line string"
|
||||||
a, b = tokenprog.regs[3]
|
if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'
|
||||||
if a < b:
|
if endprog.search(line) >= 0:
|
||||||
print 'Token:', `line[a:b]`
|
pos = end = endprog.regs[0][1]
|
||||||
|
tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)
|
||||||
|
contstr = ''
|
||||||
|
else:
|
||||||
|
contstr = contstr + line
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif parenlev == 0 and not continued: # this is a new statement
|
||||||
|
if not line: break
|
||||||
|
column = 0
|
||||||
|
while 1: # measure leading whitespace
|
||||||
|
if line[pos] == ' ': column = column + 1
|
||||||
|
elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize
|
||||||
|
elif line[pos] == '\f': column = 0
|
||||||
|
else: break
|
||||||
|
pos = pos + 1
|
||||||
|
if line[pos] in '#\n': continue # skip comments or blank lines
|
||||||
|
|
||||||
|
if column > indents[-1]: # count indents or dedents
|
||||||
|
indents.append(column)
|
||||||
|
tokeneater(INDENT, '\t', linenum, line, 0, 0)
|
||||||
|
while column < indents[-1]:
|
||||||
|
indents = indents[:-1]
|
||||||
|
tokeneater(DEDENT, '\t', linenum, line, 0, 0)
|
||||||
|
|
||||||
|
else: # continued statement
|
||||||
|
if not line: raise TokenError, "EOF within multi-line statement"
|
||||||
|
continued = 0
|
||||||
|
|
||||||
|
while pos < max:
|
||||||
|
if tokenprog.match(line, pos) > 0: # scan for tokens
|
||||||
|
start, end = tokenprog.regs[3]
|
||||||
|
token = line[start:end]
|
||||||
|
pos = end
|
||||||
|
|
||||||
|
if token[0] in namechars: # ordinary name
|
||||||
|
tokeneater(NAME, token, linenum, line, start, end)
|
||||||
|
elif token[0] in numchars: # ordinary number
|
||||||
|
tokeneater(NUMBER, token, linenum, line, start, end)
|
||||||
|
|
||||||
|
elif token in ('\'\'\'', '"""'): # triple-quoted
|
||||||
|
endprog = endprogs[token]
|
||||||
|
if endprog.search(line, pos) >= 0: # all on one line
|
||||||
|
pos = endprog.regs[0][1]
|
||||||
|
tokeneater(STRING, token, linenum, line, start, pos)
|
||||||
|
else:
|
||||||
|
contstr = line[start:] # multiple lines
|
||||||
|
break
|
||||||
|
elif token[0] in '\'"':
|
||||||
|
if token[-1] == '\n': # continued string
|
||||||
|
endprog, contstr = endprogs[token[0]], line[start:]
|
||||||
|
break
|
||||||
|
else: # ordinary string
|
||||||
|
tokeneater(STRING, token, linenum, line, start, end)
|
||||||
|
|
||||||
|
elif token[0] == '\n':
|
||||||
|
tokeneater(NEWLINE, token, linenum, line, start, end)
|
||||||
|
elif token[0] == '\\': # continued stmt
|
||||||
|
continued = 1
|
||||||
|
|
||||||
|
else:
|
||||||
|
if token[0] in '([{': parenlev = parenlev + 1
|
||||||
|
if token[0] in ')]}': parenlev = parenlev - 1
|
||||||
|
tokeneater(OP, token, linenum, line, start, end)
|
||||||
|
else:
|
||||||
|
tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)
|
||||||
|
pos = pos + 1
|
||||||
|
|
||||||
|
for indent in indents[1:]: # pop remaining indent levels
|
||||||
|
tokeneater(DEDENT, '\t', linenum, line, 0, 0)
|
||||||
|
|
||||||
|
if __name__ == '__main__': # testing
|
||||||
|
import sys
|
||||||
|
file = open(sys.argv[-1])
|
||||||
|
tokenize(file.readline)
|
||||||
|
|
Loading…
Reference in New Issue