1992-01-01 15:34:47 -04:00
|
|
|
# This module compiles a regular expression that recognizes Python tokens.
|
|
|
|
# It is designed to match the working of the Python tokenizer exactly.
|
|
|
|
# It takes care of everything except indentation;
|
|
|
|
# note that un-escaped newlines are tokens, too.
|
|
|
|
# tokenprog.regs[3] gives the location of the token without whitespace
|
|
|
|
# It also defines various subexpressions, but doesn't compile them.
|
|
|
|
# See the function test() below for an example of how to use.
|
|
|
|
|
|
|
|
import regex
|
|
|
|
|
|
|
|
# Note: to get a quoted backslash in a regexp, it must be quadrupled.
|
|
|
|
|
|
|
|
Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?'
|
|
|
|
|
|
|
|
Name = '[a-zA-Z_][a-zA-Z0-9_]*'
|
|
|
|
|
|
|
|
Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
|
|
|
|
Octnumber = '0[0-7]*[lL]?'
|
|
|
|
Decnumber = '[1-9][0-9]*[lL]?'
|
|
|
|
Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber
|
|
|
|
Exponent = '[eE][-+]?[0-9]+'
|
|
|
|
Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?'
|
|
|
|
Expfloat = '[0-9]+' + Exponent
|
|
|
|
Floatnumber = Pointfloat + '\|' + Expfloat
|
1992-03-16 14:30:24 -04:00
|
|
|
Number = Floatnumber + '\|' + Intnumber
|
1992-01-01 15:34:47 -04:00
|
|
|
|
1993-11-11 06:31:23 -04:00
|
|
|
String = '\'\(\\\\.\|[^\\\n\']\)*\'' + '\|' + '"\(\\\\.\|[^\\\n"]\)*"'
|
|
|
|
# Note: this module *recognizes* double quotes, but for backward
|
|
|
|
# compatibility, it doesn't *use* them!
|
1992-01-01 15:34:47 -04:00
|
|
|
|
|
|
|
Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>'
|
|
|
|
Bracket = '[][(){}]'
|
|
|
|
Special = '[:;.,`\n]'
|
|
|
|
Funny = Operator + '\|' + Bracket + '\|' + Special
|
|
|
|
|
|
|
|
PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny
|
|
|
|
|
|
|
|
Token = Ignore + '\(' + PlainToken + '\)'
|
|
|
|
|
|
|
|
try:
|
|
|
|
save_syntax = regex.set_syntax(0) # Use default syntax
|
|
|
|
tokenprog = regex.compile(Token)
|
|
|
|
finally:
|
1992-03-16 14:30:24 -04:00
|
|
|
if save_syntax != 0:
|
|
|
|
dummy = regex.set_syntax(save_syntax) # Restore original syntax
|
1992-01-01 15:34:47 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test(file):
|
|
|
|
f = open(file, 'r')
|
|
|
|
while 1:
|
|
|
|
line = f.readline()
|
|
|
|
if not line: break
|
|
|
|
i, n = 0, len(line)
|
|
|
|
while i < n:
|
|
|
|
j = tokenprog.match(line, i)
|
|
|
|
if j < 0:
|
|
|
|
print 'No token at', `line[i:i+20]` + '...'
|
|
|
|
i = i+1
|
|
|
|
else:
|
|
|
|
i = i+j
|
|
|
|
a, b = tokenprog.regs[3]
|
|
|
|
if a < b:
|
|
|
|
print 'Token:', `line[a:b]`
|