diff --git a/Lib/tokenize.py b/Lib/tokenize.py new file mode 100644 index 00000000000..6b3d991a960 --- /dev/null +++ b/Lib/tokenize.py @@ -0,0 +1,60 @@ +# This module compiles a regular expression that recognizes Python tokens. +# It is designed to match the working of the Python tokenizer exactly. +# It takes care of everything except indentation; +# note that un-escaped newlines are tokens, too. +# tokenprog.regs[3] gives the location of the token without whitespace +# It also defines various subexpressions, but doesn't compile them. +# See the function test() below for an example of how to use. + +import regex + +# Note: to get a quoted backslash in a regexp, it must be quadrupled. + +Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?' + +Name = '[a-zA-Z_][a-zA-Z0-9_]*' + +Hexnumber = '0[xX][0-9a-fA-F]*[lL]?' +Octnumber = '0[0-7]*[lL]?' +Decnumber = '[1-9][0-9]*[lL]?' +Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber +Exponent = '[eE][-+]?[0-9]+' +Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?' +Expfloat = '[0-9]+' + Exponent +Floatnumber = Pointfloat + '\|' + Expfloat +Number = Intnumber + '\|' + Floatnumber + +String = '\'\(\\\\.\|[^\\\n\']\)*\'' + +Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>' +Bracket = '[][(){}]' +Special = '[:;.,`\n]' +Funny = Operator + '\|' + Bracket + '\|' + Special + +PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny + +Token = Ignore + '\(' + PlainToken + '\)' + +try: + save_syntax = regex.set_syntax(0) # Use default syntax + tokenprog = regex.compile(Token) +finally: + dummy = regex.set_syntax(save_syntax) # Restore original syntax + + +def test(file): + f = open(file, 'r') + while 1: + line = f.readline() + if not line: break + i, n = 0, len(line) + while i < n: + j = tokenprog.match(line, i) + if j < 0: + print 'No token at', `line[i:i+20]` + '...' + i = i+1 + else: + i = i+j + a, b = tokenprog.regs[3] + if a < b: + print 'Token:', `line[a:b]`