From 1aec32363f25693e0c3ff81feddf620850b4955d Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 8 Apr 1997 14:24:39 +0000 Subject: [PATCH] Ka-Ping's muich improved version of March 26, 1997: # Ignore now accepts \f as whitespace. Operator now includes '**'. # Ignore and Special now accept \n or \r\n at the end of a line. # Imagnumber is new. Expfloat is corrected to reject '0e4'. --- Lib/tokenize.py | 168 +++++++++++++++++++++++++++--------------------- 1 file changed, 96 insertions(+), 72 deletions(-) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 06ed74606ec..7fe6fc14558 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -1,158 +1,182 @@ """Tokenization help for Python programs. -This module compiles a regular expression that recognizes Python -tokens in individual lines of text. The regular expression handles -everything except indentation, continuations, and triple-quoted -strings. The function 'tokenize.tokenize()' takes care of these -things for streams of text. It accepts a readline-like function which -is called repeatedly to come up with the next input line (or "" for -EOF), and a "token-eater" function which is called for each token -found, passing its type, a string containing the token, the line -number, the line, and the starting and ending positions of the token -within the line. It is designed to match the working of the Python -tokenizer exactly. +This module exports a function called 'tokenize()' that breaks a stream of +text into Python tokens. It accepts a readline-like method which is called +repeatedly to get the next line of input (or "" for EOF) and a "token-eater" +function which is called once for each token found. The latter function is +passed the token type, a string containing the token, the starting and +ending (row, column) coordinates of the token, and the original line. It is +designed to match the working of the Python tokenizer exactly, except that +it produces COMMENT tokens for comments and gives type OP for all operators. -""" +For compatibility with the older 'tokenize' module, this also compiles a +regular expression into 'tokenprog' that matches Python tokens in individual +lines of text, leaving the token in 'tokenprog.group(3)', but does not +handle indentation, continuations, or multi-line strings.""" -__version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 10 March 1997" +__version__ = "Ka-Ping Yee, 26 March 1997" import string, regex from token import * +COMMENT = N_TOKENS +tok_name[COMMENT] = 'COMMENT' + +# Changes from 1.3: +# Ignore now accepts \f as whitespace. Operator now includes '**'. +# Ignore and Special now accept \n or \r\n at the end of a line. +# Imagnumber is new. Expfloat is corrected to reject '0e4'. +# Note: to get a quoted backslash in a regex, it must be enclosed in brackets. + def group(*choices): return '\(' + string.join(choices, '\|') + '\)' -Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?' +Whitespace = '[ \f\t]*' +Comment = '\(#[^\r\n]*\)' +Ignore = Whitespace + group('[\]\r?\n' + Whitespace)+'*' + Comment+'?' Name = '[a-zA-Z_][a-zA-Z0-9_]*' -ImagZero = '0[jJ]' # This is not caught by any of the following Hexnumber = '0[xX][0-9a-fA-F]*[lL]?' Octnumber = '0[0-7]*[lL]?' -Decnumber = '[1-9][0-9]*[lLjJ]?' -Intnumber = group(ImagZero, Hexnumber, Octnumber, Decnumber) +Decnumber = '[1-9][0-9]*[lL]?' +Intnumber = group(Hexnumber, Octnumber, Decnumber) Exponent = '[eE][-+]?[0-9]+' Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?' -Expfloat = '[0-9]+' + Exponent -Floatnumber = group(Pointfloat, Expfloat) + "[jJ]?" -Number = group(Floatnumber, Intnumber) +Expfloat = '[1-9][0-9]*' + Exponent +Floatnumber = group(Pointfloat, Expfloat) +Imagnumber = group('0[jJ]', '[1-9][0-9]*[jJ]', Floatnumber + '[jJ]') +Number = group(Imagnumber, Floatnumber, Intnumber) -Single = group('^\'', '[^\]\'') +Single = group("^'", "[^\]'") Double = group('^"', '[^\]"') -Tsingle = group('^\'\'\'', '[^\]\'\'\'') -Tdouble = group('^"""', '[^\]"""') -Triple = group('\'\'\'', '"""') -String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'), - '"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n')) +Single3 = group("^'''", "[^\]'''") +Double3 = group('^"""', '[^\]"""') +Triple = group("'''", '"""') +String = group("'" + group('[\].', "[^\n'\]") + "*'", + '"' + group('[\].', '[^\n"\]') + '*"') Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|', '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>') Bracket = '[][(){}]' -Special = group('[\]?\r?\n', '[:;.,`\f]') +Special = group('\r?\n', '[:;.,`]') Funny = group(Operator, Bracket, Special) -PlainToken = group(Name, Number, Triple, String, Funny) +PlainToken = group(Name, Number, String, Funny) Token = Ignore + PlainToken +ContStr = group("'" + group('[\].', "[^\n'\]")+'*' + group("'", '[\]\r?\n'), + '"' + group('[\].', '[^\n"\]')+'*' + group('"', '[\]\r?\n')) +PseudoExtras = group('[\]\r?\n', Comment, Triple) +PseudoToken = Whitespace + group(PseudoExtras, Name, Number, ContStr, Funny) + try: - save_syntax = regex.set_syntax(0) # use default syntax + saved_syntax = regex.set_syntax(0) # use default syntax tokenprog = regex.compile(Token) + pseudoprog = regex.compile(PseudoToken) endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double), - '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) } + '\'\'\'': regex.compile(Single3), '"""': regex.compile(Double3) } finally: - regex.set_syntax(save_syntax) # restore original syntax + regex.set_syntax(saved_syntax) # restore original syntax tabsize = 8 TokenError = 'TokenError' -def printtoken(type, string, linenum, line, start, end): # for testing - print `linenum` + ':', tok_name[type], repr(string) +def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing + print "%d,%d-%d,%d:\t%s\t%s" % \ + (srow, scol, erow, ecol, tok_name[type], repr(token)) -def tokenize(readline, tokeneater = printtoken): - linenum = parenlev = continued = 0 +def tokenize(readline, tokeneater=printtoken): + lnum = parenlev = continued = 0 namechars, numchars = string.letters + '_', string.digits contstr = '' indents = [0] + while 1: # loop over lines in stream line = readline() - linenum = linenum + 1 - if line[-2:] == '\r\n': line = line[:-2] + '\n' + lnum = lnum + 1 pos, max = 0, len(line) if contstr: # continued string if not line: raise TokenError, "EOF within multi-line string" - if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n' if endprog.search(line) >= 0: pos = end = endprog.regs[0][1] - tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0) + tokeneater(STRING, contstr + line[:end], + strstart, (lnum, end), line) contstr = '' else: contstr = contstr + line continue - elif parenlev == 0 and not continued: # this is a new statement + elif parenlev == 0 and not continued: # new statement if not line: break column = 0 - while 1: # measure leading whitespace + while pos < max: # measure leading whitespace if line[pos] == ' ': column = column + 1 - elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize + elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize elif line[pos] == '\f': column = 0 else: break pos = pos + 1 - if line[pos] in '#\n': continue # skip comments or blank lines + if pos == max: break + + if line[pos] in '#\r\n': # skip comments or blank lines + tokeneater((NEWLINE, COMMENT)[line[pos] == '#'], line[pos:], + (lnum, pos), (lnum, len(line)), line) + continue if column > indents[-1]: # count indents or dedents indents.append(column) - tokeneater(INDENT, '\t', linenum, line, 0, 0) + tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) while column < indents[-1]: indents = indents[:-1] - tokeneater(DEDENT, '\t', linenum, line, 0, 0) + tokeneater(DEDENT, line[:pos], (lnum, 0), (lnum, pos), line) else: # continued statement if not line: raise TokenError, "EOF within multi-line statement" continued = 0 while pos < max: - if tokenprog.match(line, pos) > 0: # scan for tokens - start, end = tokenprog.regs[3] - token = line[start:end] + if pseudoprog.match(line, pos) > 0: # scan for tokens + start, end = pseudoprog.regs[1] + spos, epos = (lnum, start), (lnum, end) + token, initial = line[start:end], line[start] pos = end - if token[0] in namechars: # ordinary name - tokeneater(NAME, token, linenum, line, start, end) - elif token[0] in numchars: # ordinary number - tokeneater(NUMBER, token, linenum, line, start, end) - + if initial in namechars: # ordinary name + tokeneater(NAME, token, spos, epos, line) + elif initial in numchars: # ordinary number + tokeneater(NUMBER, token, spos, epos, line) + elif initial in '\r\n': + tokeneater(NEWLINE, token, spos, epos, line) + elif initial == '#': + tokeneater(COMMENT, token, spos, epos, line) + elif initial == '\\': # continued stmt + continued = 1 elif token in ('\'\'\'', '"""'): # triple-quoted endprog = endprogs[token] if endprog.search(line, pos) >= 0: # all on one line pos = endprog.regs[0][1] - token = line[start:pos] - tokeneater(STRING, token, linenum, line, start, pos) + token = line[start:pos] + tokeneater(STRING, token, spos, (lnum, pos), line) else: - contstr = line[start:] # multiple lines + strstart = (lnum, start) # multiple lines + contstr = line[start:] break - elif token[0] in '\'"': + elif initial in '\'"': if token[-1] == '\n': # continued string - endprog, contstr = endprogs[token[0]], line[start:] + strstart = (lnum, start) + endprog, contstr = endprogs[initial], line[start:] break else: # ordinary string - tokeneater(STRING, token, linenum, line, start, end) - - elif token[0] == '\n': - tokeneater(NEWLINE, token, linenum, line, start, end) - elif token[0] == '\\': # continued stmt - continued = 1 - + tokeneater(STRING, token, spos, epos, line) else: - if token[0] in '([{': parenlev = parenlev + 1 - if token[0] in ')]}': parenlev = parenlev - 1 - tokeneater(OP, token, linenum, line, start, end) + if initial in '([{': parenlev = parenlev + 1 + elif initial in ')]}': parenlev = parenlev - 1 + tokeneater(OP, token, spos, epos, line) else: - tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1) + tokeneater(ERRORTOKEN, line[pos], spos, (lnum, pos+1), line) pos = pos + 1 for indent in indents[1:]: # pop remaining indent levels - tokeneater(DEDENT, '\t', linenum, line, 0, 0) + tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '') if __name__ == '__main__': # testing import sys - file = open(sys.argv[-1]) - tokenize(file.readline) + tokenize(open(sys.argv[-1]).readline)