From 040d7ca498c68153abb0387a8f900176ea08e2bb Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 23 Aug 2002 01:36:01 +0000 Subject: [PATCH] Rewritten using the tokenize module, which gives us a real tokenizer rather than a number of approximating regular expressions. Alas, it is 3-4 times slower. Let that be a challenge for the tokenize module. --- Lib/pyclbr.py | 332 ++++++++++++++++++++++---------------------------- 1 file changed, 144 insertions(+), 188 deletions(-) diff --git a/Lib/pyclbr.py b/Lib/pyclbr.py index 9bc68c5ee73..1901a82be31 100644 --- a/Lib/pyclbr.py +++ b/Lib/pyclbr.py @@ -4,10 +4,11 @@ Parse enough of a Python file to recognize class and method definitions and to find out the superclasses of a class. The interface consists of a single function: - readmodule(module, path) + readmodule_ex(module [, path[, inpackage]]) module is the name of a Python module, path is an optional list of directories where the module is to be searched. If present, path is -prepended to the system search path sys.path. +prepended to the system search path sys.path. (inpackage is used +internally to search for a submodule of a package.) The return value is a dictionary. The keys of the dictionary are the names of the classes defined in the module (including classes that are defined via the from XXX import YYY construct). The values @@ -28,12 +29,10 @@ string giving the name of the super class. Since import statements are recognized and imported modules are scanned as well, this shouldn't happen often. +XXX describe the Function class. + BUGS -- Continuation lines are not dealt with at all, except inside strings. - Nested classes and functions can confuse it. -- Code that doesn't pass tabnanny or python -t will confuse it, unless - you set the module TABWIDTH vrbl (default 8) to the correct tab width - for the file. PACKAGE RELATED BUGS - If you have a package and a module inside that or another package @@ -52,69 +51,11 @@ PACKAGE RELATED BUGS import sys import imp -import re -import string +import tokenize # Python tokenizer +from token import NAME __all__ = ["readmodule"] -TABWIDTH = 8 - -_getnext = re.compile(r""" - (?P - \""" [^"\\]* (?: - (?: \\. | "(?!"") ) - [^"\\]* - )* - \""" - - | ''' [^'\\]* (?: - (?: \\. | '(?!'') ) - [^'\\]* - )* - ''' - - | " [^"\\\n]* (?: \\. [^"\\\n]*)* " - - | ' [^'\\\n]* (?: \\. [^'\\\n]*)* ' - ) - -| (?P - ^ - (?P [ \t]* ) - def [ \t]+ - (?P [a-zA-Z_] \w* ) - [ \t]* \( - ) - -| (?P - ^ - (?P [ \t]* ) - class [ \t]+ - (?P [a-zA-Z_] \w* ) - [ \t]* - (?P \( [^)\n]* \) )? - [ \t]* : - ) - -| (?P - ^ import [ \t]+ - (?P [^#;\n]+ ) - ) - -| (?P - ^ from [ \t]+ - (?P - [a-zA-Z_] \w* - (?: - [ \t]* \. [ \t]* [a-zA-Z_] \w* - )* - ) - [ \t]+ - import [ \t]+ - (?P [^#;\n]+ ) - ) -""", re.VERBOSE | re.DOTALL | re.MULTILINE).search - _modules = {} # cache of modules we've seen # each Python class is represented by an instance of this class @@ -140,7 +81,7 @@ class Function(Class): def _addmethod(self, name, lineno): assert 0, "Function._addmethod() shouldn't be called" -def readmodule(module, path=[], inpackage=0): +def readmodule(module, path=[], inpackage=False): '''Backwards compatible interface. Like readmodule_ex() but strips Function objects from the @@ -153,7 +94,7 @@ def readmodule(module, path=[], inpackage=0): res[key] = value return res -def readmodule_ex(module, path=[], inpackage=0): +def readmodule_ex(module, path=[], inpackage=False): '''Read a module file and return a dictionary of classes. Search for MODULE in PATH and sys.path, read and parse the @@ -168,7 +109,7 @@ def readmodule_ex(module, path=[], inpackage=0): package = module[:i].strip() submodule = module[i+1:].strip() parent = readmodule_ex(package, path, inpackage) - child = readmodule_ex(submodule, parent['__path__'], 1) + child = readmodule_ex(submodule, parent['__path__'], True) return child if module in _modules: @@ -204,129 +145,144 @@ def readmodule_ex(module, path=[], inpackage=0): _modules[module] = dict classstack = [] # stack of (class, indent) pairs - src = f.read() - f.close() - # To avoid having to stop the regexp at each newline, instead - # when we need a line number we simply count the number of - # newlines in the string since the last time we did this; i.e., - # lineno += src.count('\n', last_lineno_pos, here) - # last_lineno_pos = here - lineno, last_lineno_pos = 1, 0 - i = 0 - while 1: - m = _getnext(src, i) - if not m: - break - start, i = m.span() - - if m.start("Method") >= 0: - # found a method definition or function - thisindent = _indent(m.group("MethodIndent")) - meth_name = m.group("MethodName") - lineno += src.count('\n', last_lineno_pos, start) - last_lineno_pos = start - # close all classes indented at least as much - while classstack and \ - classstack[-1][1] >= thisindent: - del classstack[-1] - if classstack: - # it's a class method - cur_class = classstack[-1][0] - cur_class._addmethod(meth_name, lineno) - else: - # it's a function - f = Function(module, meth_name, - file, lineno) - dict[meth_name] = f - - elif m.start("String") >= 0: - pass - - elif m.start("Class") >= 0: - # we found a class definition - thisindent = _indent(m.group("ClassIndent")) - # close all classes indented at least as much - while classstack and \ - classstack[-1][1] >= thisindent: - del classstack[-1] - lineno += src.count('\n', last_lineno_pos, start) - last_lineno_pos = start - class_name = m.group("ClassName") - inherit = m.group("ClassSupers") - if inherit: - # the class inherits from other classes - inherit = inherit[1:-1].strip() - names = [] - for n in inherit.split(','): - n = n.strip() - if n in dict: - # we know this super class - n = dict[n] - else: - c = n.split('.') - if len(c) > 1: - # super class - # is of the - # form module.class: - # look in - # module for class - m = c[-2] - c = c[-1] - if m in _modules: - d = _modules[m] - if c in d: - n = d[c] - names.append(n) - inherit = names - # remember this class - cur_class = Class(module, class_name, inherit, - file, lineno) - dict[class_name] = cur_class - classstack.append((cur_class, thisindent)) - - elif m.start("Import") >= 0: - # import module - for n in m.group("ImportList").split(','): - n = n.strip() + g = tokenize.generate_tokens(f.readline) + try: + for tokentype, token, start, end, line in g: + if token == 'def': + lineno, thisindent = start + tokentype, meth_name, start, end, line = g.next() + if tokentype != NAME: + continue # Syntax error + # close all classes indented at least as much + while classstack and \ + classstack[-1][1] >= thisindent: + del classstack[-1] + if classstack: + # it's a class method + cur_class = classstack[-1][0] + cur_class._addmethod(meth_name, lineno) + else: + # it's a function + dict[meth_name] = Function(module, meth_name, file, lineno) + elif token == 'class': + lineno, thisindent = start + tokentype, class_name, start, end, line = g.next() + if tokentype != NAME: + continue # Syntax error + # close all classes indented at least as much + while classstack and \ + classstack[-1][1] >= thisindent: + del classstack[-1] + # parse what follows the class name + tokentype, token, start, end, line = g.next() + inherit = None + if token == '(': + names = [] # List of superclasses + # there's a list of superclasses + level = 1 + super = [] # Tokens making up current superclass + while True: + tokentype, token, start, end, line = g.next() + if token in (')', ',') and level == 1: + n = "".join(super) + if n in dict: + # we know this super class + n = dict[n] + else: + c = n.split('.') + if len(c) > 1: + # super class is of the form + # module.class: look in module for + # class + m = c[-2] + c = c[-1] + if m in _modules: + d = _modules[m] + if c in d: + n = d[c] + names.append(n) + if token == '(': + level += 1 + elif token == ')': + level -= 1 + if level == 0: + break + elif token == ',' and level == 1: + pass + else: + super.append(token) + inherit = names + cur_class = Class(module, class_name, inherit, file, lineno) + dict[class_name] = cur_class + classstack.append((cur_class, thisindent)) + elif token == 'import' and start[1] == 0: + modules = _getnamelist(g) + for mod, mod2 in modules: + readmodule_ex(mod, path, inpackage) + elif token == 'from' and start[1] == 0: + mod, token = _getname(g) + if not mod or token != "import": + continue + names = _getnamelist(g) try: # recursively read the imported module - d = readmodule_ex(n, path, inpackage) + d = readmodule_ex(mod, path, inpackage) except: - ##print 'module', n, 'not found' - pass - - elif m.start("ImportFrom") >= 0: - # from module import stuff - mod = m.group("ImportFromPath") - names = m.group("ImportFromList").split(',') - try: - # recursively read the imported module - d = readmodule_ex(mod, path, inpackage) - except: - ##print 'module', mod, 'not found' - continue - # add any classes that were defined in the - # imported module to our name space if they - # were mentioned in the list - for n in names: - n = n.strip() - if n in d: - dict[n] = d[n] - elif n == '*': - # only add a name if not - # already there (to mimic what - # Python does internally) - # also don't add names that - # start with _ - for n in d: - if n[0] != '_' and \ - not n in dict: - dict[n] = d[n] - else: - assert 0, "regexp _getnext found something unexpected" + continue + # add any classes that were defined in the imported module + # to our name space if they were mentioned in the list + for n, n2 in names: + if n in d: + dict[n2 or n] = d[n] + elif n == '*': + # only add a name if not already there (to mimic + # what Python does internally) also don't add + # names that start with _ + for n in d: + if n[0] != '_' and not n in dict: + dict[n] = d[n] + except StopIteration: + pass + f.close() return dict -def _indent(ws, _expandtabs=string.expandtabs): - return len(_expandtabs(ws, TABWIDTH)) +def _getnamelist(g): + # Helper to get a comma-separated list of dotted names plus 'as' + # clauses. Return a list of pairs (name, name2) where name2 is + # the 'as' name, or None if there is no 'as' clause. + names = [] + while True: + name, token = _getname(g) + if not name: + break + if token == 'as': + name2, token = _getname(g) + else: + name2 = None + names.append((name, name2)) + while token != "," and "\n" not in token: + tokentype, token, start, end, line = g.next() + if token != ",": + break + return names + +def _getname(g): + # Helper to get a dotted name, return a pair (name, token) where + # name is the dotted name, or None if there was no dotted name, + # and token is the next input token. + parts = [] + tokentype, token, start, end, line = g.next() + if tokentype != NAME and token != '*': + return (None, token) + parts.append(token) + while True: + tokentype, token, start, end, line = g.next() + if token != '.': + break + tokentype, token, start, end, line = g.next() + if tokentype != NAME: + break + parts.append(token) + return (".".join(parts), token)