Rewritten using the tokenize module, which gives us a real tokenizer

rather than a number of approximating regular expressions. Alas, it is 3-4 times slower. Let that be a challenge for the tokenize module.
2002-08-23 01:36:01 +00:00 · 2002-08-23 01:36:01 +00:00 · 040d7ca498
parent fd372aa8e9
commit 040d7ca498
1 changed files with 144 additions and 188 deletions
--- a/Lib/pyclbr.py
+++ b/Lib/pyclbr.py
@ -4,10 +4,11 @@ Parse enough of a Python file to recognize class and method
 definitions and to find out the superclasses of a class.

 The interface consists of a single function:
-        readmodule(module, path)
+        readmodule_ex(module [, path[, inpackage]])
 module is the name of a Python module, path is an optional list of
 directories where the module is to be searched.  If present, path is
-prepended to the system search path sys.path.
+prepended to the system search path sys.path.  (inpackage is used
+internally to search for a submodule of a package.)
 The return value is a dictionary.  The keys of the dictionary are
 the names of the classes defined in the module (including classes
 that are defined via the from XXX import YYY construct).  The values
@ -28,12 +29,10 @@ string giving the name of the super class.  Since import statements
 are recognized and imported modules are scanned as well, this
 shouldn't happen often.

+XXX describe the Function class.
+
 BUGS
- Continuation lines are not dealt with at all, except inside strings.
 - Nested classes and functions can confuse it.
- Code that doesn't pass tabnanny or python -t will confuse it, unless
-  you set the module TABWIDTH vrbl (default 8) to the correct tab width
-  for the file.

 PACKAGE RELATED BUGS
 - If you have a package and a module inside that or another package
@ -52,69 +51,11 @@ PACKAGE RELATED BUGS

 import sys
 import imp
-import re
-import string
+import tokenize # Python tokenizer
+from token import NAME

 __all__ = ["readmodule"]

-TABWIDTH = 8
-
-_getnext = re.compile(r"""
-    (?P<String>
-       \""" [^"\\]* (?:
-                        (?: \\. | "(?!"") )
-                        [^"\\]*
-                    )*
-       \"""
-
-    |   ''' [^'\\]* (?:
-                        (?: \\. | '(?!'') )
-                        [^'\\]*
-                    )*
-        '''
-
-    |   " [^"\\\n]* (?: \\. [^"\\\n]*)* "
-
-    |   ' [^'\\\n]* (?: \\. [^'\\\n]*)* '
-    )
-
-|   (?P<Method>
-        ^
-        (?P<MethodIndent> [ \t]* )
-        def [ \t]+
-        (?P<MethodName> [a-zA-Z_] \w* )
-        [ \t]* \(
-    )
-
-|   (?P<Class>
-        ^
-        (?P<ClassIndent> [ \t]* )
-        class [ \t]+
-        (?P<ClassName> [a-zA-Z_] \w* )
-        [ \t]*
-        (?P<ClassSupers> \( [^)\n]* \) )?
-        [ \t]* :
-    )
-
-|   (?P<Import>
-        ^ import [ \t]+
-        (?P<ImportList> [^#;\n]+ )
-    )
-
-|   (?P<ImportFrom>
-        ^ from [ \t]+
-        (?P<ImportFromPath>
-            [a-zA-Z_] \w*
-            (?:
-                [ \t]* \. [ \t]* [a-zA-Z_] \w*
-            )*
-        )
-        [ \t]+
-        import [ \t]+
-        (?P<ImportFromList> [^#;\n]+ )
-    )
-""", re.VERBOSE | re.DOTALL | re.MULTILINE).search
-
 _modules = {}                           # cache of modules we've seen

 # each Python class is represented by an instance of this class
@ -140,7 +81,7 @@ class Function(Class):
    def _addmethod(self, name, lineno):
        assert 0, "Function._addmethod() shouldn't be called"

-def readmodule(module, path=[], inpackage=0):
+def readmodule(module, path=[], inpackage=False):
    '''Backwards compatible interface.

    Like readmodule_ex() but strips Function objects from the
@ -153,7 +94,7 @@ def readmodule(module, path=[], inpackage=0):
            res[key] = value
    return res

-def readmodule_ex(module, path=[], inpackage=0):
+def readmodule_ex(module, path=[], inpackage=False):
    '''Read a module file and return a dictionary of classes.

    Search for MODULE in PATH and sys.path, read and parse the
@ -168,7 +109,7 @@ def readmodule_ex(module, path=[], inpackage=0):
        package = module[:i].strip()
        submodule = module[i+1:].strip()
        parent = readmodule_ex(package, path, inpackage)
-        child = readmodule_ex(submodule, parent['__path__'], 1)
+        child = readmodule_ex(submodule, parent['__path__'], True)
        return child

    if module in _modules:
@ -204,129 +145,144 @@ def readmodule_ex(module, path=[], inpackage=0):

    _modules[module] = dict
    classstack = [] # stack of (class, indent) pairs
-    src = f.read()
-    f.close()

-    # To avoid having to stop the regexp at each newline, instead
-    # when we need a line number we simply count the number of
-    # newlines in the string since the last time we did this; i.e.,
-    #    lineno += src.count('\n', last_lineno_pos, here)
-    #    last_lineno_pos = here
-    lineno, last_lineno_pos = 1, 0
-    i = 0
-    while 1:
-        m = _getnext(src, i)
-        if not m:
-            break
-        start, i = m.span()
-
-        if m.start("Method") >= 0:
-            # found a method definition or function
-            thisindent = _indent(m.group("MethodIndent"))
-            meth_name = m.group("MethodName")
-            lineno += src.count('\n', last_lineno_pos, start)
-            last_lineno_pos = start
-            # close all classes indented at least as much
-            while classstack and \
-                  classstack[-1][1] >= thisindent:
-                del classstack[-1]
-            if classstack:
-                # it's a class method
-                cur_class = classstack[-1][0]
-                cur_class._addmethod(meth_name, lineno)
-            else:
-                # it's a function
-                f = Function(module, meth_name,
-                             file, lineno)
-                dict[meth_name] = f
-
-        elif m.start("String") >= 0:
-            pass
-
-        elif m.start("Class") >= 0:
-            # we found a class definition
-            thisindent = _indent(m.group("ClassIndent"))
-            # close all classes indented at least as much
-            while classstack and \
-                  classstack[-1][1] >= thisindent:
-                del classstack[-1]
-            lineno += src.count('\n', last_lineno_pos, start)
-            last_lineno_pos = start
-            class_name = m.group("ClassName")
-            inherit = m.group("ClassSupers")
-            if inherit:
-                # the class inherits from other classes
-                inherit = inherit[1:-1].strip()
-                names = []
-                for n in inherit.split(','):
-                    n = n.strip()
-                    if n in dict:
-                        # we know this super class
-                        n = dict[n]
-                    else:
-                        c = n.split('.')
-                        if len(c) > 1:
-                            # super class
-                            # is of the
-                            # form module.class:
-                            # look in
-                            # module for class
-                            m = c[-2]
-                            c = c[-1]
-                            if m in _modules:
-                                d = _modules[m]
-                                if c in d:
-                                    n = d[c]
-                    names.append(n)
-                inherit = names
-            # remember this class
-            cur_class = Class(module, class_name, inherit,
-                              file, lineno)
-            dict[class_name] = cur_class
-            classstack.append((cur_class, thisindent))
-
-        elif m.start("Import") >= 0:
-            # import module
-            for n in m.group("ImportList").split(','):
-                n = n.strip()
+    g = tokenize.generate_tokens(f.readline)
+    try:
+        for tokentype, token, start, end, line in g:
+            if token == 'def':
+                lineno, thisindent = start
+                tokentype, meth_name, start, end, line = g.next()
+                if tokentype != NAME:
+                    continue # Syntax error
+                # close all classes indented at least as much
+                while classstack and \
+                      classstack[-1][1] >= thisindent:
+                    del classstack[-1]
+                if classstack:
+                    # it's a class method
+                    cur_class = classstack[-1][0]
+                    cur_class._addmethod(meth_name, lineno)
+                else:
+                    # it's a function
+                    dict[meth_name] = Function(module, meth_name, file, lineno)
+            elif token == 'class':
+                lineno, thisindent = start
+                tokentype, class_name, start, end, line = g.next()
+                if tokentype != NAME:
+                    continue # Syntax error
+                # close all classes indented at least as much
+                while classstack and \
+                      classstack[-1][1] >= thisindent:
+                    del classstack[-1]
+                # parse what follows the class name
+                tokentype, token, start, end, line = g.next()
+                inherit = None
+                if token == '(':
+                    names = [] # List of superclasses
+                    # there's a list of superclasses
+                    level = 1
+                    super = [] # Tokens making up current superclass
+                    while True:
+                        tokentype, token, start, end, line = g.next()
+                        if token in (')', ',') and level == 1:
+                            n = "".join(super)
+                            if n in dict:
+                                # we know this super class
+                                n = dict[n]
+                            else:
+                                c = n.split('.')
+                                if len(c) > 1:
+                                    # super class is of the form
+                                    # module.class: look in module for
+                                    # class
+                                    m = c[-2]
+                                    c = c[-1]
+                                    if m in _modules:
+                                        d = _modules[m]
+                                        if c in d:
+                                            n = d[c]
+                            names.append(n)
+                        if token == '(':
+                            level += 1
+                        elif token == ')':
+                            level -= 1
+                            if level == 0:
+                                break
+                        elif token == ',' and level == 1:
+                            pass
+                        else:
+                            super.append(token)
+                    inherit = names
+                cur_class = Class(module, class_name, inherit, file, lineno)
+                dict[class_name] = cur_class
+                classstack.append((cur_class, thisindent))
+            elif token == 'import' and start[1] == 0:
+                modules = _getnamelist(g)
+                for mod, mod2 in modules:
+                    readmodule_ex(mod, path, inpackage)
+            elif token == 'from' and start[1] == 0:
+                mod, token = _getname(g)
+                if not mod or token != "import":
+                    continue
+                names = _getnamelist(g)
                try:
                    # recursively read the imported module
-                    d = readmodule_ex(n, path, inpackage)
+                    d = readmodule_ex(mod, path, inpackage)
                except:
-                    ##print 'module', n, 'not found'
-                    pass
-
-        elif m.start("ImportFrom") >= 0:
-            # from module import stuff
-            mod = m.group("ImportFromPath")
-            names = m.group("ImportFromList").split(',')
-            try:
-                # recursively read the imported module
-                d = readmodule_ex(mod, path, inpackage)
-            except:
-                ##print 'module', mod, 'not found'
-                continue
-            # add any classes that were defined in the
-            # imported module to our name space if they
-            # were mentioned in the list
-            for n in names:
-                n = n.strip()
-                if n in d:
-                    dict[n] = d[n]
-                elif n == '*':
-                    # only add a name if not
-                    # already there (to mimic what
-                    # Python does internally)
-                    # also don't add names that
-                    # start with _
-                    for n in d:
-                        if n[0] != '_' and \
-                           not n in dict:
-                            dict[n] = d[n]
-        else:
-            assert 0, "regexp _getnext found something unexpected"
+                    continue
+                # add any classes that were defined in the imported module
+                # to our name space if they were mentioned in the list
+                for n, n2 in names:
+                    if n in d:
+                        dict[n2 or n] = d[n]
+                    elif n == '*':
+                        # only add a name if not already there (to mimic
+                        # what Python does internally) also don't add
+                        # names that start with _
+                        for n in d:
+                            if n[0] != '_' and not n in dict:
+                                dict[n] = d[n]
+    except StopIteration:
+        pass

+    f.close()
    return dict

-def _indent(ws, _expandtabs=string.expandtabs):
-    return len(_expandtabs(ws, TABWIDTH))
+def _getnamelist(g):
+    # Helper to get a comma-separated list of dotted names plus 'as'
+    # clauses.  Return a list of pairs (name, name2) where name2 is
+    # the 'as' name, or None if there is no 'as' clause.
+    names = []
+    while True:
+        name, token = _getname(g)
+        if not name:
+            break
+        if token == 'as':
+            name2, token = _getname(g)
+        else:
+            name2 = None
+        names.append((name, name2))
+        while token != "," and "\n" not in token:
+            tokentype, token, start, end, line = g.next()
+        if token != ",":
+            break
+    return names
+
+def _getname(g):
+    # Helper to get a dotted name, return a pair (name, token) where
+    # name is the dotted name, or None if there was no dotted name,
+    # and token is the next input token.
+    parts = []
+    tokentype, token, start, end, line = g.next()
+    if tokentype != NAME and token != '*':
+        return (None, token)
+    parts.append(token)
+    while True:
+        tokentype, token, start, end, line = g.next()
+        if token != '.':
+            break
+        tokentype, token, start, end, line = g.next()
+        if tokentype != NAME:
+            break
+        parts.append(token)
+    return (".".join(parts), token)