Rewritten using the tokenize module, which gives us a real tokenizer

rather than a number of approximating regular expressions.
Alas, it is 3-4 times slower.  Let that be a challenge for the
tokenize module.
This commit is contained in:
Guido van Rossum 2002-08-23 01:36:01 +00:00
parent fd372aa8e9
commit 040d7ca498
1 changed files with 144 additions and 188 deletions

View File

@ -4,10 +4,11 @@ Parse enough of a Python file to recognize class and method
definitions and to find out the superclasses of a class.
The interface consists of a single function:
readmodule(module, path)
readmodule_ex(module [, path[, inpackage]])
module is the name of a Python module, path is an optional list of
directories where the module is to be searched. If present, path is
prepended to the system search path sys.path.
prepended to the system search path sys.path. (inpackage is used
internally to search for a submodule of a package.)
The return value is a dictionary. The keys of the dictionary are
the names of the classes defined in the module (including classes
that are defined via the from XXX import YYY construct). The values
@ -28,12 +29,10 @@ string giving the name of the super class. Since import statements
are recognized and imported modules are scanned as well, this
shouldn't happen often.
XXX describe the Function class.
BUGS
- Continuation lines are not dealt with at all, except inside strings.
- Nested classes and functions can confuse it.
- Code that doesn't pass tabnanny or python -t will confuse it, unless
you set the module TABWIDTH vrbl (default 8) to the correct tab width
for the file.
PACKAGE RELATED BUGS
- If you have a package and a module inside that or another package
@ -52,69 +51,11 @@ PACKAGE RELATED BUGS
import sys
import imp
import re
import string
import tokenize # Python tokenizer
from token import NAME
__all__ = ["readmodule"]
TABWIDTH = 8
_getnext = re.compile(r"""
(?P<String>
\""" [^"\\]* (?:
(?: \\. | "(?!"") )
[^"\\]*
)*
\"""
| ''' [^'\\]* (?:
(?: \\. | '(?!'') )
[^'\\]*
)*
'''
| " [^"\\\n]* (?: \\. [^"\\\n]*)* "
| ' [^'\\\n]* (?: \\. [^'\\\n]*)* '
)
| (?P<Method>
^
(?P<MethodIndent> [ \t]* )
def [ \t]+
(?P<MethodName> [a-zA-Z_] \w* )
[ \t]* \(
)
| (?P<Class>
^
(?P<ClassIndent> [ \t]* )
class [ \t]+
(?P<ClassName> [a-zA-Z_] \w* )
[ \t]*
(?P<ClassSupers> \( [^)\n]* \) )?
[ \t]* :
)
| (?P<Import>
^ import [ \t]+
(?P<ImportList> [^#;\n]+ )
)
| (?P<ImportFrom>
^ from [ \t]+
(?P<ImportFromPath>
[a-zA-Z_] \w*
(?:
[ \t]* \. [ \t]* [a-zA-Z_] \w*
)*
)
[ \t]+
import [ \t]+
(?P<ImportFromList> [^#;\n]+ )
)
""", re.VERBOSE | re.DOTALL | re.MULTILINE).search
_modules = {} # cache of modules we've seen
# each Python class is represented by an instance of this class
@ -140,7 +81,7 @@ class Function(Class):
def _addmethod(self, name, lineno):
assert 0, "Function._addmethod() shouldn't be called"
def readmodule(module, path=[], inpackage=0):
def readmodule(module, path=[], inpackage=False):
'''Backwards compatible interface.
Like readmodule_ex() but strips Function objects from the
@ -153,7 +94,7 @@ def readmodule(module, path=[], inpackage=0):
res[key] = value
return res
def readmodule_ex(module, path=[], inpackage=0):
def readmodule_ex(module, path=[], inpackage=False):
'''Read a module file and return a dictionary of classes.
Search for MODULE in PATH and sys.path, read and parse the
@ -168,7 +109,7 @@ def readmodule_ex(module, path=[], inpackage=0):
package = module[:i].strip()
submodule = module[i+1:].strip()
parent = readmodule_ex(package, path, inpackage)
child = readmodule_ex(submodule, parent['__path__'], 1)
child = readmodule_ex(submodule, parent['__path__'], True)
return child
if module in _modules:
@ -204,129 +145,144 @@ def readmodule_ex(module, path=[], inpackage=0):
_modules[module] = dict
classstack = [] # stack of (class, indent) pairs
src = f.read()
f.close()
# To avoid having to stop the regexp at each newline, instead
# when we need a line number we simply count the number of
# newlines in the string since the last time we did this; i.e.,
# lineno += src.count('\n', last_lineno_pos, here)
# last_lineno_pos = here
lineno, last_lineno_pos = 1, 0
i = 0
while 1:
m = _getnext(src, i)
if not m:
break
start, i = m.span()
if m.start("Method") >= 0:
# found a method definition or function
thisindent = _indent(m.group("MethodIndent"))
meth_name = m.group("MethodName")
lineno += src.count('\n', last_lineno_pos, start)
last_lineno_pos = start
# close all classes indented at least as much
while classstack and \
classstack[-1][1] >= thisindent:
del classstack[-1]
if classstack:
# it's a class method
cur_class = classstack[-1][0]
cur_class._addmethod(meth_name, lineno)
else:
# it's a function
f = Function(module, meth_name,
file, lineno)
dict[meth_name] = f
elif m.start("String") >= 0:
pass
elif m.start("Class") >= 0:
# we found a class definition
thisindent = _indent(m.group("ClassIndent"))
# close all classes indented at least as much
while classstack and \
classstack[-1][1] >= thisindent:
del classstack[-1]
lineno += src.count('\n', last_lineno_pos, start)
last_lineno_pos = start
class_name = m.group("ClassName")
inherit = m.group("ClassSupers")
if inherit:
# the class inherits from other classes
inherit = inherit[1:-1].strip()
names = []
for n in inherit.split(','):
n = n.strip()
if n in dict:
# we know this super class
n = dict[n]
else:
c = n.split('.')
if len(c) > 1:
# super class
# is of the
# form module.class:
# look in
# module for class
m = c[-2]
c = c[-1]
if m in _modules:
d = _modules[m]
if c in d:
n = d[c]
names.append(n)
inherit = names
# remember this class
cur_class = Class(module, class_name, inherit,
file, lineno)
dict[class_name] = cur_class
classstack.append((cur_class, thisindent))
elif m.start("Import") >= 0:
# import module
for n in m.group("ImportList").split(','):
n = n.strip()
g = tokenize.generate_tokens(f.readline)
try:
for tokentype, token, start, end, line in g:
if token == 'def':
lineno, thisindent = start
tokentype, meth_name, start, end, line = g.next()
if tokentype != NAME:
continue # Syntax error
# close all classes indented at least as much
while classstack and \
classstack[-1][1] >= thisindent:
del classstack[-1]
if classstack:
# it's a class method
cur_class = classstack[-1][0]
cur_class._addmethod(meth_name, lineno)
else:
# it's a function
dict[meth_name] = Function(module, meth_name, file, lineno)
elif token == 'class':
lineno, thisindent = start
tokentype, class_name, start, end, line = g.next()
if tokentype != NAME:
continue # Syntax error
# close all classes indented at least as much
while classstack and \
classstack[-1][1] >= thisindent:
del classstack[-1]
# parse what follows the class name
tokentype, token, start, end, line = g.next()
inherit = None
if token == '(':
names = [] # List of superclasses
# there's a list of superclasses
level = 1
super = [] # Tokens making up current superclass
while True:
tokentype, token, start, end, line = g.next()
if token in (')', ',') and level == 1:
n = "".join(super)
if n in dict:
# we know this super class
n = dict[n]
else:
c = n.split('.')
if len(c) > 1:
# super class is of the form
# module.class: look in module for
# class
m = c[-2]
c = c[-1]
if m in _modules:
d = _modules[m]
if c in d:
n = d[c]
names.append(n)
if token == '(':
level += 1
elif token == ')':
level -= 1
if level == 0:
break
elif token == ',' and level == 1:
pass
else:
super.append(token)
inherit = names
cur_class = Class(module, class_name, inherit, file, lineno)
dict[class_name] = cur_class
classstack.append((cur_class, thisindent))
elif token == 'import' and start[1] == 0:
modules = _getnamelist(g)
for mod, mod2 in modules:
readmodule_ex(mod, path, inpackage)
elif token == 'from' and start[1] == 0:
mod, token = _getname(g)
if not mod or token != "import":
continue
names = _getnamelist(g)
try:
# recursively read the imported module
d = readmodule_ex(n, path, inpackage)
d = readmodule_ex(mod, path, inpackage)
except:
##print 'module', n, 'not found'
pass
elif m.start("ImportFrom") >= 0:
# from module import stuff
mod = m.group("ImportFromPath")
names = m.group("ImportFromList").split(',')
try:
# recursively read the imported module
d = readmodule_ex(mod, path, inpackage)
except:
##print 'module', mod, 'not found'
continue
# add any classes that were defined in the
# imported module to our name space if they
# were mentioned in the list
for n in names:
n = n.strip()
if n in d:
dict[n] = d[n]
elif n == '*':
# only add a name if not
# already there (to mimic what
# Python does internally)
# also don't add names that
# start with _
for n in d:
if n[0] != '_' and \
not n in dict:
dict[n] = d[n]
else:
assert 0, "regexp _getnext found something unexpected"
continue
# add any classes that were defined in the imported module
# to our name space if they were mentioned in the list
for n, n2 in names:
if n in d:
dict[n2 or n] = d[n]
elif n == '*':
# only add a name if not already there (to mimic
# what Python does internally) also don't add
# names that start with _
for n in d:
if n[0] != '_' and not n in dict:
dict[n] = d[n]
except StopIteration:
pass
f.close()
return dict
def _indent(ws, _expandtabs=string.expandtabs):
return len(_expandtabs(ws, TABWIDTH))
def _getnamelist(g):
# Helper to get a comma-separated list of dotted names plus 'as'
# clauses. Return a list of pairs (name, name2) where name2 is
# the 'as' name, or None if there is no 'as' clause.
names = []
while True:
name, token = _getname(g)
if not name:
break
if token == 'as':
name2, token = _getname(g)
else:
name2 = None
names.append((name, name2))
while token != "," and "\n" not in token:
tokentype, token, start, end, line = g.next()
if token != ",":
break
return names
def _getname(g):
# Helper to get a dotted name, return a pair (name, token) where
# name is the dotted name, or None if there was no dotted name,
# and token is the next input token.
parts = []
tokentype, token, start, end, line = g.next()
if tokentype != NAME and token != '*':
return (None, token)
parts.append(token)
while True:
tokentype, token, start, end, line = g.next()
if token != '.':
break
tokentype, token, start, end, line = g.next()
if tokentype != NAME:
break
parts.append(token)
return (".".join(parts), token)