cpython/Lib/lib2to3/btm_matcher.py

164 lines
6.5 KiB
Python
Raw Permalink Normal View History

Merged revisions 83852-83853,83857,84042,84216,84274-84276,84375,85388,85478,85506-85508 via svnmerge from svn+ssh://pythondev@svn.python.org/sandbox/trunk/2to3/lib2to3 ........ r83852 | benjamin.peterson | 2010-08-08 15:45:44 -0500 (Sun, 08 Aug 2010) | 1 line wrap with parens ........ r83853 | benjamin.peterson | 2010-08-08 15:46:31 -0500 (Sun, 08 Aug 2010) | 1 line use parens ........ r83857 | benjamin.peterson | 2010-08-08 15:59:49 -0500 (Sun, 08 Aug 2010) | 1 line things which use touch_import should be pre order ........ r84042 | george.boutsioukis | 2010-08-14 16:10:19 -0500 (Sat, 14 Aug 2010) | 2 lines This revision incorporates into the 2to3 tool the new, faster, tree matching algorithm developed during a GSOC project. The algorithm resides in the two added modules, btm_matcher and btm_utils. New code has been added to drive the new matching process in refactor.py and a few minor changes were made in other modules. A BM_compatible flag(False by default) has been added in fixer_base and it is set to True in most of the current fixers. ........ r84216 | benjamin.peterson | 2010-08-19 16:44:05 -0500 (Thu, 19 Aug 2010) | 1 line allow star_expr in testlist_gexp ........ r84274 | benjamin.peterson | 2010-08-22 18:40:46 -0500 (Sun, 22 Aug 2010) | 1 line wrap long line ........ r84275 | benjamin.peterson | 2010-08-22 18:42:22 -0500 (Sun, 22 Aug 2010) | 1 line cleanup ........ r84276 | benjamin.peterson | 2010-08-22 18:51:01 -0500 (Sun, 22 Aug 2010) | 1 line when there's a None value and a traceback, don't call type with it #9661 ........ r84375 | george.boutsioukis | 2010-08-31 08:38:53 -0500 (Tue, 31 Aug 2010) | 3 lines Idiomatic code changes & stylistic issues fixed in the BottomMatcher module. Thanks to Benjamin Peterson for taking the time to review the code. ........ r85388 | benjamin.peterson | 2010-10-12 17:27:44 -0500 (Tue, 12 Oct 2010) | 1 line fix urllib fixer with multiple as imports on a line #10069 ........ r85478 | benjamin.peterson | 2010-10-14 08:09:56 -0500 (Thu, 14 Oct 2010) | 1 line stop abusing docstrings ........ r85506 | benjamin.peterson | 2010-10-14 17:45:19 -0500 (Thu, 14 Oct 2010) | 1 line kill sibling import ........ r85507 | benjamin.peterson | 2010-10-14 17:54:15 -0500 (Thu, 14 Oct 2010) | 1 line remove trailing whitespace ........ r85508 | benjamin.peterson | 2010-10-14 17:55:28 -0500 (Thu, 14 Oct 2010) | 1 line typo ........
2010-10-14 20:00:04 -03:00
"""A bottom-up tree matching algorithm implementation meant to speed
up 2to3's matching process. After the tree patterns are reduced to
their rarest linear path, a linear Aho-Corasick automaton is
created. The linear automaton traverses the linear paths from the
leaves to the root of the AST and returns a set of nodes for further
matching. This reduces significantly the number of candidate nodes."""
__author__ = "George Boutsioukis <gboutsioukis@gmail.com>"
import logging
import itertools
from collections import defaultdict
from . import pytree
from .btm_utils import reduce_tree
class BMNode(object):
"""Class for a node of the Aho-Corasick automaton used in matching"""
count = itertools.count()
def __init__(self):
self.transition_table = {}
self.fixers = []
self.id = next(BMNode.count)
self.content = ''
class BottomMatcher(object):
"""The main matcher class. After instantiating the patterns should
be added using the add_fixer method"""
def __init__(self):
self.match = set()
self.root = BMNode()
self.nodes = [self.root]
self.fixers = []
self.logger = logging.getLogger("RefactoringTool")
def add_fixer(self, fixer):
"""Reduces a fixer's pattern tree to a linear path and adds it
to the matcher(a common Aho-Corasick automaton). The fixer is
appended on the matching states and called when they are
reached"""
self.fixers.append(fixer)
tree = reduce_tree(fixer.pattern_tree)
linear = tree.get_linear_subpattern()
match_nodes = self.add(linear, start=self.root)
for match_node in match_nodes:
match_node.fixers.append(fixer)
def add(self, pattern, start):
"Recursively adds a linear pattern to the AC automaton"
#print("adding pattern", pattern, "to", start)
if not pattern:
#print("empty pattern")
return [start]
if isinstance(pattern[0], tuple):
#alternatives
#print("alternatives")
match_nodes = []
for alternative in pattern[0]:
#add all alternatives, and add the rest of the pattern
#to each end node
end_nodes = self.add(alternative, start=start)
for end in end_nodes:
match_nodes.extend(self.add(pattern[1:], end))
return match_nodes
else:
#single token
#not last
if pattern[0] not in start.transition_table:
#transition did not exist, create new
next_node = BMNode()
start.transition_table[pattern[0]] = next_node
else:
#transition exists already, follow
next_node = start.transition_table[pattern[0]]
if pattern[1:]:
end_nodes = self.add(pattern[1:], start=next_node)
else:
end_nodes = [next_node]
return end_nodes
def run(self, leaves):
"""The main interface with the bottom matcher. The tree is
traversed from the bottom using the constructed
automaton. Nodes are only checked once as the tree is
retraversed. When the automaton fails, we give it one more
shot(in case the above tree matches as a whole with the
rejected leaf), then we break for the next leaf. There is the
special case of multiple arguments(see code comments) where we
recheck the nodes
Args:
The leaves of the AST tree to be matched
Returns:
A dictionary of node matches with fixers as the keys
"""
current_ac_node = self.root
results = defaultdict(list)
for leaf in leaves:
current_ast_node = leaf
while current_ast_node:
current_ast_node.was_checked = True
for child in current_ast_node.children:
# multiple statements, recheck
if isinstance(child, pytree.Leaf) and child.value == ";":
current_ast_node.was_checked = False
break
if current_ast_node.type == 1:
#name
node_token = current_ast_node.value
else:
node_token = current_ast_node.type
if node_token in current_ac_node.transition_table:
#token matches
current_ac_node = current_ac_node.transition_table[node_token]
for fixer in current_ac_node.fixers:
results[fixer].append(current_ast_node)
else:
#matching failed, reset automaton
current_ac_node = self.root
if (current_ast_node.parent is not None
and current_ast_node.parent.was_checked):
#the rest of the tree upwards has been checked, next leaf
break
#recheck the rejected node once from the root
if node_token in current_ac_node.transition_table:
#token matches
current_ac_node = current_ac_node.transition_table[node_token]
for fixer in current_ac_node.fixers:
results[fixer].append(current_ast_node)
current_ast_node = current_ast_node.parent
return results
def print_ac(self):
"Prints a graphviz diagram of the BM automaton(for debugging)"
print("digraph g{")
def print_node(node):
for subnode_key in node.transition_table.keys():
subnode = node.transition_table[subnode_key]
print("%d -> %d [label=%s] //%s" %
(node.id, subnode.id, type_repr(subnode_key), str(subnode.fixers)))
if subnode_key == 1:
print(subnode.content)
print_node(subnode)
print_node(self.root)
print("}")
# taken from pytree.py for debugging; only used by print_ac
_type_reprs = {}
def type_repr(type_num):
global _type_reprs
if not _type_reprs:
from .pygram import python_symbols
# printing tokens is possible but not as useful
# from .pgen2 import token // token.__dict__.items():
for name, val in python_symbols.__dict__.items():
if type(val) == int: _type_reprs[val] = name
return _type_reprs.setdefault(type_num, type_num)