284 lines
9.8 KiB
Python
284 lines
9.8 KiB
Python
"Utility functions used by the btm_matcher module"
|
|
|
|
from . import pytree
|
|
from .pgen2 import grammar, token
|
|
from .pygram import pattern_symbols, python_symbols
|
|
|
|
syms = pattern_symbols
|
|
pysyms = python_symbols
|
|
tokens = grammar.opmap
|
|
token_labels = token
|
|
|
|
TYPE_ANY = -1
|
|
TYPE_ALTERNATIVES = -2
|
|
TYPE_GROUP = -3
|
|
|
|
class MinNode(object):
|
|
"""This class serves as an intermediate representation of the
|
|
pattern tree during the conversion to sets of leaf-to-root
|
|
subpatterns"""
|
|
|
|
def __init__(self, type=None, name=None):
|
|
self.type = type
|
|
self.name = name
|
|
self.children = []
|
|
self.leaf = False
|
|
self.parent = None
|
|
self.alternatives = []
|
|
self.group = []
|
|
|
|
def __repr__(self):
|
|
return str(self.type) + ' ' + str(self.name)
|
|
|
|
def leaf_to_root(self):
|
|
"""Internal method. Returns a characteristic path of the
|
|
pattern tree. This method must be run for all leaves until the
|
|
linear subpatterns are merged into a single"""
|
|
node = self
|
|
subp = []
|
|
while node:
|
|
if node.type == TYPE_ALTERNATIVES:
|
|
node.alternatives.append(subp)
|
|
if len(node.alternatives) == len(node.children):
|
|
#last alternative
|
|
subp = [tuple(node.alternatives)]
|
|
node.alternatives = []
|
|
node = node.parent
|
|
continue
|
|
else:
|
|
node = node.parent
|
|
subp = None
|
|
break
|
|
|
|
if node.type == TYPE_GROUP:
|
|
node.group.append(subp)
|
|
#probably should check the number of leaves
|
|
if len(node.group) == len(node.children):
|
|
subp = get_characteristic_subpattern(node.group)
|
|
node.group = []
|
|
node = node.parent
|
|
continue
|
|
else:
|
|
node = node.parent
|
|
subp = None
|
|
break
|
|
|
|
if node.type == token_labels.NAME and node.name:
|
|
#in case of type=name, use the name instead
|
|
subp.append(node.name)
|
|
else:
|
|
subp.append(node.type)
|
|
|
|
node = node.parent
|
|
return subp
|
|
|
|
def get_linear_subpattern(self):
|
|
"""Drives the leaf_to_root method. The reason that
|
|
leaf_to_root must be run multiple times is because we need to
|
|
reject 'group' matches; for example the alternative form
|
|
(a | b c) creates a group [b c] that needs to be matched. Since
|
|
matching multiple linear patterns overcomes the automaton's
|
|
capabilities, leaf_to_root merges each group into a single
|
|
choice based on 'characteristic'ity,
|
|
|
|
i.e. (a|b c) -> (a|b) if b more characteristic than c
|
|
|
|
Returns: The most 'characteristic'(as defined by
|
|
get_characteristic_subpattern) path for the compiled pattern
|
|
tree.
|
|
"""
|
|
|
|
for l in self.leaves():
|
|
subp = l.leaf_to_root()
|
|
if subp:
|
|
return subp
|
|
|
|
def leaves(self):
|
|
"Generator that returns the leaves of the tree"
|
|
for child in self.children:
|
|
for x in child.leaves():
|
|
yield x
|
|
if not self.children:
|
|
yield self
|
|
|
|
def reduce_tree(node, parent=None):
|
|
"""
|
|
Internal function. Reduces a compiled pattern tree to an
|
|
intermediate representation suitable for feeding the
|
|
automaton. This also trims off any optional pattern elements(like
|
|
[a], a*).
|
|
"""
|
|
|
|
new_node = None
|
|
#switch on the node type
|
|
if node.type == syms.Matcher:
|
|
#skip
|
|
node = node.children[0]
|
|
|
|
if node.type == syms.Alternatives :
|
|
#2 cases
|
|
if len(node.children) <= 2:
|
|
#just a single 'Alternative', skip this node
|
|
new_node = reduce_tree(node.children[0], parent)
|
|
else:
|
|
#real alternatives
|
|
new_node = MinNode(type=TYPE_ALTERNATIVES)
|
|
#skip odd children('|' tokens)
|
|
for child in node.children:
|
|
if node.children.index(child)%2:
|
|
continue
|
|
reduced = reduce_tree(child, new_node)
|
|
if reduced is not None:
|
|
new_node.children.append(reduced)
|
|
elif node.type == syms.Alternative:
|
|
if len(node.children) > 1:
|
|
|
|
new_node = MinNode(type=TYPE_GROUP)
|
|
for child in node.children:
|
|
reduced = reduce_tree(child, new_node)
|
|
if reduced:
|
|
new_node.children.append(reduced)
|
|
if not new_node.children:
|
|
# delete the group if all of the children were reduced to None
|
|
new_node = None
|
|
|
|
else:
|
|
new_node = reduce_tree(node.children[0], parent)
|
|
|
|
elif node.type == syms.Unit:
|
|
if (isinstance(node.children[0], pytree.Leaf) and
|
|
node.children[0].value == '('):
|
|
#skip parentheses
|
|
return reduce_tree(node.children[1], parent)
|
|
if ((isinstance(node.children[0], pytree.Leaf) and
|
|
node.children[0].value == '[')
|
|
or
|
|
(len(node.children)>1 and
|
|
hasattr(node.children[1], "value") and
|
|
node.children[1].value == '[')):
|
|
#skip whole unit if its optional
|
|
return None
|
|
|
|
leaf = True
|
|
details_node = None
|
|
alternatives_node = None
|
|
has_repeater = False
|
|
repeater_node = None
|
|
has_variable_name = False
|
|
|
|
for child in node.children:
|
|
if child.type == syms.Details:
|
|
leaf = False
|
|
details_node = child
|
|
elif child.type == syms.Repeater:
|
|
has_repeater = True
|
|
repeater_node = child
|
|
elif child.type == syms.Alternatives:
|
|
alternatives_node = child
|
|
if hasattr(child, 'value') and child.value == '=': # variable name
|
|
has_variable_name = True
|
|
|
|
#skip variable name
|
|
if has_variable_name:
|
|
#skip variable name, '='
|
|
name_leaf = node.children[2]
|
|
if hasattr(name_leaf, 'value') and name_leaf.value == '(':
|
|
# skip parenthesis
|
|
name_leaf = node.children[3]
|
|
else:
|
|
name_leaf = node.children[0]
|
|
|
|
#set node type
|
|
if name_leaf.type == token_labels.NAME:
|
|
#(python) non-name or wildcard
|
|
if name_leaf.value == 'any':
|
|
new_node = MinNode(type=TYPE_ANY)
|
|
else:
|
|
if hasattr(token_labels, name_leaf.value):
|
|
new_node = MinNode(type=getattr(token_labels, name_leaf.value))
|
|
else:
|
|
new_node = MinNode(type=getattr(pysyms, name_leaf.value))
|
|
|
|
elif name_leaf.type == token_labels.STRING:
|
|
#(python) name or character; remove the apostrophes from
|
|
#the string value
|
|
name = name_leaf.value.strip("'")
|
|
if name in tokens:
|
|
new_node = MinNode(type=tokens[name])
|
|
else:
|
|
new_node = MinNode(type=token_labels.NAME, name=name)
|
|
elif name_leaf.type == syms.Alternatives:
|
|
new_node = reduce_tree(alternatives_node, parent)
|
|
|
|
#handle repeaters
|
|
if has_repeater:
|
|
if repeater_node.children[0].value == '*':
|
|
#reduce to None
|
|
new_node = None
|
|
elif repeater_node.children[0].value == '+':
|
|
#reduce to a single occurrence i.e. do nothing
|
|
pass
|
|
else:
|
|
#TODO: handle {min, max} repeaters
|
|
raise NotImplementedError
|
|
pass
|
|
|
|
#add children
|
|
if details_node and new_node is not None:
|
|
for child in details_node.children[1:-1]:
|
|
#skip '<', '>' markers
|
|
reduced = reduce_tree(child, new_node)
|
|
if reduced is not None:
|
|
new_node.children.append(reduced)
|
|
if new_node:
|
|
new_node.parent = parent
|
|
return new_node
|
|
|
|
|
|
def get_characteristic_subpattern(subpatterns):
|
|
"""Picks the most characteristic from a list of linear patterns
|
|
Current order used is:
|
|
names > common_names > common_chars
|
|
"""
|
|
if not isinstance(subpatterns, list):
|
|
return subpatterns
|
|
if len(subpatterns)==1:
|
|
return subpatterns[0]
|
|
|
|
# first pick out the ones containing variable names
|
|
subpatterns_with_names = []
|
|
subpatterns_with_common_names = []
|
|
common_names = ['in', 'for', 'if' , 'not', 'None']
|
|
subpatterns_with_common_chars = []
|
|
common_chars = "[]().,:"
|
|
for subpattern in subpatterns:
|
|
if any(rec_test(subpattern, lambda x: type(x) is str)):
|
|
if any(rec_test(subpattern,
|
|
lambda x: isinstance(x, str) and x in common_chars)):
|
|
subpatterns_with_common_chars.append(subpattern)
|
|
elif any(rec_test(subpattern,
|
|
lambda x: isinstance(x, str) and x in common_names)):
|
|
subpatterns_with_common_names.append(subpattern)
|
|
|
|
else:
|
|
subpatterns_with_names.append(subpattern)
|
|
|
|
if subpatterns_with_names:
|
|
subpatterns = subpatterns_with_names
|
|
elif subpatterns_with_common_names:
|
|
subpatterns = subpatterns_with_common_names
|
|
elif subpatterns_with_common_chars:
|
|
subpatterns = subpatterns_with_common_chars
|
|
# of the remaining subpatterns pick out the longest one
|
|
return max(subpatterns, key=len)
|
|
|
|
def rec_test(sequence, test_func):
|
|
"""Tests test_func on all items of sequence and items of included
|
|
sub-iterables"""
|
|
for x in sequence:
|
|
if isinstance(x, (list, tuple)):
|
|
for y in rec_test(x, test_func):
|
|
yield y
|
|
else:
|
|
yield test_func(x)
|