2019-08-21 22:38:39 -03:00
|
|
|
|
"""Classes representing state-machine concepts"""
|
|
|
|
|
|
|
|
|
|
class NFA:
|
|
|
|
|
"""A non deterministic finite automata
|
|
|
|
|
|
|
|
|
|
A non deterministic automata is a form of a finite state
|
|
|
|
|
machine. An NFA's rules are less restrictive than a DFA.
|
|
|
|
|
The NFA rules are:
|
|
|
|
|
|
|
|
|
|
* A transition can be non-deterministic and can result in
|
|
|
|
|
nothing, one, or two or more states.
|
|
|
|
|
|
|
|
|
|
* An epsilon transition consuming empty input is valid.
|
|
|
|
|
Transitions consuming labeled symbols are also permitted.
|
|
|
|
|
|
|
|
|
|
This class assumes that there is only one starting state and one
|
|
|
|
|
accepting (ending) state.
|
|
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
|
name (str): The name of the rule the NFA is representing.
|
|
|
|
|
start (NFAState): The starting state.
|
|
|
|
|
end (NFAState): The ending state
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, start, end):
|
|
|
|
|
self.name = start.rule_name
|
|
|
|
|
self.start = start
|
|
|
|
|
self.end = end
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return "NFA(start={}, end={})".format(self.start, self.end)
|
|
|
|
|
|
|
|
|
|
def dump(self, writer=print):
|
|
|
|
|
"""Dump a graphical representation of the NFA"""
|
|
|
|
|
todo = [self.start]
|
|
|
|
|
for i, state in enumerate(todo):
|
|
|
|
|
writer(" State", i, state is self.end and "(final)" or "")
|
|
|
|
|
for arc in state.arcs:
|
|
|
|
|
label = arc.label
|
|
|
|
|
next = arc.target
|
|
|
|
|
if next in todo:
|
|
|
|
|
j = todo.index(next)
|
|
|
|
|
else:
|
|
|
|
|
j = len(todo)
|
|
|
|
|
todo.append(next)
|
|
|
|
|
if label is None:
|
|
|
|
|
writer(" -> %d" % j)
|
|
|
|
|
else:
|
|
|
|
|
writer(" %s -> %d" % (label, j))
|
|
|
|
|
|
2020-01-14 18:32:55 -04:00
|
|
|
|
def dump_graph(self, writer):
|
|
|
|
|
"""Dump a DOT representation of the NFA"""
|
|
|
|
|
writer('digraph %s_nfa {\n' % self.name)
|
|
|
|
|
todo = [self.start]
|
|
|
|
|
for i, state in enumerate(todo):
|
|
|
|
|
writer(' %d [label="State %d %s"];\n' % (i, i, state is self.end and "(final)" or ""))
|
|
|
|
|
for arc in state.arcs:
|
|
|
|
|
label = arc.label
|
|
|
|
|
next = arc.target
|
|
|
|
|
if next in todo:
|
|
|
|
|
j = todo.index(next)
|
|
|
|
|
else:
|
|
|
|
|
j = len(todo)
|
|
|
|
|
todo.append(next)
|
|
|
|
|
if label is None:
|
|
|
|
|
writer(" %d -> %d [style=dotted label=ε];\n" % (i, j))
|
|
|
|
|
else:
|
|
|
|
|
writer(" %d -> %d [label=%s];\n" % (i, j, label.replace("'", '"')))
|
|
|
|
|
writer('}\n')
|
|
|
|
|
|
2019-08-21 22:38:39 -03:00
|
|
|
|
|
|
|
|
|
class NFAArc:
|
|
|
|
|
"""An arc representing a transition between two NFA states.
|
|
|
|
|
|
|
|
|
|
NFA states can be connected via two ways:
|
|
|
|
|
|
|
|
|
|
* A label transition: An input equal to the label must
|
|
|
|
|
be consumed to perform the transition.
|
|
|
|
|
* An epsilon transition: The transition can be taken without
|
|
|
|
|
consuming any input symbol.
|
|
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
|
target (NFAState): The ending state of the transition arc.
|
|
|
|
|
label (Optional[str]): The label that must be consumed to make
|
|
|
|
|
the transition. An epsilon transition is represented
|
|
|
|
|
using `None`.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, target, label):
|
|
|
|
|
self.target = target
|
|
|
|
|
self.label = label
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return "<%s: %s>" % (self.__class__.__name__, self.label)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NFAState:
|
|
|
|
|
"""A state of a NFA, non deterministic finite automata.
|
|
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
|
target (rule_name): The name of the rule used to represent the NFA's
|
|
|
|
|
ending state after a transition.
|
|
|
|
|
arcs (Dict[Optional[str], NFAState]): A mapping representing transitions
|
|
|
|
|
between the current NFA state and another NFA state via following
|
|
|
|
|
a label.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, rule_name):
|
|
|
|
|
self.rule_name = rule_name
|
|
|
|
|
self.arcs = []
|
|
|
|
|
|
|
|
|
|
def add_arc(self, target, label=None):
|
|
|
|
|
"""Add a new arc to connect the state to a target state within the NFA
|
|
|
|
|
|
|
|
|
|
The method adds a new arc to the list of arcs available as transitions
|
|
|
|
|
from the present state. An optional label indicates a named transition
|
|
|
|
|
that consumes an input while the absence of a label represents an epsilon
|
|
|
|
|
transition.
|
|
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
|
target (NFAState): The end of the transition that the arc represents.
|
|
|
|
|
label (Optional[str]): The label that must be consumed for making
|
|
|
|
|
the transition. If the label is not provided the transition is assumed
|
|
|
|
|
to be an epsilon-transition.
|
|
|
|
|
"""
|
|
|
|
|
assert label is None or isinstance(label, str)
|
|
|
|
|
assert isinstance(target, NFAState)
|
|
|
|
|
self.arcs.append(NFAArc(target, label))
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return "<%s: from %s>" % (self.__class__.__name__, self.rule_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DFA:
|
|
|
|
|
"""A deterministic finite automata
|
|
|
|
|
|
|
|
|
|
A deterministic finite automata is a form of a finite state machine
|
|
|
|
|
that obeys the following rules:
|
|
|
|
|
|
|
|
|
|
* Each of the transitions is uniquely determined by
|
|
|
|
|
the source state and input symbol
|
|
|
|
|
* Reading an input symbol is required for each state
|
|
|
|
|
transition (no epsilon transitions).
|
|
|
|
|
|
|
|
|
|
The finite-state machine will accept or reject a string of symbols
|
|
|
|
|
and only produces a unique computation of the automaton for each input
|
|
|
|
|
string. The DFA must have a unique starting state (represented as the first
|
|
|
|
|
element in the list of states) but can have multiple final states.
|
|
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
|
name (str): The name of the rule the DFA is representing.
|
|
|
|
|
states (List[DFAState]): A collection of DFA states.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, name, states):
|
|
|
|
|
self.name = name
|
|
|
|
|
self.states = states
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def from_nfa(cls, nfa):
|
|
|
|
|
"""Constructs a DFA from a NFA using the Rabin–Scott construction algorithm.
|
|
|
|
|
|
|
|
|
|
To simulate the operation of a DFA on a given input string, it's
|
|
|
|
|
necessary to keep track of a single state at any time, or more precisely,
|
|
|
|
|
the state that the automaton will reach after seeing a prefix of the
|
|
|
|
|
input. In contrast, to simulate an NFA, it's necessary to keep track of
|
|
|
|
|
a set of states: all of the states that the automaton could reach after
|
|
|
|
|
seeing the same prefix of the input, according to the nondeterministic
|
|
|
|
|
choices made by the automaton. There are two possible sources of
|
|
|
|
|
non-determinism:
|
|
|
|
|
|
|
|
|
|
1) Multiple (one or more) transitions with the same label
|
|
|
|
|
|
|
|
|
|
'A' +-------+
|
|
|
|
|
+----------->+ State +----------->+
|
|
|
|
|
| | 2 |
|
|
|
|
|
+-------+ +-------+
|
|
|
|
|
| State |
|
|
|
|
|
| 1 | +-------+
|
|
|
|
|
+-------+ | State |
|
|
|
|
|
+----------->+ 3 +----------->+
|
|
|
|
|
'A' +-------+
|
|
|
|
|
|
|
|
|
|
2) Epsilon transitions (transitions that can be taken without consuming any input)
|
|
|
|
|
|
|
|
|
|
+-------+ +-------+
|
|
|
|
|
| State | ε | State |
|
|
|
|
|
| 1 +----------->+ 2 +----------->+
|
|
|
|
|
+-------+ +-------+
|
|
|
|
|
|
|
|
|
|
Looking at the first case above, we can't determine which transition should be
|
|
|
|
|
followed when given an input A. We could choose whether or not to follow the
|
|
|
|
|
transition while in the second case the problem is that we can choose both to
|
|
|
|
|
follow the transition or not doing it. To solve this problem we can imagine that
|
|
|
|
|
we follow all possibilities at the same time and we construct new states from the
|
|
|
|
|
set of all possible reachable states. For every case in the previous example:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1) For multiple transitions with the same label we colapse all of the
|
|
|
|
|
final states under the same one
|
|
|
|
|
|
|
|
|
|
+-------+ +-------+
|
|
|
|
|
| State | 'A' | State |
|
|
|
|
|
| 1 +----------->+ 2-3 +----------->+
|
|
|
|
|
+-------+ +-------+
|
|
|
|
|
|
|
|
|
|
2) For epsilon transitions we collapse all epsilon-reachable states
|
|
|
|
|
into the same one
|
|
|
|
|
|
|
|
|
|
+-------+
|
|
|
|
|
| State |
|
|
|
|
|
| 1-2 +----------->
|
|
|
|
|
+-------+
|
|
|
|
|
|
|
|
|
|
Because the DFA states consist of sets of NFA states, an n-state NFA
|
|
|
|
|
may be converted to a DFA with at most 2**n states. Notice that the
|
|
|
|
|
constructed DFA is not minimal and can be simplified or reduced
|
|
|
|
|
afterwards.
|
|
|
|
|
|
|
|
|
|
Parameters:
|
|
|
|
|
name (NFA): The NFA to transform to DFA.
|
|
|
|
|
"""
|
|
|
|
|
assert isinstance(nfa, NFA)
|
|
|
|
|
|
|
|
|
|
def add_closure(nfa_state, base_nfa_set):
|
|
|
|
|
"""Calculate the epsilon-closure of a given state
|
|
|
|
|
|
|
|
|
|
Add to the *base_nfa_set* all the states that are
|
|
|
|
|
reachable from *nfa_state* via epsilon-transitions.
|
|
|
|
|
"""
|
|
|
|
|
assert isinstance(nfa_state, NFAState)
|
|
|
|
|
if nfa_state in base_nfa_set:
|
|
|
|
|
return
|
|
|
|
|
base_nfa_set.add(nfa_state)
|
|
|
|
|
for nfa_arc in nfa_state.arcs:
|
|
|
|
|
if nfa_arc.label is None:
|
|
|
|
|
add_closure(nfa_arc.target, base_nfa_set)
|
|
|
|
|
|
2020-03-08 23:58:24 -03:00
|
|
|
|
# Calculate the epsilon-closure of the starting state
|
2019-08-21 22:38:39 -03:00
|
|
|
|
base_nfa_set = set()
|
|
|
|
|
add_closure(nfa.start, base_nfa_set)
|
|
|
|
|
|
|
|
|
|
# Start by visiting the NFA starting state (there is only one).
|
|
|
|
|
states = [DFAState(nfa.name, base_nfa_set, nfa.end)]
|
|
|
|
|
|
|
|
|
|
for state in states: # NB states grow while we're iterating
|
|
|
|
|
|
|
|
|
|
# Find transitions from the current state to other reachable states
|
|
|
|
|
# and store them in mapping that correlates the label to all the
|
|
|
|
|
# possible reachable states that can be obtained by consuming a
|
|
|
|
|
# token equal to the label. Each set of all the states that can
|
|
|
|
|
# be reached after following a label will be the a DFA state.
|
|
|
|
|
arcs = {}
|
|
|
|
|
for nfa_state in state.nfa_set:
|
|
|
|
|
for nfa_arc in nfa_state.arcs:
|
|
|
|
|
if nfa_arc.label is not None:
|
|
|
|
|
nfa_set = arcs.setdefault(nfa_arc.label, set())
|
|
|
|
|
# All states that can be reached by epsilon-transitions
|
|
|
|
|
# are also included in the set of reachable states.
|
|
|
|
|
add_closure(nfa_arc.target, nfa_set)
|
|
|
|
|
|
|
|
|
|
# Now create new DFAs by visiting all posible transitions between
|
|
|
|
|
# the current DFA state and the new power-set states (each nfa_set)
|
|
|
|
|
# via the different labels. As the nodes are appended to *states* this
|
2019-09-09 11:08:23 -03:00
|
|
|
|
# is performing a breadth-first search traversal over the power-set of
|
2019-08-21 22:38:39 -03:00
|
|
|
|
# the states of the original NFA.
|
|
|
|
|
for label, nfa_set in sorted(arcs.items()):
|
|
|
|
|
for exisisting_state in states:
|
|
|
|
|
if exisisting_state.nfa_set == nfa_set:
|
|
|
|
|
# The DFA state already exists for this rule.
|
|
|
|
|
next_state = exisisting_state
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
next_state = DFAState(nfa.name, nfa_set, nfa.end)
|
|
|
|
|
states.append(next_state)
|
|
|
|
|
|
|
|
|
|
# Add a transition between the current DFA state and the new
|
|
|
|
|
# DFA state (the power-set state) via the current label.
|
|
|
|
|
state.add_arc(next_state, label)
|
|
|
|
|
|
|
|
|
|
return cls(nfa.name, states)
|
|
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
|
return iter(self.states)
|
|
|
|
|
|
|
|
|
|
def simplify(self):
|
|
|
|
|
"""Attempt to reduce the number of states of the DFA
|
|
|
|
|
|
|
|
|
|
Transform the DFA into an equivalent DFA that has fewer states. Two
|
|
|
|
|
classes of states can be removed or merged from the original DFA without
|
|
|
|
|
affecting the language it accepts to minimize it:
|
|
|
|
|
|
|
|
|
|
* Unreachable states can not be reached from the initial
|
|
|
|
|
state of the DFA, for any input string.
|
|
|
|
|
* Nondistinguishable states are those that cannot be distinguished
|
|
|
|
|
from one another for any input string.
|
|
|
|
|
|
|
|
|
|
This algorithm does not achieve the optimal fully-reduced solution, but it
|
|
|
|
|
works well enough for the particularities of the Python grammar. The
|
|
|
|
|
algorithm repeatedly looks for two states that have the same set of
|
|
|
|
|
arcs (same labels pointing to the same nodes) and unifies them, until
|
|
|
|
|
things stop changing.
|
|
|
|
|
"""
|
|
|
|
|
changes = True
|
|
|
|
|
while changes:
|
|
|
|
|
changes = False
|
|
|
|
|
for i, state_i in enumerate(self.states):
|
|
|
|
|
for j in range(i + 1, len(self.states)):
|
|
|
|
|
state_j = self.states[j]
|
|
|
|
|
if state_i == state_j:
|
|
|
|
|
del self.states[j]
|
|
|
|
|
for state in self.states:
|
|
|
|
|
state.unifystate(state_j, state_i)
|
|
|
|
|
changes = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
def dump(self, writer=print):
|
|
|
|
|
"""Dump a graphical representation of the DFA"""
|
|
|
|
|
for i, state in enumerate(self.states):
|
|
|
|
|
writer(" State", i, state.is_final and "(final)" or "")
|
|
|
|
|
for label, next in sorted(state.arcs.items()):
|
|
|
|
|
writer(" %s -> %d" % (label, self.states.index(next)))
|
|
|
|
|
|
2020-01-14 18:32:55 -04:00
|
|
|
|
def dump_graph(self, writer):
|
|
|
|
|
"""Dump a DOT representation of the DFA"""
|
|
|
|
|
writer('digraph %s_dfa {\n' % self.name)
|
|
|
|
|
for i, state in enumerate(self.states):
|
|
|
|
|
writer(' %d [label="State %d %s"];\n' % (i, i, state.is_final and "(final)" or ""))
|
|
|
|
|
for label, next in sorted(state.arcs.items()):
|
|
|
|
|
writer(" %d -> %d [label=%s];\n" % (i, self.states.index(next), label.replace("'", '"')))
|
|
|
|
|
writer('}\n')
|
|
|
|
|
|
2019-08-21 22:38:39 -03:00
|
|
|
|
|
|
|
|
|
class DFAState(object):
|
|
|
|
|
"""A state of a DFA
|
|
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
|
rule_name (rule_name): The name of the DFA rule containing the represented state.
|
|
|
|
|
nfa_set (Set[NFAState]): The set of NFA states used to create this state.
|
|
|
|
|
final (bool): True if the state represents an accepting state of the DFA
|
|
|
|
|
containing this state.
|
|
|
|
|
arcs (Dict[label, DFAState]): A mapping representing transitions between
|
|
|
|
|
the current DFA state and another DFA state via following a label.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, rule_name, nfa_set, final):
|
|
|
|
|
assert isinstance(nfa_set, set)
|
|
|
|
|
assert isinstance(next(iter(nfa_set)), NFAState)
|
|
|
|
|
assert isinstance(final, NFAState)
|
|
|
|
|
self.rule_name = rule_name
|
|
|
|
|
self.nfa_set = nfa_set
|
|
|
|
|
self.arcs = {} # map from terminals/nonterminals to DFAState
|
|
|
|
|
self.is_final = final in nfa_set
|
|
|
|
|
|
|
|
|
|
def add_arc(self, target, label):
|
|
|
|
|
"""Add a new arc to the current state.
|
|
|
|
|
|
|
|
|
|
Parameters:
|
|
|
|
|
target (DFAState): The DFA state at the end of the arc.
|
|
|
|
|
label (str): The label respresenting the token that must be consumed
|
|
|
|
|
to perform this transition.
|
|
|
|
|
"""
|
|
|
|
|
assert isinstance(label, str)
|
|
|
|
|
assert label not in self.arcs
|
|
|
|
|
assert isinstance(target, DFAState)
|
|
|
|
|
self.arcs[label] = target
|
|
|
|
|
|
|
|
|
|
def unifystate(self, old, new):
|
|
|
|
|
"""Replace all arcs from the current node to *old* with *new*.
|
|
|
|
|
|
|
|
|
|
Parameters:
|
|
|
|
|
old (DFAState): The DFA state to remove from all existing arcs.
|
|
|
|
|
new (DFAState): The DFA state to replace in all existing arcs.
|
|
|
|
|
"""
|
|
|
|
|
for label, next_ in self.arcs.items():
|
|
|
|
|
if next_ is old:
|
|
|
|
|
self.arcs[label] = new
|
|
|
|
|
|
|
|
|
|
def __eq__(self, other):
|
|
|
|
|
# The nfa_set does not matter for equality
|
|
|
|
|
assert isinstance(other, DFAState)
|
|
|
|
|
if self.is_final != other.is_final:
|
|
|
|
|
return False
|
|
|
|
|
# We cannot just return self.arcs == other.arcs because that
|
|
|
|
|
# would invoke this method recursively if there are any cycles.
|
|
|
|
|
if len(self.arcs) != len(other.arcs):
|
|
|
|
|
return False
|
|
|
|
|
for label, next_ in self.arcs.items():
|
|
|
|
|
if next_ is not other.arcs.get(label):
|
|
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
__hash__ = None # For Py3 compatibility.
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return "<%s: %s is_final=%s>" % (
|
|
|
|
|
self.__class__.__name__,
|
|
|
|
|
self.rule_name,
|
|
|
|
|
self.is_final,
|
|
|
|
|
)
|