import ast import contextlib import re from abc import abstractmethod from typing import ( IO, AbstractSet, Any, Dict, Iterable, Iterator, List, Optional, Set, Text, Tuple, Union, ) from pegen import sccutils from pegen.grammar import ( Alt, Cut, Forced, Gather, Grammar, GrammarError, GrammarVisitor, Group, Lookahead, NamedItem, NameLeaf, Opt, Plain, Repeat0, Repeat1, Rhs, Rule, StringLeaf, ) class RuleCollectorVisitor(GrammarVisitor): """Visitor that invokes a provieded callmaker visitor with just the NamedItem nodes""" def __init__(self, rules: Dict[str, Rule], callmakervisitor: GrammarVisitor) -> None: self.rulses = rules self.callmaker = callmakervisitor def visit_Rule(self, rule: Rule) -> None: self.visit(rule.flatten()) def visit_NamedItem(self, item: NamedItem) -> None: self.callmaker.visit(item) class KeywordCollectorVisitor(GrammarVisitor): """Visitor that collects all the keywods and soft keywords in the Grammar""" def __init__(self, gen: "ParserGenerator", keywords: Dict[str, int], soft_keywords: Set[str]): self.generator = gen self.keywords = keywords self.soft_keywords = soft_keywords def visit_StringLeaf(self, node: StringLeaf) -> None: val = ast.literal_eval(node.value) if re.match(r"[a-zA-Z_]\w*\Z", val): # This is a keyword if node.value.endswith("'") and node.value not in self.keywords: self.keywords[val] = self.generator.keyword_type() else: return self.soft_keywords.add(node.value.replace('"', "")) class RuleCheckingVisitor(GrammarVisitor): def __init__(self, rules: Dict[str, Rule], tokens: Set[str]): self.rules = rules self.tokens = tokens def visit_NameLeaf(self, node: NameLeaf) -> None: if node.value not in self.rules and node.value not in self.tokens: raise GrammarError(f"Dangling reference to rule {node.value!r}") def visit_NamedItem(self, node: NamedItem) -> None: if node.name and node.name.startswith("_"): raise GrammarError(f"Variable names cannot start with underscore: '{node.name}'") self.visit(node.item) class ParserGenerator: callmakervisitor: GrammarVisitor def __init__(self, grammar: Grammar, tokens: Set[str], file: Optional[IO[Text]]): self.grammar = grammar self.tokens = tokens self.keywords: Dict[str, int] = {} self.soft_keywords: Set[str] = set() self.rules = grammar.rules self.validate_rule_names() if "trailer" not in grammar.metas and "start" not in self.rules: raise GrammarError("Grammar without a trailer must have a 'start' rule") checker = RuleCheckingVisitor(self.rules, self.tokens) for rule in self.rules.values(): checker.visit(rule) self.file = file self.level = 0 self.first_graph, self.first_sccs = compute_left_recursives(self.rules) self.counter = 0 # For name_rule()/name_loop() self.keyword_counter = 499 # For keyword_type() self.all_rules: Dict[str, Rule] = self.rules.copy() # Rules + temporal rules self._local_variable_stack: List[List[str]] = [] def validate_rule_names(self) -> None: for rule in self.rules: if rule.startswith("_"): raise GrammarError(f"Rule names cannot start with underscore: '{rule}'") @contextlib.contextmanager def local_variable_context(self) -> Iterator[None]: self._local_variable_stack.append([]) yield self._local_variable_stack.pop() @property def local_variable_names(self) -> List[str]: return self._local_variable_stack[-1] @abstractmethod def generate(self, filename: str) -> None: raise NotImplementedError @contextlib.contextmanager def indent(self) -> Iterator[None]: self.level += 1 try: yield finally: self.level -= 1 def print(self, *args: object) -> None: if not args: print(file=self.file) else: print(" " * self.level, end="", file=self.file) print(*args, file=self.file) def printblock(self, lines: str) -> None: for line in lines.splitlines(): self.print(line) def collect_rules(self) -> None: keyword_collector = KeywordCollectorVisitor(self, self.keywords, self.soft_keywords) for rule in self.all_rules.values(): keyword_collector.visit(rule) rule_collector = RuleCollectorVisitor(self.rules, self.callmakervisitor) done: Set[str] = set() while True: computed_rules = list(self.all_rules) todo = [i for i in computed_rules if i not in done] if not todo: break done = set(self.all_rules) for rulename in todo: rule_collector.visit(self.all_rules[rulename]) def keyword_type(self) -> int: self.keyword_counter += 1 return self.keyword_counter def artifical_rule_from_rhs(self, rhs: Rhs) -> str: self.counter += 1 name = f"_tmp_{self.counter}" # TODO: Pick a nicer name. self.all_rules[name] = Rule(name, None, rhs) return name def artificial_rule_from_repeat(self, node: Plain, is_repeat1: bool) -> str: self.counter += 1 if is_repeat1: prefix = "_loop1_" else: prefix = "_loop0_" name = f"{prefix}{self.counter}" self.all_rules[name] = Rule(name, None, Rhs([Alt([NamedItem(None, node)])])) return name def artifical_rule_from_gather(self, node: Gather) -> str: self.counter += 1 name = f"_gather_{self.counter}" self.counter += 1 extra_function_name = f"_loop0_{self.counter}" extra_function_alt = Alt( [NamedItem(None, node.separator), NamedItem("elem", node.node)], action="elem", ) self.all_rules[extra_function_name] = Rule( extra_function_name, None, Rhs([extra_function_alt]), ) alt = Alt( [NamedItem("elem", node.node), NamedItem("seq", NameLeaf(extra_function_name))], ) self.all_rules[name] = Rule( name, None, Rhs([alt]), ) return name def dedupe(self, name: str) -> str: origname = name counter = 0 while name in self.local_variable_names: counter += 1 name = f"{origname}_{counter}" self.local_variable_names.append(name) return name class NullableVisitor(GrammarVisitor): def __init__(self, rules: Dict[str, Rule]) -> None: self.rules = rules self.visited: Set[Any] = set() self.nullables: Set[Union[Rule, NamedItem]] = set() def visit_Rule(self, rule: Rule) -> bool: if rule in self.visited: return False self.visited.add(rule) if self.visit(rule.rhs): self.nullables.add(rule) return rule in self.nullables def visit_Rhs(self, rhs: Rhs) -> bool: for alt in rhs.alts: if self.visit(alt): return True return False def visit_Alt(self, alt: Alt) -> bool: for item in alt.items: if not self.visit(item): return False return True def visit_Forced(self, force: Forced) -> bool: return True def visit_LookAhead(self, lookahead: Lookahead) -> bool: return True def visit_Opt(self, opt: Opt) -> bool: return True def visit_Repeat0(self, repeat: Repeat0) -> bool: return True def visit_Repeat1(self, repeat: Repeat1) -> bool: return False def visit_Gather(self, gather: Gather) -> bool: return False def visit_Cut(self, cut: Cut) -> bool: return False def visit_Group(self, group: Group) -> bool: return self.visit(group.rhs) def visit_NamedItem(self, item: NamedItem) -> bool: if self.visit(item.item): self.nullables.add(item) return item in self.nullables def visit_NameLeaf(self, node: NameLeaf) -> bool: if node.value in self.rules: return self.visit(self.rules[node.value]) # Token or unknown; never empty. return False def visit_StringLeaf(self, node: StringLeaf) -> bool: # The string token '' is considered empty. return not node.value def compute_nullables(rules: Dict[str, Rule]) -> Set[Any]: """Compute which rules in a grammar are nullable. Thanks to TatSu (tatsu/leftrec.py) for inspiration. """ nullable_visitor = NullableVisitor(rules) for rule in rules.values(): nullable_visitor.visit(rule) return nullable_visitor.nullables class InitialNamesVisitor(GrammarVisitor): def __init__(self, rules: Dict[str, Rule]) -> None: self.rules = rules self.nullables = compute_nullables(rules) def generic_visit(self, node: Iterable[Any], *args: Any, **kwargs: Any) -> Set[Any]: names: Set[str] = set() for value in node: if isinstance(value, list): for item in value: names |= self.visit(item, *args, **kwargs) else: names |= self.visit(value, *args, **kwargs) return names def visit_Alt(self, alt: Alt) -> Set[Any]: names: Set[str] = set() for item in alt.items: names |= self.visit(item) if item not in self.nullables: break return names def visit_Forced(self, force: Forced) -> Set[Any]: return set() def visit_LookAhead(self, lookahead: Lookahead) -> Set[Any]: return set() def visit_Cut(self, cut: Cut) -> Set[Any]: return set() def visit_NameLeaf(self, node: NameLeaf) -> Set[Any]: return {node.value} def visit_StringLeaf(self, node: StringLeaf) -> Set[Any]: return set() def compute_left_recursives( rules: Dict[str, Rule] ) -> Tuple[Dict[str, AbstractSet[str]], List[AbstractSet[str]]]: graph = make_first_graph(rules) sccs = list(sccutils.strongly_connected_components(graph.keys(), graph)) for scc in sccs: if len(scc) > 1: for name in scc: rules[name].left_recursive = True # Try to find a leader such that all cycles go through it. leaders = set(scc) for start in scc: for cycle in sccutils.find_cycles_in_scc(graph, scc, start): # print("Cycle:", " -> ".join(cycle)) leaders -= scc - set(cycle) if not leaders: raise ValueError( f"SCC {scc} has no leadership candidate (no element is included in all cycles)" ) # print("Leaders:", leaders) leader = min(leaders) # Pick an arbitrary leader from the candidates. rules[leader].leader = True else: name = min(scc) # The only element. if name in graph[name]: rules[name].left_recursive = True rules[name].leader = True return graph, sccs def make_first_graph(rules: Dict[str, Rule]) -> Dict[str, AbstractSet[str]]: """Compute the graph of left-invocations. There's an edge from A to B if A may invoke B at its initial position. Note that this requires the nullable flags to have been computed. """ initial_name_visitor = InitialNamesVisitor(rules) graph = {} vertices: Set[str] = set() for rulename, rhs in rules.items(): graph[rulename] = names = initial_name_visitor.visit(rhs) vertices |= names for vertex in vertices: graph.setdefault(vertex, set()) return graph