from __future__ import annotations import collections import dataclasses as dc import re import shlex from typing import Any import libclinic from libclinic import fail, ClinicError from libclinic.language import Language from libclinic.function import ( Module, Class, Function) @dc.dataclass(slots=True, repr=False) class Block: r""" Represents a single block of text embedded in another file. If dsl_name is None, the block represents verbatim text, raw original text from the file, in which case "input" will be the only non-false member. If dsl_name is not None, the block represents a Clinic block. input is always str, with embedded \n characters. input represents the original text from the file; if it's a Clinic block, it is the original text with the body_prefix and redundant leading whitespace removed. dsl_name is either str or None. If str, it's the text found on the start line of the block between the square brackets. signatures is a list. It may only contain clinic.Module, clinic.Class, and clinic.Function objects. At the moment it should contain at most one of each. output is either str or None. If str, it's the output from this block, with embedded '\n' characters. indent is a str. It's the leading whitespace that was found on every line of input. (If body_prefix is not empty, this is the indent *after* removing the body_prefix.) "indent" is different from the concept of "preindent" (which is not stored as state on Block objects). "preindent" is the whitespace that was found in front of every line of input *before* the "body_prefix" (see the Language object). If body_prefix is empty, preindent must always be empty too. To illustrate the difference between "indent" and "preindent": Assume that '_' represents whitespace. If the block processed was in a Python file, and looked like this: ____#/*[python] ____#__for a in range(20): ____#____print(a) ____#[python]*/ "preindent" would be "____" and "indent" would be "__". """ input: str dsl_name: str | None = None signatures: list[Module | Class | Function] = dc.field(default_factory=list) output: Any = None # TODO: Very dynamic; probably untypeable in its current form? indent: str = '' def __repr__(self) -> str: dsl_name = self.dsl_name or "text" def summarize(s: object) -> str: s = repr(s) if len(s) > 30: return s[:26] + "..." + s[0] return s parts = ( repr(dsl_name), f"input={summarize(self.input)}", f"output={summarize(self.output)}" ) return f"" class BlockParser: """ Block-oriented parser for Argument Clinic. Iterator, yields Block objects. """ def __init__( self, input: str, language: Language, *, verify: bool = True ) -> None: """ "input" should be a str object with embedded \n characters. "language" should be a Language object. """ language.validate() self.input = collections.deque(reversed(input.splitlines(keepends=True))) self.block_start_line_number = self.line_number = 0 self.language = language before, _, after = language.start_line.partition('{dsl_name}') assert _ == '{dsl_name}' self.find_start_re = libclinic.create_regex(before, after, whole_line=False) self.start_re = libclinic.create_regex(before, after) self.verify = verify self.last_checksum_re: re.Pattern[str] | None = None self.last_dsl_name: str | None = None self.dsl_name: str | None = None self.first_block = True def __iter__(self) -> BlockParser: return self def __next__(self) -> Block: while True: if not self.input: raise StopIteration if self.dsl_name: try: return_value = self.parse_clinic_block(self.dsl_name) except ClinicError as exc: exc.filename = self.language.filename exc.lineno = self.line_number raise self.dsl_name = None self.first_block = False return return_value block = self.parse_verbatim_block() if self.first_block and not block.input: continue self.first_block = False return block def is_start_line(self, line: str) -> str | None: match = self.start_re.match(line.lstrip()) return match.group(1) if match else None def _line(self, lookahead: bool = False) -> str: self.line_number += 1 line = self.input.pop() if not lookahead: self.language.parse_line(line) return line def parse_verbatim_block(self) -> Block: lines = [] self.block_start_line_number = self.line_number while self.input: line = self._line() dsl_name = self.is_start_line(line) if dsl_name: self.dsl_name = dsl_name break lines.append(line) return Block("".join(lines)) def parse_clinic_block(self, dsl_name: str) -> Block: in_lines = [] self.block_start_line_number = self.line_number + 1 stop_line = self.language.stop_line.format(dsl_name=dsl_name) body_prefix = self.language.body_prefix.format(dsl_name=dsl_name) def is_stop_line(line: str) -> bool: # make sure to recognize stop line even if it # doesn't end with EOL (it could be the very end of the file) if line.startswith(stop_line): remainder = line.removeprefix(stop_line) if remainder and not remainder.isspace(): fail(f"Garbage after stop line: {remainder!r}") return True else: # gh-92256: don't allow incorrectly formatted stop lines if line.lstrip().startswith(stop_line): fail(f"Whitespace is not allowed before the stop line: {line!r}") return False # consume body of program while self.input: line = self._line() if is_stop_line(line) or self.is_start_line(line): break if body_prefix: line = line.lstrip() assert line.startswith(body_prefix) line = line.removeprefix(body_prefix) in_lines.append(line) # consume output and checksum line, if present. if self.last_dsl_name == dsl_name: checksum_re = self.last_checksum_re else: before, _, after = self.language.checksum_line.format(dsl_name=dsl_name, arguments='{arguments}').partition('{arguments}') assert _ == '{arguments}' checksum_re = libclinic.create_regex(before, after, word=False) self.last_dsl_name = dsl_name self.last_checksum_re = checksum_re assert checksum_re is not None # scan forward for checksum line out_lines = [] arguments = None while self.input: line = self._line(lookahead=True) match = checksum_re.match(line.lstrip()) arguments = match.group(1) if match else None if arguments: break out_lines.append(line) if self.is_start_line(line): break output: str | None output = "".join(out_lines) if arguments: d = {} for field in shlex.split(arguments): name, equals, value = field.partition('=') if not equals: fail(f"Mangled Argument Clinic marker line: {line!r}") d[name.strip()] = value.strip() if self.verify: if 'input' in d: checksum = d['output'] else: checksum = d['checksum'] computed = libclinic.compute_checksum(output, len(checksum)) if checksum != computed: fail("Checksum mismatch! " f"Expected {checksum!r}, computed {computed!r}. " "Suggested fix: remove all generated code including " "the end marker, or use the '-f' option.") else: # put back output output_lines = output.splitlines(keepends=True) self.line_number -= len(output_lines) self.input.extend(reversed(output_lines)) output = None return Block("".join(in_lines), dsl_name, output=output)