diff --git a/Tools/clinic/clinic.py b/Tools/clinic/clinic.py index 6488d913168..ac205866f9d 100755 --- a/Tools/clinic/clinic.py +++ b/Tools/clinic/clinic.py @@ -6,11 +6,9 @@ # from __future__ import annotations -import abc import argparse import ast import builtins as bltns -import collections import contextlib import dataclasses as dc import enum @@ -57,6 +55,8 @@ from libclinic.function import ( ClassDict, ModuleDict, FunctionKind, CALLABLE, STATIC_METHOD, CLASS_METHOD, METHOD_INIT, METHOD_NEW, GETTER, SETTER) +from libclinic.language import Language, PythonLanguage +from libclinic.block_parser import Block, BlockParser # TODO: @@ -144,96 +144,6 @@ class CRenderData: self.unlock: list[str] = [] -class Language(metaclass=abc.ABCMeta): - - start_line = "" - body_prefix = "" - stop_line = "" - checksum_line = "" - - def __init__(self, filename: str) -> None: - self.filename = filename - - @abc.abstractmethod - def render( - self, - clinic: Clinic, - signatures: Iterable[Module | Class | Function] - ) -> str: - ... - - def parse_line(self, line: str) -> None: - ... - - def validate(self) -> None: - def assert_only_one( - attr: str, - *additional_fields: str - ) -> None: - """ - Ensures that the string found at getattr(self, attr) - contains exactly one formatter replacement string for - each valid field. The list of valid fields is - ['dsl_name'] extended by additional_fields. - - e.g. - self.fmt = "{dsl_name} {a} {b}" - - # this passes - self.assert_only_one('fmt', 'a', 'b') - - # this fails, the format string has a {b} in it - self.assert_only_one('fmt', 'a') - - # this fails, the format string doesn't have a {c} in it - self.assert_only_one('fmt', 'a', 'b', 'c') - - # this fails, the format string has two {a}s in it, - # it must contain exactly one - self.fmt2 = '{dsl_name} {a} {a}' - self.assert_only_one('fmt2', 'a') - - """ - fields = ['dsl_name'] - fields.extend(additional_fields) - line: str = getattr(self, attr) - fcf = libclinic.FormatCounterFormatter() - fcf.format(line) - def local_fail(should_be_there_but_isnt: bool) -> None: - if should_be_there_but_isnt: - fail("{} {} must contain {{{}}} exactly once!".format( - self.__class__.__name__, attr, name)) - else: - fail("{} {} must not contain {{{}}}!".format( - self.__class__.__name__, attr, name)) - - for name, count in fcf.counts.items(): - if name in fields: - if count > 1: - local_fail(True) - else: - local_fail(False) - for name in fields: - if fcf.counts.get(name) != 1: - local_fail(True) - - assert_only_one('start_line') - assert_only_one('stop_line') - - field = "arguments" if "{arguments}" in self.checksum_line else "checksum" - assert_only_one('checksum_line', field) - - - -class PythonLanguage(Language): - - language = 'Python' - start_line = "#/*[{dsl_name} input]" - body_prefix = "#" - stop_line = "#[{dsl_name} start generated code]*/" - checksum_line = "#/*[{dsl_name} end generated code: {arguments}]*/" - - ParamTuple = tuple["Parameter", ...] @@ -1646,250 +1556,6 @@ class CLanguage(Language): return clinic.get_destination('block').dump() -@dc.dataclass(slots=True, repr=False) -class Block: - r""" - Represents a single block of text embedded in - another file. If dsl_name is None, the block represents - verbatim text, raw original text from the file, in - which case "input" will be the only non-false member. - If dsl_name is not None, the block represents a Clinic - block. - - input is always str, with embedded \n characters. - input represents the original text from the file; - if it's a Clinic block, it is the original text with - the body_prefix and redundant leading whitespace removed. - - dsl_name is either str or None. If str, it's the text - found on the start line of the block between the square - brackets. - - signatures is a list. - It may only contain clinic.Module, clinic.Class, and - clinic.Function objects. At the moment it should - contain at most one of each. - - output is either str or None. If str, it's the output - from this block, with embedded '\n' characters. - - indent is a str. It's the leading whitespace - that was found on every line of input. (If body_prefix is - not empty, this is the indent *after* removing the - body_prefix.) - - "indent" is different from the concept of "preindent" - (which is not stored as state on Block objects). - "preindent" is the whitespace that - was found in front of every line of input *before* the - "body_prefix" (see the Language object). If body_prefix - is empty, preindent must always be empty too. - - To illustrate the difference between "indent" and "preindent": - - Assume that '_' represents whitespace. - If the block processed was in a Python file, and looked like this: - ____#/*[python] - ____#__for a in range(20): - ____#____print(a) - ____#[python]*/ - "preindent" would be "____" and "indent" would be "__". - - """ - input: str - dsl_name: str | None = None - signatures: list[Module | Class | Function] = dc.field(default_factory=list) - output: Any = None # TODO: Very dynamic; probably untypeable in its current form? - indent: str = '' - - def __repr__(self) -> str: - dsl_name = self.dsl_name or "text" - def summarize(s: object) -> str: - s = repr(s) - if len(s) > 30: - return s[:26] + "..." + s[0] - return s - parts = ( - repr(dsl_name), - f"input={summarize(self.input)}", - f"output={summarize(self.output)}" - ) - return f"" - - -class BlockParser: - """ - Block-oriented parser for Argument Clinic. - Iterator, yields Block objects. - """ - - def __init__( - self, - input: str, - language: Language, - *, - verify: bool = True - ) -> None: - """ - "input" should be a str object - with embedded \n characters. - - "language" should be a Language object. - """ - language.validate() - - self.input = collections.deque(reversed(input.splitlines(keepends=True))) - self.block_start_line_number = self.line_number = 0 - - self.language = language - before, _, after = language.start_line.partition('{dsl_name}') - assert _ == '{dsl_name}' - self.find_start_re = libclinic.create_regex(before, after, - whole_line=False) - self.start_re = libclinic.create_regex(before, after) - self.verify = verify - self.last_checksum_re: re.Pattern[str] | None = None - self.last_dsl_name: str | None = None - self.dsl_name: str | None = None - self.first_block = True - - def __iter__(self) -> BlockParser: - return self - - def __next__(self) -> Block: - while True: - if not self.input: - raise StopIteration - - if self.dsl_name: - try: - return_value = self.parse_clinic_block(self.dsl_name) - except ClinicError as exc: - exc.filename = self.language.filename - exc.lineno = self.line_number - raise - self.dsl_name = None - self.first_block = False - return return_value - block = self.parse_verbatim_block() - if self.first_block and not block.input: - continue - self.first_block = False - return block - - - def is_start_line(self, line: str) -> str | None: - match = self.start_re.match(line.lstrip()) - return match.group(1) if match else None - - def _line(self, lookahead: bool = False) -> str: - self.line_number += 1 - line = self.input.pop() - if not lookahead: - self.language.parse_line(line) - return line - - def parse_verbatim_block(self) -> Block: - lines = [] - self.block_start_line_number = self.line_number - - while self.input: - line = self._line() - dsl_name = self.is_start_line(line) - if dsl_name: - self.dsl_name = dsl_name - break - lines.append(line) - - return Block("".join(lines)) - - def parse_clinic_block(self, dsl_name: str) -> Block: - in_lines = [] - self.block_start_line_number = self.line_number + 1 - stop_line = self.language.stop_line.format(dsl_name=dsl_name) - body_prefix = self.language.body_prefix.format(dsl_name=dsl_name) - - def is_stop_line(line: str) -> bool: - # make sure to recognize stop line even if it - # doesn't end with EOL (it could be the very end of the file) - if line.startswith(stop_line): - remainder = line.removeprefix(stop_line) - if remainder and not remainder.isspace(): - fail(f"Garbage after stop line: {remainder!r}") - return True - else: - # gh-92256: don't allow incorrectly formatted stop lines - if line.lstrip().startswith(stop_line): - fail(f"Whitespace is not allowed before the stop line: {line!r}") - return False - - # consume body of program - while self.input: - line = self._line() - if is_stop_line(line) or self.is_start_line(line): - break - if body_prefix: - line = line.lstrip() - assert line.startswith(body_prefix) - line = line.removeprefix(body_prefix) - in_lines.append(line) - - # consume output and checksum line, if present. - if self.last_dsl_name == dsl_name: - checksum_re = self.last_checksum_re - else: - before, _, after = self.language.checksum_line.format(dsl_name=dsl_name, arguments='{arguments}').partition('{arguments}') - assert _ == '{arguments}' - checksum_re = libclinic.create_regex(before, after, word=False) - self.last_dsl_name = dsl_name - self.last_checksum_re = checksum_re - assert checksum_re is not None - - # scan forward for checksum line - out_lines = [] - arguments = None - while self.input: - line = self._line(lookahead=True) - match = checksum_re.match(line.lstrip()) - arguments = match.group(1) if match else None - if arguments: - break - out_lines.append(line) - if self.is_start_line(line): - break - - output: str | None - output = "".join(out_lines) - if arguments: - d = {} - for field in shlex.split(arguments): - name, equals, value = field.partition('=') - if not equals: - fail(f"Mangled Argument Clinic marker line: {line!r}") - d[name.strip()] = value.strip() - - if self.verify: - if 'input' in d: - checksum = d['output'] - else: - checksum = d['checksum'] - - computed = libclinic.compute_checksum(output, len(checksum)) - if checksum != computed: - fail("Checksum mismatch! " - f"Expected {checksum!r}, computed {computed!r}. " - "Suggested fix: remove all generated code including " - "the end marker, or use the '-f' option.") - else: - # put back output - output_lines = output.splitlines(keepends=True) - self.line_number -= len(output_lines) - self.input.extend(reversed(output_lines)) - output = None - - return Block("".join(in_lines), dsl_name, output=output) - - @dc.dataclass(slots=True, frozen=True) class Include: """ diff --git a/Tools/clinic/libclinic/block_parser.py b/Tools/clinic/libclinic/block_parser.py new file mode 100644 index 00000000000..4c0198b5359 --- /dev/null +++ b/Tools/clinic/libclinic/block_parser.py @@ -0,0 +1,256 @@ +from __future__ import annotations +import collections +import dataclasses as dc +import re +import shlex +from typing import Any + +import libclinic +from libclinic import fail, ClinicError +from libclinic.language import Language +from libclinic.function import ( + Module, Class, Function) + + +@dc.dataclass(slots=True, repr=False) +class Block: + r""" + Represents a single block of text embedded in + another file. If dsl_name is None, the block represents + verbatim text, raw original text from the file, in + which case "input" will be the only non-false member. + If dsl_name is not None, the block represents a Clinic + block. + + input is always str, with embedded \n characters. + input represents the original text from the file; + if it's a Clinic block, it is the original text with + the body_prefix and redundant leading whitespace removed. + + dsl_name is either str or None. If str, it's the text + found on the start line of the block between the square + brackets. + + signatures is a list. + It may only contain clinic.Module, clinic.Class, and + clinic.Function objects. At the moment it should + contain at most one of each. + + output is either str or None. If str, it's the output + from this block, with embedded '\n' characters. + + indent is a str. It's the leading whitespace + that was found on every line of input. (If body_prefix is + not empty, this is the indent *after* removing the + body_prefix.) + + "indent" is different from the concept of "preindent" + (which is not stored as state on Block objects). + "preindent" is the whitespace that + was found in front of every line of input *before* the + "body_prefix" (see the Language object). If body_prefix + is empty, preindent must always be empty too. + + To illustrate the difference between "indent" and "preindent": + + Assume that '_' represents whitespace. + If the block processed was in a Python file, and looked like this: + ____#/*[python] + ____#__for a in range(20): + ____#____print(a) + ____#[python]*/ + "preindent" would be "____" and "indent" would be "__". + + """ + input: str + dsl_name: str | None = None + signatures: list[Module | Class | Function] = dc.field(default_factory=list) + output: Any = None # TODO: Very dynamic; probably untypeable in its current form? + indent: str = '' + + def __repr__(self) -> str: + dsl_name = self.dsl_name or "text" + def summarize(s: object) -> str: + s = repr(s) + if len(s) > 30: + return s[:26] + "..." + s[0] + return s + parts = ( + repr(dsl_name), + f"input={summarize(self.input)}", + f"output={summarize(self.output)}" + ) + return f"" + + +class BlockParser: + """ + Block-oriented parser for Argument Clinic. + Iterator, yields Block objects. + """ + + def __init__( + self, + input: str, + language: Language, + *, + verify: bool = True + ) -> None: + """ + "input" should be a str object + with embedded \n characters. + + "language" should be a Language object. + """ + language.validate() + + self.input = collections.deque(reversed(input.splitlines(keepends=True))) + self.block_start_line_number = self.line_number = 0 + + self.language = language + before, _, after = language.start_line.partition('{dsl_name}') + assert _ == '{dsl_name}' + self.find_start_re = libclinic.create_regex(before, after, + whole_line=False) + self.start_re = libclinic.create_regex(before, after) + self.verify = verify + self.last_checksum_re: re.Pattern[str] | None = None + self.last_dsl_name: str | None = None + self.dsl_name: str | None = None + self.first_block = True + + def __iter__(self) -> BlockParser: + return self + + def __next__(self) -> Block: + while True: + if not self.input: + raise StopIteration + + if self.dsl_name: + try: + return_value = self.parse_clinic_block(self.dsl_name) + except ClinicError as exc: + exc.filename = self.language.filename + exc.lineno = self.line_number + raise + self.dsl_name = None + self.first_block = False + return return_value + block = self.parse_verbatim_block() + if self.first_block and not block.input: + continue + self.first_block = False + return block + + + def is_start_line(self, line: str) -> str | None: + match = self.start_re.match(line.lstrip()) + return match.group(1) if match else None + + def _line(self, lookahead: bool = False) -> str: + self.line_number += 1 + line = self.input.pop() + if not lookahead: + self.language.parse_line(line) + return line + + def parse_verbatim_block(self) -> Block: + lines = [] + self.block_start_line_number = self.line_number + + while self.input: + line = self._line() + dsl_name = self.is_start_line(line) + if dsl_name: + self.dsl_name = dsl_name + break + lines.append(line) + + return Block("".join(lines)) + + def parse_clinic_block(self, dsl_name: str) -> Block: + in_lines = [] + self.block_start_line_number = self.line_number + 1 + stop_line = self.language.stop_line.format(dsl_name=dsl_name) + body_prefix = self.language.body_prefix.format(dsl_name=dsl_name) + + def is_stop_line(line: str) -> bool: + # make sure to recognize stop line even if it + # doesn't end with EOL (it could be the very end of the file) + if line.startswith(stop_line): + remainder = line.removeprefix(stop_line) + if remainder and not remainder.isspace(): + fail(f"Garbage after stop line: {remainder!r}") + return True + else: + # gh-92256: don't allow incorrectly formatted stop lines + if line.lstrip().startswith(stop_line): + fail(f"Whitespace is not allowed before the stop line: {line!r}") + return False + + # consume body of program + while self.input: + line = self._line() + if is_stop_line(line) or self.is_start_line(line): + break + if body_prefix: + line = line.lstrip() + assert line.startswith(body_prefix) + line = line.removeprefix(body_prefix) + in_lines.append(line) + + # consume output and checksum line, if present. + if self.last_dsl_name == dsl_name: + checksum_re = self.last_checksum_re + else: + before, _, after = self.language.checksum_line.format(dsl_name=dsl_name, arguments='{arguments}').partition('{arguments}') + assert _ == '{arguments}' + checksum_re = libclinic.create_regex(before, after, word=False) + self.last_dsl_name = dsl_name + self.last_checksum_re = checksum_re + assert checksum_re is not None + + # scan forward for checksum line + out_lines = [] + arguments = None + while self.input: + line = self._line(lookahead=True) + match = checksum_re.match(line.lstrip()) + arguments = match.group(1) if match else None + if arguments: + break + out_lines.append(line) + if self.is_start_line(line): + break + + output: str | None + output = "".join(out_lines) + if arguments: + d = {} + for field in shlex.split(arguments): + name, equals, value = field.partition('=') + if not equals: + fail(f"Mangled Argument Clinic marker line: {line!r}") + d[name.strip()] = value.strip() + + if self.verify: + if 'input' in d: + checksum = d['output'] + else: + checksum = d['checksum'] + + computed = libclinic.compute_checksum(output, len(checksum)) + if checksum != computed: + fail("Checksum mismatch! " + f"Expected {checksum!r}, computed {computed!r}. " + "Suggested fix: remove all generated code including " + "the end marker, or use the '-f' option.") + else: + # put back output + output_lines = output.splitlines(keepends=True) + self.line_number -= len(output_lines) + self.input.extend(reversed(output_lines)) + output = None + + return Block("".join(in_lines), dsl_name, output=output) diff --git a/Tools/clinic/libclinic/language.py b/Tools/clinic/libclinic/language.py new file mode 100644 index 00000000000..a90a9bb24e2 --- /dev/null +++ b/Tools/clinic/libclinic/language.py @@ -0,0 +1,103 @@ +from __future__ import annotations +import abc +import typing +from collections.abc import ( + Iterable, +) + +import libclinic +from libclinic import fail +from libclinic.function import ( + Module, Class, Function) + +if typing.TYPE_CHECKING: + from clinic import Clinic + + +class Language(metaclass=abc.ABCMeta): + + start_line = "" + body_prefix = "" + stop_line = "" + checksum_line = "" + + def __init__(self, filename: str) -> None: + self.filename = filename + + @abc.abstractmethod + def render( + self, + clinic: Clinic, + signatures: Iterable[Module | Class | Function] + ) -> str: + ... + + def parse_line(self, line: str) -> None: + ... + + def validate(self) -> None: + def assert_only_one( + attr: str, + *additional_fields: str + ) -> None: + """ + Ensures that the string found at getattr(self, attr) + contains exactly one formatter replacement string for + each valid field. The list of valid fields is + ['dsl_name'] extended by additional_fields. + + e.g. + self.fmt = "{dsl_name} {a} {b}" + + # this passes + self.assert_only_one('fmt', 'a', 'b') + + # this fails, the format string has a {b} in it + self.assert_only_one('fmt', 'a') + + # this fails, the format string doesn't have a {c} in it + self.assert_only_one('fmt', 'a', 'b', 'c') + + # this fails, the format string has two {a}s in it, + # it must contain exactly one + self.fmt2 = '{dsl_name} {a} {a}' + self.assert_only_one('fmt2', 'a') + + """ + fields = ['dsl_name'] + fields.extend(additional_fields) + line: str = getattr(self, attr) + fcf = libclinic.FormatCounterFormatter() + fcf.format(line) + def local_fail(should_be_there_but_isnt: bool) -> None: + if should_be_there_but_isnt: + fail("{} {} must contain {{{}}} exactly once!".format( + self.__class__.__name__, attr, name)) + else: + fail("{} {} must not contain {{{}}}!".format( + self.__class__.__name__, attr, name)) + + for name, count in fcf.counts.items(): + if name in fields: + if count > 1: + local_fail(True) + else: + local_fail(False) + for name in fields: + if fcf.counts.get(name) != 1: + local_fail(True) + + assert_only_one('start_line') + assert_only_one('stop_line') + + field = "arguments" if "{arguments}" in self.checksum_line else "checksum" + assert_only_one('checksum_line', field) + + +class PythonLanguage(Language): + + language = 'Python' + start_line = "#/*[{dsl_name} input]" + body_prefix = "#" + stop_line = "#[{dsl_name} start generated code]*/" + checksum_line = "#/*[{dsl_name} end generated code: {arguments}]*/"