gh-113317, AC: Add libclinic.block_parser module (#116819)

* Move Block and BlockParser classes to a new libclinic.block_parser module. * Move Language and PythonLanguage classes to a new libclinic.language module.
2024-03-14 17:11:39 +01:00 · 2024-03-14 17:11:39 +01:00 · b54d7c87aa
parent bae6579b46
commit b54d7c87aa
3 changed files with 361 additions and 336 deletions
--- a/Tools/clinic/clinic.py
+++ b/Tools/clinic/clinic.py
@ -6,11 +6,9 @@
 #
 from __future__ import annotations

-import abc
 import argparse
 import ast
 import builtins as bltns
-import collections
 import contextlib
 import dataclasses as dc
 import enum
@ -57,6 +55,8 @@ from libclinic.function import (
    ClassDict, ModuleDict, FunctionKind,
    CALLABLE, STATIC_METHOD, CLASS_METHOD, METHOD_INIT, METHOD_NEW,
    GETTER, SETTER)
+from libclinic.language import Language, PythonLanguage
+from libclinic.block_parser import Block, BlockParser


 # TODO:
@ -144,96 +144,6 @@ class CRenderData:
        self.unlock: list[str] = []


-class Language(metaclass=abc.ABCMeta):
-
-    start_line = ""
-    body_prefix = ""
-    stop_line = ""
-    checksum_line = ""
-
-    def __init__(self, filename: str) -> None:
-        self.filename = filename
-
-    @abc.abstractmethod
-    def render(
-            self,
-            clinic: Clinic,
-            signatures: Iterable[Module | Class | Function]
-    ) -> str:
-        ...
-
-    def parse_line(self, line: str) -> None:
-        ...
-
-    def validate(self) -> None:
-        def assert_only_one(
-                attr: str,
-                *additional_fields: str
-        ) -> None:
-            """
-            Ensures that the string found at getattr(self, attr)
-            contains exactly one formatter replacement string for
-            each valid field.  The list of valid fields is
-            ['dsl_name'] extended by additional_fields.
-
-            e.g.
-                self.fmt = "{dsl_name} {a} {b}"
-
-                # this passes
-                self.assert_only_one('fmt', 'a', 'b')
-
-                # this fails, the format string has a {b} in it
-                self.assert_only_one('fmt', 'a')
-
-                # this fails, the format string doesn't have a {c} in it
-                self.assert_only_one('fmt', 'a', 'b', 'c')
-
-                # this fails, the format string has two {a}s in it,
-                # it must contain exactly one
-                self.fmt2 = '{dsl_name} {a} {a}'
-                self.assert_only_one('fmt2', 'a')
-
-            """
-            fields = ['dsl_name']
-            fields.extend(additional_fields)
-            line: str = getattr(self, attr)
-            fcf = libclinic.FormatCounterFormatter()
-            fcf.format(line)
-            def local_fail(should_be_there_but_isnt: bool) -> None:
-                if should_be_there_but_isnt:
-                    fail("{} {} must contain {{{}}} exactly once!".format(
-                        self.__class__.__name__, attr, name))
-                else:
-                    fail("{} {} must not contain {{{}}}!".format(
-                        self.__class__.__name__, attr, name))
-
-            for name, count in fcf.counts.items():
-                if name in fields:
-                    if count > 1:
-                        local_fail(True)
-                else:
-                    local_fail(False)
-            for name in fields:
-                if fcf.counts.get(name) != 1:
-                    local_fail(True)
-
-        assert_only_one('start_line')
-        assert_only_one('stop_line')
-
-        field = "arguments" if "{arguments}" in self.checksum_line else "checksum"
-        assert_only_one('checksum_line', field)
-
-
-
-class PythonLanguage(Language):
-
-    language      = 'Python'
-    start_line    = "#/*[{dsl_name} input]"
-    body_prefix   = "#"
-    stop_line     = "#[{dsl_name} start generated code]*/"
-    checksum_line = "#/*[{dsl_name} end generated code: {arguments}]*/"
-
-
 ParamTuple = tuple["Parameter", ...]


@ -1646,250 +1556,6 @@ class CLanguage(Language):
        return clinic.get_destination('block').dump()


-@dc.dataclass(slots=True, repr=False)
-class Block:
-    r"""
-    Represents a single block of text embedded in
-    another file.  If dsl_name is None, the block represents
-    verbatim text, raw original text from the file, in
-    which case "input" will be the only non-false member.
-    If dsl_name is not None, the block represents a Clinic
-    block.
-
-    input is always str, with embedded \n characters.
-    input represents the original text from the file;
-    if it's a Clinic block, it is the original text with
-    the body_prefix and redundant leading whitespace removed.
-
-    dsl_name is either str or None.  If str, it's the text
-    found on the start line of the block between the square
-    brackets.
-
-    signatures is a list.
-    It may only contain clinic.Module, clinic.Class, and
-    clinic.Function objects.  At the moment it should
-    contain at most one of each.
-
-    output is either str or None.  If str, it's the output
-    from this block, with embedded '\n' characters.
-
-    indent is a str.  It's the leading whitespace
-    that was found on every line of input.  (If body_prefix is
-    not empty, this is the indent *after* removing the
-    body_prefix.)
-
-    "indent" is different from the concept of "preindent"
-    (which is not stored as state on Block objects).
-    "preindent" is the whitespace that
-    was found in front of every line of input *before* the
-    "body_prefix" (see the Language object).  If body_prefix
-    is empty, preindent must always be empty too.
-
-    To illustrate the difference between "indent" and "preindent":
-
-    Assume that '_' represents whitespace.
-    If the block processed was in a Python file, and looked like this:
-      ____#/*[python]
-      ____#__for a in range(20):
-      ____#____print(a)
-      ____#[python]*/
-    "preindent" would be "____" and "indent" would be "__".
-
-    """
-    input: str
-    dsl_name: str | None = None
-    signatures: list[Module | Class | Function] = dc.field(default_factory=list)
-    output: Any = None  # TODO: Very dynamic; probably untypeable in its current form?
-    indent: str = ''
-
-    def __repr__(self) -> str:
-        dsl_name = self.dsl_name or "text"
-        def summarize(s: object) -> str:
-            s = repr(s)
-            if len(s) > 30:
-                return s[:26] + "..." + s[0]
-            return s
-        parts = (
-            repr(dsl_name),
-            f"input={summarize(self.input)}",
-            f"output={summarize(self.output)}"
-        )
-        return f"<clinic.Block {' '.join(parts)}>"
-
-
-class BlockParser:
-    """
-    Block-oriented parser for Argument Clinic.
-    Iterator, yields Block objects.
-    """
-
-    def __init__(
-            self,
-            input: str,
-            language: Language,
-            *,
-            verify: bool = True
-    ) -> None:
-        """
-        "input" should be a str object
-        with embedded \n characters.
-
-        "language" should be a Language object.
-        """
-        language.validate()
-
-        self.input = collections.deque(reversed(input.splitlines(keepends=True)))
-        self.block_start_line_number = self.line_number = 0
-
-        self.language = language
-        before, _, after = language.start_line.partition('{dsl_name}')
-        assert _ == '{dsl_name}'
-        self.find_start_re = libclinic.create_regex(before, after,
-                                                    whole_line=False)
-        self.start_re = libclinic.create_regex(before, after)
-        self.verify = verify
-        self.last_checksum_re: re.Pattern[str] | None = None
-        self.last_dsl_name: str | None = None
-        self.dsl_name: str | None = None
-        self.first_block = True
-
-    def __iter__(self) -> BlockParser:
-        return self
-
-    def __next__(self) -> Block:
-        while True:
-            if not self.input:
-                raise StopIteration
-
-            if self.dsl_name:
-                try:
-                    return_value = self.parse_clinic_block(self.dsl_name)
-                except ClinicError as exc:
-                    exc.filename = self.language.filename
-                    exc.lineno = self.line_number
-                    raise
-                self.dsl_name = None
-                self.first_block = False
-                return return_value
-            block = self.parse_verbatim_block()
-            if self.first_block and not block.input:
-                continue
-            self.first_block = False
-            return block
-
-
-    def is_start_line(self, line: str) -> str | None:
-        match = self.start_re.match(line.lstrip())
-        return match.group(1) if match else None
-
-    def _line(self, lookahead: bool = False) -> str:
-        self.line_number += 1
-        line = self.input.pop()
-        if not lookahead:
-            self.language.parse_line(line)
-        return line
-
-    def parse_verbatim_block(self) -> Block:
-        lines = []
-        self.block_start_line_number = self.line_number
-
-        while self.input:
-            line = self._line()
-            dsl_name = self.is_start_line(line)
-            if dsl_name:
-                self.dsl_name = dsl_name
-                break
-            lines.append(line)
-
-        return Block("".join(lines))
-
-    def parse_clinic_block(self, dsl_name: str) -> Block:
-        in_lines = []
-        self.block_start_line_number = self.line_number + 1
-        stop_line = self.language.stop_line.format(dsl_name=dsl_name)
-        body_prefix = self.language.body_prefix.format(dsl_name=dsl_name)
-
-        def is_stop_line(line: str) -> bool:
-            # make sure to recognize stop line even if it
-            # doesn't end with EOL (it could be the very end of the file)
-            if line.startswith(stop_line):
-                remainder = line.removeprefix(stop_line)
-                if remainder and not remainder.isspace():
-                    fail(f"Garbage after stop line: {remainder!r}")
-                return True
-            else:
-                # gh-92256: don't allow incorrectly formatted stop lines
-                if line.lstrip().startswith(stop_line):
-                    fail(f"Whitespace is not allowed before the stop line: {line!r}")
-                return False
-
-        # consume body of program
-        while self.input:
-            line = self._line()
-            if is_stop_line(line) or self.is_start_line(line):
-                break
-            if body_prefix:
-                line = line.lstrip()
-                assert line.startswith(body_prefix)
-                line = line.removeprefix(body_prefix)
-            in_lines.append(line)
-
-        # consume output and checksum line, if present.
-        if self.last_dsl_name == dsl_name:
-            checksum_re = self.last_checksum_re
-        else:
-            before, _, after = self.language.checksum_line.format(dsl_name=dsl_name, arguments='{arguments}').partition('{arguments}')
-            assert _ == '{arguments}'
-            checksum_re = libclinic.create_regex(before, after, word=False)
-            self.last_dsl_name = dsl_name
-            self.last_checksum_re = checksum_re
-        assert checksum_re is not None
-
-        # scan forward for checksum line
-        out_lines = []
-        arguments = None
-        while self.input:
-            line = self._line(lookahead=True)
-            match = checksum_re.match(line.lstrip())
-            arguments = match.group(1) if match else None
-            if arguments:
-                break
-            out_lines.append(line)
-            if self.is_start_line(line):
-                break
-
-        output: str | None
-        output = "".join(out_lines)
-        if arguments:
-            d = {}
-            for field in shlex.split(arguments):
-                name, equals, value = field.partition('=')
-                if not equals:
-                    fail(f"Mangled Argument Clinic marker line: {line!r}")
-                d[name.strip()] = value.strip()
-
-            if self.verify:
-                if 'input' in d:
-                    checksum = d['output']
-                else:
-                    checksum = d['checksum']
-
-                computed = libclinic.compute_checksum(output, len(checksum))
-                if checksum != computed:
-                    fail("Checksum mismatch! "
-                         f"Expected {checksum!r}, computed {computed!r}. "
-                         "Suggested fix: remove all generated code including "
-                         "the end marker, or use the '-f' option.")
-        else:
-            # put back output
-            output_lines = output.splitlines(keepends=True)
-            self.line_number -= len(output_lines)
-            self.input.extend(reversed(output_lines))
-            output = None
-
-        return Block("".join(in_lines), dsl_name, output=output)
-
-
@dc.dataclass(slots=True, frozen=True)
 class Include:
    """
--- a/Tools/clinic/libclinic/block_parser.py
+++ b/Tools/clinic/libclinic/block_parser.py
@ -0,0 +1,256 @@
+from __future__ import annotations
+import collections
+import dataclasses as dc
+import re
+import shlex
+from typing import Any
+
+import libclinic
+from libclinic import fail, ClinicError
+from libclinic.language import Language
+from libclinic.function import (
+    Module, Class, Function)
+
+
+@dc.dataclass(slots=True, repr=False)
+class Block:
+    r"""
+    Represents a single block of text embedded in
+    another file.  If dsl_name is None, the block represents
+    verbatim text, raw original text from the file, in
+    which case "input" will be the only non-false member.
+    If dsl_name is not None, the block represents a Clinic
+    block.
+
+    input is always str, with embedded \n characters.
+    input represents the original text from the file;
+    if it's a Clinic block, it is the original text with
+    the body_prefix and redundant leading whitespace removed.
+
+    dsl_name is either str or None.  If str, it's the text
+    found on the start line of the block between the square
+    brackets.
+
+    signatures is a list.
+    It may only contain clinic.Module, clinic.Class, and
+    clinic.Function objects.  At the moment it should
+    contain at most one of each.
+
+    output is either str or None.  If str, it's the output
+    from this block, with embedded '\n' characters.
+
+    indent is a str.  It's the leading whitespace
+    that was found on every line of input.  (If body_prefix is
+    not empty, this is the indent *after* removing the
+    body_prefix.)
+
+    "indent" is different from the concept of "preindent"
+    (which is not stored as state on Block objects).
+    "preindent" is the whitespace that
+    was found in front of every line of input *before* the
+    "body_prefix" (see the Language object).  If body_prefix
+    is empty, preindent must always be empty too.
+
+    To illustrate the difference between "indent" and "preindent":
+
+    Assume that '_' represents whitespace.
+    If the block processed was in a Python file, and looked like this:
+      ____#/*[python]
+      ____#__for a in range(20):
+      ____#____print(a)
+      ____#[python]*/
+    "preindent" would be "____" and "indent" would be "__".
+
+    """
+    input: str
+    dsl_name: str | None = None
+    signatures: list[Module | Class | Function] = dc.field(default_factory=list)
+    output: Any = None  # TODO: Very dynamic; probably untypeable in its current form?
+    indent: str = ''
+
+    def __repr__(self) -> str:
+        dsl_name = self.dsl_name or "text"
+        def summarize(s: object) -> str:
+            s = repr(s)
+            if len(s) > 30:
+                return s[:26] + "..." + s[0]
+            return s
+        parts = (
+            repr(dsl_name),
+            f"input={summarize(self.input)}",
+            f"output={summarize(self.output)}"
+        )
+        return f"<clinic.Block {' '.join(parts)}>"
+
+
+class BlockParser:
+    """
+    Block-oriented parser for Argument Clinic.
+    Iterator, yields Block objects.
+    """
+
+    def __init__(
+            self,
+            input: str,
+            language: Language,
+            *,
+            verify: bool = True
+    ) -> None:
+        """
+        "input" should be a str object
+        with embedded \n characters.
+
+        "language" should be a Language object.
+        """
+        language.validate()
+
+        self.input = collections.deque(reversed(input.splitlines(keepends=True)))
+        self.block_start_line_number = self.line_number = 0
+
+        self.language = language
+        before, _, after = language.start_line.partition('{dsl_name}')
+        assert _ == '{dsl_name}'
+        self.find_start_re = libclinic.create_regex(before, after,
+                                                    whole_line=False)
+        self.start_re = libclinic.create_regex(before, after)
+        self.verify = verify
+        self.last_checksum_re: re.Pattern[str] | None = None
+        self.last_dsl_name: str | None = None
+        self.dsl_name: str | None = None
+        self.first_block = True
+
+    def __iter__(self) -> BlockParser:
+        return self
+
+    def __next__(self) -> Block:
+        while True:
+            if not self.input:
+                raise StopIteration
+
+            if self.dsl_name:
+                try:
+                    return_value = self.parse_clinic_block(self.dsl_name)
+                except ClinicError as exc:
+                    exc.filename = self.language.filename
+                    exc.lineno = self.line_number
+                    raise
+                self.dsl_name = None
+                self.first_block = False
+                return return_value
+            block = self.parse_verbatim_block()
+            if self.first_block and not block.input:
+                continue
+            self.first_block = False
+            return block
+
+
+    def is_start_line(self, line: str) -> str | None:
+        match = self.start_re.match(line.lstrip())
+        return match.group(1) if match else None
+
+    def _line(self, lookahead: bool = False) -> str:
+        self.line_number += 1
+        line = self.input.pop()
+        if not lookahead:
+            self.language.parse_line(line)
+        return line
+
+    def parse_verbatim_block(self) -> Block:
+        lines = []
+        self.block_start_line_number = self.line_number
+
+        while self.input:
+            line = self._line()
+            dsl_name = self.is_start_line(line)
+            if dsl_name:
+                self.dsl_name = dsl_name
+                break
+            lines.append(line)
+
+        return Block("".join(lines))
+
+    def parse_clinic_block(self, dsl_name: str) -> Block:
+        in_lines = []
+        self.block_start_line_number = self.line_number + 1
+        stop_line = self.language.stop_line.format(dsl_name=dsl_name)
+        body_prefix = self.language.body_prefix.format(dsl_name=dsl_name)
+
+        def is_stop_line(line: str) -> bool:
+            # make sure to recognize stop line even if it
+            # doesn't end with EOL (it could be the very end of the file)
+            if line.startswith(stop_line):
+                remainder = line.removeprefix(stop_line)
+                if remainder and not remainder.isspace():
+                    fail(f"Garbage after stop line: {remainder!r}")
+                return True
+            else:
+                # gh-92256: don't allow incorrectly formatted stop lines
+                if line.lstrip().startswith(stop_line):
+                    fail(f"Whitespace is not allowed before the stop line: {line!r}")
+                return False
+
+        # consume body of program
+        while self.input:
+            line = self._line()
+            if is_stop_line(line) or self.is_start_line(line):
+                break
+            if body_prefix:
+                line = line.lstrip()
+                assert line.startswith(body_prefix)
+                line = line.removeprefix(body_prefix)
+            in_lines.append(line)
+
+        # consume output and checksum line, if present.
+        if self.last_dsl_name == dsl_name:
+            checksum_re = self.last_checksum_re
+        else:
+            before, _, after = self.language.checksum_line.format(dsl_name=dsl_name, arguments='{arguments}').partition('{arguments}')
+            assert _ == '{arguments}'
+            checksum_re = libclinic.create_regex(before, after, word=False)
+            self.last_dsl_name = dsl_name
+            self.last_checksum_re = checksum_re
+        assert checksum_re is not None
+
+        # scan forward for checksum line
+        out_lines = []
+        arguments = None
+        while self.input:
+            line = self._line(lookahead=True)
+            match = checksum_re.match(line.lstrip())
+            arguments = match.group(1) if match else None
+            if arguments:
+                break
+            out_lines.append(line)
+            if self.is_start_line(line):
+                break
+
+        output: str | None
+        output = "".join(out_lines)
+        if arguments:
+            d = {}
+            for field in shlex.split(arguments):
+                name, equals, value = field.partition('=')
+                if not equals:
+                    fail(f"Mangled Argument Clinic marker line: {line!r}")
+                d[name.strip()] = value.strip()
+
+            if self.verify:
+                if 'input' in d:
+                    checksum = d['output']
+                else:
+                    checksum = d['checksum']
+
+                computed = libclinic.compute_checksum(output, len(checksum))
+                if checksum != computed:
+                    fail("Checksum mismatch! "
+                         f"Expected {checksum!r}, computed {computed!r}. "
+                         "Suggested fix: remove all generated code including "
+                         "the end marker, or use the '-f' option.")
+        else:
+            # put back output
+            output_lines = output.splitlines(keepends=True)
+            self.line_number -= len(output_lines)
+            self.input.extend(reversed(output_lines))
+            output = None
+
+        return Block("".join(in_lines), dsl_name, output=output)
--- a/Tools/clinic/libclinic/language.py
+++ b/Tools/clinic/libclinic/language.py
@ -0,0 +1,103 @@
+from __future__ import annotations
+import abc
+import typing
+from collections.abc import (
+    Iterable,
+)
+
+import libclinic
+from libclinic import fail
+from libclinic.function import (
+    Module, Class, Function)
+
+if typing.TYPE_CHECKING:
+    from clinic import Clinic
+
+
+class Language(metaclass=abc.ABCMeta):
+
+    start_line = ""
+    body_prefix = ""
+    stop_line = ""
+    checksum_line = ""
+
+    def __init__(self, filename: str) -> None:
+        self.filename = filename
+
+    @abc.abstractmethod
+    def render(
+            self,
+            clinic: Clinic,
+            signatures: Iterable[Module | Class | Function]
+    ) -> str:
+        ...
+
+    def parse_line(self, line: str) -> None:
+        ...
+
+    def validate(self) -> None:
+        def assert_only_one(
+                attr: str,
+                *additional_fields: str
+        ) -> None:
+            """
+            Ensures that the string found at getattr(self, attr)
+            contains exactly one formatter replacement string for
+            each valid field.  The list of valid fields is
+            ['dsl_name'] extended by additional_fields.
+
+            e.g.
+                self.fmt = "{dsl_name} {a} {b}"
+
+                # this passes
+                self.assert_only_one('fmt', 'a', 'b')
+
+                # this fails, the format string has a {b} in it
+                self.assert_only_one('fmt', 'a')
+
+                # this fails, the format string doesn't have a {c} in it
+                self.assert_only_one('fmt', 'a', 'b', 'c')
+
+                # this fails, the format string has two {a}s in it,
+                # it must contain exactly one
+                self.fmt2 = '{dsl_name} {a} {a}'
+                self.assert_only_one('fmt2', 'a')
+
+            """
+            fields = ['dsl_name']
+            fields.extend(additional_fields)
+            line: str = getattr(self, attr)
+            fcf = libclinic.FormatCounterFormatter()
+            fcf.format(line)
+            def local_fail(should_be_there_but_isnt: bool) -> None:
+                if should_be_there_but_isnt:
+                    fail("{} {} must contain {{{}}} exactly once!".format(
+                        self.__class__.__name__, attr, name))
+                else:
+                    fail("{} {} must not contain {{{}}}!".format(
+                        self.__class__.__name__, attr, name))
+
+            for name, count in fcf.counts.items():
+                if name in fields:
+                    if count > 1:
+                        local_fail(True)
+                else:
+                    local_fail(False)
+            for name in fields:
+                if fcf.counts.get(name) != 1:
+                    local_fail(True)
+
+        assert_only_one('start_line')
+        assert_only_one('stop_line')
+
+        field = "arguments" if "{arguments}" in self.checksum_line else "checksum"
+        assert_only_one('checksum_line', field)
+
+
+class PythonLanguage(Language):
+
+    language      = 'Python'
+    start_line    = "#/*[{dsl_name} input]"
+    body_prefix   = "#"
+    stop_line     = "#[{dsl_name} start generated code]*/"
+    checksum_line = "#/*[{dsl_name} end generated code: {arguments}]*/"