gh-113317, AC: Add libclinic.block_parser module (#116819)

* Move Block and BlockParser classes to a new libclinic.block_parser module. * Move Language and PythonLanguage classes to a new libclinic.language module.
2024-03-14 17:11:39 +01:00 · 2024-03-14 17:11:39 +01:00 · b54d7c87aa
parent bae6579b46
commit b54d7c87aa
3 changed files with 361 additions and 336 deletions
--- a/Tools/clinic/clinic.py
+++ b/Tools/clinic/clinic.py
@ -6,11 +6,9 @@
 #
 from __future__ import annotations
 import abc
 import argparse
 import ast
 import builtins as bltns
 import collections
 import contextlib
 import dataclasses as dc
 import enum
@ -57,6 +55,8 @@ from libclinic.function import (
    ClassDict, ModuleDict, FunctionKind,
    CALLABLE, STATIC_METHOD, CLASS_METHOD, METHOD_INIT, METHOD_NEW,
    GETTER, SETTER)
 from libclinic.language import Language, PythonLanguage
 from libclinic.block_parser import Block, BlockParser
 # TODO:
@ -144,96 +144,6 @@ class CRenderData:
        self.unlock: list[str] = []
 class Language(metaclass=abc.ABCMeta):
    start_line = ""
    body_prefix = ""
    stop_line = ""
    checksum_line = ""
    def __init__(self, filename: str) -> None:
        self.filename = filename
    @abc.abstractmethod
    def render(
            self,
            clinic: Clinic,
            signatures: Iterable[Module | Class | Function]
    ) -> str:
        ...
    def parse_line(self, line: str) -> None:
        ...
    def validate(self) -> None:
        def assert_only_one(
                attr: str,
                *additional_fields: str
        ) -> None:
            """
            Ensures that the string found at getattr(self, attr)
            contains exactly one formatter replacement string for
            each valid field.  The list of valid fields is
            ['dsl_name'] extended by additional_fields.
            e.g.
                self.fmt = "{dsl_name} {a} {b}"
                # this passes
                self.assert_only_one('fmt', 'a', 'b')
                # this fails, the format string has a {b} in it
                self.assert_only_one('fmt', 'a')
                # this fails, the format string doesn't have a {c} in it
                self.assert_only_one('fmt', 'a', 'b', 'c')
                # this fails, the format string has two {a}s in it,
                # it must contain exactly one
                self.fmt2 = '{dsl_name} {a} {a}'
                self.assert_only_one('fmt2', 'a')
            """
            fields = ['dsl_name']
            fields.extend(additional_fields)
            line: str = getattr(self, attr)
            fcf = libclinic.FormatCounterFormatter()
            fcf.format(line)
            def local_fail(should_be_there_but_isnt: bool) -> None:
                if should_be_there_but_isnt:
                    fail("{} {} must contain {{{}}} exactly once!".format(
                        self.__class__.__name__, attr, name))
                else:
                    fail("{} {} must not contain {{{}}}!".format(
                        self.__class__.__name__, attr, name))
            for name, count in fcf.counts.items():
                if name in fields:
                    if count > 1:
                        local_fail(True)
                else:
                    local_fail(False)
            for name in fields:
                if fcf.counts.get(name) != 1:
                    local_fail(True)
        assert_only_one('start_line')
        assert_only_one('stop_line')
        field = "arguments" if "{arguments}" in self.checksum_line else "checksum"
        assert_only_one('checksum_line', field)
 class PythonLanguage(Language):
    language      = 'Python'
    start_line    = "#/*[{dsl_name} input]"
    body_prefix   = "#"
    stop_line     = "#[{dsl_name} start generated code]*/"
    checksum_line = "#/*[{dsl_name} end generated code: {arguments}]*/"
 ParamTuple = tuple["Parameter", ...]
@ -1646,250 +1556,6 @@ class CLanguage(Language):
        return clinic.get_destination('block').dump()
@dc.dataclass(slots=True, repr=False)
 class Block:
    r"""
    Represents a single block of text embedded in
    another file.  If dsl_name is None, the block represents
    verbatim text, raw original text from the file, in
    which case "input" will be the only non-false member.
    If dsl_name is not None, the block represents a Clinic
    block.
    input is always str, with embedded \n characters.
    input represents the original text from the file;
    if it's a Clinic block, it is the original text with
    the body_prefix and redundant leading whitespace removed.
    dsl_name is either str or None.  If str, it's the text
    found on the start line of the block between the square
    brackets.
    signatures is a list.
    It may only contain clinic.Module, clinic.Class, and
    clinic.Function objects.  At the moment it should
    contain at most one of each.
    output is either str or None.  If str, it's the output
    from this block, with embedded '\n' characters.
    indent is a str.  It's the leading whitespace
    that was found on every line of input.  (If body_prefix is
    not empty, this is the indent *after* removing the
    body_prefix.)
    "indent" is different from the concept of "preindent"
    (which is not stored as state on Block objects).
    "preindent" is the whitespace that
    was found in front of every line of input *before* the
    "body_prefix" (see the Language object).  If body_prefix
    is empty, preindent must always be empty too.
    To illustrate the difference between "indent" and "preindent":
    Assume that '_' represents whitespace.
    If the block processed was in a Python file, and looked like this:
      ____#/*[python]
      ____#__for a in range(20):
      ____#____print(a)
      ____#[python]*/
    "preindent" would be "____" and "indent" would be "__".
    """
    input: str
    dsl_name: str | None = None
    signatures: list[Module | Class | Function] = dc.field(default_factory=list)
    output: Any = None  # TODO: Very dynamic; probably untypeable in its current form?
    indent: str = ''
    def __repr__(self) -> str:
        dsl_name = self.dsl_name or "text"
        def summarize(s: object) -> str:
            s = repr(s)
            if len(s) > 30:
                return s[:26] + "..." + s[0]
            return s
        parts = (
            repr(dsl_name),
            f"input={summarize(self.input)}",
            f"output={summarize(self.output)}"
        )
        return f"<clinic.Block {' '.join(parts)}>"
 class BlockParser:
    """
    Block-oriented parser for Argument Clinic.
    Iterator, yields Block objects.
    """
    def __init__(
            self,
            input: str,
            language: Language,
            *,
            verify: bool = True
    ) -> None:
        """
        "input" should be a str object
        with embedded \n characters.
        "language" should be a Language object.
        """
        language.validate()
        self.input = collections.deque(reversed(input.splitlines(keepends=True)))
        self.block_start_line_number = self.line_number = 0
        self.language = language
        before, _, after = language.start_line.partition('{dsl_name}')
        assert _ == '{dsl_name}'
        self.find_start_re = libclinic.create_regex(before, after,
                                                    whole_line=False)
        self.start_re = libclinic.create_regex(before, after)
        self.verify = verify
        self.last_checksum_re: re.Pattern[str] | None = None
        self.last_dsl_name: str | None = None
        self.dsl_name: str | None = None
        self.first_block = True
    def __iter__(self) -> BlockParser:
        return self
    def __next__(self) -> Block:
        while True:
            if not self.input:
                raise StopIteration
            if self.dsl_name:
                try:
                    return_value = self.parse_clinic_block(self.dsl_name)
                except ClinicError as exc:
                    exc.filename = self.language.filename
                    exc.lineno = self.line_number
                    raise
                self.dsl_name = None
                self.first_block = False
                return return_value
            block = self.parse_verbatim_block()
            if self.first_block and not block.input:
                continue
            self.first_block = False
            return block
    def is_start_line(self, line: str) -> str | None:
        match = self.start_re.match(line.lstrip())
        return match.group(1) if match else None
    def _line(self, lookahead: bool = False) -> str:
        self.line_number += 1
        line = self.input.pop()
        if not lookahead:
            self.language.parse_line(line)
        return line
    def parse_verbatim_block(self) -> Block:
        lines = []
        self.block_start_line_number = self.line_number
        while self.input:
            line = self._line()
            dsl_name = self.is_start_line(line)
            if dsl_name:
                self.dsl_name = dsl_name
                break
            lines.append(line)
        return Block("".join(lines))
    def parse_clinic_block(self, dsl_name: str) -> Block:
        in_lines = []
        self.block_start_line_number = self.line_number + 1
        stop_line = self.language.stop_line.format(dsl_name=dsl_name)
        body_prefix = self.language.body_prefix.format(dsl_name=dsl_name)
        def is_stop_line(line: str) -> bool:
            # make sure to recognize stop line even if it
            # doesn't end with EOL (it could be the very end of the file)
            if line.startswith(stop_line):
                remainder = line.removeprefix(stop_line)
                if remainder and not remainder.isspace():
                    fail(f"Garbage after stop line: {remainder!r}")
                return True
            else:
                # gh-92256: don't allow incorrectly formatted stop lines
                if line.lstrip().startswith(stop_line):
                    fail(f"Whitespace is not allowed before the stop line: {line!r}")
                return False
        # consume body of program
        while self.input:
            line = self._line()
            if is_stop_line(line) or self.is_start_line(line):
                break
            if body_prefix:
                line = line.lstrip()
                assert line.startswith(body_prefix)
                line = line.removeprefix(body_prefix)
            in_lines.append(line)
        # consume output and checksum line, if present.
        if self.last_dsl_name == dsl_name:
            checksum_re = self.last_checksum_re
        else:
            before, _, after = self.language.checksum_line.format(dsl_name=dsl_name, arguments='{arguments}').partition('{arguments}')
            assert _ == '{arguments}'
            checksum_re = libclinic.create_regex(before, after, word=False)
            self.last_dsl_name = dsl_name
            self.last_checksum_re = checksum_re
        assert checksum_re is not None
        # scan forward for checksum line
        out_lines = []
        arguments = None
        while self.input:
            line = self._line(lookahead=True)
            match = checksum_re.match(line.lstrip())
            arguments = match.group(1) if match else None
            if arguments:
                break
            out_lines.append(line)
            if self.is_start_line(line):
                break
        output: str | None
        output = "".join(out_lines)
        if arguments:
            d = {}
            for field in shlex.split(arguments):
                name, equals, value = field.partition('=')
                if not equals:
                    fail(f"Mangled Argument Clinic marker line: {line!r}")
                d[name.strip()] = value.strip()
            if self.verify:
                if 'input' in d:
                    checksum = d['output']
                else:
                    checksum = d['checksum']
                computed = libclinic.compute_checksum(output, len(checksum))
                if checksum != computed:
                    fail("Checksum mismatch! "
                         f"Expected {checksum!r}, computed {computed!r}. "
                         "Suggested fix: remove all generated code including "
                         "the end marker, or use the '-f' option.")
        else:
            # put back output
            output_lines = output.splitlines(keepends=True)
            self.line_number -= len(output_lines)
            self.input.extend(reversed(output_lines))
            output = None
        return Block("".join(in_lines), dsl_name, output=output)
@dc.dataclass(slots=True, frozen=True)
 class Include:
    """
--- a/Tools/clinic/libclinic/block_parser.py
+++ b/Tools/clinic/libclinic/block_parser.py
@ -0,0 +1,256 @@
 from __future__ import annotations
 import collections
 import dataclasses as dc
 import re
 import shlex
 from typing import Any
 import libclinic
 from libclinic import fail, ClinicError
 from libclinic.language import Language
 from libclinic.function import (
    Module, Class, Function)
@dc.dataclass(slots=True, repr=False)
 class Block:
    r"""
    Represents a single block of text embedded in
    another file.  If dsl_name is None, the block represents
    verbatim text, raw original text from the file, in
    which case "input" will be the only non-false member.
    If dsl_name is not None, the block represents a Clinic
    block.
    input is always str, with embedded \n characters.
    input represents the original text from the file;
    if it's a Clinic block, it is the original text with
    the body_prefix and redundant leading whitespace removed.
    dsl_name is either str or None.  If str, it's the text
    found on the start line of the block between the square
    brackets.
    signatures is a list.
    It may only contain clinic.Module, clinic.Class, and
    clinic.Function objects.  At the moment it should
    contain at most one of each.
    output is either str or None.  If str, it's the output
    from this block, with embedded '\n' characters.
    indent is a str.  It's the leading whitespace
    that was found on every line of input.  (If body_prefix is
    not empty, this is the indent *after* removing the
    body_prefix.)
    "indent" is different from the concept of "preindent"
    (which is not stored as state on Block objects).
    "preindent" is the whitespace that
    was found in front of every line of input *before* the
    "body_prefix" (see the Language object).  If body_prefix
    is empty, preindent must always be empty too.
    To illustrate the difference between "indent" and "preindent":
    Assume that '_' represents whitespace.
    If the block processed was in a Python file, and looked like this:
      ____#/*[python]
      ____#__for a in range(20):
      ____#____print(a)
      ____#[python]*/
    "preindent" would be "____" and "indent" would be "__".
    """
    input: str
    dsl_name: str | None = None
    signatures: list[Module | Class | Function] = dc.field(default_factory=list)
    output: Any = None  # TODO: Very dynamic; probably untypeable in its current form?
    indent: str = ''
    def __repr__(self) -> str:
        dsl_name = self.dsl_name or "text"
        def summarize(s: object) -> str:
            s = repr(s)
            if len(s) > 30:
                return s[:26] + "..." + s[0]
            return s
        parts = (
            repr(dsl_name),
            f"input={summarize(self.input)}",
            f"output={summarize(self.output)}"
        )
        return f"<clinic.Block {' '.join(parts)}>"
 class BlockParser:
    """
    Block-oriented parser for Argument Clinic.
    Iterator, yields Block objects.
    """
    def __init__(
            self,
            input: str,
            language: Language,
            *,
            verify: bool = True
    ) -> None:
        """
        "input" should be a str object
        with embedded \n characters.
        "language" should be a Language object.
        """
        language.validate()
        self.input = collections.deque(reversed(input.splitlines(keepends=True)))
        self.block_start_line_number = self.line_number = 0
        self.language = language
        before, _, after = language.start_line.partition('{dsl_name}')
        assert _ == '{dsl_name}'
        self.find_start_re = libclinic.create_regex(before, after,
                                                    whole_line=False)
        self.start_re = libclinic.create_regex(before, after)
        self.verify = verify
        self.last_checksum_re: re.Pattern[str] | None = None
        self.last_dsl_name: str | None = None
        self.dsl_name: str | None = None
        self.first_block = True
    def __iter__(self) -> BlockParser:
        return self
    def __next__(self) -> Block:
        while True:
            if not self.input:
                raise StopIteration
            if self.dsl_name:
                try:
                    return_value = self.parse_clinic_block(self.dsl_name)
                except ClinicError as exc:
                    exc.filename = self.language.filename
                    exc.lineno = self.line_number
                    raise
                self.dsl_name = None
                self.first_block = False
                return return_value
            block = self.parse_verbatim_block()
            if self.first_block and not block.input:
                continue
            self.first_block = False
            return block
    def is_start_line(self, line: str) -> str | None:
        match = self.start_re.match(line.lstrip())
        return match.group(1) if match else None
    def _line(self, lookahead: bool = False) -> str:
        self.line_number += 1
        line = self.input.pop()
        if not lookahead:
            self.language.parse_line(line)
        return line
    def parse_verbatim_block(self) -> Block:
        lines = []
        self.block_start_line_number = self.line_number
        while self.input:
            line = self._line()
            dsl_name = self.is_start_line(line)
            if dsl_name:
                self.dsl_name = dsl_name
                break
            lines.append(line)
        return Block("".join(lines))
    def parse_clinic_block(self, dsl_name: str) -> Block:
        in_lines = []
        self.block_start_line_number = self.line_number + 1
        stop_line = self.language.stop_line.format(dsl_name=dsl_name)
        body_prefix = self.language.body_prefix.format(dsl_name=dsl_name)
        def is_stop_line(line: str) -> bool:
            # make sure to recognize stop line even if it
            # doesn't end with EOL (it could be the very end of the file)
            if line.startswith(stop_line):
                remainder = line.removeprefix(stop_line)
                if remainder and not remainder.isspace():
                    fail(f"Garbage after stop line: {remainder!r}")
                return True
            else:
                # gh-92256: don't allow incorrectly formatted stop lines
                if line.lstrip().startswith(stop_line):
                    fail(f"Whitespace is not allowed before the stop line: {line!r}")
                return False
        # consume body of program
        while self.input:
            line = self._line()
            if is_stop_line(line) or self.is_start_line(line):
                break
            if body_prefix:
                line = line.lstrip()
                assert line.startswith(body_prefix)
                line = line.removeprefix(body_prefix)
            in_lines.append(line)
        # consume output and checksum line, if present.
        if self.last_dsl_name == dsl_name:
            checksum_re = self.last_checksum_re
        else:
            before, _, after = self.language.checksum_line.format(dsl_name=dsl_name, arguments='{arguments}').partition('{arguments}')
            assert _ == '{arguments}'
            checksum_re = libclinic.create_regex(before, after, word=False)
            self.last_dsl_name = dsl_name
            self.last_checksum_re = checksum_re
        assert checksum_re is not None
        # scan forward for checksum line
        out_lines = []
        arguments = None
        while self.input:
            line = self._line(lookahead=True)
            match = checksum_re.match(line.lstrip())
            arguments = match.group(1) if match else None
            if arguments:
                break
            out_lines.append(line)
            if self.is_start_line(line):
                break
        output: str | None
        output = "".join(out_lines)
        if arguments:
            d = {}
            for field in shlex.split(arguments):
                name, equals, value = field.partition('=')
                if not equals:
                    fail(f"Mangled Argument Clinic marker line: {line!r}")
                d[name.strip()] = value.strip()
            if self.verify:
                if 'input' in d:
                    checksum = d['output']
                else:
                    checksum = d['checksum']
                computed = libclinic.compute_checksum(output, len(checksum))
                if checksum != computed:
                    fail("Checksum mismatch! "
                         f"Expected {checksum!r}, computed {computed!r}. "
                         "Suggested fix: remove all generated code including "
                         "the end marker, or use the '-f' option.")
        else:
            # put back output
            output_lines = output.splitlines(keepends=True)
            self.line_number -= len(output_lines)
            self.input.extend(reversed(output_lines))
            output = None
        return Block("".join(in_lines), dsl_name, output=output)
--- a/Tools/clinic/libclinic/language.py
+++ b/Tools/clinic/libclinic/language.py
@ -0,0 +1,103 @@
 from __future__ import annotations
 import abc
 import typing
 from collections.abc import (
    Iterable,
 )
 import libclinic
 from libclinic import fail
 from libclinic.function import (
    Module, Class, Function)
 if typing.TYPE_CHECKING:
    from clinic import Clinic
 class Language(metaclass=abc.ABCMeta):
    start_line = ""
    body_prefix = ""
    stop_line = ""
    checksum_line = ""
    def __init__(self, filename: str) -> None:
        self.filename = filename
    @abc.abstractmethod
    def render(
            self,
            clinic: Clinic,
            signatures: Iterable[Module | Class | Function]
    ) -> str:
        ...
    def parse_line(self, line: str) -> None:
        ...
    def validate(self) -> None:
        def assert_only_one(
                attr: str,
                *additional_fields: str
        ) -> None:
            """
            Ensures that the string found at getattr(self, attr)
            contains exactly one formatter replacement string for
            each valid field.  The list of valid fields is
            ['dsl_name'] extended by additional_fields.
            e.g.
                self.fmt = "{dsl_name} {a} {b}"
                # this passes
                self.assert_only_one('fmt', 'a', 'b')
                # this fails, the format string has a {b} in it
                self.assert_only_one('fmt', 'a')
                # this fails, the format string doesn't have a {c} in it
                self.assert_only_one('fmt', 'a', 'b', 'c')
                # this fails, the format string has two {a}s in it,
                # it must contain exactly one
                self.fmt2 = '{dsl_name} {a} {a}'
                self.assert_only_one('fmt2', 'a')
            """
            fields = ['dsl_name']
            fields.extend(additional_fields)
            line: str = getattr(self, attr)
            fcf = libclinic.FormatCounterFormatter()
            fcf.format(line)
            def local_fail(should_be_there_but_isnt: bool) -> None:
                if should_be_there_but_isnt:
                    fail("{} {} must contain {{{}}} exactly once!".format(
                        self.__class__.__name__, attr, name))
                else:
                    fail("{} {} must not contain {{{}}}!".format(
                        self.__class__.__name__, attr, name))
            for name, count in fcf.counts.items():
                if name in fields:
                    if count > 1:
                        local_fail(True)
                else:
                    local_fail(False)
            for name in fields:
                if fcf.counts.get(name) != 1:
                    local_fail(True)
        assert_only_one('start_line')
        assert_only_one('stop_line')
        field = "arguments" if "{arguments}" in self.checksum_line else "checksum"
        assert_only_one('checksum_line', field)
 class PythonLanguage(Language):
    language      = 'Python'
    start_line    = "#/*[{dsl_name} input]"
    body_prefix   = "#"
    stop_line     = "#[{dsl_name} start generated code]*/"
    checksum_line = "#/*[{dsl_name} end generated code: {arguments}]*/"