GH-125498: Update JIT builds to use LLVM 19 and preserve_none (GH-125499)

This commit is contained in:
Savannah Ostrowski 2024-10-30 12:03:31 -07:00 committed by GitHub
parent 597d814334
commit c29bbe2101
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 69 additions and 78 deletions

View File

@ -61,7 +61,7 @@ jobs:
- true - true
- false - false
llvm: llvm:
- 18 - 19
include: include:
- target: i686-pc-windows-msvc/msvc - target: i686-pc-windows-msvc/msvc
architecture: Win32 architecture: Win32
@ -121,10 +121,15 @@ jobs:
choco install llvm --allow-downgrade --no-progress --version ${{ matrix.llvm }}.1.0 choco install llvm --allow-downgrade --no-progress --version ${{ matrix.llvm }}.1.0
./PCbuild/build.bat --experimental-jit ${{ matrix.debug && '-d' || '' }} -p ${{ matrix.architecture }} ./PCbuild/build.bat --experimental-jit ${{ matrix.debug && '-d' || '' }} -p ${{ matrix.architecture }}
# The `find` line is required as a result of https://github.com/actions/runner-images/issues/9966.
# This is a bug in the macOS runner image where the pre-installed Python is installed in the same
# directory as the Homebrew Python, which causes the build to fail for macos-13. This line removes
# the symlink to the pre-installed Python so that the Homebrew Python is used instead.
- name: Native macOS - name: Native macOS
if: runner.os == 'macOS' if: runner.os == 'macOS'
run: | run: |
brew update brew update
find /usr/local/bin -lname '*/Library/Frameworks/Python.framework/*' -delete
brew install llvm@${{ matrix.llvm }} brew install llvm@${{ matrix.llvm }}
SDKROOT="$(xcrun --show-sdk-path)" \ SDKROOT="$(xcrun --show-sdk-path)" \
./configure --enable-experimental-jit ${{ matrix.debug && '--with-pydebug' || '--enable-optimizations --with-lto' }} ./configure --enable-experimental-jit ${{ matrix.debug && '--with-pydebug' || '--enable-optimizations --with-lto' }}
@ -165,6 +170,10 @@ jobs:
name: Free-Threaded (Debug) name: Free-Threaded (Debug)
needs: interpreter needs: interpreter
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy:
matrix:
llvm:
- 19
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- uses: actions/setup-python@v5 - uses: actions/setup-python@v5
@ -172,8 +181,8 @@ jobs:
python-version: '3.11' python-version: '3.11'
- name: Build with JIT enabled and GIL disabled - name: Build with JIT enabled and GIL disabled
run: | run: |
sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" ./llvm.sh 18 sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" ./llvm.sh ${{ matrix.llvm }}
export PATH="$(llvm-config-18 --bindir):$PATH" export PATH="$(llvm-config-${{ matrix.llvm }} --bindir):$PATH"
./configure --enable-experimental-jit --with-pydebug --disable-gil ./configure --enable-experimental-jit --with-pydebug --disable-gil
make all --jobs 4 make all --jobs 4
- name: Run tests - name: Run tests

View File

@ -0,0 +1 @@
Update JIT compilation to use LLVM 19

View File

@ -0,0 +1,4 @@
The JIT has been updated to leverage Clang 19s new ``preserve_none`` attribute,
which supports more platforms and is more useful than LLVM's existing ``ghccc``
calling convention. This also removes the need to manually patch the calling
convention in LLVM IR, simplifying the JIT compilation process.

View File

@ -7,49 +7,46 @@ This version of CPython can be built with an experimental just-in-time compiler[
The JIT compiler does not require end users to install any third-party dependencies, but part of it must be *built* using LLVM[^why-llvm]. You are *not* required to build the rest of CPython using LLVM, or even the same version of LLVM (in fact, this is uncommon). The JIT compiler does not require end users to install any third-party dependencies, but part of it must be *built* using LLVM[^why-llvm]. You are *not* required to build the rest of CPython using LLVM, or even the same version of LLVM (in fact, this is uncommon).
LLVM version 18 is required. Both `clang` and `llvm-readobj` need to be installed and discoverable (version suffixes, like `clang-18`, are okay). It's highly recommended that you also have `llvm-objdump` available, since this allows the build script to dump human-readable assembly for the generated code. LLVM version 19 is required. Both `clang` and `llvm-readobj` need to be installed and discoverable (version suffixes, like `clang-19`, are okay). It's highly recommended that you also have `llvm-objdump` available, since this allows the build script to dump human-readable assembly for the generated code.
It's easy to install all of the required tools: It's easy to install all of the required tools:
### Linux ### Linux
Install LLVM 18 on Ubuntu/Debian: Install LLVM 19 on Ubuntu/Debian:
```sh ```sh
wget https://apt.llvm.org/llvm.sh wget https://apt.llvm.org/llvm.sh
chmod +x llvm.sh chmod +x llvm.sh
sudo ./llvm.sh 18 sudo ./llvm.sh 19
``` ```
Install LLVM 18 on Fedora Linux 40 or newer: Install LLVM 19 on Fedora Linux 40 or newer:
```sh ```sh
sudo dnf install 'clang(major) = 18' 'llvm(major) = 18' sudo dnf install 'clang(major) = 19' 'llvm(major) = 19'
``` ```
### macOS ### macOS
Install LLVM 18 with [Homebrew](https://brew.sh): Install LLVM 19 with [Homebrew](https://brew.sh):
```sh ```sh
brew install llvm@18 brew install llvm@19
``` ```
Homebrew won't add any of the tools to your `$PATH`. That's okay; the build script knows how to find them. Homebrew won't add any of the tools to your `$PATH`. That's okay; the build script knows how to find them.
### Windows ### Windows
Install LLVM 18 [by searching for it on LLVM's GitHub releases page](https://github.com/llvm/llvm-project/releases?q=18), clicking on "Assets", downloading the appropriate Windows installer for your platform (likely the file ending with `-win64.exe`), and running it. **When installing, be sure to select the option labeled "Add LLVM to the system PATH".** Install LLVM 19 [by searching for it on LLVM's GitHub releases page](https://github.com/llvm/llvm-project/releases?q=19), clicking on "Assets", downloading the appropriate Windows installer for your platform (likely the file ending with `-win64.exe`), and running it. **When installing, be sure to select the option labeled "Add LLVM to the system PATH".**
Alternatively, you can use [chocolatey](https://chocolatey.org): Alternatively, you can use [chocolatey](https://chocolatey.org):
```sh ```sh
choco install llvm --version=18.1.6 choco install llvm --version=19.1.0
``` ```
### Dev Containers
If you are working CPython in a [Codespaces instance](https://devguide.python.org/getting-started/setup-building/#using-codespaces), there's no need to install LLVM as the Fedora 40 base image includes LLVM 18 out of the box.
## Building ## Building

View File

@ -8,7 +8,7 @@ import shlex
import subprocess import subprocess
import typing import typing
_LLVM_VERSION = 18 _LLVM_VERSION = 19
_LLVM_VERSION_PATTERN = re.compile(rf"version\s+{_LLVM_VERSION}\.\d+\.\d+\S*\s+") _LLVM_VERSION_PATTERN = re.compile(rf"version\s+{_LLVM_VERSION}\.\d+\.\d+\S*\s+")
_P = typing.ParamSpec("_P") _P = typing.ParamSpec("_P")

View File

@ -2,6 +2,7 @@
import dataclasses import dataclasses
import enum import enum
import sys
import typing import typing
import _schema import _schema
@ -132,8 +133,18 @@ class Hole:
def __post_init__(self) -> None: def __post_init__(self) -> None:
self.func = _PATCH_FUNCS[self.kind] self.func = _PATCH_FUNCS[self.kind]
def fold(self, other: typing.Self) -> typing.Self | None: def fold(self, other: typing.Self, body: bytes) -> typing.Self | None:
"""Combine two holes into a single hole, if possible.""" """Combine two holes into a single hole, if possible."""
instruction_a = int.from_bytes(
body[self.offset : self.offset + 4], byteorder=sys.byteorder
)
instruction_b = int.from_bytes(
body[other.offset : other.offset + 4], byteorder=sys.byteorder
)
reg_a = instruction_a & 0b11111
reg_b1 = instruction_b & 0b11111
reg_b2 = (instruction_b >> 5) & 0b11111
if ( if (
self.offset + 4 == other.offset self.offset + 4 == other.offset
and self.value == other.value and self.value == other.value
@ -141,6 +152,7 @@ class Hole:
and self.addend == other.addend and self.addend == other.addend
and self.func == "patch_aarch64_21rx" and self.func == "patch_aarch64_21rx"
and other.func == "patch_aarch64_12x" and other.func == "patch_aarch64_12x"
and reg_a == reg_b1 == reg_b2
): ):
# These can *only* be properly relaxed when they appear together and # These can *only* be properly relaxed when they appear together and
# patch the same value: # patch the same value:

View File

@ -26,7 +26,6 @@ CPYTHON = TOOLS.parent
PYTHON_EXECUTOR_CASES_C_H = CPYTHON / "Python" / "executor_cases.c.h" PYTHON_EXECUTOR_CASES_C_H = CPYTHON / "Python" / "executor_cases.c.h"
TOOLS_JIT_TEMPLATE_C = TOOLS_JIT / "template.c" TOOLS_JIT_TEMPLATE_C = TOOLS_JIT / "template.c"
_S = typing.TypeVar("_S", _schema.COFFSection, _schema.ELFSection, _schema.MachOSection) _S = typing.TypeVar("_S", _schema.COFFSection, _schema.ELFSection, _schema.MachOSection)
_R = typing.TypeVar( _R = typing.TypeVar(
"_R", _schema.COFFRelocation, _schema.ELFRelocation, _schema.MachORelocation "_R", _schema.COFFRelocation, _schema.ELFRelocation, _schema.MachORelocation
@ -39,7 +38,6 @@ class _Target(typing.Generic[_S, _R]):
_: dataclasses.KW_ONLY _: dataclasses.KW_ONLY
alignment: int = 1 alignment: int = 1
args: typing.Sequence[str] = () args: typing.Sequence[str] = ()
ghccc: bool = False
prefix: str = "" prefix: str = ""
stable: bool = False stable: bool = False
debug: bool = False debug: bool = False
@ -88,11 +86,7 @@ class _Target(typing.Generic[_S, _R]):
sections: list[dict[typing.Literal["Section"], _S]] = json.loads(output) sections: list[dict[typing.Literal["Section"], _S]] = json.loads(output)
for wrapped_section in sections: for wrapped_section in sections:
self._handle_section(wrapped_section["Section"], group) self._handle_section(wrapped_section["Section"], group)
# The trampoline's entry point is just named "_ENTRY", since on some assert group.symbols["_JIT_ENTRY"] == (_stencils.HoleValue.CODE, 0)
# platforms we later assume that any function starting with "_JIT_" uses
# the GHC calling convention:
entry_symbol = "_JIT_ENTRY" if "_JIT_ENTRY" in group.symbols else "_ENTRY"
assert group.symbols[entry_symbol] == (_stencils.HoleValue.CODE, 0)
if group.data.body: if group.data.body:
line = f"0: {str(bytes(group.data.body)).removeprefix('b')}" line = f"0: {str(bytes(group.data.body)).removeprefix('b')}"
group.data.disassembly.append(line) group.data.disassembly.append(line)
@ -112,9 +106,6 @@ class _Target(typing.Generic[_S, _R]):
async def _compile( async def _compile(
self, opname: str, c: pathlib.Path, tempdir: pathlib.Path self, opname: str, c: pathlib.Path, tempdir: pathlib.Path
) -> _stencils.StencilGroup: ) -> _stencils.StencilGroup:
# "Compile" the trampoline to an empty stencil group if it's not needed:
if opname == "trampoline" and not self.ghccc:
return _stencils.StencilGroup()
o = tempdir / f"{opname}.o" o = tempdir / f"{opname}.o"
args = [ args = [
f"--target={self.triple}", f"--target={self.triple}",
@ -128,6 +119,7 @@ class _Target(typing.Generic[_S, _R]):
f"-I{CPYTHON / 'Include' / 'internal'}", f"-I{CPYTHON / 'Include' / 'internal'}",
f"-I{CPYTHON / 'Include' / 'internal' / 'mimalloc'}", f"-I{CPYTHON / 'Include' / 'internal' / 'mimalloc'}",
f"-I{CPYTHON / 'Python'}", f"-I{CPYTHON / 'Python'}",
f"-I{CPYTHON / 'Tools' / 'jit'}",
"-O3", "-O3",
"-c", "-c",
# This debug info isn't necessary, and bloats out the JIT'ed code. # This debug info isn't necessary, and bloats out the JIT'ed code.
@ -143,44 +135,12 @@ class _Target(typing.Generic[_S, _R]):
# Don't call stack-smashing canaries that we can't find or patch: # Don't call stack-smashing canaries that we can't find or patch:
"-fno-stack-protector", "-fno-stack-protector",
"-std=c11", "-std=c11",
"-o",
f"{o}",
f"{c}",
*self.args, *self.args,
] ]
if self.ghccc: await _llvm.run("clang", args, echo=self.verbose)
# This is a bit of an ugly workaround, but it makes the code much
# smaller and faster, so it's worth it. We want to use the GHC
# calling convention, but Clang doesn't support it. So, we *first*
# compile the code to LLVM IR, perform some text replacements on the
# IR to change the calling convention(!), and then compile *that*.
# Once we have access to Clang 19, we can get rid of this and use
# __attribute__((preserve_none)) directly in the C code instead:
ll = tempdir / f"{opname}.ll"
args_ll = args + [
# -fomit-frame-pointer is necessary because the GHC calling
# convention uses RBP to pass arguments:
"-S",
"-emit-llvm",
"-fomit-frame-pointer",
"-o",
f"{ll}",
f"{c}",
]
await _llvm.run("clang", args_ll, echo=self.verbose)
ir = ll.read_text()
# This handles declarations, definitions, and calls to named symbols
# starting with "_JIT_":
ir = re.sub(
r"(((noalias|nonnull|noundef) )*ptr @_JIT_\w+\()", r"ghccc \1", ir
)
# This handles calls to anonymous callees, since anything with
# "musttail" needs to use the same calling convention:
ir = ir.replace("musttail call", "musttail call ghccc")
# Sometimes *both* replacements happen at the same site, so fix it:
ir = ir.replace("ghccc ghccc", "ghccc")
ll.write_text(ir)
args_o = args + ["-Wno-unused-command-line-argument", "-o", f"{o}", f"{ll}"]
else:
args_o = args + ["-o", f"{o}", f"{c}"]
await _llvm.run("clang", args_o, echo=self.verbose)
return await self._parse(o) return await self._parse(o)
async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]:
@ -519,7 +479,6 @@ class _MachO(
def get_target(host: str) -> _COFF | _ELF | _MachO: def get_target(host: str) -> _COFF | _ELF | _MachO:
"""Build a _Target for the given host "triple" and options.""" """Build a _Target for the given host "triple" and options."""
# ghccc currently crashes Clang when combined with musttail on aarch64. :(
target: _COFF | _ELF | _MachO target: _COFF | _ELF | _MachO
if re.fullmatch(r"aarch64-apple-darwin.*", host): if re.fullmatch(r"aarch64-apple-darwin.*", host):
target = _MachO(host, alignment=8, prefix="_") target = _MachO(host, alignment=8, prefix="_")
@ -535,16 +494,20 @@ def get_target(host: str) -> _COFF | _ELF | _MachO:
] ]
target = _ELF(host, alignment=8, args=args) target = _ELF(host, alignment=8, args=args)
elif re.fullmatch(r"i686-pc-windows-msvc", host): elif re.fullmatch(r"i686-pc-windows-msvc", host):
args = ["-DPy_NO_ENABLE_SHARED"] args = [
target = _COFF(host, args=args, ghccc=True, prefix="_") "-DPy_NO_ENABLE_SHARED",
# __attribute__((preserve_none)) is not supported
"-Wno-ignored-attributes",
]
target = _COFF(host, args=args, prefix="_")
elif re.fullmatch(r"x86_64-apple-darwin.*", host): elif re.fullmatch(r"x86_64-apple-darwin.*", host):
target = _MachO(host, ghccc=True, prefix="_") target = _MachO(host, prefix="_")
elif re.fullmatch(r"x86_64-pc-windows-msvc", host): elif re.fullmatch(r"x86_64-pc-windows-msvc", host):
args = ["-fms-runtime-lib=dll"] args = ["-fms-runtime-lib=dll"]
target = _COFF(host, args=args, ghccc=True) target = _COFF(host, args=args)
elif re.fullmatch(r"x86_64-.*-linux-gnu", host): elif re.fullmatch(r"x86_64-.*-linux-gnu", host):
args = ["-fpic"] args = ["-fpic"]
target = _ELF(host, args=args, ghccc=True) target = _ELF(host, args=args)
else: else:
raise ValueError(host) raise ValueError(host)
return target return target

View File

@ -65,7 +65,7 @@ def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator
if skip: if skip:
skip = False skip = False
continue continue
if pair and (folded := hole.fold(pair)): if pair and (folded := hole.fold(pair, stencil.body)):
skip = True skip = True
hole = folded hole = folded
yield f" {hole.as_c(part)}" yield f" {hole.as_c(part)}"

4
Tools/jit/jit.h Normal file
View File

@ -0,0 +1,4 @@
// To use preserve_none in JIT builds, we need to declare a separate function
// pointer with __attribute__((preserve_none)), since this attribute may not be
// supported by the compiler used to build the rest of the interpreter.
typedef jit_func __attribute__((preserve_none)) jit_func_preserve_none;

View File

@ -21,6 +21,8 @@
#include "ceval_macros.h" #include "ceval_macros.h"
#include "jit.h"
#undef CURRENT_OPARG #undef CURRENT_OPARG
#define CURRENT_OPARG() (_oparg) #define CURRENT_OPARG() (_oparg)
@ -49,7 +51,7 @@
do { \ do { \
OPT_STAT_INC(traces_executed); \ OPT_STAT_INC(traces_executed); \
__attribute__((musttail)) \ __attribute__((musttail)) \
return ((jit_func)((EXECUTOR)->jit_side_entry))(frame, stack_pointer, tstate); \ return ((jit_func_preserve_none)((EXECUTOR)->jit_side_entry))(frame, stack_pointer, tstate); \
} while (0) } while (0)
#undef GOTO_TIER_ONE #undef GOTO_TIER_ONE
@ -72,7 +74,7 @@ do { \
do { \ do { \
PyAPI_DATA(void) ALIAS; \ PyAPI_DATA(void) ALIAS; \
__attribute__((musttail)) \ __attribute__((musttail)) \
return ((jit_func)&ALIAS)(frame, stack_pointer, tstate); \ return ((jit_func_preserve_none)&ALIAS)(frame, stack_pointer, tstate); \
} while (0) } while (0)
#undef JUMP_TO_JUMP_TARGET #undef JUMP_TO_JUMP_TARGET
@ -86,7 +88,7 @@ do { \
#define TIER_TWO 2 #define TIER_TWO 2
_Py_CODEUNIT * __attribute__((preserve_none)) _Py_CODEUNIT *
_JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate) _JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate)
{ {
// Locals that the instruction implementations expect to exist: // Locals that the instruction implementations expect to exist:

View File

@ -4,11 +4,10 @@
#include "pycore_frame.h" #include "pycore_frame.h"
#include "pycore_jit.h" #include "pycore_jit.h"
// This is where the calling convention changes, on platforms that require it. #include "jit.h"
// The actual change is patched in while the JIT compiler is being built, in
// Tools/jit/_targets.py. On other platforms, this function compiles to nothing.
_Py_CODEUNIT * _Py_CODEUNIT *
_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate) _JIT_ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate)
{ {
// This is subtle. The actual trace will return to us once it exits, so we // This is subtle. The actual trace will return to us once it exits, so we
// need to make sure that we stay alive until then. If our trace side-exits // need to make sure that we stay alive until then. If our trace side-exits
@ -19,7 +18,7 @@ _ENTRY(_PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *ts
Py_INCREF(executor); Py_INCREF(executor);
// Note that this is *not* a tail call: // Note that this is *not* a tail call:
PyAPI_DATA(void) _JIT_CONTINUE; PyAPI_DATA(void) _JIT_CONTINUE;
_Py_CODEUNIT *target = ((jit_func)&_JIT_CONTINUE)(frame, stack_pointer, tstate); _Py_CODEUNIT *target = ((jit_func_preserve_none)&_JIT_CONTINUE)(frame, stack_pointer, tstate);
Py_SETREF(tstate->previous_executor, executor); Py_SETREF(tstate->previous_executor, executor);
return target; return target;
} }