gh-106812: Refactor cases_generator to allow uops with array stack effects (#107564)

Introducing a new file, stacking.py, that takes over several responsibilities related to symbolic evaluation of push/pop operations, with more generality.
2023-08-04 09:35:56 -07:00 · 2023-08-04 09:35:56 -07:00 · 400835ea16
parent 407d7fda94
commit 400835ea16
12 changed files with 1798 additions and 1098 deletions
--- a/Include/internal/pycore_opcode_metadata.h
+++ b/Include/internal/pycore_opcode_metadata.h
@ -679,9 +679,9 @@ _PyOpcode_num_pushed(int opcode, int oparg, bool jump) {
        case LOAD_GLOBAL:
            return ((oparg & 1) ? 1 : 0) + 1;
        case LOAD_GLOBAL_MODULE:
-            return ((oparg & 1) ? 1 : 0) + 1;
+            return (oparg & 1 ? 1 : 0) + 1;
        case LOAD_GLOBAL_BUILTIN:
-            return ((oparg & 1) ? 1 : 0) + 1;
+            return (oparg & 1 ? 1 : 0) + 1;
        case DELETE_FAST:
            return 0;
        case MAKE_CELL:
@ -739,7 +739,7 @@ _PyOpcode_num_pushed(int opcode, int oparg, bool jump) {
        case LOAD_METHOD:
            return ((oparg & 1) ? 1 : 0) + 1;
        case LOAD_ATTR_INSTANCE_VALUE:
-            return ((oparg & 1) ? 1 : 0) + 1;
+            return (oparg & 1 ? 1 : 0) + 1;
        case LOAD_ATTR_MODULE:
            return ((oparg & 1) ? 1 : 0) + 1;
        case LOAD_ATTR_WITH_HINT:
@ -944,7 +944,18 @@ _PyOpcode_num_pushed(int opcode, int oparg, bool jump) {
 }
 #endif

-enum InstructionFormat { INSTR_FMT_IB, INSTR_FMT_IBC, INSTR_FMT_IBC00, INSTR_FMT_IBC000, INSTR_FMT_IBC00000000, INSTR_FMT_IX, INSTR_FMT_IXC, INSTR_FMT_IXC0, INSTR_FMT_IXC00, INSTR_FMT_IXC000 };
+enum InstructionFormat {
+    INSTR_FMT_IB,
+    INSTR_FMT_IBC,
+    INSTR_FMT_IBC00,
+    INSTR_FMT_IBC000,
+    INSTR_FMT_IBC00000000,
+    INSTR_FMT_IX,
+    INSTR_FMT_IXC,
+    INSTR_FMT_IXC0,
+    INSTR_FMT_IXC00,
+    INSTR_FMT_IXC000,
+};

 #define IS_VALID_OPCODE(OP) \
    (((OP) >= 0) && ((OP) < OPCODE_METADATA_SIZE) && \
--- a/Lib/test/test_generated_cases.py
+++ b/Lib/test/test_generated_cases.py
@ -6,9 +6,9 @@ from test import test_tools

 test_tools.skip_if_missing('cases_generator')
 with test_tools.imports_under_tool('cases_generator'):
+    import generate_cases
    import analysis
    import formatting
-    import generate_cases
    from parsing import StackEffect


@ -46,28 +46,11 @@ class TestEffects(unittest.TestCase):
            (2, "(oparg<<1)"),
        )

-        self.assertEqual(
-            formatting.string_effect_size(
-                formatting.list_effect_size(input_effects),
-            ), "1 + oparg + oparg*2",
-        )
-        self.assertEqual(
-            formatting.string_effect_size(
-                formatting.list_effect_size(output_effects),
-            ),
-            "2 + oparg*4",
-        )
-        self.assertEqual(
-            formatting.string_effect_size(
-                formatting.list_effect_size(other_effects),
-            ),
-            "2 + (oparg<<1)",
-        )
-

 class TestGeneratedCases(unittest.TestCase):
    def setUp(self) -> None:
        super().setUp()
+        self.maxDiff = None

        self.temp_dir = tempfile.gettempdir()
        self.temp_input_filename = os.path.join(self.temp_dir, "input.txt")
@ -140,7 +123,8 @@ class TestGeneratedCases(unittest.TestCase):
    """
        output = """
        TARGET(OP) {
-            PyObject *value = stack_pointer[-1];
+            PyObject *value;
+            value = stack_pointer[-1];
            spam();
            STACK_SHRINK(1);
            DISPATCH();
@ -173,8 +157,9 @@ class TestGeneratedCases(unittest.TestCase):
    """
        output = """
        TARGET(OP) {
-            PyObject *value = stack_pointer[-1];
+            PyObject *value;
            PyObject *res;
+            value = stack_pointer[-1];
            spam();
            stack_pointer[-1] = res;
            DISPATCH();
@ -190,9 +175,11 @@ class TestGeneratedCases(unittest.TestCase):
    """
        output = """
        TARGET(OP) {
-            PyObject *right = stack_pointer[-1];
-            PyObject *left = stack_pointer[-2];
+            PyObject *right;
+            PyObject *left;
            PyObject *res;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
            spam();
            STACK_SHRINK(1);
            stack_pointer[-1] = res;
@ -209,9 +196,11 @@ class TestGeneratedCases(unittest.TestCase):
    """
        output = """
        TARGET(OP) {
-            PyObject *right = stack_pointer[-1];
-            PyObject *left = stack_pointer[-2];
+            PyObject *right;
+            PyObject *left;
            PyObject *result;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
            spam();
            stack_pointer[-1] = result;
            DISPATCH();
@ -235,8 +224,9 @@ class TestGeneratedCases(unittest.TestCase):
        }

        TARGET(OP3) {
-            PyObject *arg = stack_pointer[-1];
+            PyObject *arg;
            PyObject *res;
+            arg = stack_pointer[-1];
            DEOPT_IF(xxx, OP1);
            stack_pointer[-1] = res;
            CHECK_EVAL_BREAKER();
@ -281,9 +271,11 @@ class TestGeneratedCases(unittest.TestCase):
    """
        output = """
        TARGET(OP) {
-            PyObject *right = stack_pointer[-1];
-            PyObject *left = stack_pointer[-2];
+            PyObject *right;
+            PyObject *left;
            PyObject *res;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
            if (cond) goto pop_2_label;
            STACK_SHRINK(1);
            stack_pointer[-1] = res;
@ -299,7 +291,8 @@ class TestGeneratedCases(unittest.TestCase):
    """
        output = """
        TARGET(OP) {
-            PyObject *value = stack_pointer[-1];
+            PyObject *value;
+            value = stack_pointer[-1];
            uint16_t counter = read_u16(&next_instr[0].cache);
            uint32_t extra = read_u32(&next_instr[1].cache);
            STACK_SHRINK(1);
@ -338,8 +331,10 @@ class TestGeneratedCases(unittest.TestCase):
    """
        output = """
        TARGET(OP1) {
-            PyObject *right = stack_pointer[-1];
-            PyObject *left = stack_pointer[-2];
+            PyObject *right;
+            PyObject *left;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
            uint16_t counter = read_u16(&next_instr[0].cache);
            op1(left, right);
            next_instr += 1;
@ -347,38 +342,38 @@ class TestGeneratedCases(unittest.TestCase):
        }

        TARGET(OP) {
-            PyObject *_tmp_1 = stack_pointer[-1];
-            PyObject *_tmp_2 = stack_pointer[-2];
-            PyObject *_tmp_3 = stack_pointer[-3];
+            static_assert(INLINE_CACHE_ENTRIES_OP == 5, "incorrect cache size");
+            PyObject *right;
+            PyObject *left;
+            PyObject *arg2;
+            PyObject *res;
+            // OP1
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
            {
-                PyObject *right = _tmp_1;
-                PyObject *left = _tmp_2;
                uint16_t counter = read_u16(&next_instr[0].cache);
                op1(left, right);
-                _tmp_2 = left;
-                _tmp_1 = right;
            }
+            // OP2
+            arg2 = stack_pointer[-3];
            {
-                PyObject *right = _tmp_1;
-                PyObject *left = _tmp_2;
-                PyObject *arg2 = _tmp_3;
-                PyObject *res;
                uint32_t extra = read_u32(&next_instr[3].cache);
                res = op2(arg2, left, right);
-                _tmp_3 = res;
            }
-            next_instr += 5;
-            static_assert(INLINE_CACHE_ENTRIES_OP == 5, "incorrect cache size");
            STACK_SHRINK(2);
-            stack_pointer[-1] = _tmp_3;
+            stack_pointer[-1] = res;
+            next_instr += 5;
            DISPATCH();
        }

        TARGET(OP3) {
-            PyObject *right = stack_pointer[-1];
-            PyObject *left = stack_pointer[-2];
-            PyObject *arg2 = stack_pointer[-3];
+            PyObject *right;
+            PyObject *left;
+            PyObject *arg2;
            PyObject *res;
+            right = stack_pointer[-1];
+            left = stack_pointer[-2];
+            arg2 = stack_pointer[-3];
            res = op3(arg2, left, right);
            STACK_SHRINK(2);
            stack_pointer[-1] = res;
@ -396,9 +391,12 @@ class TestGeneratedCases(unittest.TestCase):
    """
        output = """
        TARGET(OP) {
-            PyObject *above = stack_pointer[-1];
-            PyObject **values = (stack_pointer - (1 + oparg*2));
-            PyObject *below = stack_pointer[-(2 + oparg*2)];
+            PyObject *above;
+            PyObject **values;
+            PyObject *below;
+            above = stack_pointer[-1];
+            values = stack_pointer - 1 - oparg*2;
+            below = stack_pointer[-2 - oparg*2];
            spam();
            STACK_SHRINK(oparg*2);
            STACK_SHRINK(2);
@ -416,12 +414,13 @@ class TestGeneratedCases(unittest.TestCase):
        output = """
        TARGET(OP) {
            PyObject *below;
-            PyObject **values = stack_pointer - (2) + 1;
+            PyObject **values;
            PyObject *above;
+            values = stack_pointer - 1;
            spam(values, oparg);
            STACK_GROW(oparg*3);
+            stack_pointer[-2 - oparg*3] = below;
            stack_pointer[-1] = above;
-            stack_pointer[-(2 + oparg*3)] = below;
            DISPATCH();
        }
    """
@ -435,8 +434,9 @@ class TestGeneratedCases(unittest.TestCase):
    """
        output = """
        TARGET(OP) {
-            PyObject **values = (stack_pointer - oparg);
+            PyObject **values;
            PyObject *above;
+            values = stack_pointer - oparg;
            spam(values, oparg);
            STACK_GROW(1);
            stack_pointer[-1] = above;
@ -453,8 +453,10 @@ class TestGeneratedCases(unittest.TestCase):
    """
        output = """
        TARGET(OP) {
-            PyObject **values = (stack_pointer - oparg);
-            PyObject *extra = stack_pointer[-(1 + oparg)];
+            PyObject **values;
+            PyObject *extra;
+            values = stack_pointer - oparg;
+            extra = stack_pointer[-1 - oparg];
            if (oparg == 0) { STACK_SHRINK(oparg); goto pop_1_somewhere; }
            STACK_SHRINK(oparg);
            STACK_SHRINK(1);
@ -471,18 +473,21 @@ class TestGeneratedCases(unittest.TestCase):
    """
        output = """
        TARGET(OP) {
-            PyObject *cc = stack_pointer[-1];
-            PyObject *input = ((oparg & 1) == 1) ? stack_pointer[-(1 + (((oparg & 1) == 1) ? 1 : 0))] : NULL;
-            PyObject *aa = stack_pointer[-(2 + (((oparg & 1) == 1) ? 1 : 0))];
+            PyObject *cc;
+            PyObject *input = NULL;
+            PyObject *aa;
            PyObject *xx;
            PyObject *output = NULL;
            PyObject *zz;
+            cc = stack_pointer[-1];
+            if ((oparg & 1) == 1) { input = stack_pointer[-1 - ((oparg & 1) == 1 ? 1 : 0)]; }
+            aa = stack_pointer[-2 - ((oparg & 1) == 1 ? 1 : 0)];
            output = spam(oparg, input);
            STACK_SHRINK((((oparg & 1) == 1) ? 1 : 0));
            STACK_GROW(((oparg & 2) ? 1 : 0));
+            stack_pointer[-2 - (oparg & 2 ? 1 : 0)] = xx;
+            if (oparg & 2) { stack_pointer[-1 - (oparg & 2 ? 1 : 0)] = output; }
            stack_pointer[-1] = zz;
-            if (oparg & 2) { stack_pointer[-(1 + ((oparg & 2) ? 1 : 0))] = output; }
-            stack_pointer[-(2 + ((oparg & 2) ? 1 : 0))] = xx;
            DISPATCH();
        }
    """
@ -500,29 +505,28 @@ class TestGeneratedCases(unittest.TestCase):
    """
        output = """
        TARGET(M) {
-            PyObject *_tmp_1 = stack_pointer[-1];
-            PyObject *_tmp_2 = stack_pointer[-2];
-            PyObject *_tmp_3 = stack_pointer[-3];
+            PyObject *right;
+            PyObject *middle;
+            PyObject *left;
+            PyObject *deep;
+            PyObject *extra = NULL;
+            PyObject *res;
+            // A
+            right = stack_pointer[-1];
+            middle = stack_pointer[-2];
+            left = stack_pointer[-3];
            {
-                PyObject *right = _tmp_1;
-                PyObject *middle = _tmp_2;
-                PyObject *left = _tmp_3;
                # Body of A
            }
+            // B
            {
-                PyObject *deep;
-                PyObject *extra = NULL;
-                PyObject *res;
                # Body of B
-                _tmp_3 = deep;
-                if (oparg) { _tmp_2 = extra; }
-                _tmp_1 = res;
            }
            STACK_SHRINK(1);
            STACK_GROW((oparg ? 1 : 0));
-            stack_pointer[-1] = _tmp_1;
-            if (oparg) { stack_pointer[-2] = _tmp_2; }
-            stack_pointer[-3] = _tmp_3;
+            stack_pointer[-2 - (oparg ? 1 : 0)] = deep;
+            if (oparg) { stack_pointer[-1 - (oparg ? 1 : 0)] = extra; }
+            stack_pointer[-1] = res;
            DISPATCH();
        }
    """
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
--- a/Python/generated_cases.c.h
+++ b/Python/generated_cases.c.h
--- a/Tools/cases_generator/analysis.py
+++ b/Tools/cases_generator/analysis.py
@ -13,7 +13,6 @@ from instructions import (
    MacroParts,
    OverriddenInstructionPlaceHolder,
    PseudoInstruction,
-    StackEffectMapping,
 )
 import parsing
 from parsing import StackEffect
@ -34,11 +33,12 @@ class Analyzer:

    input_filenames: list[str]
    errors: int = 0
+    warnings: int = 0

    def __init__(self, input_filenames: list[str]):
        self.input_filenames = input_filenames

-    def error(self, msg: str, node: parsing.Node) -> None:
+    def message(self, msg: str, node: parsing.Node) -> None:
        lineno = 0
        filename = "<unknown file>"
        if context := node.context:
@ -49,8 +49,18 @@ class Analyzer:
                if token.kind != "COMMENT":
                    break
        print(f"{filename}:{lineno}: {msg}", file=sys.stderr)
+
+    def error(self, msg: str, node: parsing.Node) -> None:
+        self.message("error: " + msg, node)
        self.errors += 1

+    def warning(self, msg: str, node: parsing.Node) -> None:
+        self.message("warning: " + msg, node)
+        self.warnings += 1
+
+    def note(self, msg: str, node: parsing.Node) -> None:
+        self.message("note: " + msg, node)
+
    everything: list[
        parsing.InstDef
        | parsing.Macro
@ -83,8 +93,15 @@ class Analyzer:
            self.parse_file(filename, instrs_idx)

        files = " + ".join(self.input_filenames)
+        n_instrs = 0
+        n_ops = 0
+        for instr in self.instrs.values():
+            if instr.kind == "op":
+                n_ops += 1
+            else:
+                n_instrs += 1
        print(
-            f"Read {len(self.instrs)} instructions/ops, "
+            f"Read {n_instrs} instructions, {n_ops} ops, "
            f"{len(self.macros)} macros, {len(self.pseudos)} pseudos, "
            f"and {len(self.families)} families from {files}",
            file=sys.stderr,
@ -270,14 +287,70 @@ class Analyzer:
        self.macro_instrs = {}
        self.pseudo_instrs = {}
        for name, macro in self.macros.items():
-            self.macro_instrs[name] = self.analyze_macro(macro)
+            self.macro_instrs[name] = mac = self.analyze_macro(macro)
+            self.check_macro_consistency(mac)
        for name, pseudo in self.pseudos.items():
            self.pseudo_instrs[name] = self.analyze_pseudo(pseudo)

+    # TODO: Merge with similar code in stacking.py, write_components()
+    def check_macro_consistency(self, mac: MacroInstruction) -> None:
+        def get_var_names(instr: Instruction) -> dict[str, StackEffect]:
+            vars: dict[str, StackEffect] = {}
+            for eff in instr.input_effects + instr.output_effects:
+                if eff.name in vars:
+                    if vars[eff.name] != eff:
+                        self.error(
+                            f"Instruction {instr.name!r} has "
+                            f"inconsistent type/cond/size for variable "
+                            f"{eff.name!r}: {vars[eff.name]} vs {eff}",
+                            instr.inst,
+                        )
+                else:
+                    vars[eff.name] = eff
+            return vars
+
+        all_vars: dict[str, StackEffect] = {}
+        # print("Checking", mac.name)
+        prevop: Instruction | None = None
+        for part in mac.parts:
+            if not isinstance(part, Component):
+                continue
+            vars = get_var_names(part.instr)
+            # print("    //", part.instr.name, "//", vars)
+            for name, eff in vars.items():
+                if name in all_vars:
+                    if all_vars[name] != eff:
+                        self.error(
+                            f"Macro {mac.name!r} has "
+                            f"inconsistent type/cond/size for variable "
+                            f"{name!r}: "
+                            f"{all_vars[name]} vs {eff} in {part.instr.name!r}",
+                            mac.macro,
+                        )
+                else:
+                    all_vars[name] = eff
+            if prevop is not None:
+                pushes = list(prevop.output_effects)
+                pops = list(reversed(part.instr.input_effects))
+                copies: list[tuple[StackEffect, StackEffect]] = []
+                while pushes and pops and pushes[-1] == pops[0]:
+                    src, dst = pushes.pop(), pops.pop(0)
+                    if src.name == dst.name or dst.name is UNUSED:
+                        continue
+                    copies.append((src, dst))
+                reads = set(copy[0].name for copy in copies)
+                writes = set(copy[1].name for copy in copies)
+                if reads & writes:
+                    self.error(
+                        f"Macro {mac.name!r} has conflicting copies "
+                        f"(source of one copy is destination of another): "
+                        f"{reads & writes}",
+                        mac.macro,
+                    )
+            prevop = part.instr
+
    def analyze_macro(self, macro: parsing.Macro) -> MacroInstruction:
        components = self.check_macro_components(macro)
-        stack, initial_sp = self.stack_analysis(components)
-        sp = initial_sp
        parts: MacroParts = []
        flags = InstructionFlags.newEmpty()
        offset = 0
@ -287,20 +360,15 @@ class Analyzer:
                    parts.append(ceffect)
                    offset += ceffect.size
                case Instruction() as instr:
-                    part, sp, offset = self.analyze_instruction(
-                        instr, stack, sp, offset
-                    )
+                    part, offset = self.analyze_instruction(instr, offset)
                    parts.append(part)
                    flags.add(instr.instr_flags)
                case _:
                    typing.assert_never(component)
-        final_sp = sp
        format = "IB"
        if offset:
            format += "C" + "0" * (offset - 1)
-        return MacroInstruction(
-            macro.name, stack, initial_sp, final_sp, format, flags, macro, parts, offset
-        )
+        return MacroInstruction(macro.name, format, flags, macro, parts, offset)

    def analyze_pseudo(self, pseudo: parsing.Pseudo) -> PseudoInstruction:
        targets = [self.instrs[target] for target in pseudo.targets]
@ -312,24 +380,15 @@ class Analyzer:
        return PseudoInstruction(pseudo.name, targets, fmts[0], targets[0].instr_flags)

    def analyze_instruction(
-        self, instr: Instruction, stack: list[StackEffect], sp: int, offset: int
-    ) -> tuple[Component, int, int]:
-        input_mapping: StackEffectMapping = []
-        for ieffect in reversed(instr.input_effects):
-            sp -= 1
-            input_mapping.append((stack[sp], ieffect))
-        output_mapping: StackEffectMapping = []
-        for oeffect in instr.output_effects:
-            output_mapping.append((stack[sp], oeffect))
-            sp += 1
+        self, instr: Instruction, offset: int
+    ) -> tuple[Component, int]:
        active_effects: list[ActiveCacheEffect] = []
        for ceffect in instr.cache_effects:
            if ceffect.name != UNUSED:
                active_effects.append(ActiveCacheEffect(ceffect, offset))
            offset += ceffect.size
        return (
-            Component(instr, input_mapping, output_mapping, active_effects),
-            sp,
+            Component(instr, active_effects),
            offset,
        )

@ -348,65 +407,3 @@ class Analyzer:
                case _:
                    typing.assert_never(uop)
        return components
-
-    def stack_analysis(
-        self, components: typing.Iterable[InstructionOrCacheEffect]
-    ) -> tuple[list[StackEffect], int]:
-        """Analyze a macro.
-
-        Ignore cache effects.
-
-        Return the list of variables (as StackEffects) and the initial stack pointer.
-        """
-        lowest = current = highest = 0
-        conditions: dict[int, str] = {}  # Indexed by 'current'.
-        last_instr: Instruction | None = None
-        for thing in components:
-            if isinstance(thing, Instruction):
-                last_instr = thing
-        for thing in components:
-            match thing:
-                case Instruction() as instr:
-                    if any(
-                        eff.size for eff in instr.input_effects + instr.output_effects
-                    ):
-                        # TODO: Eventually this will be needed, at least for macros.
-                        self.error(
-                            f"Instruction {instr.name!r} has variable-sized stack effect, "
-                            "which are not supported in macro instructions",
-                            instr.inst,  # TODO: Pass name+location of macro
-                        )
-                    if any(eff.cond for eff in instr.input_effects):
-                        self.error(
-                            f"Instruction {instr.name!r} has conditional input stack effect, "
-                            "which are not supported in macro instructions",
-                            instr.inst,  # TODO: Pass name+location of macro
-                        )
-                    if (
-                        any(eff.cond for eff in instr.output_effects)
-                        and instr is not last_instr
-                    ):
-                        self.error(
-                            f"Instruction {instr.name!r} has conditional output stack effect, "
-                            "but is not the last instruction in a macro",
-                            instr.inst,  # TODO: Pass name+location of macro
-                        )
-                    current -= len(instr.input_effects)
-                    lowest = min(lowest, current)
-                    for eff in instr.output_effects:
-                        if eff.cond:
-                            conditions[current] = eff.cond
-                        current += 1
-                    highest = max(highest, current)
-                case parsing.CacheEffect():
-                    pass
-                case _:
-                    typing.assert_never(thing)
-        # At this point, 'current' is the net stack effect,
-        # and 'lowest' and 'highest' are the extremes.
-        # Note that 'lowest' may be negative.
-        stack = [
-            StackEffect(f"_tmp_{i}", "", conditions.get(highest - i, ""))
-            for i in reversed(range(1, highest - lowest + 1))
-        ]
-        return stack, -lowest
--- a/Tools/cases_generator/flags.py
+++ b/Tools/cases_generator/flags.py
@ -49,9 +49,9 @@ class InstructionFlags:
            if value:
                setattr(self, name, value)

-    def names(self, value=None):
+    def names(self, value=None) -> list[str]:
        if value is None:
-            return dataclasses.asdict(self).keys()
+            return list(dataclasses.asdict(self).keys())
        return [n for n, v in dataclasses.asdict(self).items() if v == value]

    def bitmap(self) -> int:
--- a/Tools/cases_generator/formatting.py
+++ b/Tools/cases_generator/formatting.py
@ -2,7 +2,7 @@ import contextlib
 import re
 import typing

-from parsing import StackEffect
+from parsing import StackEffect, Family

 UNUSED = "unused"

@ -19,8 +19,11 @@ class Formatter:
    nominal_filename: str

    def __init__(
-            self, stream: typing.TextIO, indent: int,
-                  emit_line_directives: bool = False, comment: str = "//",
+        self,
+        stream: typing.TextIO,
+        indent: int,
+        emit_line_directives: bool = False,
+        comment: str = "//",
    ) -> None:
        self.stream = stream
        self.prefix = " " * indent
@ -93,8 +96,11 @@ class Formatter:
        typ = f"{dst.type}" if dst.type else "PyObject *"
        if src:
            cast = self.cast(dst, src)
-            init = f" = {cast}{src.name}"
-        elif dst.cond:
+            initexpr = f"{cast}{src.name}"
+            if src.cond and src.cond != "1":
+                initexpr = f"{parenthesize_cond(src.cond)} ? {initexpr} : NULL"
+            init = f" = {initexpr}"
+        elif dst.cond and dst.cond != "1":
            init = " = NULL"
        else:
            init = ""
@ -102,10 +108,7 @@ class Formatter:
        self.emit(f"{typ}{sepa}{dst.name}{init};")

    def assign(self, dst: StackEffect, src: StackEffect):
-        if src.name == UNUSED:
-            return
-        if src.size:
-            # Don't write sized arrays -- it's up to the user code.
+        if src.name == UNUSED or dst.name == UNUSED:
            return
        cast = self.cast(dst, src)
        if re.match(r"^REG\(oparg(\d+)\)$", dst.name):
@ -122,6 +125,23 @@ class Formatter:
    def cast(self, dst: StackEffect, src: StackEffect) -> str:
        return f"({dst.type or 'PyObject *'})" if src.type != dst.type else ""

+    def static_assert_family_size(
+        self, name: str, family: Family | None, cache_offset: int
+    ) -> None:
+        """Emit a static_assert for the size of a family, if known.
+
+        This will fail at compile time if the cache size computed from
+        the instruction definition does not match the size of the struct
+        used by specialize.c.
+        """
+        if family and name == family.name:
+            cache_size = family.size
+            if cache_size:
+                self.emit(
+                    f"static_assert({cache_size} == {cache_offset}, "
+                    f'"incorrect cache size");'
+                )
+

 def prettify_filename(filename: str) -> str:
    # Make filename more user-friendly and less platform-specific,
@ -178,11 +198,8 @@ def maybe_parenthesize(sym: str) -> str:
        return f"({sym})"


-def string_effect_size(arg: tuple[int, str]) -> str:
-    numeric, symbolic = arg
-    if numeric and symbolic:
-        return f"{numeric} + {symbolic}"
-    elif symbolic:
-        return symbolic
-    else:
-        return str(numeric)
+def parenthesize_cond(cond: str) -> str:
+    """Parenthesize a condition, but only if it contains ?: itself."""
+    if "?" in cond:
+        cond = f"({cond})"
+    return cond
--- a/Tools/cases_generator/generate_cases.py
+++ b/Tools/cases_generator/generate_cases.py
@ -4,14 +4,14 @@ Writes the cases to generated_cases.c.h, which is #included in ceval.c.
 """

 import argparse
-import contextlib
 import os
 import posixpath
 import sys
 import typing

+import stacking  # Early import to avoid circular import
 from analysis import Analyzer
-from formatting import Formatter, list_effect_size, maybe_parenthesize
+from formatting import Formatter, list_effect_size
 from flags import InstructionFlags, variable_used
 from instructions import (
    AnyInstruction,
@ -118,41 +118,7 @@ class Generator(Analyzer):
                    pushed = ""
            case parsing.Macro():
                instr = self.macro_instrs[thing.name]
-                parts = [comp for comp in instr.parts if isinstance(comp, Component)]
-                # Note: stack_analysis() already verifies that macro components
-                # have no variable-sized stack effects.
-                low = 0
-                sp = 0
-                high = 0
-                pushed_symbolic: list[str] = []
-                for comp in parts:
-                    for effect in comp.instr.input_effects:
-                        assert not effect.cond, effect
-                        assert not effect.size, effect
-                        sp -= 1
-                        low = min(low, sp)
-                    for effect in comp.instr.output_effects:
-                        assert not effect.size, effect
-                        if effect.cond:
-                            if effect.cond in ("0", "1"):
-                                pushed_symbolic.append(effect.cond)
-                            else:
-                                pushed_symbolic.append(
-                                    maybe_parenthesize(
-                                        f"{maybe_parenthesize(effect.cond)} ? 1 : 0"
-                                    )
-                                )
-                        sp += 1
-                        high = max(sp, high)
-                if high != max(0, sp):
-                    # If you get this, intermediate stack growth occurs,
-                    # and stack size calculations may go awry.
-                    # E.g. [push, pop]. The fix would be for stack size
-                    # calculations to use the micro ops.
-                    self.error("Macro has virtual stack growth", thing)
-                popped = str(-low)
-                pushed_symbolic.append(str(sp - low - len(pushed_symbolic)))
-                pushed = " + ".join(pushed_symbolic)
+                popped, pushed = stacking.get_stack_effect_info_for_macro(instr)
            case parsing.Pseudo():
                instr = self.pseudo_instrs[thing.name]
                popped = pushed = None
@ -258,7 +224,8 @@ class Generator(Analyzer):
                case _:
                    typing.assert_never(thing)
            all_formats.add(format)
-        # Turn it into a list of enum definitions.
+
+        # Turn it into a sorted list of enum values.
        format_enums = [INSTR_FMT_PREFIX + format for format in sorted(all_formats)]

        with open(metadata_filename, "w") as f:
@ -276,8 +243,10 @@ class Generator(Analyzer):

            self.write_stack_effect_functions()

-            # Write type definitions
-            self.out.emit(f"enum InstructionFormat {{ {', '.join(format_enums)} }};")
+            # Write the enum definition for instruction formats.
+            with self.out.block("enum InstructionFormat", ";"):
+                for enum in format_enums:
+                    self.out.emit(enum + ",")

            self.out.emit("")
            self.out.emit(
@ -374,7 +343,7 @@ class Generator(Analyzer):
                            # Since an 'op' is not a bytecode, it has no expansion; but 'inst' is
                            if instr.kind == "inst" and instr.is_viable_uop():
                                # Construct a dummy Component -- input/output mappings are not used
-                                part = Component(instr, [], [], instr.active_caches)
+                                part = Component(instr, instr.active_caches)
                                self.write_macro_expansions(instr.name, [part])
                            elif instr.kind == "inst" and variable_used(
                                instr.inst, "oparg1"
@ -468,7 +437,15 @@ class Generator(Analyzer):
            if isinstance(part, Component):
                # All component instructions must be viable uops
                if not part.instr.is_viable_uop():
-                    print(f"NOTE: Part {part.instr.name} of {name} is not a viable uop")
+                    # This note just reminds us about macros that cannot
+                    # be expanded to Tier 2 uops. It is not an error.
+                    # It is sometimes emitted for macros that have a
+                    # manual translation in translate_bytecode_to_trace()
+                    # in Python/optimizer.c.
+                    self.note(
+                        f"Part {part.instr.name} of {name} is not a viable uop",
+                        part.instr.inst,
+                    )
                    return
                if not part.active_caches:
                    size, offset = OPARG_SIZES["OPARG_FULL"], 0
@ -512,7 +489,7 @@ class Generator(Analyzer):
        instr2 = self.instrs[name2]
        assert not instr1.active_caches, f"{name1} has active caches"
        assert not instr2.active_caches, f"{name2} has active caches"
-        expansions = [
+        expansions: list[tuple[str, int, int]] = [
            (name1, OPARG_SIZES["OPARG_TOP"], 0),
            (name2, OPARG_SIZES["OPARG_BOTTOM"], 0),
        ]
@ -563,7 +540,6 @@ class Generator(Analyzer):
            # Write and count instructions of all kinds
            n_instrs = 0
            n_macros = 0
-            n_pseudos = 0
            for thing in self.everything:
                match thing:
                    case OverriddenInstructionPlaceHolder():
@ -574,15 +550,17 @@ class Generator(Analyzer):
                            self.write_instr(self.instrs[thing.name])
                    case parsing.Macro():
                        n_macros += 1
-                        self.write_macro(self.macro_instrs[thing.name])
+                        mac = self.macro_instrs[thing.name]
+                        stacking.write_macro_instr(mac, self.out, self.families.get(mac.name))
+                        # self.write_macro(self.macro_instrs[thing.name])
                    case parsing.Pseudo():
-                        n_pseudos += 1
+                        pass
                    case _:
                        typing.assert_never(thing)

        print(
-            f"Wrote {n_instrs} instructions, {n_macros} macros, "
-            f"and {n_pseudos} pseudos to {output_filename}",
+            f"Wrote {n_instrs} instructions and {n_macros} macros "
+            f"to {output_filename}",
            file=sys.stderr,
        )

@ -590,6 +568,8 @@ class Generator(Analyzer):
        self, executor_filename: str, emit_line_directives: bool
    ) -> None:
        """Generate cases for the Tier 2 interpreter."""
+        n_instrs = 0
+        n_uops = 0
        with open(executor_filename, "w") as f:
            self.out = Formatter(f, 8, emit_line_directives)
            self.write_provenance_header()
@ -601,6 +581,10 @@ class Generator(Analyzer):
                    case parsing.InstDef():
                        instr = self.instrs[thing.name]
                        if instr.is_viable_uop():
+                            if instr.kind == "op":
+                                n_uops += 1
+                            else:
+                                n_instrs += 1
                            self.out.emit("")
                            with self.out.block(f"case {thing.name}:"):
                                instr.write(self.out, tier=TIER_TWO)
@ -616,7 +600,7 @@ class Generator(Analyzer):
                    case _:
                        typing.assert_never(thing)
        print(
-            f"Wrote some stuff to {executor_filename}",
+            f"Wrote {n_instrs} instructions and {n_uops} ops to {executor_filename}",
            file=sys.stderr,
        )

@ -642,69 +626,6 @@ class Generator(Analyzer):
                    self.out.emit("CHECK_EVAL_BREAKER();")
                self.out.emit(f"DISPATCH();")

-    def write_macro(self, mac: MacroInstruction) -> None:
-        """Write code for a macro instruction."""
-        last_instr: Instruction | None = None
-        with self.wrap_macro(mac):
-            cache_adjust = 0
-            for part in mac.parts:
-                match part:
-                    case parsing.CacheEffect(size=size):
-                        cache_adjust += size
-                    case Component() as comp:
-                        last_instr = comp.instr
-                        comp.write_body(self.out)
-                        cache_adjust += comp.instr.cache_offset
-
-            if cache_adjust:
-                self.out.emit(f"next_instr += {cache_adjust};")
-
-            if (
-                (family := self.families.get(mac.name))
-                and mac.name == family.name
-                and (cache_size := family.size)
-            ):
-                self.out.emit(
-                    f"static_assert({cache_size} == "
-                    f'{cache_adjust}, "incorrect cache size");'
-                )
-
-    @contextlib.contextmanager
-    def wrap_macro(self, mac: MacroInstruction):
-        """Boilerplate for macro instructions."""
-        # TODO: Somewhere (where?) make it so that if one instruction
-        # has an output that is input to another, and the variable names
-        # and types match and don't conflict with other instructions,
-        # that variable is declared with the right name and type in the
-        # outer block, rather than trusting the compiler to optimize it.
-        self.out.emit("")
-        with self.out.block(f"TARGET({mac.name})"):
-            if mac.predicted:
-                self.out.emit(f"PREDICTED({mac.name});")
-
-            # The input effects should have no conditionals.
-            # Only the output effects do (for now).
-            ieffects = [
-                StackEffect(eff.name, eff.type) if eff.cond else eff
-                for eff in mac.stack
-            ]
-
-            for i, var in reversed(list(enumerate(ieffects))):
-                src = None
-                if i < mac.initial_sp:
-                    src = StackEffect(f"stack_pointer[-{mac.initial_sp - i}]", "")
-                self.out.declare(var, src)
-
-            yield
-
-            self.out.stack_adjust(ieffects[: mac.initial_sp], mac.stack[: mac.final_sp])
-
-            for i, var in enumerate(reversed(mac.stack[: mac.final_sp]), 1):
-                dst = StackEffect(f"stack_pointer[-{i}]", "")
-                self.out.assign(dst, var)
-
-            self.out.emit(f"DISPATCH();")
-

 def main():
    """Parse command line, parse input, analyze, write output."""
--- a/Tools/cases_generator/instructions.py
+++ b/Tools/cases_generator/instructions.py
@ -2,17 +2,16 @@ import dataclasses
 import re
 import typing

-from flags import InstructionFlags, variable_used_unspecialized
+from flags import InstructionFlags, variable_used, variable_used_unspecialized
 from formatting import (
    Formatter,
    UNUSED,
-    string_effect_size,
    list_effect_size,
-    maybe_parenthesize,
 )
 import lexer as lx
 import parsing
 from parsing import StackEffect
+import stacking

 BITS_PER_CODE_UNIT = 16

@ -61,6 +60,7 @@ class Instruction:

    # Computed by constructor
    always_exits: bool
+    has_deopt: bool
    cache_offset: int
    cache_effects: list[parsing.CacheEffect]
    input_effects: list[StackEffect]
@ -83,6 +83,7 @@ class Instruction:
            self.block
        )
        self.always_exits = always_exits(self.block_text)
+        self.has_deopt = variable_used(self.inst, "DEOPT_IF")
        self.cache_effects = [
            effect for effect in inst.inputs if isinstance(effect, parsing.CacheEffect)
        ]
@ -93,7 +94,7 @@ class Instruction:
        self.output_effects = inst.outputs  # For consistency/completeness
        unmoved_names: set[str] = set()
        for ieffect, oeffect in zip(self.input_effects, self.output_effects):
-            if ieffect.name == oeffect.name:
+            if ieffect == oeffect and ieffect.name == oeffect.name:
                unmoved_names.add(ieffect.name)
            else:
                break
@ -141,84 +142,17 @@ class Instruction:

    def write(self, out: Formatter, tier: Tiers = TIER_ONE) -> None:
        """Write one instruction, sans prologue and epilogue."""
+
        # Write a static assertion that a family's cache size is correct
-        if family := self.family:
-            if self.name == family.name:
-                if cache_size := family.size:
-                    out.emit(
-                        f"static_assert({cache_size} == "
-                        f'{self.cache_offset}, "incorrect cache size");'
-                    )
+        out.static_assert_family_size(self.name, self.family, self.cache_offset)

        # Write input stack effect variable declarations and initializations
-        ieffects = list(reversed(self.input_effects))
-        for i, ieffect in enumerate(ieffects):
-            isize = string_effect_size(
-                list_effect_size([ieff for ieff in ieffects[: i + 1]])
-            )
-            if ieffect.size:
-                src = StackEffect(
-                    f"(stack_pointer - {maybe_parenthesize(isize)})", "PyObject **"
-                )
-            elif ieffect.cond:
-                src = StackEffect(
-                    f"({ieffect.cond}) ? stack_pointer[-{maybe_parenthesize(isize)}] : NULL",
-                    "",
-                )
-            else:
-                src = StackEffect(f"stack_pointer[-{maybe_parenthesize(isize)}]", "")
-            out.declare(ieffect, src)
-
-        # Write output stack effect variable declarations
-        isize = string_effect_size(list_effect_size(self.input_effects))
-        input_names = {ieffect.name for ieffect in self.input_effects}
-        for i, oeffect in enumerate(self.output_effects):
-            if oeffect.name not in input_names:
-                if oeffect.size:
-                    osize = string_effect_size(
-                        list_effect_size([oeff for oeff in self.output_effects[:i]])
-                    )
-                    offset = "stack_pointer"
-                    if isize != osize:
-                        if isize != "0":
-                            offset += f" - ({isize})"
-                        if osize != "0":
-                            offset += f" + {osize}"
-                    src = StackEffect(offset, "PyObject **")
-                    out.declare(oeffect, src)
-                else:
-                    out.declare(oeffect, None)
-
-        # out.emit(f"next_instr += OPSIZE({self.inst.name}) - 1;")
-
-        self.write_body(out, 0, self.active_caches, tier=tier)
+        stacking.write_single_instr(self, out, tier)

        # Skip the rest if the block always exits
        if self.always_exits:
            return

-        # Write net stack growth/shrinkage
-        out.stack_adjust(
-            [ieff for ieff in self.input_effects],
-            [oeff for oeff in self.output_effects],
-        )
-
-        # Write output stack effect assignments
-        oeffects = list(reversed(self.output_effects))
-        for i, oeffect in enumerate(oeffects):
-            if oeffect.name in self.unmoved_names:
-                continue
-            osize = string_effect_size(
-                list_effect_size([oeff for oeff in oeffects[: i + 1]])
-            )
-            if oeffect.size:
-                dst = StackEffect(
-                    f"stack_pointer - {maybe_parenthesize(osize)}", "PyObject **"
-                )
-            else:
-                dst = StackEffect(f"stack_pointer[-{maybe_parenthesize(osize)}]", "")
-            out.assign(dst, oeffect)
-
        # Write cache effect
        if tier == TIER_ONE and self.cache_offset:
            out.emit(f"next_instr += {self.cache_offset};")
@ -274,7 +208,12 @@ class Instruction:
                # These aren't DECREF'ed so they can stay.
                ieffs = list(self.input_effects)
                oeffs = list(self.output_effects)
-                while ieffs and oeffs and ieffs[0] == oeffs[0]:
+                while (
+                    ieffs
+                    and oeffs
+                    and ieffs[0] == oeffs[0]
+                    and ieffs[0].name == oeffs[0].name
+                ):
                    ieffs.pop(0)
                    oeffs.pop(0)
                ninputs, symbolic = list_effect_size(ieffs)
@ -307,30 +246,13 @@ class Instruction:


 InstructionOrCacheEffect = Instruction | parsing.CacheEffect
-StackEffectMapping = list[tuple[StackEffect, StackEffect]]


@dataclasses.dataclass
 class Component:
    instr: Instruction
-    input_mapping: StackEffectMapping
-    output_mapping: StackEffectMapping
    active_caches: list[ActiveCacheEffect]

-    def write_body(self, out: Formatter) -> None:
-        with out.block(""):
-            input_names = {ieffect.name for _, ieffect in self.input_mapping}
-            for var, ieffect in self.input_mapping:
-                out.declare(ieffect, var)
-            for _, oeffect in self.output_mapping:
-                if oeffect.name not in input_names:
-                    out.declare(oeffect, None)
-
-            self.instr.write_body(out, -4, self.active_caches)
-
-            for var, oeffect in self.output_mapping:
-                out.assign(var, oeffect)
-

 MacroParts = list[Component | parsing.CacheEffect]

@ -340,9 +262,6 @@ class MacroInstruction:
    """A macro instruction."""

    name: str
-    stack: list[StackEffect]
-    initial_sp: int
-    final_sp: int
    instr_fmt: str
    instr_flags: InstructionFlags
    macro: parsing.Macro
--- a/Tools/cases_generator/parsing.py
+++ b/Tools/cases_generator/parsing.py
@ -69,12 +69,18 @@ class Block(Node):

@dataclass
 class StackEffect(Node):
-    name: str
+    name: str = field(compare=False)  # __eq__ only uses type, cond, size
    type: str = ""  # Optional `:type`
    cond: str = ""  # Optional `if (cond)`
    size: str = ""  # Optional `[size]`
    # Note: size cannot be combined with type or cond

+    def __repr__(self):
+        items = [self.name, self.type, self.cond, self.size]
+        while items and items[-1] == "":
+            del items[-1]
+        return f"StackEffect({', '.join(repr(item) for item in items)})"
+

@dataclass
 class Expression(Node):
@ -130,6 +136,7 @@ class Family(Node):
    size: str  # Variable giving the cache size in code units
    members: list[str]

+
@dataclass
 class Pseudo(Node):
    name: str
@ -154,7 +161,13 @@ class Parser(PLexer):
        if hdr := self.inst_header():
            if block := self.block():
                return InstDef(
-                    hdr.override, hdr.register, hdr.kind, hdr.name, hdr.inputs, hdr.outputs, block
+                    hdr.override,
+                    hdr.register,
+                    hdr.kind,
+                    hdr.name,
+                    hdr.inputs,
+                    hdr.outputs,
+                    block,
                )
            raise self.make_syntax_error("Expected block")
        return None
@ -371,9 +384,7 @@ class Parser(PLexer):
                                raise self.make_syntax_error("Expected {")
                            if members := self.members():
                                if self.expect(lx.RBRACE) and self.expect(lx.SEMI):
-                                    return Pseudo(
-                                        tkn.text, members
-                                    )
+                                    return Pseudo(tkn.text, members)
        return None

    def members(self) -> list[str] | None:
--- a/Tools/cases_generator/plexer.py
+++ b/Tools/cases_generator/plexer.py
@ -1,4 +1,5 @@
 import lexer as lx
+
 Token = lx.Token


@ -64,7 +65,9 @@ class PLexer:
        tkn = self.next()
        if tkn is not None and tkn.kind == kind:
            return tkn
-        raise self.make_syntax_error(f"Expected {kind!r} but got {tkn and tkn.text!r}", tkn)
+        raise self.make_syntax_error(
+            f"Expected {kind!r} but got {tkn and tkn.text!r}", tkn
+        )

    def extract_line(self, lineno: int) -> str:
        # Return source line `lineno` (1-based)
@ -73,18 +76,20 @@ class PLexer:
            return ""
        return lines[lineno - 1]

-    def make_syntax_error(self, message: str, tkn: Token|None = None) -> SyntaxError:
+    def make_syntax_error(self, message: str, tkn: Token | None = None) -> SyntaxError:
        # Construct a SyntaxError instance from message and token
        if tkn is None:
            tkn = self.peek()
        if tkn is None:
            tkn = self.tokens[-1]
-        return lx.make_syntax_error(message,
-            self.filename, tkn.line, tkn.column, self.extract_line(tkn.line))
+        return lx.make_syntax_error(
+            message, self.filename, tkn.line, tkn.column, self.extract_line(tkn.line)
+        )


 if __name__ == "__main__":
    import sys
+
    if sys.argv[1:]:
        filename = sys.argv[1]
        if filename == "-c" and sys.argv[2:]:
--- a/Tools/cases_generator/stacking.py
+++ b/Tools/cases_generator/stacking.py
@ -0,0 +1,400 @@
+import dataclasses
+import typing
+
+from formatting import (
+    Formatter,
+    UNUSED,
+    maybe_parenthesize,
+    parenthesize_cond,
+)
+from instructions import (
+    ActiveCacheEffect,
+    Instruction,
+    MacroInstruction,
+    Component,
+    Tiers,
+    TIER_ONE,
+)
+from parsing import StackEffect, CacheEffect, Family
+
+
+@dataclasses.dataclass
+class StackOffset:
+    """Represent the stack offset for a PEEK or POKE.
+
+    - At stack_pointer[0], deep and high are both empty.
+      (Note that that is an invalid stack reference.)
+    - Below stack top, only deep is non-empty.
+    - Above stack top, only high is non-empty.
+    - In complex cases, both deep and high may be non-empty.
+
+    All this would be much simpler if all stack entries were the same
+    size, but with conditional and array effects, they aren't.
+    The offsets are each represented by a list of StackEffect objects.
+    The name in the StackEffects is unused.
+    """
+
+    deep: list[StackEffect] = dataclasses.field(default_factory=list)
+    high: list[StackEffect] = dataclasses.field(default_factory=list)
+
+    def clone(self) -> "StackOffset":
+        return StackOffset(list(self.deep), list(self.high))
+
+    def negate(self) -> "StackOffset":
+        return StackOffset(list(self.high), list(self.deep))
+
+    def deeper(self, eff: StackEffect) -> None:
+        if eff in self.high:
+            self.high.remove(eff)
+        else:
+            self.deep.append(eff)
+
+    def higher(self, eff: StackEffect) -> None:
+        if eff in self.deep:
+            self.deep.remove(eff)
+        else:
+            self.high.append(eff)
+
+    def as_terms(self) -> list[tuple[str, str]]:
+        num = 0
+        terms: list[tuple[str, str]] = []
+        for eff in self.deep:
+            if eff.size:
+                terms.append(("-", maybe_parenthesize(eff.size)))
+            elif eff.cond and eff.cond != "1":
+                terms.append(("-", f"({parenthesize_cond(eff.cond)} ? 1 : 0)"))
+            elif eff.cond != "0":
+                num -= 1
+        for eff in self.high:
+            if eff.size:
+                terms.append(("+", maybe_parenthesize(eff.size)))
+            elif eff.cond and eff.cond != "1":
+                terms.append(("+", f"({parenthesize_cond(eff.cond)} ? 1 : 0)"))
+            elif eff.cond != "0":
+                num += 1
+        if num < 0:
+            terms.insert(0, ("-", str(-num)))
+        elif num > 0:
+            terms.append(("+", str(num)))
+        return terms
+
+    def as_index(self) -> str:
+        terms = self.as_terms()
+        return make_index(terms)
+
+
+def make_index(terms: list[tuple[str, str]]) -> str:
+    # Produce an index expression from the terms honoring PEP 8,
+    # surrounding binary ops with spaces but not unary minus
+    index = ""
+    for sign, term in terms:
+        if index:
+            index += f" {sign} {term}"
+        elif sign == "+":
+            index = term
+        else:
+            index = sign + term
+    return index or "0"
+
+
+@dataclasses.dataclass
+class StackItem:
+    offset: StackOffset
+    effect: StackEffect
+
+    def as_variable(self, lax: bool = False) -> str:
+        """Return e.g. stack_pointer[-1]."""
+        terms = self.offset.as_terms()
+        if self.effect.size:
+            terms.insert(0, ("+", "stack_pointer"))
+        index = make_index(terms)
+        if self.effect.size:
+            res = index
+        else:
+            res = f"stack_pointer[{index}]"
+        if not lax:
+            # Check that we're not reading or writing above stack top.
+            # Skip this for output variable initialization (lax=True).
+            assert (
+                self.effect in self.offset.deep and not self.offset.high
+            ), f"Push or pop above current stack level: {res}"
+        return res
+
+
+@dataclasses.dataclass
+class CopyEffect:
+    src: StackEffect
+    dst: StackEffect
+
+
+class EffectManager:
+    """Manage stack effects and offsets for an instruction."""
+
+    instr: Instruction
+    active_caches: list[ActiveCacheEffect]
+    peeks: list[StackItem]
+    pokes: list[StackItem]
+    copies: list[CopyEffect]  # See merge()
+    # Track offsets from stack pointer
+    min_offset: StackOffset
+    final_offset: StackOffset
+
+    def __init__(
+        self,
+        instr: Instruction,
+        active_caches: list[ActiveCacheEffect],
+        pred: "EffectManager | None" = None,
+    ):
+        self.instr = instr
+        self.active_caches = active_caches
+        self.peeks = []
+        self.pokes = []
+        self.copies = []
+        self.final_offset = pred.final_offset.clone() if pred else StackOffset()
+        for eff in reversed(instr.input_effects):
+            self.final_offset.deeper(eff)
+            self.peeks.append(StackItem(offset=self.final_offset.clone(), effect=eff))
+        self.min_offset = self.final_offset.clone()
+        for eff in instr.output_effects:
+            self.pokes.append(StackItem(offset=self.final_offset.clone(), effect=eff))
+            self.final_offset.higher(eff)
+
+        if pred:
+            # Replace push(x) + pop(y) with copy(x, y).
+            # Check that the sources and destinations are disjoint.
+            sources: set[str] = set()
+            destinations: set[str] = set()
+            while (
+                pred.pokes
+                and self.peeks
+                and pred.pokes[-1].effect == self.peeks[-1].effect
+            ):
+                src = pred.pokes.pop(-1).effect
+                dst = self.peeks.pop(0).effect
+                pred.final_offset.deeper(src)
+                if dst.name != UNUSED:
+                    destinations.add(dst.name)
+                    if dst.name != src.name:
+                        sources.add(src.name)
+                    self.copies.append(CopyEffect(src, dst))
+            # TODO: Turn this into an error (pass an Analyzer instance?)
+            assert sources & destinations == set(), (
+                pred.instr.name,
+                self.instr.name,
+                sources,
+                destinations,
+            )
+
+    def adjust_deeper(self, eff: StackEffect) -> None:
+        for peek in self.peeks:
+            peek.offset.deeper(eff)
+        for poke in self.pokes:
+            poke.offset.deeper(eff)
+        self.min_offset.deeper(eff)
+        self.final_offset.deeper(eff)
+
+    def adjust_higher(self, eff: StackEffect) -> None:
+        for peek in self.peeks:
+            peek.offset.higher(eff)
+        for poke in self.pokes:
+            poke.offset.higher(eff)
+        self.min_offset.higher(eff)
+        self.final_offset.higher(eff)
+
+    def adjust(self, offset: StackOffset) -> None:
+        for down in offset.deep:
+            self.adjust_deeper(down)
+        for up in offset.high:
+            self.adjust_higher(up)
+
+    def adjust_inverse(self, offset: StackOffset) -> None:
+        for down in offset.deep:
+            self.adjust_higher(down)
+        for up in offset.high:
+            self.adjust_deeper(up)
+
+    def collect_vars(self) -> dict[str, StackEffect]:
+        """Collect all variables, skipping unused ones."""
+        vars: dict[str, StackEffect] = {}
+
+        def add(eff: StackEffect) -> None:
+            if eff.name != UNUSED:
+                if eff.name in vars:
+                    # TODO: Make this an error
+                    assert vars[eff.name] == eff, (
+                        self.instr.name,
+                        eff.name,
+                        vars[eff.name],
+                        eff,
+                    )
+                else:
+                    vars[eff.name] = eff
+
+        for copy in self.copies:
+            add(copy.src)
+            add(copy.dst)
+        for peek in self.peeks:
+            add(peek.effect)
+        for poke in self.pokes:
+            add(poke.effect)
+
+        return vars
+
+
+def less_than(a: StackOffset, b: StackOffset) -> bool:
+    # TODO: Handle more cases
+    if a.high != b.high:
+        return False
+    return a.deep[: len(b.deep)] == b.deep
+
+
+def get_managers(parts: list[Component]) -> list[EffectManager]:
+    managers: list[EffectManager] = []
+    pred: EffectManager | None = None
+    for part in parts:
+        mgr = EffectManager(part.instr, part.active_caches, pred)
+        managers.append(mgr)
+        pred = mgr
+    return managers
+
+
+def get_stack_effect_info_for_macro(mac: MacroInstruction) -> tuple[str, str]:
+    """Get the stack effect info for a macro instruction.
+
+    Returns a tuple (popped, pushed) where each is a string giving a
+    symbolic expression for the number of values popped/pushed.
+    """
+    parts = [part for part in mac.parts if isinstance(part, Component)]
+    managers = get_managers(parts)
+    popped = StackOffset()
+    for mgr in managers:
+        if less_than(mgr.min_offset, popped):
+            popped = mgr.min_offset.clone()
+    # Compute pushed = final - popped
+    pushed = managers[-1].final_offset.clone()
+    for effect in popped.deep:
+        pushed.higher(effect)
+    for effect in popped.high:
+        pushed.deeper(effect)
+    return popped.negate().as_index(), pushed.as_index()
+
+
+def write_single_instr(
+    instr: Instruction, out: Formatter, tier: Tiers = TIER_ONE
+) -> None:
+    try:
+        write_components(
+            [Component(instr, instr.active_caches)],
+            out,
+            tier,
+        )
+    except AssertionError as err:
+        raise AssertionError(f"Error writing instruction {instr.name}") from err
+
+
+def write_macro_instr(
+    mac: MacroInstruction, out: Formatter, family: Family | None
+) -> None:
+    parts = [part for part in mac.parts if isinstance(part, Component)]
+
+    cache_adjust = 0
+    for part in mac.parts:
+        match part:
+            case CacheEffect(size=size):
+                cache_adjust += size
+            case Component(instr=instr):
+                cache_adjust += instr.cache_offset
+            case _:
+                typing.assert_never(part)
+
+    out.emit("")
+    with out.block(f"TARGET({mac.name})"):
+        if mac.predicted:
+            out.emit(f"PREDICTED({mac.name});")
+        out.static_assert_family_size(mac.name, family, cache_adjust)
+        try:
+            write_components(parts, out, TIER_ONE)
+        except AssertionError as err:
+            raise AssertionError(f"Error writing macro {mac.name}") from err
+        if cache_adjust:
+            out.emit(f"next_instr += {cache_adjust};")
+        out.emit("DISPATCH();")
+
+
+def write_components(
+    parts: list[Component],
+    out: Formatter,
+    tier: Tiers,
+) -> None:
+    managers = get_managers(parts)
+
+    all_vars: dict[str, StackEffect] = {}
+    for mgr in managers:
+        for name, eff in mgr.collect_vars().items():
+            if name in all_vars:
+                # TODO: Turn this into an error -- variable conflict
+                assert all_vars[name] == eff, (
+                    name,
+                    mgr.instr.name,
+                    all_vars[name],
+                    eff,
+                )
+            else:
+                all_vars[name] = eff
+
+    # Declare all variables
+    for name, eff in all_vars.items():
+        out.declare(eff, None)
+
+    for mgr in managers:
+        if len(parts) > 1:
+            out.emit(f"// {mgr.instr.name}")
+
+        for copy in mgr.copies:
+            if copy.src.name != copy.dst.name:
+                out.assign(copy.dst, copy.src)
+        for peek in mgr.peeks:
+            out.assign(
+                peek.effect,
+                StackEffect(
+                    peek.as_variable(),
+                    peek.effect.type,
+                    peek.effect.cond,
+                    peek.effect.size,
+                ),
+            )
+        # Initialize array outputs
+        for poke in mgr.pokes:
+            if poke.effect.size and poke.effect.name not in mgr.instr.unmoved_names:
+                out.assign(
+                    poke.effect,
+                    StackEffect(
+                        poke.as_variable(lax=True),
+                        poke.effect.type,
+                        poke.effect.cond,
+                        poke.effect.size,
+                    ),
+                )
+
+        if len(parts) == 1:
+            mgr.instr.write_body(out, 0, mgr.active_caches, tier)
+        else:
+            with out.block(""):
+                mgr.instr.write_body(out, -4, mgr.active_caches, tier)
+
+        if mgr is managers[-1]:
+            out.stack_adjust(mgr.final_offset.deep, mgr.final_offset.high)
+            # Use clone() since adjust_inverse() mutates final_offset
+            mgr.adjust_inverse(mgr.final_offset.clone())
+
+        for poke in mgr.pokes:
+            if not poke.effect.size and poke.effect.name not in mgr.instr.unmoved_names:
+                out.assign(
+                    StackEffect(
+                        poke.as_variable(),
+                        poke.effect.type,
+                        poke.effect.cond,
+                        poke.effect.size,
+                    ),
+                    poke.effect,
+                )