From 61c7249759ce88465ea655d5c19d17d03ff3f74b Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 17 Aug 2023 11:29:58 -0700 Subject: [PATCH] gh-106581: Project through calls (#108067) This finishes the work begun in gh-107760. When, while projecting a superblock, we encounter a call to a short, simple function, the superblock will now enter the function using `_PUSH_FRAME`, continue through it, and leave it using `_POP_FRAME`, and then continue through the original code. Multiple frame pushes and pops are even possible. It is also possible to stop appending to the superblock in the middle of a called function, when running out of space or encountering an unsupported bytecode. --- Include/internal/pycore_ceval.h | 1 + Include/internal/pycore_function.h | 9 ++ Include/internal/pycore_opcode_metadata.h | 87 +++++++++-------- Lib/test/test_capi/test_misc.py | 1 + Lib/test/test_code.py | 2 +- Objects/codeobject.c | 3 + Objects/funcobject.c | 79 +++++++++++++++- Python/abstract_interp_cases.c.h | 9 ++ Python/bytecodes.c | 56 ++++++----- Python/ceval.c | 14 +-- Python/ceval_macros.h | 4 + Python/executor_cases.c.h | 46 ++++++++- Python/generated_cases.c.h | 109 ++++++++++++++++------ Python/optimizer.c | 90 +++++++++++++++++- Tools/cases_generator/analysis.py | 6 +- Tools/cases_generator/stacking.py | 2 +- 16 files changed, 409 insertions(+), 109 deletions(-) diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index 05b73805978..0e3a99be8c3 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -171,6 +171,7 @@ void _PyEval_FormatKwargsError(PyThreadState *tstate, PyObject *func, PyObject * PyObject *_PyEval_MatchClass(PyThreadState *tstate, PyObject *subject, PyObject *type, Py_ssize_t nargs, PyObject *kwargs); PyObject *_PyEval_MatchKeys(PyThreadState *tstate, PyObject *map, PyObject *keys); int _PyEval_UnpackIterable(PyThreadState *tstate, PyObject *v, int argcnt, int argcntafter, PyObject **sp); +void _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame); #ifdef __cplusplus diff --git a/Include/internal/pycore_function.h b/Include/internal/pycore_function.h index e844d323ec7..3f3da8a44b7 100644 --- a/Include/internal/pycore_function.h +++ b/Include/internal/pycore_function.h @@ -16,13 +16,22 @@ extern PyObject* _PyFunction_Vectorcall( #define FUNC_MAX_WATCHERS 8 +#define FUNC_VERSION_CACHE_SIZE (1<<12) /* Must be a power of 2 */ struct _py_func_state { uint32_t next_version; + // Borrowed references to function objects whose + // func_version % FUNC_VERSION_CACHE_SIZE + // once was equal to the index in the table. + // They are cleared when the function is deallocated. + PyFunctionObject *func_version_cache[FUNC_VERSION_CACHE_SIZE]; }; extern PyFunctionObject* _PyFunction_FromConstructor(PyFrameConstructor *constr); extern uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func); +extern void _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version); +PyFunctionObject *_PyFunction_LookupByVersion(uint32_t version); + extern PyObject *_Py_set_function_type_params( PyThreadState* unused, PyObject *func, PyObject *type_params); diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index afe8aa172b7..396d194ed27 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -33,35 +33,36 @@ #define _BINARY_OP_SUBTRACT_FLOAT 309 #define _GUARD_BOTH_UNICODE 310 #define _BINARY_OP_ADD_UNICODE 311 -#define _LOAD_LOCALS 312 -#define _LOAD_FROM_DICT_OR_GLOBALS 313 -#define _GUARD_GLOBALS_VERSION 314 -#define _GUARD_BUILTINS_VERSION 315 -#define _LOAD_GLOBAL_MODULE 316 -#define _LOAD_GLOBAL_BUILTINS 317 -#define _GUARD_TYPE_VERSION 318 -#define _CHECK_MANAGED_OBJECT_HAS_VALUES 319 -#define _LOAD_ATTR_INSTANCE_VALUE 320 -#define IS_NONE 321 -#define _ITER_CHECK_LIST 322 -#define _IS_ITER_EXHAUSTED_LIST 323 -#define _ITER_NEXT_LIST 324 -#define _ITER_CHECK_TUPLE 325 -#define _IS_ITER_EXHAUSTED_TUPLE 326 -#define _ITER_NEXT_TUPLE 327 -#define _ITER_CHECK_RANGE 328 -#define _IS_ITER_EXHAUSTED_RANGE 329 -#define _ITER_NEXT_RANGE 330 -#define _CHECK_PEP_523 331 -#define _CHECK_FUNCTION_EXACT_ARGS 332 -#define _CHECK_STACK_SPACE 333 -#define _INIT_CALL_PY_EXACT_ARGS 334 -#define _PUSH_FRAME 335 -#define _POP_JUMP_IF_FALSE 336 -#define _POP_JUMP_IF_TRUE 337 -#define JUMP_TO_TOP 338 -#define SAVE_CURRENT_IP 339 -#define INSERT 340 +#define _POP_FRAME 312 +#define _LOAD_LOCALS 313 +#define _LOAD_FROM_DICT_OR_GLOBALS 314 +#define _GUARD_GLOBALS_VERSION 315 +#define _GUARD_BUILTINS_VERSION 316 +#define _LOAD_GLOBAL_MODULE 317 +#define _LOAD_GLOBAL_BUILTINS 318 +#define _GUARD_TYPE_VERSION 319 +#define _CHECK_MANAGED_OBJECT_HAS_VALUES 320 +#define _LOAD_ATTR_INSTANCE_VALUE 321 +#define IS_NONE 322 +#define _ITER_CHECK_LIST 323 +#define _IS_ITER_EXHAUSTED_LIST 324 +#define _ITER_NEXT_LIST 325 +#define _ITER_CHECK_TUPLE 326 +#define _IS_ITER_EXHAUSTED_TUPLE 327 +#define _ITER_NEXT_TUPLE 328 +#define _ITER_CHECK_RANGE 329 +#define _IS_ITER_EXHAUSTED_RANGE 330 +#define _ITER_NEXT_RANGE 331 +#define _CHECK_PEP_523 332 +#define _CHECK_FUNCTION_EXACT_ARGS 333 +#define _CHECK_STACK_SPACE 334 +#define _INIT_CALL_PY_EXACT_ARGS 335 +#define _PUSH_FRAME 336 +#define _POP_JUMP_IF_FALSE 337 +#define _POP_JUMP_IF_TRUE 338 +#define JUMP_TO_TOP 339 +#define SAVE_CURRENT_IP 340 +#define INSERT 341 extern int _PyOpcode_num_popped(int opcode, int oparg, bool jump); #ifdef NEED_OPCODE_METADATA @@ -197,6 +198,8 @@ int _PyOpcode_num_popped(int opcode, int oparg, bool jump) { return oparg; case INTERPRETER_EXIT: return 1; + case _POP_FRAME: + return 1; case RETURN_VALUE: return 1; case INSTRUMENTED_RETURN_VALUE: @@ -723,6 +726,8 @@ int _PyOpcode_num_pushed(int opcode, int oparg, bool jump) { return 0; case INTERPRETER_EXIT: return 0; + case _POP_FRAME: + return 0; case RETURN_VALUE: return 0; case INSTRUMENTED_RETURN_VALUE: @@ -1191,7 +1196,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[OPCODE_METADATA_SIZE] = { [STORE_FAST_STORE_FAST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG }, [POP_TOP] = { true, INSTR_FMT_IX, 0 }, [PUSH_NULL] = { true, INSTR_FMT_IX, 0 }, - [END_FOR] = { true, INSTR_FMT_IB, 0 }, + [END_FOR] = { true, INSTR_FMT_IX, 0 }, [INSTRUMENTED_END_FOR] = { true, INSTR_FMT_IX, 0 }, [END_SEND] = { true, INSTR_FMT_IX, 0 }, [INSTRUMENTED_END_SEND] = { true, INSTR_FMT_IX, 0 }, @@ -1205,14 +1210,14 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[OPCODE_METADATA_SIZE] = { [TO_BOOL_STR] = { true, INSTR_FMT_IXC00, 0 }, [TO_BOOL_ALWAYS_TRUE] = { true, INSTR_FMT_IXC00, 0 }, [UNARY_INVERT] = { true, INSTR_FMT_IX, 0 }, - [BINARY_OP_MULTIPLY_INT] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_ADD_INT] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_SUBTRACT_INT] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_MULTIPLY_FLOAT] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_ADD_FLOAT] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_SUBTRACT_FLOAT] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_ADD_UNICODE] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_INPLACE_ADD_UNICODE] = { true, INSTR_FMT_IB, HAS_LOCAL_FLAG }, + [BINARY_OP_MULTIPLY_INT] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_ADD_INT] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_SUBTRACT_INT] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_MULTIPLY_FLOAT] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_ADD_FLOAT] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_SUBTRACT_FLOAT] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_ADD_UNICODE] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_INPLACE_ADD_UNICODE] = { true, INSTR_FMT_IX, HAS_LOCAL_FLAG }, [BINARY_SUBSCR] = { true, INSTR_FMT_IXC, 0 }, [BINARY_SLICE] = { true, INSTR_FMT_IX, 0 }, [STORE_SLICE] = { true, INSTR_FMT_IX, 0 }, @@ -1259,7 +1264,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[OPCODE_METADATA_SIZE] = { [DELETE_ATTR] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG }, [STORE_GLOBAL] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG }, [DELETE_GLOBAL] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG }, - [LOAD_LOCALS] = { true, INSTR_FMT_IB, 0 }, + [LOAD_LOCALS] = { true, INSTR_FMT_IX, 0 }, [LOAD_NAME] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG }, [LOAD_FROM_DICT_OR_GLOBALS] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG }, [LOAD_GLOBAL] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_NAME_FLAG }, @@ -1400,6 +1405,7 @@ extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACR #ifdef NEED_OPCODE_METADATA const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPANSION_SIZE] = { [NOP] = { .nuops = 1, .uops = { { NOP, 0, 0 } } }, + [RESUME] = { .nuops = 1, .uops = { { RESUME, 0, 0 } } }, [LOAD_FAST_CHECK] = { .nuops = 1, .uops = { { LOAD_FAST_CHECK, 0, 0 } } }, [LOAD_FAST] = { .nuops = 1, .uops = { { LOAD_FAST, 0, 0 } } }, [LOAD_FAST_AND_CLEAR] = { .nuops = 1, .uops = { { LOAD_FAST_AND_CLEAR, 0, 0 } } }, @@ -1444,6 +1450,8 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPAN [DELETE_SUBSCR] = { .nuops = 1, .uops = { { DELETE_SUBSCR, 0, 0 } } }, [CALL_INTRINSIC_1] = { .nuops = 1, .uops = { { CALL_INTRINSIC_1, 0, 0 } } }, [CALL_INTRINSIC_2] = { .nuops = 1, .uops = { { CALL_INTRINSIC_2, 0, 0 } } }, + [RETURN_VALUE] = { .nuops = 3, .uops = { { SAVE_IP, 7, 0 }, { SAVE_CURRENT_IP, 0, 0 }, { _POP_FRAME, 0, 0 } } }, + [RETURN_CONST] = { .nuops = 4, .uops = { { LOAD_CONST, 0, 0 }, { SAVE_IP, 7, 0 }, { SAVE_CURRENT_IP, 0, 0 }, { _POP_FRAME, 0, 0 } } }, [GET_AITER] = { .nuops = 1, .uops = { { GET_AITER, 0, 0 } } }, [GET_ANEXT] = { .nuops = 1, .uops = { { GET_ANEXT, 0, 0 } } }, [GET_AWAITABLE] = { .nuops = 1, .uops = { { GET_AWAITABLE, 0, 0 } } }, @@ -1545,6 +1553,7 @@ const char * const _PyOpcode_uop_name[OPCODE_UOP_NAME_SIZE] = { [_BINARY_OP_SUBTRACT_FLOAT] = "_BINARY_OP_SUBTRACT_FLOAT", [_GUARD_BOTH_UNICODE] = "_GUARD_BOTH_UNICODE", [_BINARY_OP_ADD_UNICODE] = "_BINARY_OP_ADD_UNICODE", + [_POP_FRAME] = "_POP_FRAME", [_LOAD_LOCALS] = "_LOAD_LOCALS", [_LOAD_FROM_DICT_OR_GLOBALS] = "_LOAD_FROM_DICT_OR_GLOBALS", [_GUARD_GLOBALS_VERSION] = "_GUARD_GLOBALS_VERSION", diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index 3dfbfdc26e7..18a0476122d 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -2633,6 +2633,7 @@ class TestUops(unittest.TestCase): self.assertIsNotNone(ex) uops = {opname for opname, _, _ in ex} self.assertIn("_PUSH_FRAME", uops) + self.assertIn("_BINARY_OP_ADD_INT", uops) diff --git a/Lib/test/test_code.py b/Lib/test/test_code.py index ca06a39f5df..e056c16466e 100644 --- a/Lib/test/test_code.py +++ b/Lib/test/test_code.py @@ -264,7 +264,7 @@ class CodeTest(unittest.TestCase): ("co_posonlyargcount", 0), ("co_kwonlyargcount", 0), ("co_nlocals", 1), - ("co_stacksize", 0), + ("co_stacksize", 1), ("co_flags", code.co_flags | inspect.CO_COROUTINE), ("co_firstlineno", 100), ("co_code", code2.co_code), diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 2c9c8cec77f..4d6efe938f4 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -396,6 +396,9 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) int nlocals, ncellvars, nfreevars; get_localsplus_counts(con->localsplusnames, con->localspluskinds, &nlocals, &ncellvars, &nfreevars); + if (con->stacksize == 0) { + con->stacksize = 1; + } co->co_filename = Py_NewRef(con->filename); co->co_name = Py_NewRef(con->name); diff --git a/Objects/funcobject.c b/Objects/funcobject.c index 8c0bface3ac..33191d23f18 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -223,7 +223,73 @@ error: return NULL; } -uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func) +/* +Function versions +----------------- + +Function versions are used to detect when a function object has been +updated, invalidating inline cache data used by the `CALL` bytecode +(notably `CALL_PY_EXACT_ARGS` and a few other `CALL` specializations). + +They are also used by the Tier 2 superblock creation code to find +the function being called (and from there the code object). + +How does a function's `func_version` field get initialized? + +- `PyFunction_New` and friends initialize it to 0. +- The `MAKE_FUNCTION` instruction sets it from the code's `co_version`. +- It is reset to 0 when various attributes like `__code__` are set. +- A new version is allocated by `_PyFunction_GetVersionForCurrentState` + when the specializer needs a version and the version is 0. + +The latter allocates versions using a counter in the interpreter state; +when the counter wraps around to 0, no more versions are allocated. +There is one other special case: functions with a non-standard +`vectorcall` field are not given a version. + +When the function version is 0, the `CALL` bytecode is not specialized. + +Code object versions +-------------------- + +So where to code objects get their `co_version`? There is a single +static global counter, `_Py_next_func_version`. This is initialized in +the generated (!) file `Python/deepfreeze/deepfreeze.c`, to 1 plus the +number of deep-frozen function objects in that file. +(In `_bootstrap_python.c` and `freeze_module.c` it is initialized to 1.) + +Code objects get a new `co_version` allocated from this counter upon +creation. Since code objects are nominally immutable, `co_version` can +not be invalidated. The only way it can be 0 is when 2**32 or more +code objects have been created during the process's lifetime. +(The counter isn't reset by `fork()`, extending the lifetime.) +*/ + +void +_PyFunction_SetVersion(PyFunctionObject *func, uint32_t version) +{ + func->func_version = version; + if (version != 0) { + PyInterpreterState *interp = _PyInterpreterState_GET(); + interp->func_state.func_version_cache[ + version % FUNC_VERSION_CACHE_SIZE] = func; + } +} + +PyFunctionObject * +_PyFunction_LookupByVersion(uint32_t version) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyFunctionObject *func = interp->func_state.func_version_cache[ + version % FUNC_VERSION_CACHE_SIZE]; + if (func != NULL && func->func_version == version) { + return (PyFunctionObject *)Py_NewRef(func); + } + return NULL; +} + +uint32_t +_PyFunction_GetVersionForCurrentState(PyFunctionObject *func) { if (func->func_version != 0) { return func->func_version; @@ -236,7 +302,7 @@ uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func) return 0; } uint32_t v = interp->func_state.next_version++; - func->func_version = v; + _PyFunction_SetVersion(func, v); return v; } @@ -851,6 +917,15 @@ func_dealloc(PyFunctionObject *op) if (op->func_weakreflist != NULL) { PyObject_ClearWeakRefs((PyObject *) op); } + if (op->func_version != 0) { + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyFunctionObject **slot = + interp->func_state.func_version_cache + + (op->func_version % FUNC_VERSION_CACHE_SIZE); + if (*slot == op) { + *slot = NULL; + } + } (void)func_clear(op); // These aren't cleared by func_clear(). Py_DECREF(op->func_code); diff --git a/Python/abstract_interp_cases.c.h b/Python/abstract_interp_cases.c.h index eef071119bc..1b99b929fa8 100644 --- a/Python/abstract_interp_cases.c.h +++ b/Python/abstract_interp_cases.c.h @@ -7,6 +7,10 @@ break; } + case RESUME: { + break; + } + case POP_TOP: { STACK_SHRINK(1); break; @@ -191,6 +195,11 @@ break; } + case _POP_FRAME: { + STACK_SHRINK(1); + break; + } + case GET_AITER: { PARTITIONNODE_OVERWRITE((_Py_PARTITIONNODE_t *)PARTITIONNODE_NULLROOT, PEEK(-(-1)), true); break; diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 6f17472e04e..ae459cabadd 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -133,6 +133,7 @@ dummy_func( } inst(RESUME, (--)) { + #if TIER_ONE assert(frame == tstate->current_frame); /* Possibly combine this with eval breaker */ if (_PyFrame_GetCode(frame)->_co_instrumentation_version != tstate->interp->monitoring_version) { @@ -140,7 +141,9 @@ dummy_func( ERROR_IF(err, error); next_instr--; } - else if (oparg < 2) { + else + #endif + if (oparg < 2) { CHECK_EVAL_BREAKER(); } } @@ -757,21 +760,37 @@ dummy_func( return retval; } - inst(RETURN_VALUE, (retval --)) { - STACK_SHRINK(1); + // The stack effect here is ambiguous. + // We definitely pop the return value off the stack on entry. + // We also push it onto the stack on exit, but that's a + // different frame, and it's accounted for by _PUSH_FRAME. + op(_POP_FRAME, (retval --)) { assert(EMPTY()); _PyFrame_SetStackPointer(frame, stack_pointer); _Py_LeaveRecursiveCallPy(tstate); - assert(frame != &entry_frame); // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; + #if TIER_ONE + assert(frame != &entry_frame); + #endif frame = tstate->current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + _PyEval_FrameClearAndPop(tstate, dying); frame->prev_instr += frame->return_offset; _PyFrame_StackPush(frame, retval); + #if TIER_ONE goto resume_frame; + #endif + #if TIER_TWO + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif } + macro(RETURN_VALUE) = + SAVE_IP + // Tier 2 only; special-cased oparg + SAVE_CURRENT_IP + // Sets frame->prev_instr + _POP_FRAME; + inst(INSTRUMENTED_RETURN_VALUE, (retval --)) { int err = _Py_call_instrumentation_arg( tstate, PY_MONITORING_EVENT_PY_RETURN, @@ -785,27 +804,17 @@ dummy_func( // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; frame = tstate->current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + _PyEval_FrameClearAndPop(tstate, dying); frame->prev_instr += frame->return_offset; _PyFrame_StackPush(frame, retval); goto resume_frame; } - inst(RETURN_CONST, (--)) { - PyObject *retval = GETITEM(FRAME_CO_CONSTS, oparg); - Py_INCREF(retval); - assert(EMPTY()); - _PyFrame_SetStackPointer(frame, stack_pointer); - _Py_LeaveRecursiveCallPy(tstate); - assert(frame != &entry_frame); - // GH-99729: We need to unlink the frame *before* clearing it: - _PyInterpreterFrame *dying = frame; - frame = tstate->current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); - frame->prev_instr += frame->return_offset; - _PyFrame_StackPush(frame, retval); - goto resume_frame; - } + macro(RETURN_CONST) = + LOAD_CONST + + SAVE_IP + // Tier 2 only; special-cased oparg + SAVE_CURRENT_IP + // Sets frame->prev_instr + _POP_FRAME; inst(INSTRUMENTED_RETURN_CONST, (--)) { PyObject *retval = GETITEM(FRAME_CO_CONSTS, oparg); @@ -821,7 +830,7 @@ dummy_func( // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; frame = tstate->current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + _PyEval_FrameClearAndPop(tstate, dying); frame->prev_instr += frame->return_offset; _PyFrame_StackPush(frame, retval); goto resume_frame; @@ -3545,7 +3554,8 @@ dummy_func( goto error; } - func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; + _PyFunction_SetVersion( + func_obj, ((PyCodeObject *)codeobj)->co_version); func = (PyObject *)func_obj; } diff --git a/Python/ceval.c b/Python/ceval.c index 1e2262c1f18..329a1a17cf0 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -222,8 +222,6 @@ _PyEvalFramePushAndInit(PyThreadState *tstate, PyFunctionObject *func, static _PyInterpreterFrame * _PyEvalFramePushAndInit_Ex(PyThreadState *tstate, PyFunctionObject *func, PyObject *locals, Py_ssize_t nargs, PyObject *callargs, PyObject *kwargs); -static void -_PyEvalFrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame); #ifdef HAVE_ERRNO_H #include @@ -603,10 +601,6 @@ int _Py_CheckRecursiveCallPy( } -static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate) { - tstate->py_recursion_remaining++; -} - static const _Py_CODEUNIT _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS[] = { /* Put a NOP at the start, so that the IP points into * the code, rather than before it */ @@ -731,7 +725,7 @@ resume_frame: // When tracing executed uops, also trace bytecode char *uop_debug = Py_GETENV("PYTHONUOPSDEBUG"); if (uop_debug != NULL && *uop_debug >= '0') { - lltrace = (*uop_debug - '0') >= 4; // TODO: Parse an int and all that + lltrace = (*uop_debug - '0') >= 5; // TODO: Parse an int and all that } } } @@ -918,7 +912,7 @@ exit_unwind: // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; frame = tstate->current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + _PyEval_FrameClearAndPop(tstate, dying); frame->return_offset = 0; if (frame == &entry_frame) { /* Restore previous frame and exit */ @@ -1487,8 +1481,8 @@ clear_gen_frame(PyThreadState *tstate, _PyInterpreterFrame * frame) frame->previous = NULL; } -static void -_PyEvalFrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame * frame) +void +_PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame * frame) { if (frame->owner == FRAME_OWNED_BY_THREAD) { clear_thread_frame(tstate, frame); diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 08f19cd9a39..635b8e501e5 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -369,3 +369,7 @@ static inline int _Py_EnterRecursivePy(PyThreadState *tstate) { return (tstate->py_recursion_remaining-- <= 0) && _Py_CheckRecursiveCallPy(tstate); } + +static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate) { + tstate->py_recursion_remaining++; +} diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 9fbf026f164..89a5bbfecde 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -7,6 +7,23 @@ break; } + case RESUME: { + #if TIER_ONE + assert(frame == tstate->current_frame); + /* Possibly combine this with eval breaker */ + if (_PyFrame_GetCode(frame)->_co_instrumentation_version != tstate->interp->monitoring_version) { + int err = _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); + if (err) goto error; + next_instr--; + } + else + #endif + if (oparg < 2) { + CHECK_EVAL_BREAKER(); + } + break; + } + case LOAD_FAST_CHECK: { PyObject *value; value = GETLOCAL(oparg); @@ -666,6 +683,32 @@ break; } + case _POP_FRAME: { + PyObject *retval; + retval = stack_pointer[-1]; + STACK_SHRINK(1); + assert(EMPTY()); + _PyFrame_SetStackPointer(frame, stack_pointer); + _Py_LeaveRecursiveCallPy(tstate); + // GH-99729: We need to unlink the frame *before* clearing it: + _PyInterpreterFrame *dying = frame; + #if TIER_ONE + assert(frame != &entry_frame); + #endif + frame = tstate->current_frame = dying->previous; + _PyEval_FrameClearAndPop(tstate, dying); + frame->prev_instr += frame->return_offset; + _PyFrame_StackPush(frame, retval); + #if TIER_ONE + goto resume_frame; + #endif + #if TIER_TWO + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif + break; + } + case GET_AITER: { PyObject *obj; PyObject *iter; @@ -2607,7 +2650,8 @@ goto error; } - func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; + _PyFunction_SetVersion( + func_obj, ((PyCodeObject *)codeobj)->co_version); func = (PyObject *)func_obj; stack_pointer[-1] = func; break; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 80af8a7bcd5..f6322df566c 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -8,6 +8,7 @@ } TARGET(RESUME) { + #if TIER_ONE assert(frame == tstate->current_frame); /* Possibly combine this with eval breaker */ if (_PyFrame_GetCode(frame)->_co_instrumentation_version != tstate->interp->monitoring_version) { @@ -15,7 +16,9 @@ if (err) goto error; next_instr--; } - else if (oparg < 2) { + else + #endif + if (oparg < 2) { CHECK_EVAL_BREAKER(); } DISPATCH(); @@ -970,20 +973,40 @@ TARGET(RETURN_VALUE) { PyObject *retval; + // SAVE_CURRENT_IP + { + #if TIER_ONE + frame->prev_instr = next_instr - 1; + #endif + #if TIER_TWO + // Relies on a preceding SAVE_IP + frame->prev_instr--; + #endif + } + // _POP_FRAME retval = stack_pointer[-1]; STACK_SHRINK(1); - assert(EMPTY()); - _PyFrame_SetStackPointer(frame, stack_pointer); - _Py_LeaveRecursiveCallPy(tstate); - assert(frame != &entry_frame); - // GH-99729: We need to unlink the frame *before* clearing it: - _PyInterpreterFrame *dying = frame; - frame = tstate->current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); - frame->prev_instr += frame->return_offset; - _PyFrame_StackPush(frame, retval); - goto resume_frame; - STACK_SHRINK(1); + { + assert(EMPTY()); + _PyFrame_SetStackPointer(frame, stack_pointer); + _Py_LeaveRecursiveCallPy(tstate); + // GH-99729: We need to unlink the frame *before* clearing it: + _PyInterpreterFrame *dying = frame; + #if TIER_ONE + assert(frame != &entry_frame); + #endif + frame = tstate->current_frame = dying->previous; + _PyEval_FrameClearAndPop(tstate, dying); + frame->prev_instr += frame->return_offset; + _PyFrame_StackPush(frame, retval); + #if TIER_ONE + goto resume_frame; + #endif + #if TIER_TWO + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif + } } TARGET(INSTRUMENTED_RETURN_VALUE) { @@ -1001,7 +1024,7 @@ // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; frame = tstate->current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + _PyEval_FrameClearAndPop(tstate, dying); frame->prev_instr += frame->return_offset; _PyFrame_StackPush(frame, retval); goto resume_frame; @@ -1009,19 +1032,46 @@ } TARGET(RETURN_CONST) { - PyObject *retval = GETITEM(FRAME_CO_CONSTS, oparg); - Py_INCREF(retval); - assert(EMPTY()); - _PyFrame_SetStackPointer(frame, stack_pointer); - _Py_LeaveRecursiveCallPy(tstate); - assert(frame != &entry_frame); - // GH-99729: We need to unlink the frame *before* clearing it: - _PyInterpreterFrame *dying = frame; - frame = tstate->current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); - frame->prev_instr += frame->return_offset; - _PyFrame_StackPush(frame, retval); - goto resume_frame; + PyObject *value; + PyObject *retval; + // LOAD_CONST + { + value = GETITEM(FRAME_CO_CONSTS, oparg); + Py_INCREF(value); + } + // SAVE_CURRENT_IP + { + #if TIER_ONE + frame->prev_instr = next_instr - 1; + #endif + #if TIER_TWO + // Relies on a preceding SAVE_IP + frame->prev_instr--; + #endif + } + // _POP_FRAME + retval = value; + { + assert(EMPTY()); + _PyFrame_SetStackPointer(frame, stack_pointer); + _Py_LeaveRecursiveCallPy(tstate); + // GH-99729: We need to unlink the frame *before* clearing it: + _PyInterpreterFrame *dying = frame; + #if TIER_ONE + assert(frame != &entry_frame); + #endif + frame = tstate->current_frame = dying->previous; + _PyEval_FrameClearAndPop(tstate, dying); + frame->prev_instr += frame->return_offset; + _PyFrame_StackPush(frame, retval); + #if TIER_ONE + goto resume_frame; + #endif + #if TIER_TWO + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif + } } TARGET(INSTRUMENTED_RETURN_CONST) { @@ -1038,7 +1088,7 @@ // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; frame = tstate->current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + _PyEval_FrameClearAndPop(tstate, dying); frame->prev_instr += frame->return_offset; _PyFrame_StackPush(frame, retval); goto resume_frame; @@ -4575,7 +4625,8 @@ goto error; } - func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; + _PyFunction_SetVersion( + func_obj, ((PyCodeObject *)codeobj)->co_version); func = (PyObject *)func_obj; stack_pointer[-1] = func; DISPATCH(); diff --git a/Python/optimizer.c b/Python/optimizer.c index 559c4ae9872..57518404c3f 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -373,6 +373,8 @@ static PyTypeObject UOpExecutor_Type = { .tp_as_sequence = &uop_as_sequence, }; +#define TRACE_STACK_SIZE 5 + static int translate_bytecode_to_trace( PyCodeObject *code, @@ -380,10 +382,16 @@ translate_bytecode_to_trace( _PyUOpInstruction *trace, int buffer_size) { + PyCodeObject *initial_code = code; _Py_CODEUNIT *initial_instr = instr; int trace_length = 0; int max_length = buffer_size; int reserved = 0; + struct { + PyCodeObject *code; + _Py_CODEUNIT *instr; + } trace_stack[TRACE_STACK_SIZE]; + int trace_stack_depth = 0; #ifdef Py_DEBUG char *uop_debug = Py_GETENV("PYTHONUOPSDEBUG"); @@ -441,6 +449,24 @@ translate_bytecode_to_trace( // Reserve space for main+stub uops, plus 2 for SAVE_IP and EXIT_TRACE #define RESERVE(main, stub) RESERVE_RAW((main) + (stub) + 2, uop_name(opcode)) +// Trace stack operations (used by _PUSH_FRAME, _POP_FRAME) +#define TRACE_STACK_PUSH() \ + if (trace_stack_depth >= TRACE_STACK_SIZE) { \ + DPRINTF(2, "Trace stack overflow\n"); \ + ADD_TO_TRACE(SAVE_IP, 0, 0); \ + goto done; \ + } \ + trace_stack[trace_stack_depth].code = code; \ + trace_stack[trace_stack_depth].instr = instr; \ + trace_stack_depth++; +#define TRACE_STACK_POP() \ + if (trace_stack_depth <= 0) { \ + Py_FatalError("Trace stack underflow\n"); \ + } \ + trace_stack_depth--; \ + code = trace_stack[trace_stack_depth].code; \ + instr = trace_stack[trace_stack_depth].instr; + DPRINTF(4, "Optimizing %s (%s:%d) at byte offset %d\n", PyUnicode_AsUTF8(code->co_qualname), @@ -448,6 +474,7 @@ translate_bytecode_to_trace( code->co_firstlineno, 2 * INSTR_IP(initial_instr, code)); +top: // Jump here after _PUSH_FRAME for (;;) { RESERVE_RAW(2, "epilogue"); // Always need space for SAVE_IP and EXIT_TRACE ADD_TO_TRACE(SAVE_IP, INSTR_IP(instr, code), 0); @@ -508,7 +535,7 @@ pop_jump_if_bool: case JUMP_BACKWARD: { - if (instr + 2 - oparg == initial_instr) { + if (instr + 2 - oparg == initial_instr && code == initial_code) { RESERVE(1, 0); ADD_TO_TRACE(JUMP_TO_TOP, 0, 0); } @@ -573,6 +600,14 @@ pop_jump_if_bool: // Reserve space for nuops (+ SAVE_IP + EXIT_TRACE) int nuops = expansion->nuops; RESERVE(nuops, 0); + if (expansion->uops[nuops-1].uop == _POP_FRAME) { + // Check for trace stack underflow now: + // We can't bail e.g. in the middle of + // LOAD_CONST + _POP_FRAME. + if (trace_stack_depth == 0) { + DPRINTF(2, "Trace stack underflow\n"); + goto done;} + } uint32_t orig_oparg = oparg; // For OPARG_TOP/BOTTOM for (int i = 0; i < nuops; i++) { oparg = orig_oparg; @@ -619,8 +654,57 @@ pop_jump_if_bool: Py_FatalError("garbled expansion"); } ADD_TO_TRACE(expansion->uops[i].uop, oparg, operand); + if (expansion->uops[i].uop == _POP_FRAME) { + TRACE_STACK_POP(); + DPRINTF(2, + "Returning to %s (%s:%d) at byte offset %d\n", + PyUnicode_AsUTF8(code->co_qualname), + PyUnicode_AsUTF8(code->co_filename), + code->co_firstlineno, + 2 * INSTR_IP(instr, code)); + goto top; + } if (expansion->uops[i].uop == _PUSH_FRAME) { assert(i + 1 == nuops); + int func_version_offset = + offsetof(_PyCallCache, func_version)/sizeof(_Py_CODEUNIT) + // Add one to account for the actual opcode/oparg pair: + + 1; + uint32_t func_version = read_u32(&instr[func_version_offset].cache); + PyFunctionObject *func = _PyFunction_LookupByVersion(func_version); + DPRINTF(3, "Function object: %p\n", func); + if (func != NULL) { + PyCodeObject *new_code = (PyCodeObject *)PyFunction_GET_CODE(func); + if (new_code == code) { + // Recursive call, bail (we could be here forever). + DPRINTF(2, "Bailing on recursive call to %s (%s:%d)\n", + PyUnicode_AsUTF8(new_code->co_qualname), + PyUnicode_AsUTF8(new_code->co_filename), + new_code->co_firstlineno); + ADD_TO_TRACE(SAVE_IP, 0, 0); + goto done; + } + if (new_code->co_version != func_version) { + // func.__code__ was updated. + // Perhaps it may happen again, so don't bother tracing. + // TODO: Reason about this -- is it better to bail or not? + DPRINTF(2, "Bailing because co_version != func_version\n"); + ADD_TO_TRACE(SAVE_IP, 0, 0); + goto done; + } + // Increment IP to the return address + instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]] + 1; + TRACE_STACK_PUSH(); + code = new_code; + instr = _PyCode_CODE(code); + DPRINTF(2, + "Continuing in %s (%s:%d) at byte offset %d\n", + PyUnicode_AsUTF8(code->co_qualname), + PyUnicode_AsUTF8(code->co_filename), + code->co_firstlineno, + 2 * INSTR_IP(instr, code)); + goto top; + } ADD_TO_TRACE(SAVE_IP, 0, 0); goto done; } @@ -639,6 +723,10 @@ pop_jump_if_bool: } // End for (;;) done: + while (trace_stack_depth > 0) { + TRACE_STACK_POP(); + } + assert(code == initial_code); // Skip short traces like SAVE_IP, LOAD_FAST, SAVE_IP, EXIT_TRACE if (trace_length > 3) { ADD_TO_TRACE(EXIT_TRACE, 0, 0); diff --git a/Tools/cases_generator/analysis.py b/Tools/cases_generator/analysis.py index 2db1cd01c19..48f2db981c9 100644 --- a/Tools/cases_generator/analysis.py +++ b/Tools/cases_generator/analysis.py @@ -364,10 +364,12 @@ class Analyzer: case Instruction() as instr: part, offset = self.analyze_instruction(instr, offset) parts.append(part) - flags.add(instr.instr_flags) + if instr.name != "SAVE_IP": + # SAVE_IP in a macro is a no-op in Tier 1 + flags.add(instr.instr_flags) case _: typing.assert_never(component) - format = "IB" + format = "IB" if flags.HAS_ARG_FLAG else "IX" if offset: format += "C" + "0" * (offset - 1) return MacroInstruction(macro.name, format, flags, macro, parts, offset) diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index 8361eb99f88..632298a567d 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -380,7 +380,7 @@ def write_components( poke.as_stack_effect(lax=True), ) - if mgr.instr.name == "_PUSH_FRAME": + if mgr.instr.name in ("_PUSH_FRAME", "_POP_FRAME"): # Adjust stack to min_offset (input effects materialized) out.stack_adjust(mgr.min_offset.deep, mgr.min_offset.high) # Use clone() since adjust_inverse() mutates final_offset