From 7b21403ccd16c480812a1e857c0ee2deca592be0 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Tue, 20 Feb 2024 09:39:55 +0000 Subject: [PATCH] GH-112354: Initial implementation of warm up on exits and trace-stitching (GH-114142) --- .gitattributes | 1 + Include/cpython/optimizer.h | 25 ++- Include/cpython/pystate.h | 2 + Include/internal/pycore_interp.h | 5 +- Include/internal/pycore_jit.h | 2 +- Include/internal/pycore_opcode_metadata.h | 54 ++--- Include/internal/pycore_uop_ids.h | 8 +- Include/internal/pycore_uop_metadata.h | 36 +-- Lib/test/test_frame.py | 1 + Lib/test/test_generated_cases.py | 11 + Modules/_testinternalcapi.c | 4 +- Python/bytecodes.c | 122 ++++++++--- Python/ceval.c | 47 ++-- Python/ceval_macros.h | 31 ++- Python/executor_cases.c.h | 96 ++++++-- Python/generated_cases.c.h | 31 +-- Python/jit.c | 13 +- Python/optimizer.c | 205 ++++++++++++++---- Python/pylifecycle.c | 4 +- Python/pystate.c | 2 + Python/tier2_engine.md | 150 +++++++++++++ Python/tier2_redundancy_eliminator_cases.c.h | 14 +- Tools/c-analyzer/cpython/_parser.py | 1 + Tools/c-analyzer/cpython/ignored.tsv | 5 + Tools/cases_generator/analyzer.py | 22 +- Tools/cases_generator/generators_common.py | 3 + .../opcode_metadata_generator.py | 1 + Tools/cases_generator/tier2_generator.py | 16 ++ Tools/jit/template.c | 30 ++- 29 files changed, 744 insertions(+), 198 deletions(-) create mode 100644 Python/tier2_engine.md diff --git a/.gitattributes b/.gitattributes index c984797e1ac..159cd83ff74 100644 --- a/.gitattributes +++ b/.gitattributes @@ -77,6 +77,7 @@ Include/internal/pycore_opcode.h generated Include/internal/pycore_opcode_metadata.h generated Include/internal/pycore_*_generated.h generated Include/internal/pycore_uop_ids.h generated +Include/internal/pycore_uop_metadata.h generated Include/opcode.h generated Include/opcode_ids.h generated Include/token.h generated diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index f710ca76b2b..fe54d1ddfe6 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -33,16 +33,28 @@ typedef struct { typedef struct { uint16_t opcode; uint16_t oparg; - uint32_t target; + union { + uint32_t target; + uint32_t exit_index; + }; uint64_t operand; // A cache entry } _PyUOpInstruction; +typedef struct _exit_data { + uint32_t target; + int16_t temperature; + const struct _PyExecutorObject *executor; +} _PyExitData; + typedef struct _PyExecutorObject { PyObject_VAR_HEAD + const _PyUOpInstruction *trace; _PyVMData vm_data; /* Used by the VM, but opaque to the optimizer */ - void *jit_code; + uint32_t exit_count; + uint32_t code_size; size_t jit_size; - _PyUOpInstruction trace[1]; + void *jit_code; + _PyExitData exits[1]; } _PyExecutorObject; typedef struct _PyOptimizerObject _PyOptimizerObject; @@ -59,6 +71,7 @@ typedef struct _PyOptimizerObject { /* These thresholds are treated as signed so do not exceed INT16_MAX * Use INT16_MAX to indicate that the optimizer should never be called */ uint16_t resume_threshold; + uint16_t side_threshold; uint16_t backedge_threshold; /* Data needed by the optimizer goes here, but is opaque to the VM */ } _PyOptimizerObject; @@ -73,16 +86,16 @@ PyAPI_FUNC(int) PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *in _PyOptimizerObject *_Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject* optimizer); -PyAPI_FUNC(void) PyUnstable_SetOptimizer(_PyOptimizerObject* optimizer); +PyAPI_FUNC(int) PyUnstable_SetOptimizer(_PyOptimizerObject* optimizer); PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void); PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int offset); int -_PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer); +_PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer, _PyExecutorObject **exec_ptr); -void _Py_ExecutorInit(_PyExecutorObject *, _PyBloomFilter *); +void _Py_ExecutorInit(_PyExecutorObject *, const _PyBloomFilter *); void _Py_ExecutorClear(_PyExecutorObject *); void _Py_BloomFilter_Init(_PyBloomFilter *); void _Py_BloomFilter_Add(_PyBloomFilter *bloom, void *obj); diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index 9bc8758e72b..b99450a8a8d 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -212,6 +212,8 @@ struct _ts { /* The thread's exception stack entry. (Always the last entry.) */ _PyErr_StackItem exc_state; + PyObject *previous_executor; + }; #ifdef Py_DEBUG diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 567d6a9bd51..06eba665c80 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -237,10 +237,13 @@ struct _is { struct callable_cache callable_cache; _PyOptimizerObject *optimizer; _PyExecutorObject *executor_list_head; - /* These values are shifted and offset to speed up check in JUMP_BACKWARD */ + + /* These two values are shifted and offset to speed up check in JUMP_BACKWARD */ uint32_t optimizer_resume_threshold; uint32_t optimizer_backedge_threshold; + uint16_t optimizer_side_threshold; + uint32_t next_func_version; _rare_events rare_events; PyDict_WatchCallback builtins_dict_watcher; diff --git a/Include/internal/pycore_jit.h b/Include/internal/pycore_jit.h index 0b71eb6f758..17bd23f0752 100644 --- a/Include/internal/pycore_jit.h +++ b/Include/internal/pycore_jit.h @@ -13,7 +13,7 @@ extern "C" { typedef _Py_CODEUNIT *(*jit_func)(_PyInterpreterFrame *frame, PyObject **stack_pointer, PyThreadState *tstate); -int _PyJIT_Compile(_PyExecutorObject *executor, _PyUOpInstruction *trace, size_t length); +int _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size_t length); void _PyJIT_Free(_PyExecutorObject *executor); #endif // _Py_JIT diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 6b60a6fbffd..177dd302f73 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -909,8 +909,9 @@ enum InstructionFormat { #define HAS_DEOPT_FLAG (128) #define HAS_ERROR_FLAG (256) #define HAS_ESCAPES_FLAG (512) -#define HAS_PURE_FLAG (1024) -#define HAS_PASSTHROUGH_FLAG (2048) +#define HAS_EXIT_FLAG (1024) +#define HAS_PURE_FLAG (2048) +#define HAS_PASSTHROUGH_FLAG (4096) #define OPCODE_HAS_ARG(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_ARG_FLAG)) #define OPCODE_HAS_CONST(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_CONST_FLAG)) #define OPCODE_HAS_NAME(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_NAME_FLAG)) @@ -921,6 +922,7 @@ enum InstructionFormat { #define OPCODE_HAS_DEOPT(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_DEOPT_FLAG)) #define OPCODE_HAS_ERROR(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_ERROR_FLAG)) #define OPCODE_HAS_ESCAPES(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_ESCAPES_FLAG)) +#define OPCODE_HAS_EXIT(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_EXIT_FLAG)) #define OPCODE_HAS_PURE(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_PURE_FLAG)) #define OPCODE_HAS_PASSTHROUGH(OP) (_PyOpcode_opcode_metadata[OP].flags & (HAS_PASSTHROUGH_FLAG)) @@ -945,14 +947,14 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[268] = { [BEFORE_ASYNC_WITH] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [BEFORE_WITH] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [BINARY_OP] = { true, INSTR_FMT_IBC, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, - [BINARY_OP_ADD_FLOAT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG }, - [BINARY_OP_ADD_INT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_ERROR_FLAG }, - [BINARY_OP_ADD_UNICODE] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, - [BINARY_OP_INPLACE_ADD_UNICODE] = { true, INSTR_FMT_IXC, HAS_LOCAL_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, - [BINARY_OP_MULTIPLY_FLOAT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG }, - [BINARY_OP_MULTIPLY_INT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_ERROR_FLAG }, - [BINARY_OP_SUBTRACT_FLOAT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG }, - [BINARY_OP_SUBTRACT_INT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_ERROR_FLAG }, + [BINARY_OP_ADD_FLOAT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, + [BINARY_OP_ADD_INT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ERROR_FLAG }, + [BINARY_OP_ADD_UNICODE] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, + [BINARY_OP_INPLACE_ADD_UNICODE] = { true, INSTR_FMT_IXC, HAS_LOCAL_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, + [BINARY_OP_MULTIPLY_FLOAT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, + [BINARY_OP_MULTIPLY_INT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ERROR_FLAG }, + [BINARY_OP_SUBTRACT_FLOAT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, + [BINARY_OP_SUBTRACT_INT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ERROR_FLAG }, [BINARY_SLICE] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [BINARY_SUBSCR] = { true, INSTR_FMT_IXC, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [BINARY_SUBSCR_DICT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, @@ -1060,16 +1062,16 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[268] = { [LOAD_ATTR] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [LOAD_ATTR_CLASS] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG }, [LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, - [LOAD_ATTR_INSTANCE_VALUE] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG }, - [LOAD_ATTR_METHOD_LAZY_DICT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, - [LOAD_ATTR_METHOD_NO_DICT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, - [LOAD_ATTR_METHOD_WITH_VALUES] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, + [LOAD_ATTR_INSTANCE_VALUE] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, + [LOAD_ATTR_METHOD_LAZY_DICT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG }, + [LOAD_ATTR_METHOD_NO_DICT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG }, + [LOAD_ATTR_METHOD_WITH_VALUES] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG }, [LOAD_ATTR_MODULE] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG }, - [LOAD_ATTR_NONDESCRIPTOR_NO_DICT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG }, - [LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG }, + [LOAD_ATTR_NONDESCRIPTOR_NO_DICT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, + [LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, [LOAD_ATTR_PROPERTY] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, - [LOAD_ATTR_SLOT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG }, - [LOAD_ATTR_WITH_HINT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, + [LOAD_ATTR_SLOT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, + [LOAD_ATTR_WITH_HINT] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG }, [LOAD_BUILD_CLASS] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [LOAD_CONST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_CONST_FLAG | HAS_PURE_FLAG }, [LOAD_DEREF] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, @@ -1118,8 +1120,8 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[268] = { [SET_FUNCTION_ATTRIBUTE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ESCAPES_FLAG }, [SET_UPDATE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [STORE_ATTR] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, - [STORE_ATTR_INSTANCE_VALUE] = { true, INSTR_FMT_IXC000, HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, - [STORE_ATTR_SLOT] = { true, INSTR_FMT_IXC000, HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, + [STORE_ATTR_INSTANCE_VALUE] = { true, INSTR_FMT_IXC000, HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG }, + [STORE_ATTR_SLOT] = { true, INSTR_FMT_IXC000, HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG }, [STORE_ATTR_WITH_HINT] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, [STORE_DEREF] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ESCAPES_FLAG }, [STORE_FAST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG }, @@ -1133,12 +1135,12 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[268] = { [STORE_SUBSCR_LIST_INT] = { true, INSTR_FMT_IXC, HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, [SWAP] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_PURE_FLAG }, [TO_BOOL] = { true, INSTR_FMT_IXC00, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, - [TO_BOOL_ALWAYS_TRUE] = { true, INSTR_FMT_IXC00, HAS_DEOPT_FLAG }, - [TO_BOOL_BOOL] = { true, INSTR_FMT_IXC00, HAS_DEOPT_FLAG }, - [TO_BOOL_INT] = { true, INSTR_FMT_IXC00, HAS_DEOPT_FLAG }, - [TO_BOOL_LIST] = { true, INSTR_FMT_IXC00, HAS_DEOPT_FLAG }, - [TO_BOOL_NONE] = { true, INSTR_FMT_IXC00, HAS_DEOPT_FLAG }, - [TO_BOOL_STR] = { true, INSTR_FMT_IXC00, HAS_DEOPT_FLAG }, + [TO_BOOL_ALWAYS_TRUE] = { true, INSTR_FMT_IXC00, HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, + [TO_BOOL_BOOL] = { true, INSTR_FMT_IXC00, HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, + [TO_BOOL_INT] = { true, INSTR_FMT_IXC00, HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, + [TO_BOOL_LIST] = { true, INSTR_FMT_IXC00, HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, + [TO_BOOL_NONE] = { true, INSTR_FMT_IXC00, HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, + [TO_BOOL_STR] = { true, INSTR_FMT_IXC00, HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, [UNARY_INVERT] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [UNARY_NEGATIVE] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [UNARY_NOT] = { true, INSTR_FMT_IX, HAS_PURE_FLAG }, diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index 9bb537d3550..ed800a73796 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -139,7 +139,6 @@ extern "C" { #define _CONTAINS_OP CONTAINS_OP #define _CHECK_EG_MATCH CHECK_EG_MATCH #define _CHECK_EXC_MATCH CHECK_EXC_MATCH -#define _JUMP_BACKWARD JUMP_BACKWARD #define _POP_JUMP_IF_FALSE 339 #define _POP_JUMP_IF_TRUE 340 #define _IS_NONE 341 @@ -237,8 +236,11 @@ extern "C" { #define _CHECK_GLOBALS 384 #define _CHECK_BUILTINS 385 #define _INTERNAL_INCREMENT_OPT_COUNTER 386 -#define _CHECK_VALIDITY_AND_SET_IP 387 -#define MAX_UOP_ID 387 +#define _COLD_EXIT 387 +#define _START_EXECUTOR 388 +#define _FATAL_ERROR 389 +#define _CHECK_VALIDITY_AND_SET_IP 390 +#define MAX_UOP_ID 390 #ifdef __cplusplus } diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 163a0320aa2..1e2dfecd9cf 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -32,22 +32,22 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_UNARY_NEGATIVE] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_UNARY_NOT] = HAS_PURE_FLAG, [_TO_BOOL] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, - [_TO_BOOL_BOOL] = HAS_DEOPT_FLAG | HAS_PASSTHROUGH_FLAG, - [_TO_BOOL_INT] = HAS_DEOPT_FLAG, - [_TO_BOOL_LIST] = HAS_DEOPT_FLAG, - [_TO_BOOL_NONE] = HAS_DEOPT_FLAG, - [_TO_BOOL_STR] = HAS_DEOPT_FLAG, - [_TO_BOOL_ALWAYS_TRUE] = HAS_DEOPT_FLAG, + [_TO_BOOL_BOOL] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_PASSTHROUGH_FLAG, + [_TO_BOOL_INT] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG, + [_TO_BOOL_LIST] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG, + [_TO_BOOL_NONE] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG, + [_TO_BOOL_STR] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG, + [_TO_BOOL_ALWAYS_TRUE] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG, [_UNARY_INVERT] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, - [_GUARD_BOTH_INT] = HAS_DEOPT_FLAG | HAS_PASSTHROUGH_FLAG, + [_GUARD_BOTH_INT] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_PASSTHROUGH_FLAG, [_BINARY_OP_MULTIPLY_INT] = HAS_ERROR_FLAG | HAS_PURE_FLAG, [_BINARY_OP_ADD_INT] = HAS_ERROR_FLAG | HAS_PURE_FLAG, [_BINARY_OP_SUBTRACT_INT] = HAS_ERROR_FLAG | HAS_PURE_FLAG, - [_GUARD_BOTH_FLOAT] = HAS_DEOPT_FLAG | HAS_PASSTHROUGH_FLAG, + [_GUARD_BOTH_FLOAT] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_PASSTHROUGH_FLAG, [_BINARY_OP_MULTIPLY_FLOAT] = HAS_PURE_FLAG, [_BINARY_OP_ADD_FLOAT] = HAS_PURE_FLAG, [_BINARY_OP_SUBTRACT_FLOAT] = HAS_PURE_FLAG, - [_GUARD_BOTH_UNICODE] = HAS_DEOPT_FLAG | HAS_PASSTHROUGH_FLAG, + [_GUARD_BOTH_UNICODE] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_PASSTHROUGH_FLAG, [_BINARY_OP_ADD_UNICODE] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG | HAS_PURE_FLAG, [_BINARY_SUBSCR] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_BINARY_SLICE] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, @@ -112,7 +112,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_LOAD_SUPER_ATTR_ATTR] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_LOAD_SUPER_ATTR_METHOD] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_LOAD_ATTR] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, - [_GUARD_TYPE_VERSION] = HAS_DEOPT_FLAG | HAS_PASSTHROUGH_FLAG, + [_GUARD_TYPE_VERSION] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_PASSTHROUGH_FLAG, [_CHECK_MANAGED_OBJECT_HAS_VALUES] = HAS_DEOPT_FLAG | HAS_PASSTHROUGH_FLAG, [_LOAD_ATTR_INSTANCE_VALUE] = HAS_ARG_FLAG | HAS_DEOPT_FLAG, [_CHECK_ATTR_MODULE] = HAS_DEOPT_FLAG | HAS_PASSTHROUGH_FLAG, @@ -193,14 +193,14 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_COPY] = HAS_ARG_FLAG | HAS_PURE_FLAG, [_BINARY_OP] = HAS_ARG_FLAG | HAS_ERROR_FLAG, [_SWAP] = HAS_ARG_FLAG | HAS_PURE_FLAG, - [_GUARD_IS_TRUE_POP] = HAS_DEOPT_FLAG, - [_GUARD_IS_FALSE_POP] = HAS_DEOPT_FLAG, - [_GUARD_IS_NONE_POP] = HAS_DEOPT_FLAG, - [_GUARD_IS_NOT_NONE_POP] = HAS_DEOPT_FLAG, + [_GUARD_IS_TRUE_POP] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG, + [_GUARD_IS_FALSE_POP] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG, + [_GUARD_IS_NONE_POP] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG, + [_GUARD_IS_NOT_NONE_POP] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG, [_JUMP_TO_TOP] = HAS_EVAL_BREAK_FLAG, [_SET_IP] = 0, [_SAVE_RETURN_OFFSET] = HAS_ARG_FLAG, - [_EXIT_TRACE] = HAS_DEOPT_FLAG, + [_EXIT_TRACE] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG, [_CHECK_VALIDITY] = HAS_DEOPT_FLAG, [_LOAD_CONST_INLINE] = HAS_PURE_FLAG, [_LOAD_CONST_INLINE_BORROW] = HAS_PURE_FLAG, @@ -209,6 +209,9 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_CHECK_GLOBALS] = HAS_DEOPT_FLAG, [_CHECK_BUILTINS] = HAS_DEOPT_FLAG, [_INTERNAL_INCREMENT_OPT_COUNTER] = 0, + [_COLD_EXIT] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, + [_START_EXECUTOR] = 0, + [_FATAL_ERROR] = HAS_ESCAPES_FLAG, [_CHECK_VALIDITY_AND_SET_IP] = HAS_DEOPT_FLAG, }; @@ -266,6 +269,7 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = { [_CHECK_STACK_SPACE] = "_CHECK_STACK_SPACE", [_CHECK_VALIDITY] = "_CHECK_VALIDITY", [_CHECK_VALIDITY_AND_SET_IP] = "_CHECK_VALIDITY_AND_SET_IP", + [_COLD_EXIT] = "_COLD_EXIT", [_COMPARE_OP] = "_COMPARE_OP", [_COMPARE_OP_FLOAT] = "_COMPARE_OP_FLOAT", [_COMPARE_OP_INT] = "_COMPARE_OP_INT", @@ -285,6 +289,7 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = { [_END_SEND] = "_END_SEND", [_EXIT_INIT_CHECK] = "_EXIT_INIT_CHECK", [_EXIT_TRACE] = "_EXIT_TRACE", + [_FATAL_ERROR] = "_FATAL_ERROR", [_FORMAT_SIMPLE] = "_FORMAT_SIMPLE", [_FORMAT_WITH_SPEC] = "_FORMAT_WITH_SPEC", [_FOR_ITER_TIER_TWO] = "_FOR_ITER_TIER_TWO", @@ -377,6 +382,7 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = { [_SET_FUNCTION_ATTRIBUTE] = "_SET_FUNCTION_ATTRIBUTE", [_SET_IP] = "_SET_IP", [_SET_UPDATE] = "_SET_UPDATE", + [_START_EXECUTOR] = "_START_EXECUTOR", [_STORE_ATTR] = "_STORE_ATTR", [_STORE_ATTR_INSTANCE_VALUE] = "_STORE_ATTR_INSTANCE_VALUE", [_STORE_ATTR_SLOT] = "_STORE_ATTR_SLOT", diff --git a/Lib/test/test_frame.py b/Lib/test/test_frame.py index f88206de550..f8812c281c2 100644 --- a/Lib/test/test_frame.py +++ b/Lib/test/test_frame.py @@ -331,6 +331,7 @@ class TestIncompleteFrameAreInvisible(unittest.TestCase): # on the *very next* allocation: gc.collect() gc.set_threshold(1, 0, 0) + sys._clear_internal_caches() # Okay, so here's the nightmare scenario: # - We're tracing the resumption of a generator, which creates a new # frame object. diff --git a/Lib/test/test_generated_cases.py b/Lib/test/test_generated_cases.py index a7ad6c7320b..0d2ccd558d4 100644 --- a/Lib/test/test_generated_cases.py +++ b/Lib/test/test_generated_cases.py @@ -794,6 +794,17 @@ class TestGeneratedCases(unittest.TestCase): self.run_cases_test(input, output) + def test_deopt_and_exit(self): + input = """ + pure op(OP, (arg1 -- out)) { + DEOPT_IF(1); + EXIT_IF(1); + } + """ + output = "" + with self.assertRaises(Exception): + self.run_cases_test(input, output) + class TestGeneratedAbstractCases(unittest.TestCase): def setUp(self) -> None: super().setUp() diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 3834f00009c..bcc431a2700 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -977,7 +977,9 @@ set_optimizer(PyObject *self, PyObject *opt) if (opt == Py_None) { opt = NULL; } - PyUnstable_SetOptimizer((_PyOptimizerObject*)opt); + if (PyUnstable_SetOptimizer((_PyOptimizerObject*)opt) < 0) { + return NULL; + } Py_RETURN_NONE; } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 6822e772e91..2e0008e63f6 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -340,12 +340,12 @@ dummy_func( macro(TO_BOOL) = _SPECIALIZE_TO_BOOL + unused/2 + _TO_BOOL; inst(TO_BOOL_BOOL, (unused/1, unused/2, value -- value)) { - DEOPT_IF(!PyBool_Check(value)); + EXIT_IF(!PyBool_Check(value)); STAT_INC(TO_BOOL, hit); } inst(TO_BOOL_INT, (unused/1, unused/2, value -- res)) { - DEOPT_IF(!PyLong_CheckExact(value)); + EXIT_IF(!PyLong_CheckExact(value)); STAT_INC(TO_BOOL, hit); if (_PyLong_IsZero((PyLongObject *)value)) { assert(_Py_IsImmortal(value)); @@ -358,7 +358,7 @@ dummy_func( } inst(TO_BOOL_LIST, (unused/1, unused/2, value -- res)) { - DEOPT_IF(!PyList_CheckExact(value)); + EXIT_IF(!PyList_CheckExact(value)); STAT_INC(TO_BOOL, hit); res = Py_SIZE(value) ? Py_True : Py_False; DECREF_INPUTS(); @@ -366,13 +366,13 @@ dummy_func( inst(TO_BOOL_NONE, (unused/1, unused/2, value -- res)) { // This one is a bit weird, because we expect *some* failures: - DEOPT_IF(!Py_IsNone(value)); + EXIT_IF(!Py_IsNone(value)); STAT_INC(TO_BOOL, hit); res = Py_False; } inst(TO_BOOL_STR, (unused/1, unused/2, value -- res)) { - DEOPT_IF(!PyUnicode_CheckExact(value)); + EXIT_IF(!PyUnicode_CheckExact(value)); STAT_INC(TO_BOOL, hit); if (value == &_Py_STR(empty)) { assert(_Py_IsImmortal(value)); @@ -388,7 +388,7 @@ dummy_func( inst(TO_BOOL_ALWAYS_TRUE, (unused/1, version/2, value -- res)) { // This one is a bit weird, because we expect *some* failures: assert(version); - DEOPT_IF(Py_TYPE(value)->tp_version_tag != version); + EXIT_IF(Py_TYPE(value)->tp_version_tag != version); STAT_INC(TO_BOOL, hit); DECREF_INPUTS(); res = Py_True; @@ -412,8 +412,8 @@ dummy_func( }; op(_GUARD_BOTH_INT, (left, right -- left, right)) { - DEOPT_IF(!PyLong_CheckExact(left)); - DEOPT_IF(!PyLong_CheckExact(right)); + EXIT_IF(!PyLong_CheckExact(left)); + EXIT_IF(!PyLong_CheckExact(right)); } pure op(_BINARY_OP_MULTIPLY_INT, (left, right -- res)) { @@ -448,8 +448,8 @@ dummy_func( _GUARD_BOTH_INT + unused/1 + _BINARY_OP_SUBTRACT_INT; op(_GUARD_BOTH_FLOAT, (left, right -- left, right)) { - DEOPT_IF(!PyFloat_CheckExact(left)); - DEOPT_IF(!PyFloat_CheckExact(right)); + EXIT_IF(!PyFloat_CheckExact(left)); + EXIT_IF(!PyFloat_CheckExact(right)); } pure op(_BINARY_OP_MULTIPLY_FLOAT, (left, right -- res)) { @@ -484,8 +484,8 @@ dummy_func( _GUARD_BOTH_FLOAT + unused/1 + _BINARY_OP_SUBTRACT_FLOAT; op(_GUARD_BOTH_UNICODE, (left, right -- left, right)) { - DEOPT_IF(!PyUnicode_CheckExact(left)); - DEOPT_IF(!PyUnicode_CheckExact(right)); + EXIT_IF(!PyUnicode_CheckExact(left)); + EXIT_IF(!PyUnicode_CheckExact(right)); } pure op(_BINARY_OP_ADD_UNICODE, (left, right -- res)) { @@ -1904,7 +1904,7 @@ dummy_func( op(_GUARD_TYPE_VERSION, (type_version/2, owner -- owner)) { PyTypeObject *tp = Py_TYPE(owner); assert(type_version != 0); - DEOPT_IF(tp->tp_version_tag != type_version); + EXIT_IF(tp->tp_version_tag != type_version); } op(_CHECK_MANAGED_OBJECT_HAS_VALUES, (owner -- owner)) { @@ -2314,6 +2314,7 @@ dummy_func( } inst(JUMP_BACKWARD, (unused/1 --)) { + TIER_ONE_ONLY CHECK_EVAL_BREAKER(); assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); @@ -2335,13 +2336,13 @@ dummy_func( oparg >>= 8; start--; } - int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer); + _PyExecutorObject *executor; + int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor); ERROR_IF(optimized < 0, error); if (optimized) { - // Rewind and enter the executor: - assert(start->op.code == ENTER_EXECUTOR); - next_instr = start; - this_instr[1].cache &= OPTIMIZER_BITS_MASK; + assert(tstate->previous_executor == NULL); + tstate->previous_executor = Py_None; + GOTO_TIER_TWO(executor); } else { int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK; @@ -2371,14 +2372,15 @@ dummy_func( inst(ENTER_EXECUTOR, (--)) { TIER_ONE_ONLY CHECK_EVAL_BREAKER(); - PyCodeObject *code = _PyFrame_GetCode(frame); - current_executor = code->co_executors->executors[oparg & 255]; - assert(current_executor->vm_data.index == INSTR_OFFSET() - 1); - assert(current_executor->vm_data.code == code); - assert(current_executor->vm_data.valid); - Py_INCREF(current_executor); - GOTO_TIER_TWO(); + _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; + assert(executor->vm_data.index == INSTR_OFFSET() - 1); + assert(executor->vm_data.code == code); + assert(executor->vm_data.valid); + assert(tstate->previous_executor == NULL); + tstate->previous_executor = Py_None; + Py_INCREF(executor); + GOTO_TIER_TWO(executor); } replaced op(_POP_JUMP_IF_FALSE, (cond -- )) { @@ -3997,26 +3999,26 @@ dummy_func( inst(CACHE, (--)) { TIER_ONE_ONLY assert(0 && "Executing a cache."); - Py_UNREACHABLE(); + Py_FatalError("Executing a cache."); } inst(RESERVED, (--)) { TIER_ONE_ONLY assert(0 && "Executing RESERVED instruction."); - Py_UNREACHABLE(); + Py_FatalError("Executing RESERVED instruction."); } ///////// Tier-2 only opcodes ///////// op (_GUARD_IS_TRUE_POP, (flag -- )) { SYNC_SP(); - DEOPT_IF(!Py_IsTrue(flag)); + EXIT_IF(!Py_IsTrue(flag)); assert(Py_IsTrue(flag)); } op (_GUARD_IS_FALSE_POP, (flag -- )) { SYNC_SP(); - DEOPT_IF(!Py_IsFalse(flag)); + EXIT_IF(!Py_IsFalse(flag)); assert(Py_IsFalse(flag)); } @@ -4024,18 +4026,20 @@ dummy_func( SYNC_SP(); if (!Py_IsNone(val)) { Py_DECREF(val); - DEOPT_IF(1); + EXIT_IF(1); } } op (_GUARD_IS_NOT_NONE_POP, (val -- )) { SYNC_SP(); - DEOPT_IF(Py_IsNone(val)); + EXIT_IF(Py_IsNone(val)); Py_DECREF(val); } op(_JUMP_TO_TOP, (--)) { - next_uop = current_executor->trace; +#ifndef _Py_JIT + next_uop = ¤t_executor->trace[1]; +#endif CHECK_EVAL_BREAKER(); } @@ -4055,7 +4059,7 @@ dummy_func( op(_EXIT_TRACE, (--)) { TIER_TWO_ONLY - DEOPT_IF(1); + EXIT_IF(1); } op(_CHECK_VALIDITY, (--)) { @@ -4101,6 +4105,58 @@ dummy_func( exe->count++; } + /* Only used for handling cold side exits, should never appear in + * a normal trace or as part of an instruction. + */ + op(_COLD_EXIT, (--)) { + TIER_TWO_ONLY + _PyExecutorObject *previous = (_PyExecutorObject *)tstate->previous_executor; + _PyExitData *exit = &previous->exits[oparg]; + exit->temperature++; + PyCodeObject *code = _PyFrame_GetCode(frame); + _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target; + if (exit->temperature < (int32_t)tstate->interp->optimizer_side_threshold) { + GOTO_TIER_ONE(target); + } + _PyExecutorObject *executor; + if (target->op.code == ENTER_EXECUTOR) { + executor = code->co_executors->executors[target->op.arg]; + Py_INCREF(executor); + } else { + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor); + if (optimized <= 0) { + int32_t new_temp = -1 * tstate->interp->optimizer_side_threshold; + exit->temperature = (new_temp < INT16_MIN) ? INT16_MIN : new_temp; + if (optimized < 0) { + Py_DECREF(previous); + tstate->previous_executor = Py_None; + ERROR_IF(1, error); + } + GOTO_TIER_ONE(target); + } + } + /* We need two references. One to store in exit->executor and + * one to keep the executor alive when executing. */ + Py_INCREF(executor); + exit->executor = executor; + GOTO_TIER_TWO(executor); + } + + op(_START_EXECUTOR, (executor/4 --)) { + TIER_TWO_ONLY + Py_DECREF(tstate->previous_executor); + tstate->previous_executor = NULL; +#ifndef _Py_JIT + current_executor = (_PyExecutorObject*)executor; +#endif + } + + op(_FATAL_ERROR, (--)) { + TIER_TWO_ONLY + assert(0); + Py_FatalError("Fatal error uop executed."); + } + op(_CHECK_VALIDITY_AND_SET_IP, (instr_ptr/4 --)) { TIER_TWO_ONLY DEOPT_IF(!current_executor->vm_data.valid); diff --git a/Python/ceval.c b/Python/ceval.c index 4f208009086..adccf8fc00f 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -16,6 +16,7 @@ #include "pycore_moduleobject.h" // PyModuleObject #include "pycore_object.h" // _PyObject_GC_TRACK() #include "pycore_opcode_metadata.h" // EXTRA_CASES +#include "pycore_optimizer.h" // _PyUOpExecutor_Type #include "pycore_opcode_utils.h" // MAKE_FUNCTION_* #include "pycore_pyerrors.h" // _PyErr_GetRaisedException() #include "pycore_pystate.h" // _PyInterpreterState_GET() @@ -738,15 +739,16 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int goto resume_with_error; } - /* State shared between Tier 1 and Tier 2 interpreter */ - _PyExecutorObject *current_executor = NULL; - /* Local "register" variables. * These are cached values from the frame and code object. */ - _Py_CODEUNIT *next_instr; PyObject **stack_pointer; +#ifndef _Py_JIT + /* Tier 2 interpreter state */ + _PyExecutorObject *current_executor = NULL; + const _PyUOpInstruction *next_uop = NULL; +#endif start_frame: if (_Py_EnterRecursivePy(tstate)) { @@ -960,18 +962,7 @@ resume_with_error: enter_tier_two: #ifdef _Py_JIT - - ; // ;) - jit_func jitted = current_executor->jit_code; - next_instr = jitted(frame, stack_pointer, tstate); - frame = tstate->current_frame; - Py_DECREF(current_executor); - if (next_instr == NULL) { - goto resume_with_error; - } - stack_pointer = _PyFrame_GetStackPointer(frame); - DISPATCH(); - + assert(0); #else #undef LOAD_IP @@ -1007,12 +998,12 @@ enter_tier_two: #endif OPT_STAT_INC(traces_executed); - _PyUOpInstruction *next_uop = current_executor->trace; uint16_t uopcode; #ifdef Py_STATS uint64_t trace_uop_execution_counter = 0; #endif + assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT); for (;;) { uopcode = next_uop->opcode; DPRINTF(3, @@ -1075,23 +1066,39 @@ error_tier_two: frame->return_offset = 0; // Don't leave this random _PyFrame_SetStackPointer(frame, stack_pointer); Py_DECREF(current_executor); + tstate->previous_executor = NULL; goto resume_with_error; // Jump here from DEOPT_IF() deoptimize: next_instr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); - DPRINTF(2, "DEOPT: [UOp %d (%s), oparg %d, operand %" PRIu64 ", target %d @ %d -> %s]\n", + DPRINTF(2, "DEOPT: [UOp %d (%s), oparg %d, operand %" PRIu64 ", target %d -> %s]\n", uopcode, _PyUOpName(uopcode), next_uop[-1].oparg, next_uop[-1].operand, next_uop[-1].target, - (int)(next_uop - current_executor->trace - 1), - _PyOpcode_OpName[frame->instr_ptr->op.code]); + _PyOpcode_OpName[next_instr->op.code]); OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); UOP_STAT_INC(uopcode, miss); Py_DECREF(current_executor); + tstate->previous_executor = NULL; DISPATCH(); +// Jump here from EXIT_IF() +side_exit: + OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); + UOP_STAT_INC(uopcode, miss); + uint32_t exit_index = next_uop[-1].exit_index; + assert(exit_index < current_executor->exit_count); + _PyExitData *exit = ¤t_executor->exits[exit_index]; + DPRINTF(2, "SIDE EXIT: [UOp %d (%s), oparg %d, operand %" PRIu64 ", exit %u, temp %d, target %d -> %s]\n", + uopcode, _PyUOpName(uopcode), next_uop[-1].oparg, next_uop[-1].operand, exit_index, exit->temperature, + exit->target, _PyOpcode_OpName[_PyCode_CODE(_PyFrame_GetCode(frame))[exit->target].op.code]); + Py_INCREF(exit->executor); + tstate->previous_executor = (PyObject *)current_executor; + GOTO_TIER_TWO(exit->executor); + #endif // _Py_JIT } + #if defined(__GNUC__) # pragma GCC diagnostic pop #elif defined(_MSC_VER) /* MS_WINDOWS */ diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 1043966c9a8..f796b603356 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -394,7 +394,36 @@ stack_pointer = _PyFrame_GetStackPointer(frame); /* Tier-switching macros. */ -#define GOTO_TIER_TWO() goto enter_tier_two; +#ifdef _Py_JIT +#define GOTO_TIER_TWO(EXECUTOR) \ +do { \ + jit_func jitted = (EXECUTOR)->jit_code; \ + next_instr = jitted(frame, stack_pointer, tstate); \ + Py_DECREF(tstate->previous_executor); \ + tstate->previous_executor = NULL; \ + frame = tstate->current_frame; \ + if (next_instr == NULL) { \ + goto resume_with_error; \ + } \ + stack_pointer = _PyFrame_GetStackPointer(frame); \ + DISPATCH(); \ +} while (0) +#else +#define GOTO_TIER_TWO(EXECUTOR) \ +do { \ + next_uop = (EXECUTOR)->trace; \ + assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT); \ + goto enter_tier_two; \ +} while (0) +#endif + +#define GOTO_TIER_ONE(TARGET) \ +do { \ + Py_DECREF(tstate->previous_executor); \ + tstate->previous_executor = NULL; \ + next_instr = target; \ + DISPATCH(); \ +} while (0) #define CURRENT_OPARG() (next_uop[-1].oparg) diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 11e2a1fe85d..a18284d89ab 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -141,7 +141,7 @@ case _TO_BOOL_BOOL: { PyObject *value; value = stack_pointer[-1]; - if (!PyBool_Check(value)) goto deoptimize; + if (!PyBool_Check(value)) goto side_exit; STAT_INC(TO_BOOL, hit); break; } @@ -150,7 +150,7 @@ PyObject *value; PyObject *res; value = stack_pointer[-1]; - if (!PyLong_CheckExact(value)) goto deoptimize; + if (!PyLong_CheckExact(value)) goto side_exit; STAT_INC(TO_BOOL, hit); if (_PyLong_IsZero((PyLongObject *)value)) { assert(_Py_IsImmortal(value)); @@ -168,7 +168,7 @@ PyObject *value; PyObject *res; value = stack_pointer[-1]; - if (!PyList_CheckExact(value)) goto deoptimize; + if (!PyList_CheckExact(value)) goto side_exit; STAT_INC(TO_BOOL, hit); res = Py_SIZE(value) ? Py_True : Py_False; Py_DECREF(value); @@ -181,7 +181,7 @@ PyObject *res; value = stack_pointer[-1]; // This one is a bit weird, because we expect *some* failures: - if (!Py_IsNone(value)) goto deoptimize; + if (!Py_IsNone(value)) goto side_exit; STAT_INC(TO_BOOL, hit); res = Py_False; stack_pointer[-1] = res; @@ -192,7 +192,7 @@ PyObject *value; PyObject *res; value = stack_pointer[-1]; - if (!PyUnicode_CheckExact(value)) goto deoptimize; + if (!PyUnicode_CheckExact(value)) goto side_exit; STAT_INC(TO_BOOL, hit); if (value == &_Py_STR(empty)) { assert(_Py_IsImmortal(value)); @@ -214,7 +214,7 @@ uint32_t version = (uint32_t)CURRENT_OPERAND(); // This one is a bit weird, because we expect *some* failures: assert(version); - if (Py_TYPE(value)->tp_version_tag != version) goto deoptimize; + if (Py_TYPE(value)->tp_version_tag != version) goto side_exit; STAT_INC(TO_BOOL, hit); Py_DECREF(value); res = Py_True; @@ -238,8 +238,8 @@ PyObject *left; right = stack_pointer[-1]; left = stack_pointer[-2]; - if (!PyLong_CheckExact(left)) goto deoptimize; - if (!PyLong_CheckExact(right)) goto deoptimize; + if (!PyLong_CheckExact(left)) goto side_exit; + if (!PyLong_CheckExact(right)) goto side_exit; break; } @@ -296,8 +296,8 @@ PyObject *left; right = stack_pointer[-1]; left = stack_pointer[-2]; - if (!PyFloat_CheckExact(left)) goto deoptimize; - if (!PyFloat_CheckExact(right)) goto deoptimize; + if (!PyFloat_CheckExact(left)) goto side_exit; + if (!PyFloat_CheckExact(right)) goto side_exit; break; } @@ -354,8 +354,8 @@ PyObject *left; right = stack_pointer[-1]; left = stack_pointer[-2]; - if (!PyUnicode_CheckExact(left)) goto deoptimize; - if (!PyUnicode_CheckExact(right)) goto deoptimize; + if (!PyUnicode_CheckExact(left)) goto side_exit; + if (!PyUnicode_CheckExact(right)) goto side_exit; break; } @@ -1623,7 +1623,7 @@ uint32_t type_version = (uint32_t)CURRENT_OPERAND(); PyTypeObject *tp = Py_TYPE(owner); assert(type_version != 0); - if (tp->tp_version_tag != type_version) goto deoptimize; + if (tp->tp_version_tag != type_version) goto side_exit; break; } @@ -2013,8 +2013,6 @@ break; } - /* _JUMP_BACKWARD is not a viable micro-op for tier 2 */ - /* _POP_JUMP_IF_FALSE is not a viable micro-op for tier 2 */ /* _POP_JUMP_IF_TRUE is not a viable micro-op for tier 2 */ @@ -3318,7 +3316,7 @@ PyObject *flag; flag = stack_pointer[-1]; stack_pointer += -1; - if (!Py_IsTrue(flag)) goto deoptimize; + if (!Py_IsTrue(flag)) goto side_exit; assert(Py_IsTrue(flag)); break; } @@ -3327,7 +3325,7 @@ PyObject *flag; flag = stack_pointer[-1]; stack_pointer += -1; - if (!Py_IsFalse(flag)) goto deoptimize; + if (!Py_IsFalse(flag)) goto side_exit; assert(Py_IsFalse(flag)); break; } @@ -3338,7 +3336,7 @@ stack_pointer += -1; if (!Py_IsNone(val)) { Py_DECREF(val); - if (1) goto deoptimize; + if (1) goto side_exit; } break; } @@ -3347,13 +3345,15 @@ PyObject *val; val = stack_pointer[-1]; stack_pointer += -1; - if (Py_IsNone(val)) goto deoptimize; + if (Py_IsNone(val)) goto side_exit; Py_DECREF(val); break; } case _JUMP_TO_TOP: { - next_uop = current_executor->trace; + #ifndef _Py_JIT + next_uop = ¤t_executor->trace[1]; + #endif CHECK_EVAL_BREAKER(); break; } @@ -3378,7 +3378,7 @@ case _EXIT_TRACE: { TIER_TWO_ONLY - if (1) goto deoptimize; + if (1) goto side_exit; break; } @@ -3457,6 +3457,60 @@ break; } + case _COLD_EXIT: { + oparg = CURRENT_OPARG(); + TIER_TWO_ONLY + _PyExecutorObject *previous = (_PyExecutorObject *)tstate->previous_executor; + _PyExitData *exit = &previous->exits[oparg]; + exit->temperature++; + PyCodeObject *code = _PyFrame_GetCode(frame); + _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target; + if (exit->temperature < (int32_t)tstate->interp->optimizer_side_threshold) { + GOTO_TIER_ONE(target); + } + _PyExecutorObject *executor; + if (target->op.code == ENTER_EXECUTOR) { + executor = code->co_executors->executors[target->op.arg]; + Py_INCREF(executor); + } else { + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor); + if (optimized <= 0) { + int32_t new_temp = -1 * tstate->interp->optimizer_side_threshold; + exit->temperature = (new_temp < INT16_MIN) ? INT16_MIN : new_temp; + if (optimized < 0) { + Py_DECREF(previous); + tstate->previous_executor = Py_None; + if (1) goto error_tier_two; + } + GOTO_TIER_ONE(target); + } + } + /* We need two references. One to store in exit->executor and + * one to keep the executor alive when executing. */ + Py_INCREF(executor); + exit->executor = executor; + GOTO_TIER_TWO(executor); + break; + } + + case _START_EXECUTOR: { + PyObject *executor = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY + Py_DECREF(tstate->previous_executor); + tstate->previous_executor = NULL; + #ifndef _Py_JIT + current_executor = (_PyExecutorObject*)executor; + #endif + break; + } + + case _FATAL_ERROR: { + TIER_TWO_ONLY + assert(0); + Py_FatalError("Fatal error uop executed."); + break; + } + case _CHECK_VALIDITY_AND_SET_IP: { PyObject *instr_ptr = (PyObject *)CURRENT_OPERAND(); TIER_TWO_ONLY diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 6c19adc60c6..a520d042a4a 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -741,7 +741,8 @@ INSTRUCTION_STATS(CACHE); TIER_ONE_ONLY assert(0 && "Executing a cache."); - Py_UNREACHABLE(); + Py_FatalError("Executing a cache."); + DISPATCH(); } TARGET(CALL) { @@ -2369,12 +2370,14 @@ TIER_ONE_ONLY CHECK_EVAL_BREAKER(); PyCodeObject *code = _PyFrame_GetCode(frame); - current_executor = code->co_executors->executors[oparg & 255]; - assert(current_executor->vm_data.index == INSTR_OFFSET() - 1); - assert(current_executor->vm_data.code == code); - assert(current_executor->vm_data.valid); - Py_INCREF(current_executor); - GOTO_TIER_TWO(); + _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; + assert(executor->vm_data.index == INSTR_OFFSET() - 1); + assert(executor->vm_data.code == code); + assert(executor->vm_data.valid); + assert(tstate->previous_executor == NULL); + tstate->previous_executor = Py_None; + Py_INCREF(executor); + GOTO_TIER_TWO(executor); DISPATCH(); } @@ -3262,6 +3265,7 @@ next_instr += 2; INSTRUCTION_STATS(JUMP_BACKWARD); /* Skip 1 cache entry */ + TIER_ONE_ONLY CHECK_EVAL_BREAKER(); assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); @@ -3283,13 +3287,13 @@ oparg >>= 8; start--; } - int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer); + _PyExecutorObject *executor; + int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor); if (optimized < 0) goto error; if (optimized) { - // Rewind and enter the executor: - assert(start->op.code == ENTER_EXECUTOR); - next_instr = start; - this_instr[1].cache &= OPTIMIZER_BITS_MASK; + assert(tstate->previous_executor == NULL); + tstate->previous_executor = Py_None; + GOTO_TIER_TWO(executor); } else { int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK; @@ -4778,7 +4782,8 @@ INSTRUCTION_STATS(RESERVED); TIER_ONE_ONLY assert(0 && "Executing RESERVED instruction."); - Py_UNREACHABLE(); + Py_FatalError("Executing RESERVED instruction."); + DISPATCH(); } TARGET(RESUME) { diff --git a/Python/jit.c b/Python/jit.c index 22949c082da..839414bd810 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -300,13 +300,13 @@ emit(const StencilGroup *group, uint64_t patches[]) // Compiles executor in-place. Don't forget to call _PyJIT_Free later! int -_PyJIT_Compile(_PyExecutorObject *executor, _PyUOpInstruction *trace, size_t length) +_PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size_t length) { // Loop once to find the total compiled size: size_t code_size = 0; size_t data_size = 0; for (size_t i = 0; i < length; i++) { - _PyUOpInstruction *instruction = &trace[i]; + _PyUOpInstruction *instruction = (_PyUOpInstruction *)&trace[i]; const StencilGroup *group = &stencil_groups[instruction->opcode]; code_size += group->code.body_size; data_size += group->data.body_size; @@ -323,8 +323,13 @@ _PyJIT_Compile(_PyExecutorObject *executor, _PyUOpInstruction *trace, size_t len // Loop again to emit the code: char *code = memory; char *data = memory + code_size; + char *top = code; + if (trace[0].opcode == _START_EXECUTOR) { + // Don't want to execute this more than once: + top += stencil_groups[_START_EXECUTOR].code.body_size; + } for (size_t i = 0; i < length; i++) { - _PyUOpInstruction *instruction = &trace[i]; + _PyUOpInstruction *instruction = (_PyUOpInstruction *)&trace[i]; const StencilGroup *group = &stencil_groups[instruction->opcode]; // Think of patches as a dictionary mapping HoleValue to uint64_t: uint64_t patches[] = GET_PATCHES(); @@ -335,7 +340,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, _PyUOpInstruction *trace, size_t len patches[HoleValue_OPARG] = instruction->oparg; patches[HoleValue_OPERAND] = instruction->operand; patches[HoleValue_TARGET] = instruction->target; - patches[HoleValue_TOP] = (uint64_t)memory; + patches[HoleValue_TOP] = (uint64_t)top; patches[HoleValue_ZERO] = 0; emit(group, patches); code += group->code.body_size; diff --git a/Python/optimizer.c b/Python/optimizer.c index efa19680c9b..acc1d545d2b 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -2,6 +2,7 @@ #include "opcode.h" #include "pycore_interp.h" #include "pycore_bitutils.h" // _Py_popcount32() +#include "pycore_object.h" // _PyObject_GC_UNTRACK() #include "pycore_opcode_metadata.h" // _PyOpcode_OpName[] #include "pycore_opcode_utils.h" // MAX_REAL_OPCODE #include "pycore_optimizer.h" // _Py_uop_analyze_and_optimize() @@ -128,10 +129,11 @@ static _PyOptimizerObject _PyOptimizer_Default = { .optimize = never_optimize, .resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD, .backedge_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD, + .side_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD, }; static uint32_t -shift_and_offset_threshold(uint16_t threshold) +shift_and_offset_threshold(uint32_t threshold) { return (threshold << OPTIMIZER_BITS_IN_COUNTER) + (1 << 15); } @@ -140,41 +142,74 @@ _PyOptimizerObject * PyUnstable_GetOptimizer(void) { PyInterpreterState *interp = _PyInterpreterState_GET(); - if (interp->optimizer == &_PyOptimizer_Default) { - return NULL; - } assert(interp->optimizer_backedge_threshold == shift_and_offset_threshold(interp->optimizer->backedge_threshold)); assert(interp->optimizer_resume_threshold == shift_and_offset_threshold(interp->optimizer->resume_threshold)); + if (interp->optimizer == &_PyOptimizer_Default) { + return NULL; + } Py_INCREF(interp->optimizer); return interp->optimizer; } +static _PyExecutorObject * +make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *dependencies); + +static int +init_cold_exit_executor(_PyExecutorObject *executor, int oparg); + +static int cold_exits_initialized = 0; +static _PyExecutorObject COLD_EXITS[UOP_MAX_TRACE_LENGTH] = { 0 }; + +static const _PyBloomFilter EMPTY_FILTER = { 0 }; + _PyOptimizerObject * _Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject *optimizer) { if (optimizer == NULL) { optimizer = &_PyOptimizer_Default; } + else if (cold_exits_initialized == 0) { + cold_exits_initialized = 1; + for (int i = 0; i < UOP_MAX_TRACE_LENGTH; i++) { + if (init_cold_exit_executor(&COLD_EXITS[i], i)) { + return NULL; + } + } + } _PyOptimizerObject *old = interp->optimizer; + if (old == NULL) { + old = &_PyOptimizer_Default; + } Py_INCREF(optimizer); interp->optimizer = optimizer; interp->optimizer_backedge_threshold = shift_and_offset_threshold(optimizer->backedge_threshold); interp->optimizer_resume_threshold = shift_and_offset_threshold(optimizer->resume_threshold); + interp->optimizer_side_threshold = optimizer->side_threshold; + if (optimizer == &_PyOptimizer_Default) { + assert(interp->optimizer_backedge_threshold > (1 << 16)); + assert(interp->optimizer_resume_threshold > (1 << 16)); + } return old; } -void +int PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer) { PyInterpreterState *interp = _PyInterpreterState_GET(); _PyOptimizerObject *old = _Py_SetOptimizer(interp, optimizer); - Py_DECREF(old); + Py_XDECREF(old); + return old == NULL ? -1 : 0; } +/* Returns 1 if optimized, 0 if not optimized, and -1 for an error. + * If optimized, *executor_ptr contains a new reference to the executor + */ int -_PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer) +_PyOptimizer_Optimize( + _PyInterpreterFrame *frame, _Py_CODEUNIT *start, + PyObject **stack_pointer, _PyExecutorObject **executor_ptr) { PyCodeObject *code = (PyCodeObject *)frame->f_executable; assert(PyCode_Check(code)); @@ -183,12 +218,11 @@ _PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject return 0; } _PyOptimizerObject *opt = interp->optimizer; - _PyExecutorObject *executor = NULL; - int err = opt->optimize(opt, frame, start, &executor, (int)(stack_pointer - _PyFrame_Stackbase(frame))); + int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame))); if (err <= 0) { - assert(executor == NULL); return err; } + assert(*executor_ptr != NULL); int index = get_index_for_executor(code, start); if (index < 0) { /* Out of memory. Don't raise and assume that the @@ -197,11 +231,11 @@ _PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject * If an optimizer has already produced an executor, * it might get confused by the executor disappearing, * but there is not much we can do about that here. */ - Py_DECREF(executor); + Py_DECREF(*executor_ptr); return 0; } - insert_executor(code, start, index, executor); - Py_DECREF(executor); + insert_executor(code, start, index, *executor_ptr); + assert((*executor_ptr)->vm_data.valid); return 1; } @@ -237,11 +271,12 @@ static PyMethodDef executor_methods[] = { static void uop_dealloc(_PyExecutorObject *self) { + _PyObject_GC_UNTRACK(self); _Py_ExecutorClear(self); #ifdef _Py_JIT _PyJIT_Free(self); #endif - PyObject_Free(self); + PyObject_GC_Del(self); } const char * @@ -253,7 +288,7 @@ _PyUOpName(int index) static Py_ssize_t uop_len(_PyExecutorObject *self) { - return Py_SIZE(self); + return self->code_size; } static PyObject * @@ -292,15 +327,34 @@ PySequenceMethods uop_as_sequence = { .sq_item = (ssizeargfunc)uop_item, }; +static int +executor_clear(PyObject *o) +{ + _Py_ExecutorClear((_PyExecutorObject *)o); + return 0; +} + +static int +executor_traverse(PyObject *o, visitproc visit, void *arg) +{ + _PyExecutorObject *executor = (_PyExecutorObject *)o; + for (uint32_t i = 0; i < executor->exit_count; i++) { + Py_VISIT(executor->exits[i].executor); + } + return 0; +} + PyTypeObject _PyUOpExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "uop_executor", - .tp_basicsize = offsetof(_PyExecutorObject, trace), - .tp_itemsize = sizeof(_PyUOpInstruction), - .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, + .tp_basicsize = offsetof(_PyExecutorObject, exits), + .tp_itemsize = 1, + .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC, .tp_dealloc = (destructor)uop_dealloc, .tp_as_sequence = &uop_as_sequence, .tp_methods = executor_methods, + .tp_traverse = executor_traverse, + .tp_clear = executor_clear, }; /* TO DO -- Generate these tables */ @@ -324,6 +378,7 @@ BRANCH_TO_GUARD[4][2] = { [POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NOT_NONE_POP, }; + #define CONFIDENCE_RANGE 1000 #define CONFIDENCE_CUTOFF 333 @@ -726,9 +781,10 @@ done: * NOPs are excluded from the count. */ static int -compute_used(_PyUOpInstruction *buffer, uint32_t *used) +compute_used(_PyUOpInstruction *buffer, uint32_t *used, int *exit_count_ptr) { int count = 0; + int exit_count = 0; SET_BIT(used, 0); for (int i = 0; i < UOP_MAX_TRACE_LENGTH; i++) { if (!BIT_IS_SET(used, i)) { @@ -736,6 +792,9 @@ compute_used(_PyUOpInstruction *buffer, uint32_t *used) } count++; int opcode = buffer[i].opcode; + if (_PyUop_Flags[opcode] & HAS_EXIT_FLAG) { + exit_count++; + } if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) { continue; } @@ -751,44 +810,76 @@ compute_used(_PyUOpInstruction *buffer, uint32_t *used) UNSET_BIT(used, i); } } + *exit_count_ptr = exit_count; return count; } +/* Executor side exits */ + +static _PyExecutorObject * +allocate_executor(int exit_count, int length) +{ + int size = exit_count*sizeof(_PyExitData) + length*sizeof(_PyUOpInstruction); + _PyExecutorObject *res = PyObject_GC_NewVar(_PyExecutorObject, &_PyUOpExecutor_Type, size); + if (res == NULL) { + return NULL; + } + res->trace = (_PyUOpInstruction *)(res->exits + exit_count); + res->code_size = length; + res->exit_count = exit_count; + return res; +} + /* Makes an executor from a buffer of uops. * Account for the buffer having gaps and NOPs by computing a "used" * bit vector and only copying the used uops. Here "used" means reachable * and not a NOP. */ static _PyExecutorObject * -make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies) +make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *dependencies) { uint32_t used[(UOP_MAX_TRACE_LENGTH + 31)/32] = { 0 }; - int length = compute_used(buffer, used); - _PyExecutorObject *executor = PyObject_NewVar(_PyExecutorObject, &_PyUOpExecutor_Type, length); + int exit_count; + int length = compute_used(buffer, used, &exit_count); + _PyExecutorObject *executor = allocate_executor(exit_count, length+1); if (executor == NULL) { return NULL; } - int dest = length - 1; + /* Initialize exits */ + for (int i = 0; i < exit_count; i++) { + executor->exits[i].executor = &COLD_EXITS[i]; + executor->exits[i].temperature = 0; + } + int next_exit = exit_count-1; + _PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length]; /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */ for (int i = UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) { if (!BIT_IS_SET(used, i)) { continue; } - executor->trace[dest] = buffer[i]; + *dest = buffer[i]; int opcode = buffer[i].opcode; if (opcode == _POP_JUMP_IF_FALSE || opcode == _POP_JUMP_IF_TRUE) { /* The oparg of the target will already have been set to its new offset */ - int oparg = executor->trace[dest].oparg; - executor->trace[dest].oparg = buffer[oparg].oparg; + int oparg = dest->oparg; + dest->oparg = buffer[oparg].oparg; + } + if (_PyUop_Flags[opcode] & HAS_EXIT_FLAG) { + executor->exits[next_exit].target = buffer[i].target; + dest->exit_index = next_exit; + next_exit--; } /* Set the oparg to be the destination offset, * so that we can set the oparg of earlier jumps correctly. */ - buffer[i].oparg = dest; + buffer[i].oparg = (uint16_t)(dest - executor->trace); dest--; } - assert(dest == -1); + assert(next_exit == -1); + assert(dest == executor->trace); + dest->opcode = _START_EXECUTOR; + dest->operand = (uintptr_t)executor; _Py_ExecutorInit(executor, dependencies); #ifdef Py_DEBUG char *python_lltrace = Py_GETENV("PYTHON_LLTRACE"); @@ -811,14 +902,40 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies) #ifdef _Py_JIT executor->jit_code = NULL; executor->jit_size = 0; - if (_PyJIT_Compile(executor, executor->trace, Py_SIZE(executor))) { + if (_PyJIT_Compile(executor, executor->trace, length+1)) { Py_DECREF(executor); return NULL; } #endif + _PyObject_GC_TRACK(executor); return executor; } +static int +init_cold_exit_executor(_PyExecutorObject *executor, int oparg) +{ + _Py_SetImmortal(executor); + Py_SET_TYPE(executor, &_PyUOpExecutor_Type); + executor->trace = (_PyUOpInstruction *)executor->exits; + executor->code_size = 1; + executor->exit_count = 0; + _PyUOpInstruction *inst = (_PyUOpInstruction *)&executor->trace[0]; + inst->opcode = _COLD_EXIT; + inst->oparg = oparg; + executor->vm_data.valid = true; + for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { + assert(executor->vm_data.bloom.bits[i] == 0); + } +#ifdef _Py_JIT + executor->jit_code = NULL; + executor->jit_size = 0; + if (_PyJIT_Compile(executor, executor->trace, 1)) { + return -1; + } +#endif + return 0; +} + static int uop_optimize( _PyOptimizerObject *self, @@ -880,13 +997,15 @@ PyUnstable_Optimizer_NewUOpOptimizer(void) opt->resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD; // Need a few iterations to settle specializations, // and to ammortize the cost of optimization. + opt->side_threshold = 16; opt->backedge_threshold = 16; return (PyObject *)opt; } static void counter_dealloc(_PyExecutorObject *self) { - PyObject *opt = (PyObject *)self->trace[0].operand; + /* The optimizer is the operand of the second uop. */ + PyObject *opt = (PyObject *)self->trace[1].operand; Py_DECREF(opt); uop_dealloc(self); } @@ -894,11 +1013,13 @@ counter_dealloc(_PyExecutorObject *self) { PyTypeObject _PyCounterExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "counting_executor", - .tp_basicsize = offsetof(_PyExecutorObject, trace), - .tp_itemsize = sizeof(_PyUOpInstruction), - .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, + .tp_basicsize = offsetof(_PyExecutorObject, exits), + .tp_itemsize = 1, + .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC, .tp_dealloc = (destructor)counter_dealloc, .tp_methods = executor_methods, + .tp_traverse = executor_traverse, + .tp_clear = executor_clear, }; static int @@ -926,9 +1047,7 @@ counter_optimize( { .opcode = _INTERNAL_INCREMENT_OPT_COUNTER }, { .opcode = _EXIT_TRACE, .target = (uint32_t)(target - _PyCode_CODE(code)) } }; - _PyBloomFilter empty; - _Py_BloomFilter_Init(&empty); - _PyExecutorObject *executor = make_executor_from_uops(buffer, &empty); + _PyExecutorObject *executor = make_executor_from_uops(buffer, &EMPTY_FILTER); if (executor == NULL) { return -1; } @@ -968,6 +1087,7 @@ PyUnstable_Optimizer_NewCounter(void) } opt->base.optimize = counter_optimize; opt->base.resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD; + opt->base.side_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD; opt->base.backedge_threshold = 0; opt->count = 0; return (PyObject *)opt; @@ -1091,9 +1211,6 @@ link_executor(_PyExecutorObject *executor) static void unlink_executor(_PyExecutorObject *executor) { - if (!executor->vm_data.valid) { - return; - } _PyExecutorLinkListNode *links = &executor->vm_data.links; _PyExecutorObject *next = links->next; _PyExecutorObject *prev = links->previous; @@ -1114,7 +1231,7 @@ unlink_executor(_PyExecutorObject *executor) /* This must be called by optimizers before using the executor */ void -_Py_ExecutorInit(_PyExecutorObject *executor, _PyBloomFilter *dependency_set) +_Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_set) { executor->vm_data.valid = true; for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { @@ -1127,11 +1244,19 @@ _Py_ExecutorInit(_PyExecutorObject *executor, _PyBloomFilter *dependency_set) void _Py_ExecutorClear(_PyExecutorObject *executor) { + if (!executor->vm_data.valid) { + return; + } unlink_executor(executor); PyCodeObject *code = executor->vm_data.code; if (code == NULL) { return; } + for (uint32_t i = 0; i < executor->exit_count; i++) { + Py_DECREF(executor->exits[i].executor); + executor->exits[i].executor = &COLD_EXITS[i]; + executor->exits[i].temperature = INT16_MIN; + } _Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index]; assert(instruction->op.code == ENTER_EXECUTOR); int index = instruction->op.arg; diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index b354c033ae7..7b537af8c87 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1261,7 +1261,9 @@ init_interp_main(PyThreadState *tstate) if (opt == NULL) { return _PyStatus_ERR("can't initialize optimizer"); } - PyUnstable_SetOptimizer((_PyOptimizerObject *)opt); + if (PyUnstable_SetOptimizer((_PyOptimizerObject *)opt)) { + return _PyStatus_ERR("can't initialize optimizer"); + } Py_DECREF(opt); } } diff --git a/Python/pystate.c b/Python/pystate.c index c2ccc276449..3484beab7aa 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -782,6 +782,7 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate) } _PyOptimizerObject *old = _Py_SetOptimizer(interp, NULL); + assert(old != NULL); Py_DECREF(old); /* It is possible that any of the objects below have a finalizer @@ -1346,6 +1347,7 @@ init_threadstate(_PyThreadStateImpl *_tstate, tstate->datastack_top = NULL; tstate->datastack_limit = NULL; tstate->what_event = -1; + tstate->previous_executor = NULL; #ifdef Py_GIL_DISABLED // Initialize biased reference counting inter-thread queue diff --git a/Python/tier2_engine.md b/Python/tier2_engine.md new file mode 100644 index 00000000000..df9f6c12450 --- /dev/null +++ b/Python/tier2_engine.md @@ -0,0 +1,150 @@ +# The tier 2 execution engine + +## General idea + +When execution in tier 1 becomes "hot", that is the counter for that point in +the code reaches some threshold, we create an executor and execute that +instead of the tier 1 bytecode. + +Since each executor must exit, we also track the "hotness" of those +exits and attach new executors to those exits. + +As the program executes, and the hot parts of the program get optimized, +a graph of executors forms. + +## Superblocks and Executors + +Once a point in the code has become hot enough, we want to optimize it. +Starting from that point we project the likely path of execution, +using information gathered by tier 1 to guide that projection to +form a "superblock", a mostly linear sequence of micro-ops. +Although mostly linear, it may include a single loop. + +We then optimize this superblock to form an optimized superblock, +which is equivalent but more efficient. + +A superblock is a representation of the code we want to execute, +but it is not in executable form. +The executable form is known as an executor. + +Executors are semantically equivalent to the superblock they are +created from, but are in a form that can be efficiently executable. + +There are two execution engines for executors, and two types of executors: +* The hardware which runs machine code executors created by the JIT compiler. +* The tier 2 interpreter runs bytecode executors. + +It would be very wasteful to support both a tier 2 interpreter and +JIT compiler in the same process. +For now, we will make the choice of engine a configuration option, +but we could make it a command line option in the future if that would prove useful. + + +### Tier 2 Interpreter + +For platforms without a JIT and for testing, we need an interpreter +for executors. It is similar in design to the tier 1 interpreter, but has a +different instruction set, and does not adapt. + +### JIT compiler + +The JIT compiler converts superblocks into machine code executors. +These have identical behavior to interpreted executors, except that +they consume more memory for the generated machine code and are a lot faster. + +## Transfering control + +There are three types of control transfer that we need to consider: +* Tier 1 to tier 2 +* Tier 2 to tier 1 +* One executor to another within tier 2 + +Since we expect the graph of executors to span most of the hot +part of the program, transfers from one executor to another should +be the most common. +Therefore, we want to make those transfers fast. + +### Tier 2 to tier 2 + +#### Cold exits + +All side exits start cold and most stay cold, but a few become +hot. We want to keep the memory consumption small for the many +cold exits, but those that become hot need to be fast. +However we cannot know in advance, which will be which. + +So that tier 2 to tier 2 transfers are fast for hot exits, +exits must be implemented as executors. In order to patch +executor exits when they get hot, a pointer to the current +executor must be passed to the exit executor. + +#### Handling reference counts + +There must be an implicit reference to the currently executing +executor, otherwise it might be freed. +Consequently, we must increment the reference count of an +executor just before executing it, and decrement it just after +executing it. + +We want to minimize the amount of data that is passed from +one executor to the next. In the JIT, this reduces the number +of arguments in the tailcall, freeing up registers for other uses. +It is less important in the interpreter, but following the same +design as the JIT simplifies debugging and is good for performance. + +Provided that we incref the new executor before executing it, we +can jump directly to the code of the executor, without needing +to pass a reference to that executor object. +However, we do need a reference to the previous executor, +so that it can be decref'd and for handling of cold exits. +To avoid messing up the JIT's register allocation, we pass a +reference to the previous executor in the thread state's +`previous_executor` field. + +#### The interpreter + +The tier 2 interpreter has a variable `current_executor` which +points to the currently live executor. When transfering from executor +`A` to executor `B` we do the following: +(Initially `current_executor` points to `A`, and the refcount of +`A` is elevated by one) + +1. Set the instruction pointer to start at the beginning of `B` +2. Increment the reference count of `B` +3. Start executing `B` + +We also make the first instruction in `B` do the following: +1. Set `current_executor` to point to `B` +2. Decrement the reference count of `A` (`A` is referenced by `tstate->previous_executor`) + +The net effect of the above is to safely decrement the refcount of `A`, +increment the refcount of `B` and set `current_executor` to point to `B`. + +#### In the JIT + +Transfering control from one executor to another is done via tailcalls. + +The compiled executor should do the same, except that there is no local +variable `current_executor`. + +### Tier 1 to tier 2 + +Since the executor doesn't know if the previous code was tier 1 or tier 2, +we need to make a transfer from tier 1 to tier 2 look like a tier 2 to tier 2 +transfer to the executor. + +We can then perform a tier 1 to tier 2 transfer by setting `current_executor` +to `None`, and then performing a tier 2 to tier 2 transfer as above. + +### Tier 2 to tier 1 + +Each micro-op that might exit to tier 1 contains a `target` value, +which is the offset of the tier 1 instruction to exit to in the +current code object. + +## Counters + +TO DO. +The implementation will change soon, so there is no point in +documenting it until then. + diff --git a/Python/tier2_redundancy_eliminator_cases.c.h b/Python/tier2_redundancy_eliminator_cases.c.h index be2fbb9106f..98f0bdca01f 100644 --- a/Python/tier2_redundancy_eliminator_cases.c.h +++ b/Python/tier2_redundancy_eliminator_cases.c.h @@ -1053,8 +1053,6 @@ break; } - /* _JUMP_BACKWARD is not a viable micro-op for tier 2 */ - /* _POP_JUMP_IF_FALSE is not a viable micro-op for tier 2 */ /* _POP_JUMP_IF_TRUE is not a viable micro-op for tier 2 */ @@ -1739,6 +1737,18 @@ break; } + case _COLD_EXIT: { + break; + } + + case _START_EXECUTOR: { + break; + } + + case _FATAL_ERROR: { + break; + } + case _CHECK_VALIDITY_AND_SET_IP: { break; } diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py index 61cd41ea8f3..8482bf849aa 100644 --- a/Tools/c-analyzer/cpython/_parser.py +++ b/Tools/c-analyzer/cpython/_parser.py @@ -321,6 +321,7 @@ MAX_SIZES = { _abs('Objects/stringlib/unicode_format.h'): (10_000, 400), _abs('Objects/typeobject.c'): (35_000, 200), _abs('Python/compile.c'): (20_000, 500), + _abs('Python/optimizer.c'): (100_000, 5_000), _abs('Python/parking_lot.c'): (40_000, 1000), _abs('Python/pylifecycle.c'): (500_000, 5000), _abs('Python/pystate.c'): (500_000, 5000), diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv index 14bcd85b9ea..94be3375849 100644 --- a/Tools/c-analyzer/cpython/ignored.tsv +++ b/Tools/c-analyzer/cpython/ignored.tsv @@ -382,6 +382,11 @@ Python/optimizer.c - _PyCounterOptimizer_Type - Python/optimizer.c - _PyUOpExecutor_Type - Python/optimizer.c - _PyUOpOptimizer_Type - Python/optimizer.c - _PyOptimizer_Default - +Python/optimizer.c - _ColdExit_Type - +Python/optimizer.c - COLD_EXITS - +Python/optimizer.c - Py_FatalErrorExecutor - +Python/optimizer.c - EMPTY_FILTER - +Python/optimizer.c - cold_exits_initialized - ##----------------------- ## test code diff --git a/Tools/cases_generator/analyzer.py b/Tools/cases_generator/analyzer.py index 3497b7fcdf3..bcffd75269a 100644 --- a/Tools/cases_generator/analyzer.py +++ b/Tools/cases_generator/analyzer.py @@ -21,6 +21,7 @@ class Properties: uses_co_names: bool uses_locals: bool has_free: bool + side_exit: bool pure: bool passthrough: bool @@ -48,6 +49,7 @@ class Properties: uses_co_names=any(p.uses_co_names for p in properties), uses_locals=any(p.uses_locals for p in properties), has_free=any(p.has_free for p in properties), + side_exit=any(p.side_exit for p in properties), pure=all(p.pure for p in properties), passthrough=all(p.passthrough for p in properties), ) @@ -69,6 +71,7 @@ SKIP_PROPERTIES = Properties( uses_co_names=False, uses_locals=False, has_free=False, + side_exit=False, pure=False, passthrough=False, ) @@ -269,9 +272,7 @@ def override_error( def convert_stack_item(item: parser.StackEffect) -> StackItem: - return StackItem( - item.name, item.type, item.cond, (item.size or "1") - ) + return StackItem(item.name, item.type, item.cond, (item.size or "1")) def analyze_stack(op: parser.InstDef) -> StackEffect: @@ -448,13 +449,24 @@ def compute_properties(op: parser.InstDef) -> Properties: or variable_used(op, "PyCell_GET") or variable_used(op, "PyCell_SET") ) + deopts_if = variable_used(op, "DEOPT_IF") + exits_if = variable_used(op, "EXIT_IF") + if deopts_if and exits_if: + tkn = op.tokens[0] + raise lexer.make_syntax_error( + "Op cannot contain both EXIT_IF and DEOPT_IF", + tkn.filename, + tkn.line, + tkn.column, + op.name, + ) infallible = is_infallible(op) - deopts = variable_used(op, "DEOPT_IF") passthrough = stack_effect_only_peeks(op) and infallible return Properties( escapes=makes_escaping_api_call(op), infallible=infallible, - deopts=deopts, + deopts=deopts_if or exits_if, + side_exit=exits_if, oparg=variable_used(op, "oparg"), jumps=variable_used(op, "JUMPBY"), eval_breaker=variable_used(op, "CHECK_EVAL_BREAKER"), diff --git a/Tools/cases_generator/generators_common.py b/Tools/cases_generator/generators_common.py index 2fc2ab11532..54ffea71b53 100644 --- a/Tools/cases_generator/generators_common.py +++ b/Tools/cases_generator/generators_common.py @@ -154,6 +154,7 @@ def replace_check_eval_breaker( REPLACEMENT_FUNCTIONS = { + "EXIT_IF": replace_deopt, "DEOPT_IF": replace_deopt, "ERROR_IF": replace_error, "DECREF_INPUTS": replace_decrefs, @@ -205,6 +206,8 @@ def cflags(p: Properties) -> str: flags.append("HAS_EVAL_BREAK_FLAG") if p.deopts: flags.append("HAS_DEOPT_FLAG") + if p.side_exit: + flags.append("HAS_EXIT_FLAG") if not p.infallible: flags.append("HAS_ERROR_FLAG") if p.escapes: diff --git a/Tools/cases_generator/opcode_metadata_generator.py b/Tools/cases_generator/opcode_metadata_generator.py index 3e9fa3e26da..52dc09e1499 100644 --- a/Tools/cases_generator/opcode_metadata_generator.py +++ b/Tools/cases_generator/opcode_metadata_generator.py @@ -50,6 +50,7 @@ FLAGS = [ "DEOPT", "ERROR", "ESCAPES", + "EXIT", "PURE", "PASSTHROUGH", ] diff --git a/Tools/cases_generator/tier2_generator.py b/Tools/cases_generator/tier2_generator.py index 7897b89b275..8b4d1646dd5 100644 --- a/Tools/cases_generator/tier2_generator.py +++ b/Tools/cases_generator/tier2_generator.py @@ -98,9 +98,25 @@ def tier2_replace_deopt( out.emit(") goto deoptimize;\n") +def tier2_replace_exit_if( + out: CWriter, + tkn: Token, + tkn_iter: Iterator[Token], + uop: Uop, + unused: Stack, + inst: Instruction | None, +) -> None: + out.emit_at("if ", tkn) + out.emit(next(tkn_iter)) + emit_to(out, tkn_iter, "RPAREN") + next(tkn_iter) # Semi colon + out.emit(") goto side_exit;\n") + + TIER2_REPLACEMENT_FUNCTIONS = REPLACEMENT_FUNCTIONS.copy() TIER2_REPLACEMENT_FUNCTIONS["ERROR_IF"] = tier2_replace_error TIER2_REPLACEMENT_FUNCTIONS["DEOPT_IF"] = tier2_replace_deopt +TIER2_REPLACEMENT_FUNCTIONS["EXIT_IF"] = tier2_replace_exit_if def write_uop(uop: Uop, out: CWriter, stack: Stack) -> None: diff --git a/Tools/jit/template.c b/Tools/jit/template.c index 12303a550d8..d79c6efb8f6 100644 --- a/Tools/jit/template.c +++ b/Tools/jit/template.c @@ -38,6 +38,20 @@ goto LABEL ## _tier_two; \ } while (0) +#undef GOTO_TIER_TWO +#define GOTO_TIER_TWO(EXECUTOR) \ +do { \ + __attribute__((musttail)) \ + return ((jit_func)((EXECUTOR)->jit_code))(frame, stack_pointer, tstate); \ +} while (0) + +#undef GOTO_TIER_ONE +#define GOTO_TIER_ONE(TARGET) \ +do { \ + _PyFrame_SetStackPointer(frame, stack_pointer); \ + return TARGET; \ +} while (0) + #undef LOAD_IP #define LOAD_IP(UNUSED) \ do { \ @@ -59,7 +73,6 @@ _JIT_ENTRY(_PyInterpreterFrame *frame, PyObject **stack_pointer, PyThreadState * PATCH_VALUE(_PyExecutorObject *, current_executor, _JIT_EXECUTOR) int oparg; int opcode = _JIT_OPCODE; - _PyUOpInstruction *next_uop; // Other stuff we need handy: PATCH_VALUE(uint16_t, _oparg, _JIT_OPARG) PATCH_VALUE(uint64_t, _operand, _JIT_OPERAND) @@ -90,9 +103,16 @@ pop_2_error_tier_two: pop_1_error_tier_two: STACK_SHRINK(1); error_tier_two: - _PyFrame_SetStackPointer(frame, stack_pointer); - return NULL; + tstate->previous_executor = (PyObject *)current_executor; + GOTO_TIER_ONE(NULL); deoptimize: - _PyFrame_SetStackPointer(frame, stack_pointer); - return _PyCode_CODE(_PyFrame_GetCode(frame)) + _target; + tstate->previous_executor = (PyObject *)current_executor; + GOTO_TIER_ONE(_PyCode_CODE(_PyFrame_GetCode(frame)) + _target); +side_exit: + { + _PyExitData *exit = ¤t_executor->exits[_target]; + Py_INCREF(exit->executor); + tstate->previous_executor = (PyObject *)current_executor; + GOTO_TIER_TWO(exit->executor); + } }