mirror of https://github.com/python/cpython
GH-123516: Improve JIT memory consumption by invalidating cold executors (GH-124443)
Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
This commit is contained in:
parent
23e812b84a
commit
65f1237098
|
@ -283,6 +283,7 @@ PyAPI_FUNC(PyObject *) _PyEval_LoadName(PyThreadState *tstate, _PyInterpreterFra
|
|||
#define _PY_GC_SCHEDULED_BIT (1U << 4)
|
||||
#define _PY_EVAL_PLEASE_STOP_BIT (1U << 5)
|
||||
#define _PY_EVAL_EXPLICIT_MERGE_BIT (1U << 6)
|
||||
#define _PY_EVAL_JIT_INVALIDATE_COLD_BIT (1U << 7)
|
||||
|
||||
/* Reserve a few bits for future use */
|
||||
#define _PY_EVAL_EVENTS_BITS 8
|
||||
|
|
|
@ -261,7 +261,7 @@ struct _is {
|
|||
struct callable_cache callable_cache;
|
||||
_PyOptimizerObject *optimizer;
|
||||
_PyExecutorObject *executor_list_head;
|
||||
|
||||
size_t trace_run_counter;
|
||||
_rare_events rare_events;
|
||||
PyDict_WatchCallback builtins_dict_watcher;
|
||||
|
||||
|
|
|
@ -29,9 +29,10 @@ typedef struct {
|
|||
typedef struct {
|
||||
uint8_t opcode;
|
||||
uint8_t oparg;
|
||||
uint16_t valid:1;
|
||||
uint16_t linked:1;
|
||||
uint16_t chain_depth:14; // Must be big engough for MAX_CHAIN_DEPTH - 1.
|
||||
uint8_t valid:1;
|
||||
uint8_t linked:1;
|
||||
uint8_t chain_depth:6; // Must be big enough for MAX_CHAIN_DEPTH - 1.
|
||||
bool warm;
|
||||
int index; // Index of ENTER_EXECUTOR (if code isn't NULL, below).
|
||||
_PyBloomFilter bloom;
|
||||
_PyExecutorLinkListNode links;
|
||||
|
@ -123,11 +124,18 @@ PyAPI_FUNC(PyObject *) _PyOptimizer_NewUOpOptimizer(void);
|
|||
#ifdef _Py_TIER2
|
||||
PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation);
|
||||
PyAPI_FUNC(void) _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation);
|
||||
PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp);
|
||||
|
||||
#else
|
||||
# define _Py_Executors_InvalidateDependency(A, B, C) ((void)0)
|
||||
# define _Py_Executors_InvalidateAll(A, B) ((void)0)
|
||||
# define _Py_Executors_InvalidateCold(A) ((void)0)
|
||||
|
||||
#endif
|
||||
|
||||
// Used as the threshold to trigger executor invalidation when
|
||||
// trace_run_counter is greater than this value.
|
||||
#define JIT_CLEANUP_THRESHOLD 100000
|
||||
|
||||
// This is the length of the trace we project initially.
|
||||
#define UOP_MAX_TRACE_LENGTH 800
|
||||
|
|
|
@ -222,64 +222,65 @@ extern "C" {
|
|||
#define _LOAD_SUPER_ATTR_METHOD LOAD_SUPER_ATTR_METHOD
|
||||
#define _MAKE_CELL MAKE_CELL
|
||||
#define _MAKE_FUNCTION MAKE_FUNCTION
|
||||
#define _MAKE_WARM 439
|
||||
#define _MAP_ADD MAP_ADD
|
||||
#define _MATCH_CLASS MATCH_CLASS
|
||||
#define _MATCH_KEYS MATCH_KEYS
|
||||
#define _MATCH_MAPPING MATCH_MAPPING
|
||||
#define _MATCH_SEQUENCE MATCH_SEQUENCE
|
||||
#define _MAYBE_EXPAND_METHOD 439
|
||||
#define _MONITOR_CALL 440
|
||||
#define _MONITOR_JUMP_BACKWARD 441
|
||||
#define _MONITOR_RESUME 442
|
||||
#define _MAYBE_EXPAND_METHOD 440
|
||||
#define _MONITOR_CALL 441
|
||||
#define _MONITOR_JUMP_BACKWARD 442
|
||||
#define _MONITOR_RESUME 443
|
||||
#define _NOP NOP
|
||||
#define _POP_EXCEPT POP_EXCEPT
|
||||
#define _POP_JUMP_IF_FALSE 443
|
||||
#define _POP_JUMP_IF_TRUE 444
|
||||
#define _POP_JUMP_IF_FALSE 444
|
||||
#define _POP_JUMP_IF_TRUE 445
|
||||
#define _POP_TOP POP_TOP
|
||||
#define _POP_TOP_LOAD_CONST_INLINE_BORROW 445
|
||||
#define _POP_TOP_LOAD_CONST_INLINE_BORROW 446
|
||||
#define _PUSH_EXC_INFO PUSH_EXC_INFO
|
||||
#define _PUSH_FRAME 446
|
||||
#define _PUSH_FRAME 447
|
||||
#define _PUSH_NULL PUSH_NULL
|
||||
#define _PY_FRAME_GENERAL 447
|
||||
#define _PY_FRAME_KW 448
|
||||
#define _QUICKEN_RESUME 449
|
||||
#define _REPLACE_WITH_TRUE 450
|
||||
#define _PY_FRAME_GENERAL 448
|
||||
#define _PY_FRAME_KW 449
|
||||
#define _QUICKEN_RESUME 450
|
||||
#define _REPLACE_WITH_TRUE 451
|
||||
#define _RESUME_CHECK RESUME_CHECK
|
||||
#define _RETURN_GENERATOR RETURN_GENERATOR
|
||||
#define _RETURN_VALUE RETURN_VALUE
|
||||
#define _SAVE_RETURN_OFFSET 451
|
||||
#define _SEND 452
|
||||
#define _SEND_GEN_FRAME 453
|
||||
#define _SAVE_RETURN_OFFSET 452
|
||||
#define _SEND 453
|
||||
#define _SEND_GEN_FRAME 454
|
||||
#define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS
|
||||
#define _SET_ADD SET_ADD
|
||||
#define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE
|
||||
#define _SET_UPDATE SET_UPDATE
|
||||
#define _START_EXECUTOR 454
|
||||
#define _STORE_ATTR 455
|
||||
#define _STORE_ATTR_INSTANCE_VALUE 456
|
||||
#define _STORE_ATTR_SLOT 457
|
||||
#define _STORE_ATTR_WITH_HINT 458
|
||||
#define _START_EXECUTOR 455
|
||||
#define _STORE_ATTR 456
|
||||
#define _STORE_ATTR_INSTANCE_VALUE 457
|
||||
#define _STORE_ATTR_SLOT 458
|
||||
#define _STORE_ATTR_WITH_HINT 459
|
||||
#define _STORE_DEREF STORE_DEREF
|
||||
#define _STORE_FAST 459
|
||||
#define _STORE_FAST_0 460
|
||||
#define _STORE_FAST_1 461
|
||||
#define _STORE_FAST_2 462
|
||||
#define _STORE_FAST_3 463
|
||||
#define _STORE_FAST_4 464
|
||||
#define _STORE_FAST_5 465
|
||||
#define _STORE_FAST_6 466
|
||||
#define _STORE_FAST_7 467
|
||||
#define _STORE_FAST 460
|
||||
#define _STORE_FAST_0 461
|
||||
#define _STORE_FAST_1 462
|
||||
#define _STORE_FAST_2 463
|
||||
#define _STORE_FAST_3 464
|
||||
#define _STORE_FAST_4 465
|
||||
#define _STORE_FAST_5 466
|
||||
#define _STORE_FAST_6 467
|
||||
#define _STORE_FAST_7 468
|
||||
#define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST
|
||||
#define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST
|
||||
#define _STORE_GLOBAL STORE_GLOBAL
|
||||
#define _STORE_NAME STORE_NAME
|
||||
#define _STORE_SLICE 468
|
||||
#define _STORE_SUBSCR 469
|
||||
#define _STORE_SLICE 469
|
||||
#define _STORE_SUBSCR 470
|
||||
#define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT
|
||||
#define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT
|
||||
#define _SWAP SWAP
|
||||
#define _TIER2_RESUME_CHECK 470
|
||||
#define _TO_BOOL 471
|
||||
#define _TIER2_RESUME_CHECK 471
|
||||
#define _TO_BOOL 472
|
||||
#define _TO_BOOL_BOOL TO_BOOL_BOOL
|
||||
#define _TO_BOOL_INT TO_BOOL_INT
|
||||
#define _TO_BOOL_LIST TO_BOOL_LIST
|
||||
|
@ -289,14 +290,14 @@ extern "C" {
|
|||
#define _UNARY_NEGATIVE UNARY_NEGATIVE
|
||||
#define _UNARY_NOT UNARY_NOT
|
||||
#define _UNPACK_EX UNPACK_EX
|
||||
#define _UNPACK_SEQUENCE 472
|
||||
#define _UNPACK_SEQUENCE 473
|
||||
#define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST
|
||||
#define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE
|
||||
#define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE
|
||||
#define _WITH_EXCEPT_START WITH_EXCEPT_START
|
||||
#define _YIELD_VALUE YIELD_VALUE
|
||||
#define __DO_CALL_FUNCTION_EX _DO_CALL_FUNCTION_EX
|
||||
#define MAX_UOP_ID 472
|
||||
#define MAX_UOP_ID 473
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -274,6 +274,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = {
|
|||
[_INTERNAL_INCREMENT_OPT_COUNTER] = 0,
|
||||
[_DYNAMIC_EXIT] = HAS_ESCAPES_FLAG,
|
||||
[_START_EXECUTOR] = 0,
|
||||
[_MAKE_WARM] = 0,
|
||||
[_FATAL_ERROR] = 0,
|
||||
[_CHECK_VALIDITY_AND_SET_IP] = HAS_DEOPT_FLAG,
|
||||
[_DEOPT] = 0,
|
||||
|
@ -481,6 +482,7 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = {
|
|||
[_LOAD_SUPER_ATTR_METHOD] = "_LOAD_SUPER_ATTR_METHOD",
|
||||
[_MAKE_CELL] = "_MAKE_CELL",
|
||||
[_MAKE_FUNCTION] = "_MAKE_FUNCTION",
|
||||
[_MAKE_WARM] = "_MAKE_WARM",
|
||||
[_MAP_ADD] = "_MAP_ADD",
|
||||
[_MATCH_CLASS] = "_MATCH_CLASS",
|
||||
[_MATCH_KEYS] = "_MATCH_KEYS",
|
||||
|
@ -1062,6 +1064,8 @@ int _PyUop_num_popped(int opcode, int oparg)
|
|||
return 0;
|
||||
case _START_EXECUTOR:
|
||||
return 0;
|
||||
case _MAKE_WARM:
|
||||
return 0;
|
||||
case _FATAL_ERROR:
|
||||
return 0;
|
||||
case _CHECK_VALIDITY_AND_SET_IP:
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Improved JIT memory consumption by periodically freeing memory used by infrequently-executed code.
|
||||
This change is especially likely to improve the memory footprint of long-running programs.
|
|
@ -4836,6 +4836,14 @@ dummy_func(
|
|||
assert(((_PyExecutorObject *)executor)->vm_data.valid);
|
||||
}
|
||||
|
||||
tier2 op(_MAKE_WARM, (--)) {
|
||||
current_executor->vm_data.warm = true;
|
||||
// It's okay if this ends up going negative.
|
||||
if (--tstate->interp->trace_run_counter == 0) {
|
||||
_Py_set_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT);
|
||||
}
|
||||
}
|
||||
|
||||
tier2 op(_FATAL_ERROR, (--)) {
|
||||
assert(0);
|
||||
Py_FatalError("Fatal error uop executed.");
|
||||
|
|
|
@ -1289,6 +1289,12 @@ _Py_HandlePending(PyThreadState *tstate)
|
|||
_Py_RunGC(tstate);
|
||||
}
|
||||
|
||||
if ((breaker & _PY_EVAL_JIT_INVALIDATE_COLD_BIT) != 0) {
|
||||
_Py_unset_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT);
|
||||
_Py_Executors_InvalidateCold(tstate->interp);
|
||||
tstate->interp->trace_run_counter = JIT_CLEANUP_THRESHOLD;
|
||||
}
|
||||
|
||||
/* GIL drop request */
|
||||
if ((breaker & _PY_GIL_DROP_REQUEST_BIT) != 0) {
|
||||
/* Give another thread a chance */
|
||||
|
|
|
@ -5435,6 +5435,15 @@
|
|||
break;
|
||||
}
|
||||
|
||||
case _MAKE_WARM: {
|
||||
current_executor->vm_data.warm = true;
|
||||
// It's okay if this ends up going negative.
|
||||
if (--tstate->interp->trace_run_counter == 0) {
|
||||
_Py_set_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case _FATAL_ERROR: {
|
||||
assert(0);
|
||||
Py_FatalError("Fatal error uop executed.");
|
||||
|
|
|
@ -565,6 +565,7 @@ translate_bytecode_to_trace(
|
|||
code->co_firstlineno,
|
||||
2 * INSTR_IP(initial_instr, code));
|
||||
ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)instr, INSTR_IP(instr, code));
|
||||
ADD_TO_TRACE(_MAKE_WARM, 0, 0, 0);
|
||||
uint32_t target = 0;
|
||||
|
||||
for (;;) {
|
||||
|
@ -1194,6 +1195,9 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil
|
|||
executor->jit_code = NULL;
|
||||
executor->jit_side_entry = NULL;
|
||||
executor->jit_size = 0;
|
||||
// This is initialized to true so we can prevent the executor
|
||||
// from being immediately detected as cold and invalidated.
|
||||
executor->vm_data.warm = true;
|
||||
if (_PyJIT_Compile(executor, executor->trace, length)) {
|
||||
Py_DECREF(executor);
|
||||
return NULL;
|
||||
|
@ -1659,4 +1663,42 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
_Py_Executors_InvalidateCold(PyInterpreterState *interp)
|
||||
{
|
||||
/* Walk the list of executors */
|
||||
/* TO DO -- Use a tree to avoid traversing as many objects */
|
||||
PyObject *invalidate = PyList_New(0);
|
||||
if (invalidate == NULL) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Clearing an executor can deallocate others, so we need to make a list of
|
||||
* executors to invalidate first */
|
||||
for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
|
||||
assert(exec->vm_data.valid);
|
||||
_PyExecutorObject *next = exec->vm_data.links.next;
|
||||
|
||||
if (!exec->vm_data.warm && PyList_Append(invalidate, (PyObject *)exec) < 0) {
|
||||
goto error;
|
||||
}
|
||||
else {
|
||||
exec->vm_data.warm = false;
|
||||
}
|
||||
|
||||
exec = next;
|
||||
}
|
||||
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
|
||||
_PyExecutorObject *exec = (_PyExecutorObject *)PyList_GET_ITEM(invalidate, i);
|
||||
executor_clear(exec);
|
||||
}
|
||||
Py_DECREF(invalidate);
|
||||
return;
|
||||
error:
|
||||
PyErr_Clear();
|
||||
Py_XDECREF(invalidate);
|
||||
// If we're truly out of memory, wiping out everything is a fine fallback
|
||||
_Py_Executors_InvalidateAll(interp, 0);
|
||||
}
|
||||
|
||||
#endif /* _Py_TIER2 */
|
||||
|
|
|
@ -2381,6 +2381,10 @@
|
|||
break;
|
||||
}
|
||||
|
||||
case _MAKE_WARM: {
|
||||
break;
|
||||
}
|
||||
|
||||
case _FATAL_ERROR: {
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -660,6 +660,7 @@ init_interpreter(PyInterpreterState *interp,
|
|||
#ifdef _Py_TIER2
|
||||
(void)_Py_SetOptimizer(interp, NULL);
|
||||
interp->executor_list_head = NULL;
|
||||
interp->trace_run_counter = JIT_CLEANUP_THRESHOLD;
|
||||
#endif
|
||||
if (interp != &runtime->_main_interpreter) {
|
||||
/* Fix the self-referential, statically initialized fields. */
|
||||
|
|
|
@ -540,6 +540,7 @@ NON_ESCAPING_FUNCTIONS = (
|
|||
"_PyList_FromStackRefSteal",
|
||||
"_PyTuple_FromArraySteal",
|
||||
"_PyTuple_FromStackRefSteal",
|
||||
"_Py_set_eval_breaker_bit"
|
||||
)
|
||||
|
||||
ESCAPING_FUNCTIONS = (
|
||||
|
|
|
@ -139,6 +139,9 @@ class _Target(typing.Generic[_S, _R]):
|
|||
"-fno-plt",
|
||||
# Don't call stack-smashing canaries that we can't find or patch:
|
||||
"-fno-stack-protector",
|
||||
# On aarch64 Linux, intrinsics were being emitted and this flag
|
||||
# was required to disable them.
|
||||
"-mno-outline-atomics",
|
||||
"-std=c11",
|
||||
*self.args,
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue