GH-113710: Backedge counter improvements. (GH-115166)

This commit is contained in:
Mark Shannon 2024-02-13 14:16:37 +00:00 committed by GitHub
parent 7cce857622
commit f9f6156c5a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 81 additions and 55 deletions

View File

@ -71,6 +71,8 @@ typedef struct {
PyAPI_FUNC(int) PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutorObject *executor); PyAPI_FUNC(int) PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutorObject *executor);
_PyOptimizerObject *_Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject* optimizer);
PyAPI_FUNC(void) PyUnstable_SetOptimizer(_PyOptimizerObject* optimizer); PyAPI_FUNC(void) PyUnstable_SetOptimizer(_PyOptimizerObject* optimizer);
PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void); PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void);
@ -80,8 +82,6 @@ PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int o
int int
_PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer); _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer);
extern _PyOptimizerObject _PyOptimizer_Default;
void _Py_ExecutorInit(_PyExecutorObject *, _PyBloomFilter *); void _Py_ExecutorInit(_PyExecutorObject *, _PyBloomFilter *);
void _Py_ExecutorClear(_PyExecutorObject *); void _Py_ExecutorClear(_PyExecutorObject *);
void _Py_BloomFilter_Init(_PyBloomFilter *); void _Py_BloomFilter_Init(_PyBloomFilter *);
@ -96,7 +96,11 @@ PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void);
#define OPTIMIZER_BITS_IN_COUNTER 4 #define OPTIMIZER_BITS_IN_COUNTER 4
/* Minimum of 16 additional executions before retry */ /* Minimum of 16 additional executions before retry */
#define MINIMUM_TIER2_BACKOFF 4 #define MIN_TIER2_BACKOFF 4
#define MAX_TIER2_BACKOFF (15 - OPTIMIZER_BITS_IN_COUNTER)
#define OPTIMIZER_BITS_MASK ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1)
/* A value <= UINT16_MAX but large enough that when shifted is > UINT16_MAX */
#define OPTIMIZER_UNREACHABLE_THRESHOLD UINT16_MAX
#define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 3 #define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 3
#define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 6 #define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 6

View File

@ -239,8 +239,10 @@ struct _is {
struct callable_cache callable_cache; struct callable_cache callable_cache;
_PyOptimizerObject *optimizer; _PyOptimizerObject *optimizer;
_PyExecutorObject *executor_list_head; _PyExecutorObject *executor_list_head;
uint16_t optimizer_resume_threshold; /* These values are shifted and offset to speed up check in JUMP_BACKWARD */
uint16_t optimizer_backedge_threshold; uint32_t optimizer_resume_threshold;
uint32_t optimizer_backedge_threshold;
uint32_t next_func_version; uint32_t next_func_version;
_rare_events rare_events; _rare_events rare_events;
PyDict_WatchCallback builtins_dict_watcher; PyDict_WatchCallback builtins_dict_watcher;

View File

@ -2318,13 +2318,16 @@ dummy_func(
assert(oparg <= INSTR_OFFSET()); assert(oparg <= INSTR_OFFSET());
JUMPBY(-oparg); JUMPBY(-oparg);
#if ENABLE_SPECIALIZATION #if ENABLE_SPECIALIZATION
this_instr[1].cache += (1 << OPTIMIZER_BITS_IN_COUNTER); uint16_t counter = this_instr[1].cache;
this_instr[1].cache = counter + (1 << OPTIMIZER_BITS_IN_COUNTER);
/* We are using unsigned values, but we really want signed values, so /* We are using unsigned values, but we really want signed values, so
* do the 2s complement comparison manually */ * do the 2s complement adjustment manually */
uint16_t ucounter = this_instr[1].cache + (1 << 15); uint32_t offset_counter = counter ^ (1 << 15);
uint16_t threshold = tstate->interp->optimizer_backedge_threshold + (1 << 15); uint32_t threshold = tstate->interp->optimizer_backedge_threshold;
assert((threshold & OPTIMIZER_BITS_MASK) == 0);
// Use '>=' not '>' so that the optimizer/backoff bits do not effect the result.
// Double-check that the opcode isn't instrumented or something: // Double-check that the opcode isn't instrumented or something:
if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) { if (offset_counter >= threshold && this_instr->op.code == JUMP_BACKWARD) {
OPT_STAT_INC(attempts); OPT_STAT_INC(attempts);
_Py_CODEUNIT *start = this_instr; _Py_CODEUNIT *start = this_instr;
/* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */ /* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */
@ -2338,18 +2341,18 @@ dummy_func(
// Rewind and enter the executor: // Rewind and enter the executor:
assert(start->op.code == ENTER_EXECUTOR); assert(start->op.code == ENTER_EXECUTOR);
next_instr = start; next_instr = start;
this_instr[1].cache &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); this_instr[1].cache &= OPTIMIZER_BITS_MASK;
} }
else { else {
int backoff = this_instr[1].cache & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK;
if (backoff < MINIMUM_TIER2_BACKOFF) { backoff++;
backoff = MINIMUM_TIER2_BACKOFF; if (backoff < MIN_TIER2_BACKOFF) {
backoff = MIN_TIER2_BACKOFF;
} }
else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER) { else if (backoff > MAX_TIER2_BACKOFF) {
backoff++; backoff = MAX_TIER2_BACKOFF;
} }
assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER); this_instr[1].cache = ((UINT16_MAX << OPTIMIZER_BITS_IN_COUNTER) << backoff) | backoff;
this_instr[1].cache = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff;
} }
} }
#endif /* ENABLE_SPECIALIZATION */ #endif /* ENABLE_SPECIALIZATION */

View File

@ -3263,13 +3263,16 @@
assert(oparg <= INSTR_OFFSET()); assert(oparg <= INSTR_OFFSET());
JUMPBY(-oparg); JUMPBY(-oparg);
#if ENABLE_SPECIALIZATION #if ENABLE_SPECIALIZATION
this_instr[1].cache += (1 << OPTIMIZER_BITS_IN_COUNTER); uint16_t counter = this_instr[1].cache;
this_instr[1].cache = counter + (1 << OPTIMIZER_BITS_IN_COUNTER);
/* We are using unsigned values, but we really want signed values, so /* We are using unsigned values, but we really want signed values, so
* do the 2s complement comparison manually */ * do the 2s complement adjustment manually */
uint16_t ucounter = this_instr[1].cache + (1 << 15); uint32_t offset_counter = counter ^ (1 << 15);
uint16_t threshold = tstate->interp->optimizer_backedge_threshold + (1 << 15); uint32_t threshold = tstate->interp->optimizer_backedge_threshold;
assert((threshold & OPTIMIZER_BITS_MASK) == 0);
// Use '>=' not '>' so that the optimizer/backoff bits do not effect the result.
// Double-check that the opcode isn't instrumented or something: // Double-check that the opcode isn't instrumented or something:
if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) { if (offset_counter >= threshold && this_instr->op.code == JUMP_BACKWARD) {
OPT_STAT_INC(attempts); OPT_STAT_INC(attempts);
_Py_CODEUNIT *start = this_instr; _Py_CODEUNIT *start = this_instr;
/* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */ /* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */
@ -3283,18 +3286,18 @@
// Rewind and enter the executor: // Rewind and enter the executor:
assert(start->op.code == ENTER_EXECUTOR); assert(start->op.code == ENTER_EXECUTOR);
next_instr = start; next_instr = start;
this_instr[1].cache &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); this_instr[1].cache &= OPTIMIZER_BITS_MASK;
} }
else { else {
int backoff = this_instr[1].cache & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK;
if (backoff < MINIMUM_TIER2_BACKOFF) { backoff++;
backoff = MINIMUM_TIER2_BACKOFF; if (backoff < MIN_TIER2_BACKOFF) {
backoff = MIN_TIER2_BACKOFF;
} }
else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER) { else if (backoff > MAX_TIER2_BACKOFF) {
backoff++; backoff = MAX_TIER2_BACKOFF;
} }
assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER); this_instr[1].cache = ((UINT16_MAX << OPTIMIZER_BITS_IN_COUNTER) << backoff) | backoff;
this_instr[1].cache = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff;
} }
} }
#endif /* ENABLE_SPECIALIZATION */ #endif /* ENABLE_SPECIALIZATION */

View File

@ -109,6 +109,9 @@ never_optimize(
_PyExecutorObject **exec, _PyExecutorObject **exec,
int Py_UNUSED(stack_entries)) int Py_UNUSED(stack_entries))
{ {
/* Although it should be benign for this to be called,
* it shouldn't happen, so fail in debug builds. */
assert(0 && "never optimize should never be called");
return 0; return 0;
} }
@ -120,13 +123,19 @@ PyTypeObject _PyDefaultOptimizer_Type = {
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
}; };
_PyOptimizerObject _PyOptimizer_Default = { static _PyOptimizerObject _PyOptimizer_Default = {
PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type) PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type)
.optimize = never_optimize, .optimize = never_optimize,
.resume_threshold = INT16_MAX, .resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD,
.backedge_threshold = INT16_MAX, .backedge_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD,
}; };
static uint32_t
shift_and_offset_threshold(uint16_t threshold)
{
return (threshold << OPTIMIZER_BITS_IN_COUNTER) + (1 << 15);
}
_PyOptimizerObject * _PyOptimizerObject *
PyUnstable_GetOptimizer(void) PyUnstable_GetOptimizer(void)
{ {
@ -134,24 +143,33 @@ PyUnstable_GetOptimizer(void)
if (interp->optimizer == &_PyOptimizer_Default) { if (interp->optimizer == &_PyOptimizer_Default) {
return NULL; return NULL;
} }
assert(interp->optimizer_backedge_threshold == interp->optimizer->backedge_threshold); assert(interp->optimizer_backedge_threshold ==
assert(interp->optimizer_resume_threshold == interp->optimizer->resume_threshold); shift_and_offset_threshold(interp->optimizer->backedge_threshold));
assert(interp->optimizer_resume_threshold ==
shift_and_offset_threshold(interp->optimizer->resume_threshold));
Py_INCREF(interp->optimizer); Py_INCREF(interp->optimizer);
return interp->optimizer; return interp->optimizer;
} }
void _PyOptimizerObject *
PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer) _Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject *optimizer)
{ {
PyInterpreterState *interp = _PyInterpreterState_GET();
if (optimizer == NULL) { if (optimizer == NULL) {
optimizer = &_PyOptimizer_Default; optimizer = &_PyOptimizer_Default;
} }
_PyOptimizerObject *old = interp->optimizer; _PyOptimizerObject *old = interp->optimizer;
Py_INCREF(optimizer); Py_INCREF(optimizer);
interp->optimizer = optimizer; interp->optimizer = optimizer;
interp->optimizer_backedge_threshold = optimizer->backedge_threshold; interp->optimizer_backedge_threshold = shift_and_offset_threshold(optimizer->backedge_threshold);
interp->optimizer_resume_threshold = optimizer->resume_threshold; interp->optimizer_resume_threshold = shift_and_offset_threshold(optimizer->resume_threshold);
return old;
}
void
PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
_PyOptimizerObject *old = _Py_SetOptimizer(interp, optimizer);
Py_DECREF(old); Py_DECREF(old);
} }
@ -860,10 +878,10 @@ PyUnstable_Optimizer_NewUOpOptimizer(void)
return NULL; return NULL;
} }
opt->optimize = uop_optimize; opt->optimize = uop_optimize;
opt->resume_threshold = INT16_MAX; opt->resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD;
// Need at least 3 iterations to settle specializations. // Need a few iterations to settle specializations,
// A few lower bits of the counter are reserved for other flags. // and to ammortize the cost of optimization.
opt->backedge_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER; opt->backedge_threshold = 16;
return (PyObject *)opt; return (PyObject *)opt;
} }
@ -950,7 +968,7 @@ PyUnstable_Optimizer_NewCounter(void)
return NULL; return NULL;
} }
opt->base.optimize = counter_optimize; opt->base.optimize = counter_optimize;
opt->base.resume_threshold = INT16_MAX; opt->base.resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD;
opt->base.backedge_threshold = 0; opt->base.backedge_threshold = 0;
opt->count = 0; opt->count = 0;
return (PyObject *)opt; return (PyObject *)opt;

View File

@ -1627,8 +1627,8 @@ finalize_modules(PyThreadState *tstate)
// Invalidate all executors and turn off tier 2 optimizer // Invalidate all executors and turn off tier 2 optimizer
_Py_Executors_InvalidateAll(interp); _Py_Executors_InvalidateAll(interp);
Py_XDECREF(interp->optimizer); _PyOptimizerObject *old = _Py_SetOptimizer(interp, NULL);
interp->optimizer = &_PyOptimizer_Default; Py_XDECREF(old);
// Stop watching __builtin__ modifications // Stop watching __builtin__ modifications
PyDict_Unwatch(0, interp->builtins); PyDict_Unwatch(0, interp->builtins);

View File

@ -625,9 +625,7 @@ init_interpreter(PyInterpreterState *interp,
} }
interp->sys_profile_initialized = false; interp->sys_profile_initialized = false;
interp->sys_trace_initialized = false; interp->sys_trace_initialized = false;
interp->optimizer = &_PyOptimizer_Default; (void)_Py_SetOptimizer(interp, NULL);
interp->optimizer_backedge_threshold = _PyOptimizer_Default.backedge_threshold;
interp->optimizer_resume_threshold = _PyOptimizer_Default.backedge_threshold;
interp->next_func_version = 1; interp->next_func_version = 1;
interp->executor_list_head = NULL; interp->executor_list_head = NULL;
if (interp != &runtime->_main_interpreter) { if (interp != &runtime->_main_interpreter) {
@ -780,10 +778,8 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
tstate->_status.cleared = 0; tstate->_status.cleared = 0;
} }
Py_CLEAR(interp->optimizer); _PyOptimizerObject *old = _Py_SetOptimizer(interp, NULL);
interp->optimizer = &_PyOptimizer_Default; Py_DECREF(old);
interp->optimizer_backedge_threshold = _PyOptimizer_Default.backedge_threshold;
interp->optimizer_resume_threshold = _PyOptimizer_Default.backedge_threshold;
/* It is possible that any of the objects below have a finalizer /* It is possible that any of the objects below have a finalizer
that runs Python code or otherwise relies on a thread state that runs Python code or otherwise relies on a thread state