GH-113710: Backedge counter improvements. (GH-115166)

This commit is contained in:
Mark Shannon 2024-02-13 14:16:37 +00:00 committed by GitHub
parent 7cce857622
commit f9f6156c5a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 81 additions and 55 deletions

View File

@ -71,6 +71,8 @@ typedef struct {
PyAPI_FUNC(int) PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutorObject *executor);
_PyOptimizerObject *_Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject* optimizer);
PyAPI_FUNC(void) PyUnstable_SetOptimizer(_PyOptimizerObject* optimizer);
PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void);
@ -80,8 +82,6 @@ PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int o
int
_PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer);
extern _PyOptimizerObject _PyOptimizer_Default;
void _Py_ExecutorInit(_PyExecutorObject *, _PyBloomFilter *);
void _Py_ExecutorClear(_PyExecutorObject *);
void _Py_BloomFilter_Init(_PyBloomFilter *);
@ -96,7 +96,11 @@ PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void);
#define OPTIMIZER_BITS_IN_COUNTER 4
/* Minimum of 16 additional executions before retry */
#define MINIMUM_TIER2_BACKOFF 4
#define MIN_TIER2_BACKOFF 4
#define MAX_TIER2_BACKOFF (15 - OPTIMIZER_BITS_IN_COUNTER)
#define OPTIMIZER_BITS_MASK ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1)
/* A value <= UINT16_MAX but large enough that when shifted is > UINT16_MAX */
#define OPTIMIZER_UNREACHABLE_THRESHOLD UINT16_MAX
#define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 3
#define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 6

View File

@ -239,8 +239,10 @@ struct _is {
struct callable_cache callable_cache;
_PyOptimizerObject *optimizer;
_PyExecutorObject *executor_list_head;
uint16_t optimizer_resume_threshold;
uint16_t optimizer_backedge_threshold;
/* These values are shifted and offset to speed up check in JUMP_BACKWARD */
uint32_t optimizer_resume_threshold;
uint32_t optimizer_backedge_threshold;
uint32_t next_func_version;
_rare_events rare_events;
PyDict_WatchCallback builtins_dict_watcher;

View File

@ -2318,13 +2318,16 @@ dummy_func(
assert(oparg <= INSTR_OFFSET());
JUMPBY(-oparg);
#if ENABLE_SPECIALIZATION
this_instr[1].cache += (1 << OPTIMIZER_BITS_IN_COUNTER);
uint16_t counter = this_instr[1].cache;
this_instr[1].cache = counter + (1 << OPTIMIZER_BITS_IN_COUNTER);
/* We are using unsigned values, but we really want signed values, so
* do the 2s complement comparison manually */
uint16_t ucounter = this_instr[1].cache + (1 << 15);
uint16_t threshold = tstate->interp->optimizer_backedge_threshold + (1 << 15);
* do the 2s complement adjustment manually */
uint32_t offset_counter = counter ^ (1 << 15);
uint32_t threshold = tstate->interp->optimizer_backedge_threshold;
assert((threshold & OPTIMIZER_BITS_MASK) == 0);
// Use '>=' not '>' so that the optimizer/backoff bits do not effect the result.
// Double-check that the opcode isn't instrumented or something:
if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) {
if (offset_counter >= threshold && this_instr->op.code == JUMP_BACKWARD) {
OPT_STAT_INC(attempts);
_Py_CODEUNIT *start = this_instr;
/* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */
@ -2338,18 +2341,18 @@ dummy_func(
// Rewind and enter the executor:
assert(start->op.code == ENTER_EXECUTOR);
next_instr = start;
this_instr[1].cache &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1);
this_instr[1].cache &= OPTIMIZER_BITS_MASK;
}
else {
int backoff = this_instr[1].cache & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1);
if (backoff < MINIMUM_TIER2_BACKOFF) {
backoff = MINIMUM_TIER2_BACKOFF;
int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK;
backoff++;
if (backoff < MIN_TIER2_BACKOFF) {
backoff = MIN_TIER2_BACKOFF;
}
else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER) {
backoff++;
else if (backoff > MAX_TIER2_BACKOFF) {
backoff = MAX_TIER2_BACKOFF;
}
assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER);
this_instr[1].cache = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff;
this_instr[1].cache = ((UINT16_MAX << OPTIMIZER_BITS_IN_COUNTER) << backoff) | backoff;
}
}
#endif /* ENABLE_SPECIALIZATION */

View File

@ -3263,13 +3263,16 @@
assert(oparg <= INSTR_OFFSET());
JUMPBY(-oparg);
#if ENABLE_SPECIALIZATION
this_instr[1].cache += (1 << OPTIMIZER_BITS_IN_COUNTER);
uint16_t counter = this_instr[1].cache;
this_instr[1].cache = counter + (1 << OPTIMIZER_BITS_IN_COUNTER);
/* We are using unsigned values, but we really want signed values, so
* do the 2s complement comparison manually */
uint16_t ucounter = this_instr[1].cache + (1 << 15);
uint16_t threshold = tstate->interp->optimizer_backedge_threshold + (1 << 15);
* do the 2s complement adjustment manually */
uint32_t offset_counter = counter ^ (1 << 15);
uint32_t threshold = tstate->interp->optimizer_backedge_threshold;
assert((threshold & OPTIMIZER_BITS_MASK) == 0);
// Use '>=' not '>' so that the optimizer/backoff bits do not effect the result.
// Double-check that the opcode isn't instrumented or something:
if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) {
if (offset_counter >= threshold && this_instr->op.code == JUMP_BACKWARD) {
OPT_STAT_INC(attempts);
_Py_CODEUNIT *start = this_instr;
/* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */
@ -3283,18 +3286,18 @@
// Rewind and enter the executor:
assert(start->op.code == ENTER_EXECUTOR);
next_instr = start;
this_instr[1].cache &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1);
this_instr[1].cache &= OPTIMIZER_BITS_MASK;
}
else {
int backoff = this_instr[1].cache & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1);
if (backoff < MINIMUM_TIER2_BACKOFF) {
backoff = MINIMUM_TIER2_BACKOFF;
int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK;
backoff++;
if (backoff < MIN_TIER2_BACKOFF) {
backoff = MIN_TIER2_BACKOFF;
}
else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER) {
backoff++;
else if (backoff > MAX_TIER2_BACKOFF) {
backoff = MAX_TIER2_BACKOFF;
}
assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER);
this_instr[1].cache = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff;
this_instr[1].cache = ((UINT16_MAX << OPTIMIZER_BITS_IN_COUNTER) << backoff) | backoff;
}
}
#endif /* ENABLE_SPECIALIZATION */

View File

@ -109,6 +109,9 @@ never_optimize(
_PyExecutorObject **exec,
int Py_UNUSED(stack_entries))
{
/* Although it should be benign for this to be called,
* it shouldn't happen, so fail in debug builds. */
assert(0 && "never optimize should never be called");
return 0;
}
@ -120,13 +123,19 @@ PyTypeObject _PyDefaultOptimizer_Type = {
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
};
_PyOptimizerObject _PyOptimizer_Default = {
static _PyOptimizerObject _PyOptimizer_Default = {
PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type)
.optimize = never_optimize,
.resume_threshold = INT16_MAX,
.backedge_threshold = INT16_MAX,
.resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD,
.backedge_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD,
};
static uint32_t
shift_and_offset_threshold(uint16_t threshold)
{
return (threshold << OPTIMIZER_BITS_IN_COUNTER) + (1 << 15);
}
_PyOptimizerObject *
PyUnstable_GetOptimizer(void)
{
@ -134,24 +143,33 @@ PyUnstable_GetOptimizer(void)
if (interp->optimizer == &_PyOptimizer_Default) {
return NULL;
}
assert(interp->optimizer_backedge_threshold == interp->optimizer->backedge_threshold);
assert(interp->optimizer_resume_threshold == interp->optimizer->resume_threshold);
assert(interp->optimizer_backedge_threshold ==
shift_and_offset_threshold(interp->optimizer->backedge_threshold));
assert(interp->optimizer_resume_threshold ==
shift_and_offset_threshold(interp->optimizer->resume_threshold));
Py_INCREF(interp->optimizer);
return interp->optimizer;
}
void
PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer)
_PyOptimizerObject *
_Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject *optimizer)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
if (optimizer == NULL) {
optimizer = &_PyOptimizer_Default;
}
_PyOptimizerObject *old = interp->optimizer;
Py_INCREF(optimizer);
interp->optimizer = optimizer;
interp->optimizer_backedge_threshold = optimizer->backedge_threshold;
interp->optimizer_resume_threshold = optimizer->resume_threshold;
interp->optimizer_backedge_threshold = shift_and_offset_threshold(optimizer->backedge_threshold);
interp->optimizer_resume_threshold = shift_and_offset_threshold(optimizer->resume_threshold);
return old;
}
void
PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
_PyOptimizerObject *old = _Py_SetOptimizer(interp, optimizer);
Py_DECREF(old);
}
@ -860,10 +878,10 @@ PyUnstable_Optimizer_NewUOpOptimizer(void)
return NULL;
}
opt->optimize = uop_optimize;
opt->resume_threshold = INT16_MAX;
// Need at least 3 iterations to settle specializations.
// A few lower bits of the counter are reserved for other flags.
opt->backedge_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER;
opt->resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD;
// Need a few iterations to settle specializations,
// and to ammortize the cost of optimization.
opt->backedge_threshold = 16;
return (PyObject *)opt;
}
@ -950,7 +968,7 @@ PyUnstable_Optimizer_NewCounter(void)
return NULL;
}
opt->base.optimize = counter_optimize;
opt->base.resume_threshold = INT16_MAX;
opt->base.resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD;
opt->base.backedge_threshold = 0;
opt->count = 0;
return (PyObject *)opt;

View File

@ -1627,8 +1627,8 @@ finalize_modules(PyThreadState *tstate)
// Invalidate all executors and turn off tier 2 optimizer
_Py_Executors_InvalidateAll(interp);
Py_XDECREF(interp->optimizer);
interp->optimizer = &_PyOptimizer_Default;
_PyOptimizerObject *old = _Py_SetOptimizer(interp, NULL);
Py_XDECREF(old);
// Stop watching __builtin__ modifications
PyDict_Unwatch(0, interp->builtins);

View File

@ -625,9 +625,7 @@ init_interpreter(PyInterpreterState *interp,
}
interp->sys_profile_initialized = false;
interp->sys_trace_initialized = false;
interp->optimizer = &_PyOptimizer_Default;
interp->optimizer_backedge_threshold = _PyOptimizer_Default.backedge_threshold;
interp->optimizer_resume_threshold = _PyOptimizer_Default.backedge_threshold;
(void)_Py_SetOptimizer(interp, NULL);
interp->next_func_version = 1;
interp->executor_list_head = NULL;
if (interp != &runtime->_main_interpreter) {
@ -780,10 +778,8 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
tstate->_status.cleared = 0;
}
Py_CLEAR(interp->optimizer);
interp->optimizer = &_PyOptimizer_Default;
interp->optimizer_backedge_threshold = _PyOptimizer_Default.backedge_threshold;
interp->optimizer_resume_threshold = _PyOptimizer_Default.backedge_threshold;
_PyOptimizerObject *old = _Py_SetOptimizer(interp, NULL);
Py_DECREF(old);
/* It is possible that any of the objects below have a finalizer
that runs Python code or otherwise relies on a thread state