mirror of https://github.com/python/cpython
GH-113710: Backedge counter improvements. (GH-115166)
This commit is contained in:
parent
7cce857622
commit
f9f6156c5a
|
@ -71,6 +71,8 @@ typedef struct {
|
||||||
|
|
||||||
PyAPI_FUNC(int) PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutorObject *executor);
|
PyAPI_FUNC(int) PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutorObject *executor);
|
||||||
|
|
||||||
|
_PyOptimizerObject *_Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject* optimizer);
|
||||||
|
|
||||||
PyAPI_FUNC(void) PyUnstable_SetOptimizer(_PyOptimizerObject* optimizer);
|
PyAPI_FUNC(void) PyUnstable_SetOptimizer(_PyOptimizerObject* optimizer);
|
||||||
|
|
||||||
PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void);
|
PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void);
|
||||||
|
@ -80,8 +82,6 @@ PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int o
|
||||||
int
|
int
|
||||||
_PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer);
|
_PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer);
|
||||||
|
|
||||||
extern _PyOptimizerObject _PyOptimizer_Default;
|
|
||||||
|
|
||||||
void _Py_ExecutorInit(_PyExecutorObject *, _PyBloomFilter *);
|
void _Py_ExecutorInit(_PyExecutorObject *, _PyBloomFilter *);
|
||||||
void _Py_ExecutorClear(_PyExecutorObject *);
|
void _Py_ExecutorClear(_PyExecutorObject *);
|
||||||
void _Py_BloomFilter_Init(_PyBloomFilter *);
|
void _Py_BloomFilter_Init(_PyBloomFilter *);
|
||||||
|
@ -96,7 +96,11 @@ PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void);
|
||||||
|
|
||||||
#define OPTIMIZER_BITS_IN_COUNTER 4
|
#define OPTIMIZER_BITS_IN_COUNTER 4
|
||||||
/* Minimum of 16 additional executions before retry */
|
/* Minimum of 16 additional executions before retry */
|
||||||
#define MINIMUM_TIER2_BACKOFF 4
|
#define MIN_TIER2_BACKOFF 4
|
||||||
|
#define MAX_TIER2_BACKOFF (15 - OPTIMIZER_BITS_IN_COUNTER)
|
||||||
|
#define OPTIMIZER_BITS_MASK ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1)
|
||||||
|
/* A value <= UINT16_MAX but large enough that when shifted is > UINT16_MAX */
|
||||||
|
#define OPTIMIZER_UNREACHABLE_THRESHOLD UINT16_MAX
|
||||||
|
|
||||||
#define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 3
|
#define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 3
|
||||||
#define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 6
|
#define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 6
|
||||||
|
|
|
@ -239,8 +239,10 @@ struct _is {
|
||||||
struct callable_cache callable_cache;
|
struct callable_cache callable_cache;
|
||||||
_PyOptimizerObject *optimizer;
|
_PyOptimizerObject *optimizer;
|
||||||
_PyExecutorObject *executor_list_head;
|
_PyExecutorObject *executor_list_head;
|
||||||
uint16_t optimizer_resume_threshold;
|
/* These values are shifted and offset to speed up check in JUMP_BACKWARD */
|
||||||
uint16_t optimizer_backedge_threshold;
|
uint32_t optimizer_resume_threshold;
|
||||||
|
uint32_t optimizer_backedge_threshold;
|
||||||
|
|
||||||
uint32_t next_func_version;
|
uint32_t next_func_version;
|
||||||
_rare_events rare_events;
|
_rare_events rare_events;
|
||||||
PyDict_WatchCallback builtins_dict_watcher;
|
PyDict_WatchCallback builtins_dict_watcher;
|
||||||
|
|
|
@ -2318,13 +2318,16 @@ dummy_func(
|
||||||
assert(oparg <= INSTR_OFFSET());
|
assert(oparg <= INSTR_OFFSET());
|
||||||
JUMPBY(-oparg);
|
JUMPBY(-oparg);
|
||||||
#if ENABLE_SPECIALIZATION
|
#if ENABLE_SPECIALIZATION
|
||||||
this_instr[1].cache += (1 << OPTIMIZER_BITS_IN_COUNTER);
|
uint16_t counter = this_instr[1].cache;
|
||||||
|
this_instr[1].cache = counter + (1 << OPTIMIZER_BITS_IN_COUNTER);
|
||||||
/* We are using unsigned values, but we really want signed values, so
|
/* We are using unsigned values, but we really want signed values, so
|
||||||
* do the 2s complement comparison manually */
|
* do the 2s complement adjustment manually */
|
||||||
uint16_t ucounter = this_instr[1].cache + (1 << 15);
|
uint32_t offset_counter = counter ^ (1 << 15);
|
||||||
uint16_t threshold = tstate->interp->optimizer_backedge_threshold + (1 << 15);
|
uint32_t threshold = tstate->interp->optimizer_backedge_threshold;
|
||||||
|
assert((threshold & OPTIMIZER_BITS_MASK) == 0);
|
||||||
|
// Use '>=' not '>' so that the optimizer/backoff bits do not effect the result.
|
||||||
// Double-check that the opcode isn't instrumented or something:
|
// Double-check that the opcode isn't instrumented or something:
|
||||||
if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) {
|
if (offset_counter >= threshold && this_instr->op.code == JUMP_BACKWARD) {
|
||||||
OPT_STAT_INC(attempts);
|
OPT_STAT_INC(attempts);
|
||||||
_Py_CODEUNIT *start = this_instr;
|
_Py_CODEUNIT *start = this_instr;
|
||||||
/* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */
|
/* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */
|
||||||
|
@ -2338,18 +2341,18 @@ dummy_func(
|
||||||
// Rewind and enter the executor:
|
// Rewind and enter the executor:
|
||||||
assert(start->op.code == ENTER_EXECUTOR);
|
assert(start->op.code == ENTER_EXECUTOR);
|
||||||
next_instr = start;
|
next_instr = start;
|
||||||
this_instr[1].cache &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1);
|
this_instr[1].cache &= OPTIMIZER_BITS_MASK;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
int backoff = this_instr[1].cache & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1);
|
int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK;
|
||||||
if (backoff < MINIMUM_TIER2_BACKOFF) {
|
backoff++;
|
||||||
backoff = MINIMUM_TIER2_BACKOFF;
|
if (backoff < MIN_TIER2_BACKOFF) {
|
||||||
|
backoff = MIN_TIER2_BACKOFF;
|
||||||
}
|
}
|
||||||
else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER) {
|
else if (backoff > MAX_TIER2_BACKOFF) {
|
||||||
backoff++;
|
backoff = MAX_TIER2_BACKOFF;
|
||||||
}
|
}
|
||||||
assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER);
|
this_instr[1].cache = ((UINT16_MAX << OPTIMIZER_BITS_IN_COUNTER) << backoff) | backoff;
|
||||||
this_instr[1].cache = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* ENABLE_SPECIALIZATION */
|
#endif /* ENABLE_SPECIALIZATION */
|
||||||
|
|
|
@ -3263,13 +3263,16 @@
|
||||||
assert(oparg <= INSTR_OFFSET());
|
assert(oparg <= INSTR_OFFSET());
|
||||||
JUMPBY(-oparg);
|
JUMPBY(-oparg);
|
||||||
#if ENABLE_SPECIALIZATION
|
#if ENABLE_SPECIALIZATION
|
||||||
this_instr[1].cache += (1 << OPTIMIZER_BITS_IN_COUNTER);
|
uint16_t counter = this_instr[1].cache;
|
||||||
|
this_instr[1].cache = counter + (1 << OPTIMIZER_BITS_IN_COUNTER);
|
||||||
/* We are using unsigned values, but we really want signed values, so
|
/* We are using unsigned values, but we really want signed values, so
|
||||||
* do the 2s complement comparison manually */
|
* do the 2s complement adjustment manually */
|
||||||
uint16_t ucounter = this_instr[1].cache + (1 << 15);
|
uint32_t offset_counter = counter ^ (1 << 15);
|
||||||
uint16_t threshold = tstate->interp->optimizer_backedge_threshold + (1 << 15);
|
uint32_t threshold = tstate->interp->optimizer_backedge_threshold;
|
||||||
|
assert((threshold & OPTIMIZER_BITS_MASK) == 0);
|
||||||
|
// Use '>=' not '>' so that the optimizer/backoff bits do not effect the result.
|
||||||
// Double-check that the opcode isn't instrumented or something:
|
// Double-check that the opcode isn't instrumented or something:
|
||||||
if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) {
|
if (offset_counter >= threshold && this_instr->op.code == JUMP_BACKWARD) {
|
||||||
OPT_STAT_INC(attempts);
|
OPT_STAT_INC(attempts);
|
||||||
_Py_CODEUNIT *start = this_instr;
|
_Py_CODEUNIT *start = this_instr;
|
||||||
/* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */
|
/* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */
|
||||||
|
@ -3283,18 +3286,18 @@
|
||||||
// Rewind and enter the executor:
|
// Rewind and enter the executor:
|
||||||
assert(start->op.code == ENTER_EXECUTOR);
|
assert(start->op.code == ENTER_EXECUTOR);
|
||||||
next_instr = start;
|
next_instr = start;
|
||||||
this_instr[1].cache &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1);
|
this_instr[1].cache &= OPTIMIZER_BITS_MASK;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
int backoff = this_instr[1].cache & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1);
|
int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK;
|
||||||
if (backoff < MINIMUM_TIER2_BACKOFF) {
|
backoff++;
|
||||||
backoff = MINIMUM_TIER2_BACKOFF;
|
if (backoff < MIN_TIER2_BACKOFF) {
|
||||||
|
backoff = MIN_TIER2_BACKOFF;
|
||||||
}
|
}
|
||||||
else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER) {
|
else if (backoff > MAX_TIER2_BACKOFF) {
|
||||||
backoff++;
|
backoff = MAX_TIER2_BACKOFF;
|
||||||
}
|
}
|
||||||
assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER);
|
this_instr[1].cache = ((UINT16_MAX << OPTIMIZER_BITS_IN_COUNTER) << backoff) | backoff;
|
||||||
this_instr[1].cache = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* ENABLE_SPECIALIZATION */
|
#endif /* ENABLE_SPECIALIZATION */
|
||||||
|
|
|
@ -109,6 +109,9 @@ never_optimize(
|
||||||
_PyExecutorObject **exec,
|
_PyExecutorObject **exec,
|
||||||
int Py_UNUSED(stack_entries))
|
int Py_UNUSED(stack_entries))
|
||||||
{
|
{
|
||||||
|
/* Although it should be benign for this to be called,
|
||||||
|
* it shouldn't happen, so fail in debug builds. */
|
||||||
|
assert(0 && "never optimize should never be called");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -120,13 +123,19 @@ PyTypeObject _PyDefaultOptimizer_Type = {
|
||||||
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
|
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
|
||||||
};
|
};
|
||||||
|
|
||||||
_PyOptimizerObject _PyOptimizer_Default = {
|
static _PyOptimizerObject _PyOptimizer_Default = {
|
||||||
PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type)
|
PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type)
|
||||||
.optimize = never_optimize,
|
.optimize = never_optimize,
|
||||||
.resume_threshold = INT16_MAX,
|
.resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD,
|
||||||
.backedge_threshold = INT16_MAX,
|
.backedge_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static uint32_t
|
||||||
|
shift_and_offset_threshold(uint16_t threshold)
|
||||||
|
{
|
||||||
|
return (threshold << OPTIMIZER_BITS_IN_COUNTER) + (1 << 15);
|
||||||
|
}
|
||||||
|
|
||||||
_PyOptimizerObject *
|
_PyOptimizerObject *
|
||||||
PyUnstable_GetOptimizer(void)
|
PyUnstable_GetOptimizer(void)
|
||||||
{
|
{
|
||||||
|
@ -134,24 +143,33 @@ PyUnstable_GetOptimizer(void)
|
||||||
if (interp->optimizer == &_PyOptimizer_Default) {
|
if (interp->optimizer == &_PyOptimizer_Default) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
assert(interp->optimizer_backedge_threshold == interp->optimizer->backedge_threshold);
|
assert(interp->optimizer_backedge_threshold ==
|
||||||
assert(interp->optimizer_resume_threshold == interp->optimizer->resume_threshold);
|
shift_and_offset_threshold(interp->optimizer->backedge_threshold));
|
||||||
|
assert(interp->optimizer_resume_threshold ==
|
||||||
|
shift_and_offset_threshold(interp->optimizer->resume_threshold));
|
||||||
Py_INCREF(interp->optimizer);
|
Py_INCREF(interp->optimizer);
|
||||||
return interp->optimizer;
|
return interp->optimizer;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
_PyOptimizerObject *
|
||||||
PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer)
|
_Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject *optimizer)
|
||||||
{
|
{
|
||||||
PyInterpreterState *interp = _PyInterpreterState_GET();
|
|
||||||
if (optimizer == NULL) {
|
if (optimizer == NULL) {
|
||||||
optimizer = &_PyOptimizer_Default;
|
optimizer = &_PyOptimizer_Default;
|
||||||
}
|
}
|
||||||
_PyOptimizerObject *old = interp->optimizer;
|
_PyOptimizerObject *old = interp->optimizer;
|
||||||
Py_INCREF(optimizer);
|
Py_INCREF(optimizer);
|
||||||
interp->optimizer = optimizer;
|
interp->optimizer = optimizer;
|
||||||
interp->optimizer_backedge_threshold = optimizer->backedge_threshold;
|
interp->optimizer_backedge_threshold = shift_and_offset_threshold(optimizer->backedge_threshold);
|
||||||
interp->optimizer_resume_threshold = optimizer->resume_threshold;
|
interp->optimizer_resume_threshold = shift_and_offset_threshold(optimizer->resume_threshold);
|
||||||
|
return old;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer)
|
||||||
|
{
|
||||||
|
PyInterpreterState *interp = _PyInterpreterState_GET();
|
||||||
|
_PyOptimizerObject *old = _Py_SetOptimizer(interp, optimizer);
|
||||||
Py_DECREF(old);
|
Py_DECREF(old);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -860,10 +878,10 @@ PyUnstable_Optimizer_NewUOpOptimizer(void)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
opt->optimize = uop_optimize;
|
opt->optimize = uop_optimize;
|
||||||
opt->resume_threshold = INT16_MAX;
|
opt->resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD;
|
||||||
// Need at least 3 iterations to settle specializations.
|
// Need a few iterations to settle specializations,
|
||||||
// A few lower bits of the counter are reserved for other flags.
|
// and to ammortize the cost of optimization.
|
||||||
opt->backedge_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER;
|
opt->backedge_threshold = 16;
|
||||||
return (PyObject *)opt;
|
return (PyObject *)opt;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -950,7 +968,7 @@ PyUnstable_Optimizer_NewCounter(void)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
opt->base.optimize = counter_optimize;
|
opt->base.optimize = counter_optimize;
|
||||||
opt->base.resume_threshold = INT16_MAX;
|
opt->base.resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD;
|
||||||
opt->base.backedge_threshold = 0;
|
opt->base.backedge_threshold = 0;
|
||||||
opt->count = 0;
|
opt->count = 0;
|
||||||
return (PyObject *)opt;
|
return (PyObject *)opt;
|
||||||
|
|
|
@ -1627,8 +1627,8 @@ finalize_modules(PyThreadState *tstate)
|
||||||
|
|
||||||
// Invalidate all executors and turn off tier 2 optimizer
|
// Invalidate all executors and turn off tier 2 optimizer
|
||||||
_Py_Executors_InvalidateAll(interp);
|
_Py_Executors_InvalidateAll(interp);
|
||||||
Py_XDECREF(interp->optimizer);
|
_PyOptimizerObject *old = _Py_SetOptimizer(interp, NULL);
|
||||||
interp->optimizer = &_PyOptimizer_Default;
|
Py_XDECREF(old);
|
||||||
|
|
||||||
// Stop watching __builtin__ modifications
|
// Stop watching __builtin__ modifications
|
||||||
PyDict_Unwatch(0, interp->builtins);
|
PyDict_Unwatch(0, interp->builtins);
|
||||||
|
|
|
@ -625,9 +625,7 @@ init_interpreter(PyInterpreterState *interp,
|
||||||
}
|
}
|
||||||
interp->sys_profile_initialized = false;
|
interp->sys_profile_initialized = false;
|
||||||
interp->sys_trace_initialized = false;
|
interp->sys_trace_initialized = false;
|
||||||
interp->optimizer = &_PyOptimizer_Default;
|
(void)_Py_SetOptimizer(interp, NULL);
|
||||||
interp->optimizer_backedge_threshold = _PyOptimizer_Default.backedge_threshold;
|
|
||||||
interp->optimizer_resume_threshold = _PyOptimizer_Default.backedge_threshold;
|
|
||||||
interp->next_func_version = 1;
|
interp->next_func_version = 1;
|
||||||
interp->executor_list_head = NULL;
|
interp->executor_list_head = NULL;
|
||||||
if (interp != &runtime->_main_interpreter) {
|
if (interp != &runtime->_main_interpreter) {
|
||||||
|
@ -780,10 +778,8 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
|
||||||
tstate->_status.cleared = 0;
|
tstate->_status.cleared = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
Py_CLEAR(interp->optimizer);
|
_PyOptimizerObject *old = _Py_SetOptimizer(interp, NULL);
|
||||||
interp->optimizer = &_PyOptimizer_Default;
|
Py_DECREF(old);
|
||||||
interp->optimizer_backedge_threshold = _PyOptimizer_Default.backedge_threshold;
|
|
||||||
interp->optimizer_resume_threshold = _PyOptimizer_Default.backedge_threshold;
|
|
||||||
|
|
||||||
/* It is possible that any of the objects below have a finalizer
|
/* It is possible that any of the objects below have a finalizer
|
||||||
that runs Python code or otherwise relies on a thread state
|
that runs Python code or otherwise relies on a thread state
|
||||||
|
|
Loading…
Reference in New Issue