mirror of https://github.com/python/cpython
GH-118093: Handle some polymorphism before requiring progress in tier two (GH-122843)
This commit is contained in:
parent
503af8fe9a
commit
9621a7d017
|
@ -29,8 +29,9 @@ typedef struct {
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t opcode;
|
uint8_t opcode;
|
||||||
uint8_t oparg;
|
uint8_t oparg;
|
||||||
uint8_t valid;
|
uint16_t valid:1;
|
||||||
uint8_t linked;
|
uint16_t linked:1;
|
||||||
|
uint16_t chain_depth:14; // Must be big engough for MAX_CHAIN_DEPTH - 1.
|
||||||
int index; // Index of ENTER_EXECUTOR (if code isn't NULL, below).
|
int index; // Index of ENTER_EXECUTOR (if code isn't NULL, below).
|
||||||
_PyBloomFilter bloom;
|
_PyBloomFilter bloom;
|
||||||
_PyExecutorLinkListNode links;
|
_PyExecutorLinkListNode links;
|
||||||
|
@ -83,7 +84,7 @@ typedef struct _PyOptimizerObject _PyOptimizerObject;
|
||||||
typedef int (*_Py_optimize_func)(
|
typedef int (*_Py_optimize_func)(
|
||||||
_PyOptimizerObject* self, struct _PyInterpreterFrame *frame,
|
_PyOptimizerObject* self, struct _PyInterpreterFrame *frame,
|
||||||
_Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr,
|
_Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr,
|
||||||
int curr_stackentries);
|
int curr_stackentries, bool progress_needed);
|
||||||
|
|
||||||
struct _PyOptimizerObject {
|
struct _PyOptimizerObject {
|
||||||
PyObject_HEAD
|
PyObject_HEAD
|
||||||
|
@ -182,6 +183,12 @@ static inline uint16_t uop_get_error_target(const _PyUOpInstruction *inst)
|
||||||
// Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH())
|
// Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH())
|
||||||
#define MAX_ABSTRACT_FRAME_DEPTH (TRACE_STACK_SIZE + 2)
|
#define MAX_ABSTRACT_FRAME_DEPTH (TRACE_STACK_SIZE + 2)
|
||||||
|
|
||||||
|
// The maximum number of side exits that we can take before requiring forward
|
||||||
|
// progress (and inserting a new ENTER_EXECUTOR instruction). In practice, this
|
||||||
|
// is the "maximum amount of polymorphism" that an isolated trace tree can
|
||||||
|
// handle before rejoining the rest of the program.
|
||||||
|
#define MAX_CHAIN_DEPTH 4
|
||||||
|
|
||||||
typedef struct _Py_UopsSymbol _Py_UopsSymbol;
|
typedef struct _Py_UopsSymbol _Py_UopsSymbol;
|
||||||
|
|
||||||
struct _Py_UOpsAbstractFrame {
|
struct _Py_UOpsAbstractFrame {
|
||||||
|
@ -257,7 +264,7 @@ extern int _Py_uop_frame_pop(_Py_UOpsContext *ctx);
|
||||||
|
|
||||||
PyAPI_FUNC(PyObject *) _Py_uop_symbols_test(PyObject *self, PyObject *ignored);
|
PyAPI_FUNC(PyObject *) _Py_uop_symbols_test(PyObject *self, PyObject *ignored);
|
||||||
|
|
||||||
PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **exec_ptr);
|
PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **exec_ptr, int chain_depth);
|
||||||
|
|
||||||
static inline int is_terminator(const _PyUOpInstruction *uop)
|
static inline int is_terminator(const _PyUOpInstruction *uop)
|
||||||
{
|
{
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Improve the experimental JIT's handling of polymorphic code.
|
|
@ -2501,7 +2501,7 @@ dummy_func(
|
||||||
start--;
|
start--;
|
||||||
}
|
}
|
||||||
_PyExecutorObject *executor;
|
_PyExecutorObject *executor;
|
||||||
int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor);
|
int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, 0);
|
||||||
ERROR_IF(optimized < 0, error);
|
ERROR_IF(optimized < 0, error);
|
||||||
if (optimized) {
|
if (optimized) {
|
||||||
assert(tstate->previous_executor == NULL);
|
assert(tstate->previous_executor == NULL);
|
||||||
|
@ -4543,7 +4543,8 @@ dummy_func(
|
||||||
Py_INCREF(executor);
|
Py_INCREF(executor);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor);
|
int chain_depth = current_executor->vm_data.chain_depth + 1;
|
||||||
|
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, chain_depth);
|
||||||
if (optimized <= 0) {
|
if (optimized <= 0) {
|
||||||
exit->temperature = restart_backoff_counter(temperature);
|
exit->temperature = restart_backoff_counter(temperature);
|
||||||
if (optimized < 0) {
|
if (optimized < 0) {
|
||||||
|
@ -4626,7 +4627,7 @@ dummy_func(
|
||||||
exit->temperature = advance_backoff_counter(exit->temperature);
|
exit->temperature = advance_backoff_counter(exit->temperature);
|
||||||
GOTO_TIER_ONE(target);
|
GOTO_TIER_ONE(target);
|
||||||
}
|
}
|
||||||
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor);
|
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, 0);
|
||||||
if (optimized <= 0) {
|
if (optimized <= 0) {
|
||||||
exit->temperature = restart_backoff_counter(exit->temperature);
|
exit->temperature = restart_backoff_counter(exit->temperature);
|
||||||
if (optimized < 0) {
|
if (optimized < 0) {
|
||||||
|
|
|
@ -5015,7 +5015,8 @@
|
||||||
Py_INCREF(executor);
|
Py_INCREF(executor);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor);
|
int chain_depth = current_executor->vm_data.chain_depth + 1;
|
||||||
|
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, chain_depth);
|
||||||
if (optimized <= 0) {
|
if (optimized <= 0) {
|
||||||
exit->temperature = restart_backoff_counter(temperature);
|
exit->temperature = restart_backoff_counter(temperature);
|
||||||
if (optimized < 0) {
|
if (optimized < 0) {
|
||||||
|
@ -5147,7 +5148,7 @@
|
||||||
exit->temperature = advance_backoff_counter(exit->temperature);
|
exit->temperature = advance_backoff_counter(exit->temperature);
|
||||||
GOTO_TIER_ONE(target);
|
GOTO_TIER_ONE(target);
|
||||||
}
|
}
|
||||||
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor);
|
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, 0);
|
||||||
if (optimized <= 0) {
|
if (optimized <= 0) {
|
||||||
exit->temperature = restart_backoff_counter(exit->temperature);
|
exit->temperature = restart_backoff_counter(exit->temperature);
|
||||||
if (optimized < 0) {
|
if (optimized < 0) {
|
||||||
|
|
|
@ -4259,7 +4259,7 @@
|
||||||
start--;
|
start--;
|
||||||
}
|
}
|
||||||
_PyExecutorObject *executor;
|
_PyExecutorObject *executor;
|
||||||
int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor);
|
int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, 0);
|
||||||
if (optimized < 0) goto error;
|
if (optimized < 0) goto error;
|
||||||
if (optimized) {
|
if (optimized) {
|
||||||
assert(tstate->previous_executor == NULL);
|
assert(tstate->previous_executor == NULL);
|
||||||
|
|
|
@ -111,7 +111,8 @@ never_optimize(
|
||||||
_PyInterpreterFrame *frame,
|
_PyInterpreterFrame *frame,
|
||||||
_Py_CODEUNIT *instr,
|
_Py_CODEUNIT *instr,
|
||||||
_PyExecutorObject **exec,
|
_PyExecutorObject **exec,
|
||||||
int Py_UNUSED(stack_entries))
|
int Py_UNUSED(stack_entries),
|
||||||
|
bool Py_UNUSED(progress_needed))
|
||||||
{
|
{
|
||||||
// This may be called if the optimizer is reset
|
// This may be called if the optimizer is reset
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -176,32 +177,44 @@ _Py_SetTier2Optimizer(_PyOptimizerObject *optimizer)
|
||||||
int
|
int
|
||||||
_PyOptimizer_Optimize(
|
_PyOptimizer_Optimize(
|
||||||
_PyInterpreterFrame *frame, _Py_CODEUNIT *start,
|
_PyInterpreterFrame *frame, _Py_CODEUNIT *start,
|
||||||
_PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr)
|
_PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr, int chain_depth)
|
||||||
{
|
{
|
||||||
|
// The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must*
|
||||||
|
// make progress in order to avoid infinite loops or excessively-long
|
||||||
|
// side-exit chains. We can only insert the executor into the bytecode if
|
||||||
|
// this is true, since a deopt won't infinitely re-enter the executor:
|
||||||
|
chain_depth %= MAX_CHAIN_DEPTH;
|
||||||
|
bool progress_needed = chain_depth == 0;
|
||||||
PyCodeObject *code = _PyFrame_GetCode(frame);
|
PyCodeObject *code = _PyFrame_GetCode(frame);
|
||||||
assert(PyCode_Check(code));
|
assert(PyCode_Check(code));
|
||||||
PyInterpreterState *interp = _PyInterpreterState_GET();
|
PyInterpreterState *interp = _PyInterpreterState_GET();
|
||||||
if (!has_space_for_executor(code, start)) {
|
if (progress_needed && !has_space_for_executor(code, start)) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
_PyOptimizerObject *opt = interp->optimizer;
|
_PyOptimizerObject *opt = interp->optimizer;
|
||||||
int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame)));
|
int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame)), progress_needed);
|
||||||
if (err <= 0) {
|
if (err <= 0) {
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
assert(*executor_ptr != NULL);
|
assert(*executor_ptr != NULL);
|
||||||
int index = get_index_for_executor(code, start);
|
if (progress_needed) {
|
||||||
if (index < 0) {
|
int index = get_index_for_executor(code, start);
|
||||||
/* Out of memory. Don't raise and assume that the
|
if (index < 0) {
|
||||||
* error will show up elsewhere.
|
/* Out of memory. Don't raise and assume that the
|
||||||
*
|
* error will show up elsewhere.
|
||||||
* If an optimizer has already produced an executor,
|
*
|
||||||
* it might get confused by the executor disappearing,
|
* If an optimizer has already produced an executor,
|
||||||
* but there is not much we can do about that here. */
|
* it might get confused by the executor disappearing,
|
||||||
Py_DECREF(*executor_ptr);
|
* but there is not much we can do about that here. */
|
||||||
return 0;
|
Py_DECREF(*executor_ptr);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
insert_executor(code, start, index, *executor_ptr);
|
||||||
}
|
}
|
||||||
insert_executor(code, start, index, *executor_ptr);
|
else {
|
||||||
|
(*executor_ptr)->vm_data.code = NULL;
|
||||||
|
}
|
||||||
|
(*executor_ptr)->vm_data.chain_depth = chain_depth;
|
||||||
assert((*executor_ptr)->vm_data.valid);
|
assert((*executor_ptr)->vm_data.valid);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -530,9 +543,9 @@ translate_bytecode_to_trace(
|
||||||
_Py_CODEUNIT *instr,
|
_Py_CODEUNIT *instr,
|
||||||
_PyUOpInstruction *trace,
|
_PyUOpInstruction *trace,
|
||||||
int buffer_size,
|
int buffer_size,
|
||||||
_PyBloomFilter *dependencies)
|
_PyBloomFilter *dependencies, bool progress_needed)
|
||||||
{
|
{
|
||||||
bool progress_needed = true;
|
bool first = true;
|
||||||
PyCodeObject *code = _PyFrame_GetCode(frame);
|
PyCodeObject *code = _PyFrame_GetCode(frame);
|
||||||
PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj;
|
PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj;
|
||||||
assert(PyFunction_Check(func));
|
assert(PyFunction_Check(func));
|
||||||
|
@ -576,7 +589,7 @@ translate_bytecode_to_trace(
|
||||||
uint32_t opcode = instr->op.code;
|
uint32_t opcode = instr->op.code;
|
||||||
uint32_t oparg = instr->op.arg;
|
uint32_t oparg = instr->op.arg;
|
||||||
|
|
||||||
if (!progress_needed && instr == initial_instr) {
|
if (!first && instr == initial_instr) {
|
||||||
// We have looped around to the start:
|
// We have looped around to the start:
|
||||||
RESERVE(1);
|
RESERVE(1);
|
||||||
ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
|
ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
|
||||||
|
@ -585,14 +598,6 @@ translate_bytecode_to_trace(
|
||||||
|
|
||||||
DPRINTF(2, "%d: %s(%d)\n", target, _PyOpcode_OpName[opcode], oparg);
|
DPRINTF(2, "%d: %s(%d)\n", target, _PyOpcode_OpName[opcode], oparg);
|
||||||
|
|
||||||
if (opcode == ENTER_EXECUTOR) {
|
|
||||||
assert(oparg < 256);
|
|
||||||
_PyExecutorObject *executor = code->co_executors->executors[oparg];
|
|
||||||
opcode = executor->vm_data.opcode;
|
|
||||||
DPRINTF(2, " * ENTER_EXECUTOR -> %s\n", _PyOpcode_OpName[opcode]);
|
|
||||||
oparg = executor->vm_data.oparg;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (opcode == EXTENDED_ARG) {
|
if (opcode == EXTENDED_ARG) {
|
||||||
instr++;
|
instr++;
|
||||||
opcode = instr->op.code;
|
opcode = instr->op.code;
|
||||||
|
@ -602,13 +607,27 @@ translate_bytecode_to_trace(
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (opcode == ENTER_EXECUTOR) {
|
||||||
|
// We have a couple of options here. We *could* peek "underneath"
|
||||||
|
// this executor and continue tracing, which could give us a longer,
|
||||||
|
// more optimizeable trace (at the expense of lots of duplicated
|
||||||
|
// tier two code). Instead, we choose to just end here and stitch to
|
||||||
|
// the other trace, which allows a side-exit traces to rejoin the
|
||||||
|
// "main" trace periodically (and also helps protect us against
|
||||||
|
// pathological behavior where the amount of tier two code explodes
|
||||||
|
// for a medium-length, branchy code path). This seems to work
|
||||||
|
// better in practice, but in the future we could be smarter about
|
||||||
|
// what we do here:
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG);
|
assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG);
|
||||||
RESERVE_RAW(2, "_CHECK_VALIDITY_AND_SET_IP");
|
RESERVE_RAW(2, "_CHECK_VALIDITY_AND_SET_IP");
|
||||||
ADD_TO_TRACE(_CHECK_VALIDITY_AND_SET_IP, 0, (uintptr_t)instr, target);
|
ADD_TO_TRACE(_CHECK_VALIDITY_AND_SET_IP, 0, (uintptr_t)instr, target);
|
||||||
|
|
||||||
/* Special case the first instruction,
|
/* Special case the first instruction,
|
||||||
* so that we can guarantee forward progress */
|
* so that we can guarantee forward progress */
|
||||||
if (progress_needed) {
|
if (first && progress_needed) {
|
||||||
|
assert(first);
|
||||||
if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) {
|
if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) {
|
||||||
opcode = _PyOpcode_Deopt[opcode];
|
opcode = _PyOpcode_Deopt[opcode];
|
||||||
}
|
}
|
||||||
|
@ -903,7 +922,7 @@ translate_bytecode_to_trace(
|
||||||
}
|
}
|
||||||
top:
|
top:
|
||||||
// Jump here after _PUSH_FRAME or likely branches.
|
// Jump here after _PUSH_FRAME or likely branches.
|
||||||
progress_needed = false;
|
first = false;
|
||||||
} // End for (;;)
|
} // End for (;;)
|
||||||
|
|
||||||
done:
|
done:
|
||||||
|
@ -912,7 +931,7 @@ done:
|
||||||
}
|
}
|
||||||
assert(code == initial_code);
|
assert(code == initial_code);
|
||||||
// Skip short traces where we can't even translate a single instruction:
|
// Skip short traces where we can't even translate a single instruction:
|
||||||
if (progress_needed) {
|
if (first) {
|
||||||
OPT_STAT_INC(trace_too_short);
|
OPT_STAT_INC(trace_too_short);
|
||||||
DPRINTF(2,
|
DPRINTF(2,
|
||||||
"No trace for %s (%s:%d) at byte offset %d (no progress)\n",
|
"No trace for %s (%s:%d) at byte offset %d (no progress)\n",
|
||||||
|
@ -1225,13 +1244,14 @@ uop_optimize(
|
||||||
_PyInterpreterFrame *frame,
|
_PyInterpreterFrame *frame,
|
||||||
_Py_CODEUNIT *instr,
|
_Py_CODEUNIT *instr,
|
||||||
_PyExecutorObject **exec_ptr,
|
_PyExecutorObject **exec_ptr,
|
||||||
int curr_stackentries)
|
int curr_stackentries,
|
||||||
|
bool progress_needed)
|
||||||
{
|
{
|
||||||
_PyBloomFilter dependencies;
|
_PyBloomFilter dependencies;
|
||||||
_Py_BloomFilter_Init(&dependencies);
|
_Py_BloomFilter_Init(&dependencies);
|
||||||
_PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH];
|
_PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH];
|
||||||
OPT_STAT_INC(attempts);
|
OPT_STAT_INC(attempts);
|
||||||
int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies);
|
int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies, progress_needed);
|
||||||
if (length <= 0) {
|
if (length <= 0) {
|
||||||
// Error or nothing translated
|
// Error or nothing translated
|
||||||
return length;
|
return length;
|
||||||
|
@ -1328,7 +1348,8 @@ counter_optimize(
|
||||||
_PyInterpreterFrame *frame,
|
_PyInterpreterFrame *frame,
|
||||||
_Py_CODEUNIT *instr,
|
_Py_CODEUNIT *instr,
|
||||||
_PyExecutorObject **exec_ptr,
|
_PyExecutorObject **exec_ptr,
|
||||||
int Py_UNUSED(curr_stackentries)
|
int Py_UNUSED(curr_stackentries),
|
||||||
|
bool Py_UNUSED(progress_needed)
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
PyCodeObject *code = _PyFrame_GetCode(frame);
|
PyCodeObject *code = _PyFrame_GetCode(frame);
|
||||||
|
|
Loading…
Reference in New Issue