GH-113710: Add a "globals to constants" pass (GH-114592)

Converts specializations of `LOAD_GLOBAL` into constants during tier 2 optimization.
This commit is contained in:
Mark Shannon 2024-02-02 12:14:34 +00:00 committed by GitHub
parent 2091fb2a85
commit 0e71a295e9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 375 additions and 55 deletions

View File

@ -17,6 +17,9 @@ typedef struct {
/* Dictionary version: globally unique, value change each time /* Dictionary version: globally unique, value change each time
the dictionary is modified */ the dictionary is modified */
#ifdef Py_BUILD_CORE #ifdef Py_BUILD_CORE
/* Bits 0-7 are for dict watchers.
* Bits 8-11 are for the watched mutation counter (used by tier2 optimization)
* The remaining bits (12-63) are the actual version tag. */
uint64_t ma_version_tag; uint64_t ma_version_tag;
#else #else
Py_DEPRECATED(3.12) uint64_t ma_version_tag; Py_DEPRECATED(3.12) uint64_t ma_version_tag;

View File

@ -47,7 +47,10 @@ typedef struct _PyExecutorObject {
typedef struct _PyOptimizerObject _PyOptimizerObject; typedef struct _PyOptimizerObject _PyOptimizerObject;
/* Should return > 0 if a new executor is created. O if no executor is produced and < 0 if an error occurred. */ /* Should return > 0 if a new executor is created. O if no executor is produced and < 0 if an error occurred. */
typedef int (*optimize_func)(_PyOptimizerObject* self, PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutorObject **, int curr_stackentries); typedef int (*optimize_func)(
_PyOptimizerObject* self, struct _PyInterpreterFrame *frame,
_Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr,
int curr_stackentries);
typedef struct _PyOptimizerObject { typedef struct _PyOptimizerObject {
PyObject_HEAD PyObject_HEAD
@ -94,6 +97,9 @@ PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void);
/* Minimum of 16 additional executions before retry */ /* Minimum of 16 additional executions before retry */
#define MINIMUM_TIER2_BACKOFF 4 #define MINIMUM_TIER2_BACKOFF 4
#define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 3
#define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 6
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -207,8 +207,8 @@ static inline PyDictUnicodeEntry* DK_UNICODE_ENTRIES(PyDictKeysObject *dk) {
#define DK_IS_UNICODE(dk) ((dk)->dk_kind != DICT_KEYS_GENERAL) #define DK_IS_UNICODE(dk) ((dk)->dk_kind != DICT_KEYS_GENERAL)
#define DICT_VERSION_INCREMENT (1 << DICT_MAX_WATCHERS) #define DICT_VERSION_INCREMENT (1 << (DICT_MAX_WATCHERS + DICT_WATCHED_MUTATION_BITS))
#define DICT_VERSION_MASK (DICT_VERSION_INCREMENT - 1) #define DICT_WATCHER_MASK ((1 << DICT_MAX_WATCHERS) - 1)
#ifdef Py_GIL_DISABLED #ifdef Py_GIL_DISABLED
#define DICT_NEXT_VERSION(INTERP) \ #define DICT_NEXT_VERSION(INTERP) \
@ -234,7 +234,7 @@ _PyDict_NotifyEvent(PyInterpreterState *interp,
PyObject *value) PyObject *value)
{ {
assert(Py_REFCNT((PyObject*)mp) > 0); assert(Py_REFCNT((PyObject*)mp) > 0);
int watcher_bits = mp->ma_version_tag & DICT_VERSION_MASK; int watcher_bits = mp->ma_version_tag & DICT_WATCHER_MASK;
if (watcher_bits) { if (watcher_bits) {
_PyDict_SendEvent(watcher_bits, event, mp, key, value); _PyDict_SendEvent(watcher_bits, event, mp, key, value);
return DICT_NEXT_VERSION(interp) | watcher_bits; return DICT_NEXT_VERSION(interp) | watcher_bits;

View File

@ -9,6 +9,7 @@ extern "C" {
#endif #endif
#define DICT_MAX_WATCHERS 8 #define DICT_MAX_WATCHERS 8
#define DICT_WATCHED_MUTATION_BITS 4
struct _Py_dict_state { struct _Py_dict_state {
/*Global counter used to set ma_version_tag field of dictionary. /*Global counter used to set ma_version_tag field of dictionary.

View File

@ -72,7 +72,6 @@ typedef struct _rare_events {
uint8_t set_eval_frame_func; uint8_t set_eval_frame_func;
/* Modifying the builtins, __builtins__.__dict__[var] = ... */ /* Modifying the builtins, __builtins__.__dict__[var] = ... */
uint8_t builtin_dict; uint8_t builtin_dict;
int builtins_dict_watcher_id;
/* Modifying a function, e.g. func.__defaults__ = ..., etc. */ /* Modifying a function, e.g. func.__defaults__ = ..., etc. */
uint8_t func_modification; uint8_t func_modification;
} _rare_events; } _rare_events;
@ -243,6 +242,7 @@ struct _is {
uint16_t optimizer_backedge_threshold; uint16_t optimizer_backedge_threshold;
uint32_t next_func_version; uint32_t next_func_version;
_rare_events rare_events; _rare_events rare_events;
PyDict_WatchCallback builtins_dict_watcher;
_Py_GlobalMonitors monitors; _Py_GlobalMonitors monitors;
bool sys_profile_initialized; bool sys_profile_initialized;

View File

@ -8,8 +8,9 @@ extern "C" {
# error "this header requires Py_BUILD_CORE define" # error "this header requires Py_BUILD_CORE define"
#endif #endif
int _Py_uop_analyze_and_optimize(PyCodeObject *code, int _Py_uop_analyze_and_optimize(_PyInterpreterFrame *frame,
_PyUOpInstruction *trace, int trace_len, int curr_stackentries); _PyUOpInstruction *trace, int trace_len, int curr_stackentries,
_PyBloomFilter *dependencies);
extern PyTypeObject _PyCounterExecutor_Type; extern PyTypeObject _PyCounterExecutor_Type;
extern PyTypeObject _PyCounterOptimizer_Type; extern PyTypeObject _PyCounterOptimizer_Type;

View File

@ -232,8 +232,12 @@ extern "C" {
#define _CHECK_VALIDITY 379 #define _CHECK_VALIDITY 379
#define _LOAD_CONST_INLINE 380 #define _LOAD_CONST_INLINE 380
#define _LOAD_CONST_INLINE_BORROW 381 #define _LOAD_CONST_INLINE_BORROW 381
#define _INTERNAL_INCREMENT_OPT_COUNTER 382 #define _LOAD_CONST_INLINE_WITH_NULL 382
#define MAX_UOP_ID 382 #define _LOAD_CONST_INLINE_BORROW_WITH_NULL 383
#define _CHECK_GLOBALS 384
#define _CHECK_BUILTINS 385
#define _INTERNAL_INCREMENT_OPT_COUNTER 386
#define MAX_UOP_ID 386
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -204,6 +204,10 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = {
[_CHECK_VALIDITY] = HAS_DEOPT_FLAG, [_CHECK_VALIDITY] = HAS_DEOPT_FLAG,
[_LOAD_CONST_INLINE] = 0, [_LOAD_CONST_INLINE] = 0,
[_LOAD_CONST_INLINE_BORROW] = 0, [_LOAD_CONST_INLINE_BORROW] = 0,
[_LOAD_CONST_INLINE_WITH_NULL] = 0,
[_LOAD_CONST_INLINE_BORROW_WITH_NULL] = 0,
[_CHECK_GLOBALS] = HAS_DEOPT_FLAG,
[_CHECK_BUILTINS] = HAS_DEOPT_FLAG,
[_INTERNAL_INCREMENT_OPT_COUNTER] = 0, [_INTERNAL_INCREMENT_OPT_COUNTER] = 0,
}; };
@ -250,10 +254,12 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = {
[_CHECK_ATTR_METHOD_LAZY_DICT] = "_CHECK_ATTR_METHOD_LAZY_DICT", [_CHECK_ATTR_METHOD_LAZY_DICT] = "_CHECK_ATTR_METHOD_LAZY_DICT",
[_CHECK_ATTR_MODULE] = "_CHECK_ATTR_MODULE", [_CHECK_ATTR_MODULE] = "_CHECK_ATTR_MODULE",
[_CHECK_ATTR_WITH_HINT] = "_CHECK_ATTR_WITH_HINT", [_CHECK_ATTR_WITH_HINT] = "_CHECK_ATTR_WITH_HINT",
[_CHECK_BUILTINS] = "_CHECK_BUILTINS",
[_CHECK_CALL_BOUND_METHOD_EXACT_ARGS] = "_CHECK_CALL_BOUND_METHOD_EXACT_ARGS", [_CHECK_CALL_BOUND_METHOD_EXACT_ARGS] = "_CHECK_CALL_BOUND_METHOD_EXACT_ARGS",
[_CHECK_EG_MATCH] = "_CHECK_EG_MATCH", [_CHECK_EG_MATCH] = "_CHECK_EG_MATCH",
[_CHECK_EXC_MATCH] = "_CHECK_EXC_MATCH", [_CHECK_EXC_MATCH] = "_CHECK_EXC_MATCH",
[_CHECK_FUNCTION_EXACT_ARGS] = "_CHECK_FUNCTION_EXACT_ARGS", [_CHECK_FUNCTION_EXACT_ARGS] = "_CHECK_FUNCTION_EXACT_ARGS",
[_CHECK_GLOBALS] = "_CHECK_GLOBALS",
[_CHECK_MANAGED_OBJECT_HAS_VALUES] = "_CHECK_MANAGED_OBJECT_HAS_VALUES", [_CHECK_MANAGED_OBJECT_HAS_VALUES] = "_CHECK_MANAGED_OBJECT_HAS_VALUES",
[_CHECK_PEP_523] = "_CHECK_PEP_523", [_CHECK_PEP_523] = "_CHECK_PEP_523",
[_CHECK_STACK_SPACE] = "_CHECK_STACK_SPACE", [_CHECK_STACK_SPACE] = "_CHECK_STACK_SPACE",
@ -332,6 +338,8 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = {
[_LOAD_CONST] = "_LOAD_CONST", [_LOAD_CONST] = "_LOAD_CONST",
[_LOAD_CONST_INLINE] = "_LOAD_CONST_INLINE", [_LOAD_CONST_INLINE] = "_LOAD_CONST_INLINE",
[_LOAD_CONST_INLINE_BORROW] = "_LOAD_CONST_INLINE_BORROW", [_LOAD_CONST_INLINE_BORROW] = "_LOAD_CONST_INLINE_BORROW",
[_LOAD_CONST_INLINE_BORROW_WITH_NULL] = "_LOAD_CONST_INLINE_BORROW_WITH_NULL",
[_LOAD_CONST_INLINE_WITH_NULL] = "_LOAD_CONST_INLINE_WITH_NULL",
[_LOAD_DEREF] = "_LOAD_DEREF", [_LOAD_DEREF] = "_LOAD_DEREF",
[_LOAD_FAST] = "_LOAD_FAST", [_LOAD_FAST] = "_LOAD_FAST",
[_LOAD_FAST_AND_CLEAR] = "_LOAD_FAST_AND_CLEAR", [_LOAD_FAST_AND_CLEAR] = "_LOAD_FAST_AND_CLEAR",

View File

@ -151,8 +151,8 @@ class TestDictWatchers(unittest.TestCase):
def test_watch_unassigned_watcher_id(self): def test_watch_unassigned_watcher_id(self):
d = {} d = {}
with self.assertRaisesRegex(ValueError, r"No dict watcher set for ID 1"): with self.assertRaisesRegex(ValueError, r"No dict watcher set for ID 3"):
self.watch(1, d) self.watch(3, d)
def test_unwatch_non_dict(self): def test_unwatch_non_dict(self):
with self.watcher() as wid: with self.watcher() as wid:
@ -168,8 +168,8 @@ class TestDictWatchers(unittest.TestCase):
def test_unwatch_unassigned_watcher_id(self): def test_unwatch_unassigned_watcher_id(self):
d = {} d = {}
with self.assertRaisesRegex(ValueError, r"No dict watcher set for ID 1"): with self.assertRaisesRegex(ValueError, r"No dict watcher set for ID 3"):
self.unwatch(1, d) self.unwatch(3, d)
def test_clear_out_of_range_watcher_id(self): def test_clear_out_of_range_watcher_id(self):
with self.assertRaisesRegex(ValueError, r"Invalid dict watcher ID -1"): with self.assertRaisesRegex(ValueError, r"Invalid dict watcher ID -1"):
@ -178,8 +178,8 @@ class TestDictWatchers(unittest.TestCase):
self.clear_watcher(8) # DICT_MAX_WATCHERS = 8 self.clear_watcher(8) # DICT_MAX_WATCHERS = 8
def test_clear_unassigned_watcher_id(self): def test_clear_unassigned_watcher_id(self):
with self.assertRaisesRegex(ValueError, r"No dict watcher set for ID 1"): with self.assertRaisesRegex(ValueError, r"No dict watcher set for ID 3"):
self.clear_watcher(1) self.clear_watcher(3)
class TestTypeWatchers(unittest.TestCase): class TestTypeWatchers(unittest.TestCase):

View File

@ -15,8 +15,8 @@ module _testcapi
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6361033e795369fc]*/ /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6361033e795369fc]*/
// Test dict watching // Test dict watching
static PyObject *g_dict_watch_events; static PyObject *g_dict_watch_events = NULL;
static int g_dict_watchers_installed; static int g_dict_watchers_installed = 0;
static int static int
dict_watch_callback(PyDict_WatchEvent event, dict_watch_callback(PyDict_WatchEvent event,

View File

@ -5943,7 +5943,8 @@ PyDict_AddWatcher(PyDict_WatchCallback callback)
{ {
PyInterpreterState *interp = _PyInterpreterState_GET(); PyInterpreterState *interp = _PyInterpreterState_GET();
for (int i = 0; i < DICT_MAX_WATCHERS; i++) { /* Start at 2, as 0 and 1 are reserved for CPython */
for (int i = 2; i < DICT_MAX_WATCHERS; i++) {
if (!interp->dict_state.watchers[i]) { if (!interp->dict_state.watchers[i]) {
interp->dict_state.watchers[i] = callback; interp->dict_state.watchers[i] = callback;
return i; return i;

View File

@ -4071,13 +4071,37 @@ dummy_func(
} }
op(_LOAD_CONST_INLINE, (ptr/4 -- value)) { op(_LOAD_CONST_INLINE, (ptr/4 -- value)) {
TIER_TWO_ONLY
value = Py_NewRef(ptr); value = Py_NewRef(ptr);
} }
op(_LOAD_CONST_INLINE_BORROW, (ptr/4 -- value)) { op(_LOAD_CONST_INLINE_BORROW, (ptr/4 -- value)) {
TIER_TWO_ONLY
value = ptr; value = ptr;
} }
op(_LOAD_CONST_INLINE_WITH_NULL, (ptr/4 -- value, null)) {
TIER_TWO_ONLY
value = Py_NewRef(ptr);
null = NULL;
}
op(_LOAD_CONST_INLINE_BORROW_WITH_NULL, (ptr/4 -- value, null)) {
TIER_TWO_ONLY
value = ptr;
null = NULL;
}
op(_CHECK_GLOBALS, (dict/4 -- )) {
TIER_TWO_ONLY
DEOPT_IF(GLOBALS() != dict);
}
op(_CHECK_BUILTINS, (dict/4 -- )) {
TIER_TWO_ONLY
DEOPT_IF(BUILTINS() != dict);
}
/* Internal -- for testing executors */ /* Internal -- for testing executors */
op(_INTERNAL_INCREMENT_OPT_COUNTER, (opt --)) { op(_INTERNAL_INCREMENT_OPT_COUNTER, (opt --)) {
_PyCounterOptimizerObject *exe = (_PyCounterOptimizerObject *)opt; _PyCounterOptimizerObject *exe = (_PyCounterOptimizerObject *)opt;

View File

@ -3393,6 +3393,7 @@
case _LOAD_CONST_INLINE: { case _LOAD_CONST_INLINE: {
PyObject *value; PyObject *value;
PyObject *ptr = (PyObject *)CURRENT_OPERAND(); PyObject *ptr = (PyObject *)CURRENT_OPERAND();
TIER_TWO_ONLY
value = Py_NewRef(ptr); value = Py_NewRef(ptr);
stack_pointer[0] = value; stack_pointer[0] = value;
stack_pointer += 1; stack_pointer += 1;
@ -3402,12 +3403,53 @@
case _LOAD_CONST_INLINE_BORROW: { case _LOAD_CONST_INLINE_BORROW: {
PyObject *value; PyObject *value;
PyObject *ptr = (PyObject *)CURRENT_OPERAND(); PyObject *ptr = (PyObject *)CURRENT_OPERAND();
TIER_TWO_ONLY
value = ptr; value = ptr;
stack_pointer[0] = value; stack_pointer[0] = value;
stack_pointer += 1; stack_pointer += 1;
break; break;
} }
case _LOAD_CONST_INLINE_WITH_NULL: {
PyObject *value;
PyObject *null;
PyObject *ptr = (PyObject *)CURRENT_OPERAND();
TIER_TWO_ONLY
value = Py_NewRef(ptr);
null = NULL;
stack_pointer[0] = value;
stack_pointer[1] = null;
stack_pointer += 2;
break;
}
case _LOAD_CONST_INLINE_BORROW_WITH_NULL: {
PyObject *value;
PyObject *null;
PyObject *ptr = (PyObject *)CURRENT_OPERAND();
TIER_TWO_ONLY
value = ptr;
null = NULL;
stack_pointer[0] = value;
stack_pointer[1] = null;
stack_pointer += 2;
break;
}
case _CHECK_GLOBALS: {
PyObject *dict = (PyObject *)CURRENT_OPERAND();
TIER_TWO_ONLY
if (GLOBALS() != dict) goto deoptimize;
break;
}
case _CHECK_BUILTINS: {
PyObject *dict = (PyObject *)CURRENT_OPERAND();
TIER_TWO_ONLY
if (BUILTINS() != dict) goto deoptimize;
break;
}
case _INTERNAL_INCREMENT_OPT_COUNTER: { case _INTERNAL_INCREMENT_OPT_COUNTER: {
PyObject *opt; PyObject *opt;
opt = stack_pointer[-1]; opt = stack_pointer[-1];

View File

@ -108,16 +108,14 @@ PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutor
} }
static int static int
error_optimize( never_optimize(
_PyOptimizerObject* self, _PyOptimizerObject* self,
PyCodeObject *code, _PyInterpreterFrame *frame,
_Py_CODEUNIT *instr, _Py_CODEUNIT *instr,
_PyExecutorObject **exec, _PyExecutorObject **exec,
int Py_UNUSED(stack_entries)) int Py_UNUSED(stack_entries))
{ {
assert(0); return 0;
PyErr_Format(PyExc_SystemError, "Should never call error_optimize");
return -1;
} }
PyTypeObject _PyDefaultOptimizer_Type = { PyTypeObject _PyDefaultOptimizer_Type = {
@ -130,7 +128,7 @@ PyTypeObject _PyDefaultOptimizer_Type = {
_PyOptimizerObject _PyOptimizer_Default = { _PyOptimizerObject _PyOptimizer_Default = {
PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type) PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type)
.optimize = error_optimize, .optimize = never_optimize,
.resume_threshold = INT16_MAX, .resume_threshold = INT16_MAX,
.backedge_threshold = INT16_MAX, .backedge_threshold = INT16_MAX,
}; };
@ -174,7 +172,7 @@ _PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject
} }
_PyOptimizerObject *opt = interp->optimizer; _PyOptimizerObject *opt = interp->optimizer;
_PyExecutorObject *executor = NULL; _PyExecutorObject *executor = NULL;
int err = opt->optimize(opt, code, start, &executor, (int)(stack_pointer - _PyFrame_Stackbase(frame))); int err = opt->optimize(opt, frame, start, &executor, (int)(stack_pointer - _PyFrame_Stackbase(frame)));
if (err <= 0) { if (err <= 0) {
assert(executor == NULL); assert(executor == NULL);
return err; return err;
@ -363,7 +361,8 @@ BRANCH_TO_GUARD[4][2] = {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0); \ ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0); \
goto done; \ goto done; \
} \ } \
trace_stack[trace_stack_depth].code = code; \ assert(func->func_code == (PyObject *)code); \
trace_stack[trace_stack_depth].func = func; \
trace_stack[trace_stack_depth].instr = instr; \ trace_stack[trace_stack_depth].instr = instr; \
trace_stack_depth++; trace_stack_depth++;
#define TRACE_STACK_POP() \ #define TRACE_STACK_POP() \
@ -371,7 +370,8 @@ BRANCH_TO_GUARD[4][2] = {
Py_FatalError("Trace stack underflow\n"); \ Py_FatalError("Trace stack underflow\n"); \
} \ } \
trace_stack_depth--; \ trace_stack_depth--; \
code = trace_stack[trace_stack_depth].code; \ func = trace_stack[trace_stack_depth].func; \
code = (PyCodeObject *)trace_stack[trace_stack_depth].func->func_code; \
instr = trace_stack[trace_stack_depth].instr; instr = trace_stack[trace_stack_depth].instr;
/* Returns 1 on success, /* Returns 1 on success,
@ -380,20 +380,23 @@ BRANCH_TO_GUARD[4][2] = {
*/ */
static int static int
translate_bytecode_to_trace( translate_bytecode_to_trace(
PyCodeObject *code, _PyInterpreterFrame *frame,
_Py_CODEUNIT *instr, _Py_CODEUNIT *instr,
_PyUOpInstruction *trace, _PyUOpInstruction *trace,
int buffer_size, int buffer_size,
_PyBloomFilter *dependencies) _PyBloomFilter *dependencies)
{ {
bool progress_needed = true; bool progress_needed = true;
PyCodeObject *code = (PyCodeObject *)frame->f_executable;
PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj;
assert(PyFunction_Check(func));
PyCodeObject *initial_code = code; PyCodeObject *initial_code = code;
_Py_BloomFilter_Add(dependencies, initial_code); _Py_BloomFilter_Add(dependencies, initial_code);
_Py_CODEUNIT *initial_instr = instr; _Py_CODEUNIT *initial_instr = instr;
int trace_length = 0; int trace_length = 0;
int max_length = buffer_size; int max_length = buffer_size;
struct { struct {
PyCodeObject *code; PyFunctionObject *func;
_Py_CODEUNIT *instr; _Py_CODEUNIT *instr;
} trace_stack[TRACE_STACK_SIZE]; } trace_stack[TRACE_STACK_SIZE];
int trace_stack_depth = 0; int trace_stack_depth = 0;
@ -593,9 +596,9 @@ top: // Jump here after _PUSH_FRAME or likely branches
ADD_TO_TRACE(uop, oparg, operand, target); ADD_TO_TRACE(uop, oparg, operand, target);
if (uop == _POP_FRAME) { if (uop == _POP_FRAME) {
TRACE_STACK_POP(); TRACE_STACK_POP();
/* Set the operand to the code object returned to, /* Set the operand to the function object returned to,
* to assist optimization passes */ * to assist optimization passes */
trace[trace_length-1].operand = (uintptr_t)code; trace[trace_length-1].operand = (uintptr_t)func;
DPRINTF(2, DPRINTF(2,
"Returning to %s (%s:%d) at byte offset %d\n", "Returning to %s (%s:%d) at byte offset %d\n",
PyUnicode_AsUTF8(code->co_qualname), PyUnicode_AsUTF8(code->co_qualname),
@ -611,10 +614,10 @@ top: // Jump here after _PUSH_FRAME or likely branches
// Add one to account for the actual opcode/oparg pair: // Add one to account for the actual opcode/oparg pair:
+ 1; + 1;
uint32_t func_version = read_u32(&instr[func_version_offset].cache); uint32_t func_version = read_u32(&instr[func_version_offset].cache);
PyFunctionObject *func = _PyFunction_LookupByVersion(func_version); PyFunctionObject *new_func = _PyFunction_LookupByVersion(func_version);
DPRINTF(3, "Function object: %p\n", func); DPRINTF(3, "Function object: %p\n", func);
if (func != NULL) { if (new_func != NULL) {
PyCodeObject *new_code = (PyCodeObject *)PyFunction_GET_CODE(func); PyCodeObject *new_code = (PyCodeObject *)PyFunction_GET_CODE(new_func);
if (new_code == code) { if (new_code == code) {
// Recursive call, bail (we could be here forever). // Recursive call, bail (we could be here forever).
DPRINTF(2, "Bailing on recursive call to %s (%s:%d)\n", DPRINTF(2, "Bailing on recursive call to %s (%s:%d)\n",
@ -639,8 +642,9 @@ top: // Jump here after _PUSH_FRAME or likely branches
_Py_BloomFilter_Add(dependencies, new_code); _Py_BloomFilter_Add(dependencies, new_code);
/* Set the operand to the callee's code object, /* Set the operand to the callee's code object,
* to assist optimization passes */ * to assist optimization passes */
trace[trace_length-1].operand = (uintptr_t)new_code; trace[trace_length-1].operand = (uintptr_t)new_func;
code = new_code; code = new_code;
func = new_func;
instr = _PyCode_CODE(code); instr = _PyCode_CODE(code);
DPRINTF(2, DPRINTF(2,
"Continuing in %s (%s:%d) at byte offset %d\n", "Continuing in %s (%s:%d) at byte offset %d\n",
@ -808,7 +812,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies)
static int static int
uop_optimize( uop_optimize(
_PyOptimizerObject *self, _PyOptimizerObject *self,
PyCodeObject *code, _PyInterpreterFrame *frame,
_Py_CODEUNIT *instr, _Py_CODEUNIT *instr,
_PyExecutorObject **exec_ptr, _PyExecutorObject **exec_ptr,
int curr_stackentries) int curr_stackentries)
@ -816,7 +820,7 @@ uop_optimize(
_PyBloomFilter dependencies; _PyBloomFilter dependencies;
_Py_BloomFilter_Init(&dependencies); _Py_BloomFilter_Init(&dependencies);
_PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH]; _PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH];
int err = translate_bytecode_to_trace(code, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies); int err = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies);
if (err <= 0) { if (err <= 0) {
// Error or nothing translated // Error or nothing translated
return err; return err;
@ -824,9 +828,10 @@ uop_optimize(
OPT_STAT_INC(traces_created); OPT_STAT_INC(traces_created);
char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE"); char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE");
if (uop_optimize == NULL || *uop_optimize > '0') { if (uop_optimize == NULL || *uop_optimize > '0') {
err = _Py_uop_analyze_and_optimize(code, buffer, UOP_MAX_TRACE_LENGTH, curr_stackentries); err = _Py_uop_analyze_and_optimize(frame, buffer,
if (err < 0) { UOP_MAX_TRACE_LENGTH, curr_stackentries, &dependencies);
return -1; if (err <= 0) {
return err;
} }
} }
_PyExecutorObject *executor = make_executor_from_uops(buffer, &dependencies); _PyExecutorObject *executor = make_executor_from_uops(buffer, &dependencies);
@ -887,12 +892,13 @@ PyTypeObject _PyCounterExecutor_Type = {
static int static int
counter_optimize( counter_optimize(
_PyOptimizerObject* self, _PyOptimizerObject* self,
PyCodeObject *code, _PyInterpreterFrame *frame,
_Py_CODEUNIT *instr, _Py_CODEUNIT *instr,
_PyExecutorObject **exec_ptr, _PyExecutorObject **exec_ptr,
int Py_UNUSED(curr_stackentries) int Py_UNUSED(curr_stackentries)
) )
{ {
PyCodeObject *code = (PyCodeObject *)frame->f_executable;
int oparg = instr->op.arg; int oparg = instr->op.arg;
while (instr->op.code == EXTENDED_ARG) { while (instr->op.code == EXTENDED_ARG) {
instr++; instr++;

View File

@ -1,10 +1,12 @@
#include "Python.h" #include "Python.h"
#include "opcode.h" #include "opcode.h"
#include "pycore_dict.h"
#include "pycore_interp.h" #include "pycore_interp.h"
#include "pycore_opcode_metadata.h" #include "pycore_opcode_metadata.h"
#include "pycore_opcode_utils.h" #include "pycore_opcode_utils.h"
#include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_pystate.h" // _PyInterpreterState_GET()
#include "pycore_uop_metadata.h" #include "pycore_uop_metadata.h"
#include "pycore_dict.h"
#include "pycore_long.h" #include "pycore_long.h"
#include "cpython/optimizer.h" #include "cpython/optimizer.h"
#include <stdbool.h> #include <stdbool.h>
@ -12,9 +14,210 @@
#include <stddef.h> #include <stddef.h>
#include "pycore_optimizer.h" #include "pycore_optimizer.h"
static int
get_mutations(PyObject* dict) {
assert(PyDict_CheckExact(dict));
PyDictObject *d = (PyDictObject *)dict;
return (d->ma_version_tag >> DICT_MAX_WATCHERS) & ((1 << DICT_WATCHED_MUTATION_BITS)-1);
}
static void static void
peephole_opt(PyCodeObject *co, _PyUOpInstruction *buffer, int buffer_size) increment_mutations(PyObject* dict) {
assert(PyDict_CheckExact(dict));
PyDictObject *d = (PyDictObject *)dict;
d->ma_version_tag += (1 << DICT_MAX_WATCHERS);
}
static int
globals_watcher_callback(PyDict_WatchEvent event, PyObject* dict,
PyObject* key, PyObject* new_value)
{ {
if (event == PyDict_EVENT_CLONED) {
return 0;
}
uint64_t watched_mutations = get_mutations(dict);
if (watched_mutations < _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS) {
_Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), dict);
increment_mutations(dict);
}
else {
PyDict_Unwatch(1, dict);
}
return 0;
}
static void
global_to_const(_PyUOpInstruction *inst, PyObject *obj)
{
assert(inst->opcode == _LOAD_GLOBAL_MODULE || inst->opcode == _LOAD_GLOBAL_BUILTINS);
assert(PyDict_CheckExact(obj));
PyDictObject *dict = (PyDictObject *)obj;
assert(dict->ma_keys->dk_kind == DICT_KEYS_UNICODE);
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(dict->ma_keys);
assert(inst->operand <= UINT16_MAX);
PyObject *res = entries[inst->operand].me_value;
if (res == NULL) {
return;
}
if (_Py_IsImmortal(res)) {
inst->opcode = (inst->oparg & 1) ? _LOAD_CONST_INLINE_BORROW_WITH_NULL : _LOAD_CONST_INLINE_BORROW;
}
else {
inst->opcode = (inst->oparg & 1) ? _LOAD_CONST_INLINE_WITH_NULL : _LOAD_CONST_INLINE;
}
inst->operand = (uint64_t)res;
}
static int
incorrect_keys(_PyUOpInstruction *inst, PyObject *obj)
{
if (!PyDict_CheckExact(obj)) {
return 1;
}
PyDictObject *dict = (PyDictObject *)obj;
if (dict->ma_keys->dk_version != inst->operand) {
return 1;
}
return 0;
}
/* The first two dict watcher IDs are reserved for CPython,
* so we don't need to check that they haven't been used */
#define BUILTINS_WATCHER_ID 0
#define GLOBALS_WATCHER_ID 1
/* Returns 1 if successfully optimized
* 0 if the trace is not suitable for optimization (yet)
* -1 if there was an error. */
static int
remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer,
int buffer_size, _PyBloomFilter *dependencies)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
PyObject *builtins = frame->f_builtins;
if (builtins != interp->builtins) {
return 1;
}
PyObject *globals = frame->f_globals;
assert(PyFunction_Check(((PyFunctionObject *)frame->f_funcobj)));
assert(((PyFunctionObject *)frame->f_funcobj)->func_builtins == builtins);
assert(((PyFunctionObject *)frame->f_funcobj)->func_globals == globals);
/* In order to treat globals as constants, we need to
* know that the globals dict is the one we expected, and
* that it hasn't changed
* In order to treat builtins as constants, we need to
* know that the builtins dict is the one we expected, and
* that it hasn't changed and that the global dictionary's
* keys have not changed */
/* These values represent stacks of booleans (one bool per bit).
* Pushing a frame shifts left, popping a frame shifts right. */
uint32_t builtins_checked = 0;
uint32_t builtins_watched = 0;
uint32_t globals_checked = 0;
uint32_t globals_watched = 0;
if (interp->dict_state.watchers[1] == NULL) {
interp->dict_state.watchers[1] = globals_watcher_callback;
}
for (int pc = 0; pc < buffer_size; pc++) {
_PyUOpInstruction *inst = &buffer[pc];
int opcode = inst->opcode;
switch(opcode) {
case _GUARD_BUILTINS_VERSION:
if (incorrect_keys(inst, builtins)) {
return 0;
}
if (interp->rare_events.builtin_dict >= _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS) {
continue;
}
if ((builtins_watched & 1) == 0) {
PyDict_Watch(BUILTINS_WATCHER_ID, builtins);
builtins_watched |= 1;
}
if (builtins_checked & 1) {
buffer[pc].opcode = NOP;
}
else {
buffer[pc].opcode = _CHECK_BUILTINS;
buffer[pc].operand = (uintptr_t)builtins;
builtins_checked |= 1;
}
break;
case _GUARD_GLOBALS_VERSION:
if (incorrect_keys(inst, globals)) {
return 0;
}
uint64_t watched_mutations = get_mutations(globals);
if (watched_mutations >= _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS) {
continue;
}
if ((globals_watched & 1) == 0) {
PyDict_Watch(GLOBALS_WATCHER_ID, globals);
_Py_BloomFilter_Add(dependencies, globals);
globals_watched |= 1;
}
if (globals_checked & 1) {
buffer[pc].opcode = NOP;
}
else {
buffer[pc].opcode = _CHECK_GLOBALS;
buffer[pc].operand = (uintptr_t)globals;
globals_checked |= 1;
}
break;
case _LOAD_GLOBAL_BUILTINS:
if (globals_checked & builtins_checked & globals_watched & builtins_watched & 1) {
global_to_const(inst, builtins);
}
break;
case _LOAD_GLOBAL_MODULE:
if (globals_checked & globals_watched & 1) {
global_to_const(inst, globals);
}
break;
case _PUSH_FRAME:
{
globals_checked <<= 1;
globals_watched <<= 1;
builtins_checked <<= 1;
builtins_watched <<= 1;
PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand;
if (func == NULL) {
return 1;
}
assert(PyFunction_Check(func));
globals = func->func_globals;
builtins = func->func_builtins;
if (builtins != interp->builtins) {
return 1;
}
break;
}
case _POP_FRAME:
{
globals_checked >>= 1;
globals_watched >>= 1;
builtins_checked >>= 1;
builtins_watched >>= 1;
PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand;
assert(PyFunction_Check(func));
globals = func->func_globals;
builtins = func->func_builtins;
break;
}
case _JUMP_TO_TOP:
case _EXIT_TRACE:
return 1;
}
}
return 0;
}
static void
peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size)
{
PyCodeObject *co = (PyCodeObject *)frame->f_executable;
for (int pc = 0; pc < buffer_size; pc++) { for (int pc = 0; pc < buffer_size; pc++) {
int opcode = buffer[pc].opcode; int opcode = buffer[pc].opcode;
switch(opcode) { switch(opcode) {
@ -36,8 +239,17 @@ peephole_opt(PyCodeObject *co, _PyUOpInstruction *buffer, int buffer_size)
} }
case _PUSH_FRAME: case _PUSH_FRAME:
case _POP_FRAME: case _POP_FRAME:
co = (PyCodeObject *)buffer[pc].operand; {
PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand;
if (func == NULL) {
co = NULL;
}
else {
assert(PyFunction_Check(func));
co = (PyCodeObject *)func->func_code;
}
break; break;
}
case _JUMP_TO_TOP: case _JUMP_TO_TOP:
case _EXIT_TRACE: case _EXIT_TRACE:
return; return;
@ -83,16 +295,20 @@ remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size)
} }
} }
int int
_Py_uop_analyze_and_optimize( _Py_uop_analyze_and_optimize(
PyCodeObject *co, _PyInterpreterFrame *frame,
_PyUOpInstruction *buffer, _PyUOpInstruction *buffer,
int buffer_size, int buffer_size,
int curr_stacklen int curr_stacklen,
_PyBloomFilter *dependencies
) )
{ {
peephole_opt(co, buffer, buffer_size); int err = remove_globals(frame, buffer, buffer_size, dependencies);
if (err <= 0) {
return err;
}
peephole_opt(frame, buffer, buffer_size);
remove_unneeded_uops(buffer, buffer_size); remove_unneeded_uops(buffer, buffer_size);
return 0; return 1;
} }

View File

@ -32,6 +32,7 @@
#include "pycore_typevarobject.h" // _Py_clear_generic_types() #include "pycore_typevarobject.h" // _Py_clear_generic_types()
#include "pycore_unicodeobject.h" // _PyUnicode_InitTypes() #include "pycore_unicodeobject.h" // _PyUnicode_InitTypes()
#include "pycore_weakref.h" // _PyWeakref_GET_REF() #include "pycore_weakref.h" // _PyWeakref_GET_REF()
#include "cpython/optimizer.h" // _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS
#include "pycore_obmalloc.h" // _PyMem_init_obmalloc() #include "pycore_obmalloc.h" // _PyMem_init_obmalloc()
#include "opcode.h" #include "opcode.h"
@ -609,7 +610,11 @@ init_interp_create_gil(PyThreadState *tstate, int gil)
static int static int
builtins_dict_watcher(PyDict_WatchEvent event, PyObject *dict, PyObject *key, PyObject *new_value) builtins_dict_watcher(PyDict_WatchEvent event, PyObject *dict, PyObject *key, PyObject *new_value)
{ {
RARE_EVENT_INC(builtin_dict); PyInterpreterState *interp = _PyInterpreterState_GET();
if (event != PyDict_EVENT_CLONED && interp->rare_events.builtin_dict < _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS) {
_Py_Executors_InvalidateAll(interp);
}
RARE_EVENT_INTERP_INC(interp, builtin_dict);
return 0; return 0;
} }
@ -1287,11 +1292,9 @@ init_interp_main(PyThreadState *tstate)
} }
} }
if ((interp->rare_events.builtins_dict_watcher_id = PyDict_AddWatcher(&builtins_dict_watcher)) == -1) {
return _PyStatus_ERR("failed to add builtin dict watcher");
}
if (PyDict_Watch(interp->rare_events.builtins_dict_watcher_id, interp->builtins) != 0) { interp->dict_state.watchers[0] = &builtins_dict_watcher;
if (PyDict_Watch(0, interp->builtins) != 0) {
return _PyStatus_ERR("failed to set builtin dict watcher"); return _PyStatus_ERR("failed to set builtin dict watcher");
} }
@ -1622,8 +1625,13 @@ finalize_modules(PyThreadState *tstate)
{ {
PyInterpreterState *interp = tstate->interp; PyInterpreterState *interp = tstate->interp;
// Stop collecting stats on __builtin__ modifications during teardown // Invalidate all executors and turn off tier 2 optimizer
PyDict_Unwatch(interp->rare_events.builtins_dict_watcher_id, interp->builtins); _Py_Executors_InvalidateAll(interp);
Py_XDECREF(interp->optimizer);
interp->optimizer = &_PyOptimizer_Default;
// Stop watching __builtin__ modifications
PyDict_Unwatch(0, interp->builtins);
PyObject *modules = _PyImport_GetModules(interp); PyObject *modules = _PyImport_GetModules(interp);
if (modules == NULL) { if (modules == NULL) {