gh-104584: Baby steps towards generating and executing traces (#105924)

Added a new, experimental, tracing optimizer and interpreter (a.k.a. "tier 2"). This currently pessimizes, so don't use yet -- this is infrastructure so we can experiment with optimizing passes. To enable it, pass ``-Xuops`` or set ``PYTHONUOPS=1``. To get debug output, set ``PYTHONUOPSDEBUG=N`` where ``N`` is a debug level (0-4, where 0 is no debug output and 4 is excessively verbose).

All of this code is likely to change dramatically before the 3.13 feature freeze. But this is a first step.
This commit is contained in:
Guido van Rossum 2023-06-26 19:02:57 -07:00 committed by GitHub
parent d3af83b934
commit 51fc725117
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 2559 additions and 305 deletions

1
.gitattributes vendored
View File

@ -86,6 +86,7 @@ Parser/token.c generated
Programs/test_frozenmain.h generated
Python/Python-ast.c generated
Python/generated_cases.c.h generated
Python/executor_cases.c.h generated
Python/opcode_targets.h generated
Python/stdlib_module_names.h generated
Tools/peg_generator/pegen/grammar_parser.py generated

View File

@ -45,6 +45,7 @@ extern _PyOptimizerObject _PyOptimizer_Default;
/* For testing */
PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewCounter(void);
PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void);
#define OPTIMIZER_BITS_IN_COUNTER 4

View File

@ -0,0 +1,31 @@
#ifndef Py_INTERNAL_UOPS_H
#define Py_INTERNAL_UOPS_H
#ifdef __cplusplus
extern "C" {
#endif
#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif
#define _Py_UOP_MAX_TRACE_LENGTH 16
typedef struct {
int opcode;
uint64_t operand; // Sometimes oparg, sometimes a cache entry
} _PyUOpInstruction;
typedef struct {
_PyExecutorObject base;
_PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH]; // TODO: variable length
} _PyUOpExecutorObject;
_PyInterpreterFrame *_PyUopExecute(
_PyExecutorObject *executor,
_PyInterpreterFrame *frame,
PyObject **stack_pointer);
#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_UOPS_H */

View File

@ -71,6 +71,9 @@ typedef struct _object_stats {
uint64_t type_cache_dunder_misses;
uint64_t type_cache_collisions;
uint64_t optimization_attempts;
uint64_t optimization_traces_created;
uint64_t optimization_traces_executed;
uint64_t optimization_uops_executed;
} ObjectStats;
typedef struct _stats {

View File

@ -1542,19 +1542,9 @@ regen-opcode-targets:
.PHONY: regen-cases
regen-cases:
# Regenerate Python/generated_cases.c.h
# and Python/opcode_metadata.h
# from Python/bytecodes.c
# using Tools/cases_generator/generate_cases.py
# Regenerate various files from Python/bytecodes.c
PYTHONPATH=$(srcdir)/Tools/cases_generator \
$(PYTHON_FOR_REGEN) \
$(srcdir)/Tools/cases_generator/generate_cases.py \
--emit-line-directives \
-o $(srcdir)/Python/generated_cases.c.h.new \
-m $(srcdir)/Python/opcode_metadata.h.new \
$(srcdir)/Python/bytecodes.c
$(UPDATE_FILE) $(srcdir)/Python/generated_cases.c.h $(srcdir)/Python/generated_cases.c.h.new
$(UPDATE_FILE) $(srcdir)/Python/opcode_metadata.h $(srcdir)/Python/opcode_metadata.h.new
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/cases_generator/generate_cases.py -l
Python/compile.o: $(srcdir)/Python/opcode_metadata.h
@ -1565,6 +1555,13 @@ Python/ceval.o: \
$(srcdir)/Python/opcode_metadata.h \
$(srcdir)/Python/opcode_targets.h
Python/flowgraph.o: \
$(srcdir)/Python/opcode_metadata.h
Python/optimizer.o: \
$(srcdir)/Python/executor_cases.c.h \
$(srcdir)/Python/opcode_metadata.h
Python/frozen.o: $(FROZEN_FILES_OUT)
# Generate DTrace probe macros, then rename them (PYTHON_ -> PyDTrace_) to

View File

@ -0,0 +1 @@
Added a new, experimental, tracing optimizer and interpreter (a.k.a. "tier 2"). This currently pessimizes, so don't use yet -- this is infrastructure so we can experiment with optimizing passes. To enable it, pass ``-Xuops`` or set ``PYTHONUOPS=1``. To get debug output, set ``PYTHONUOPSDEBUG=N`` where ``N`` is a debug level (0-4, where 0 is no debug output and 4 is excessively verbose).

View File

@ -830,6 +830,12 @@ get_counter_optimizer(PyObject *self, PyObject *arg)
return PyUnstable_Optimizer_NewCounter();
}
static PyObject *
get_uop_optimizer(PyObject *self, PyObject *arg)
{
return PyUnstable_Optimizer_NewUOpOptimizer();
}
static PyObject *
set_optimizer(PyObject *self, PyObject *opt)
{
@ -994,6 +1000,7 @@ static PyMethodDef module_functions[] = {
{"get_optimizer", get_optimizer, METH_NOARGS, NULL},
{"set_optimizer", set_optimizer, METH_O, NULL},
{"get_counter_optimizer", get_counter_optimizer, METH_NOARGS, NULL},
{"get_uop_optimizer", get_uop_optimizer, METH_NOARGS, NULL},
{"pending_threadfunc", _PyCFunction_CAST(pending_threadfunc),
METH_VARARGS | METH_KEYWORDS},
// {"pending_fd_identify", pending_fd_identify, METH_VARARGS, NULL},

View File

@ -52,8 +52,6 @@
#define family(name, ...) static int family_##name
#define pseudo(name) static int pseudo_##name
typedef PyObject *(*convertion_func_ptr)(PyObject *);
// Dummy variables for stack effects.
static PyObject *value, *value1, *value2, *left, *right, *res, *sum, *prod, *sub;
static PyObject *container, *start, *stop, *v, *lhs, *rhs, *res2;
@ -2182,7 +2180,7 @@ dummy_func(
frame = executor->execute(executor, frame, stack_pointer);
if (frame == NULL) {
frame = cframe.current_frame;
goto error;
goto resume_with_error;
}
goto resume_frame;
}

View File

@ -22,6 +22,7 @@
#include "pycore_sysmodule.h" // _PySys_Audit()
#include "pycore_tuple.h" // _PyTuple_ITEMS()
#include "pycore_typeobject.h" // _PySuper_Lookup()
#include "pycore_uops.h" // _PyUOpExecutorObject
#include "pycore_emscripten_signal.h" // _Py_CHECK_EMSCRIPTEN_SIGNALS
#include "pycore_dict.h"
@ -223,14 +224,6 @@ _PyEvalFramePushAndInit_Ex(PyThreadState *tstate, PyFunctionObject *func,
static void
_PyEvalFrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame);
typedef PyObject *(*convertion_func_ptr)(PyObject *);
static const convertion_func_ptr CONVERSION_FUNCTIONS[4] = {
[FVC_STR] = PyObject_Str,
[FVC_REPR] = PyObject_Repr,
[FVC_ASCII] = PyObject_ASCII
};
#define UNBOUNDLOCAL_ERROR_MSG \
"cannot access local variable '%s' where it is not associated with a value"
#define UNBOUNDFREE_ERROR_MSG \
@ -2771,3 +2764,131 @@ void Py_LeaveRecursiveCall(void)
{
_Py_LeaveRecursiveCall();
}
///////////////////// Experimental UOp Interpreter /////////////////////
// UPDATE_MISS_STATS (called by DEOPT_IF) uses next_instr
// TODO: Make it do something useful
#undef UPDATE_MISS_STATS
#define UPDATE_MISS_STATS(INSTNAME) ((void)0)
_PyInterpreterFrame *
_PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject **stack_pointer)
{
#ifdef LLTRACE
char *uop_debug = Py_GETENV("PYTHONUOPSDEBUG");
int lltrace = 0;
if (uop_debug != NULL && *uop_debug >= '0') {
lltrace = *uop_debug - '0'; // TODO: Parse an int and all that
}
if (lltrace >= 2) {
PyCodeObject *code = _PyFrame_GetCode(frame);
_Py_CODEUNIT *instr = frame->prev_instr + 1;
fprintf(stderr,
"Entering _PyUopExecute for %s (%s:%d) at offset %ld\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
#endif
PyThreadState *tstate = _PyThreadState_GET();
_PyUOpExecutorObject *self = (_PyUOpExecutorObject *)executor;
// Equivalent to CHECK_EVAL_BREAKER()
_Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY();
if (_Py_atomic_load_relaxed_int32(&tstate->interp->ceval.eval_breaker)) {
if (_Py_HandlePending(tstate) != 0) {
goto error;
}
}
OBJECT_STAT_INC(optimization_traces_executed);
_Py_CODEUNIT *ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive - 1;
int pc = 0;
int opcode;
uint64_t operand;
int oparg;
for (;;) {
opcode = self->trace[pc].opcode;
operand = self->trace[pc].operand;
oparg = (int)operand;
#ifdef LLTRACE
if (lltrace >= 3) {
const char *opname = opcode < 256 ? _PyOpcode_OpName[opcode] : "";
int stack_level = (int)(stack_pointer - _PyFrame_Stackbase(frame));
fprintf(stderr, " uop %s %d, operand %" PRIu64 ", stack_level %d\n",
opname, opcode, operand, stack_level);
}
#endif
pc++;
OBJECT_STAT_INC(optimization_uops_executed);
switch (opcode) {
#undef ENABLE_SPECIALIZATION
#define ENABLE_SPECIALIZATION 0
#include "executor_cases.c.h"
case SET_IP:
{
frame->prev_instr = ip_offset + oparg;
break;
}
case EXIT_TRACE:
{
_PyFrame_SetStackPointer(frame, stack_pointer);
Py_DECREF(self);
return frame;
}
default:
{
fprintf(stderr, "Unknown uop %d, operand %" PRIu64 "\n", opcode, operand);
Py_FatalError("Unknown uop");
abort(); // Unreachable
for (;;) {}
// Really unreachable
}
}
}
pop_4_error:
STACK_SHRINK(1);
pop_3_error:
STACK_SHRINK(1);
pop_2_error:
STACK_SHRINK(1);
pop_1_error:
STACK_SHRINK(1);
error:
// On ERROR_IF we return NULL as the frame.
// The caller recovers the frame from cframe.current_frame.
#ifdef LLTRACE
if (lltrace >= 2) {
fprintf(stderr, "Error: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
}
#endif
_PyFrame_SetStackPointer(frame, stack_pointer);
Py_DECREF(self);
return NULL;
PREDICTED(UNPACK_SEQUENCE)
PREDICTED(COMPARE_OP)
PREDICTED(LOAD_SUPER_ATTR)
PREDICTED(STORE_SUBSCR)
PREDICTED(BINARY_SUBSCR)
PREDICTED(BINARY_OP)
// On DEOPT_IF we just repeat the last instruction.
// This presumes nothing was popped from the stack (nor pushed).
#ifdef LLTRACE
if (lltrace >= 2) {
fprintf(stderr, "DEOPT: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
}
#endif
_PyFrame_SetStackPointer(frame, stack_pointer);
Py_DECREF(self);
return frame;
}

View File

@ -1,4 +1,4 @@
// Macros needed by ceval.c and bytecodes.c
// Macros and other things needed by ceval.c and bytecodes.c
/* Computed GOTOs, or
the-optimization-commonly-but-improperly-known-as-"threaded code"
@ -339,3 +339,11 @@ do { \
goto error; \
} \
} while (0);
typedef PyObject *(*convertion_func_ptr)(PyObject *);
static const convertion_func_ptr CONVERSION_FUNCTIONS[4] = {
[FVC_STR] = PyObject_Str,
[FVC_REPR] = PyObject_Repr,
[FVC_ASCII] = PyObject_ASCII
};

1606
Python/executor_cases.c.h generated Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -18,6 +18,21 @@
((OP) == POP_BLOCK) || \
0
#define EXIT_TRACE 300
#define SET_IP 301
#define _GUARD_BOTH_INT 302
#define _BINARY_OP_MULTIPLY_INT 303
#define _BINARY_OP_ADD_INT 304
#define _BINARY_OP_SUBTRACT_INT 305
#define _GUARD_BOTH_FLOAT 306
#define _BINARY_OP_MULTIPLY_FLOAT 307
#define _BINARY_OP_ADD_FLOAT 308
#define _BINARY_OP_SUBTRACT_FLOAT 309
#define _GUARD_BOTH_UNICODE 310
#define _BINARY_OP_ADD_UNICODE 311
#define _LOAD_LOCALS 312
#define _LOAD_FROM_DICT_OR_GLOBALS 313
#ifndef NEED_OPCODE_METADATA
extern int _PyOpcode_num_popped(int opcode, int oparg, bool jump);
#else
@ -885,12 +900,19 @@ struct opcode_metadata {
int flags;
};
struct opcode_macro_expansion {
int nuops;
struct { int16_t uop; int8_t size; int8_t offset; } uops[8];
};
#define OPCODE_METADATA_FMT(OP) (_PyOpcode_opcode_metadata[(OP)].instr_format)
#define SAME_OPCODE_METADATA(OP1, OP2) \
(OPCODE_METADATA_FMT(OP1) == OPCODE_METADATA_FMT(OP2))
#ifndef NEED_OPCODE_METADATA
extern const struct opcode_metadata _PyOpcode_opcode_metadata[512];
extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];
#else
const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {
[NOP] = { true, INSTR_FMT_IX, 0 },
@ -1101,4 +1123,88 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {
[CACHE] = { true, INSTR_FMT_IX, 0 },
[RESERVED] = { true, INSTR_FMT_IX, 0 },
};
const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
[NOP] = { .nuops = 1, .uops = { { NOP, 0, 0 } } },
[LOAD_FAST] = { .nuops = 1, .uops = { { LOAD_FAST, 0, 0 } } },
[LOAD_FAST_AND_CLEAR] = { .nuops = 1, .uops = { { LOAD_FAST_AND_CLEAR, 0, 0 } } },
[LOAD_CONST] = { .nuops = 1, .uops = { { LOAD_CONST, 0, 0 } } },
[STORE_FAST] = { .nuops = 1, .uops = { { STORE_FAST, 0, 0 } } },
[POP_TOP] = { .nuops = 1, .uops = { { POP_TOP, 0, 0 } } },
[PUSH_NULL] = { .nuops = 1, .uops = { { PUSH_NULL, 0, 0 } } },
[END_SEND] = { .nuops = 1, .uops = { { END_SEND, 0, 0 } } },
[UNARY_NEGATIVE] = { .nuops = 1, .uops = { { UNARY_NEGATIVE, 0, 0 } } },
[UNARY_NOT] = { .nuops = 1, .uops = { { UNARY_NOT, 0, 0 } } },
[UNARY_INVERT] = { .nuops = 1, .uops = { { UNARY_INVERT, 0, 0 } } },
[BINARY_SLICE] = { .nuops = 1, .uops = { { BINARY_SLICE, 0, 0 } } },
[STORE_SLICE] = { .nuops = 1, .uops = { { STORE_SLICE, 0, 0 } } },
[BINARY_SUBSCR_LIST_INT] = { .nuops = 1, .uops = { { BINARY_SUBSCR_LIST_INT, 0, 0 } } },
[BINARY_SUBSCR_TUPLE_INT] = { .nuops = 1, .uops = { { BINARY_SUBSCR_TUPLE_INT, 0, 0 } } },
[BINARY_SUBSCR_DICT] = { .nuops = 1, .uops = { { BINARY_SUBSCR_DICT, 0, 0 } } },
[LIST_APPEND] = { .nuops = 1, .uops = { { LIST_APPEND, 0, 0 } } },
[SET_ADD] = { .nuops = 1, .uops = { { SET_ADD, 0, 0 } } },
[STORE_SUBSCR_LIST_INT] = { .nuops = 1, .uops = { { STORE_SUBSCR_LIST_INT, 0, 0 } } },
[STORE_SUBSCR_DICT] = { .nuops = 1, .uops = { { STORE_SUBSCR_DICT, 0, 0 } } },
[DELETE_SUBSCR] = { .nuops = 1, .uops = { { DELETE_SUBSCR, 0, 0 } } },
[CALL_INTRINSIC_1] = { .nuops = 1, .uops = { { CALL_INTRINSIC_1, 0, 0 } } },
[CALL_INTRINSIC_2] = { .nuops = 1, .uops = { { CALL_INTRINSIC_2, 0, 0 } } },
[GET_AITER] = { .nuops = 1, .uops = { { GET_AITER, 0, 0 } } },
[GET_ANEXT] = { .nuops = 1, .uops = { { GET_ANEXT, 0, 0 } } },
[GET_AWAITABLE] = { .nuops = 1, .uops = { { GET_AWAITABLE, 0, 0 } } },
[POP_EXCEPT] = { .nuops = 1, .uops = { { POP_EXCEPT, 0, 0 } } },
[LOAD_ASSERTION_ERROR] = { .nuops = 1, .uops = { { LOAD_ASSERTION_ERROR, 0, 0 } } },
[LOAD_BUILD_CLASS] = { .nuops = 1, .uops = { { LOAD_BUILD_CLASS, 0, 0 } } },
[STORE_NAME] = { .nuops = 1, .uops = { { STORE_NAME, 0, 0 } } },
[DELETE_NAME] = { .nuops = 1, .uops = { { DELETE_NAME, 0, 0 } } },
[UNPACK_SEQUENCE_TWO_TUPLE] = { .nuops = 1, .uops = { { UNPACK_SEQUENCE_TWO_TUPLE, 0, 0 } } },
[UNPACK_SEQUENCE_TUPLE] = { .nuops = 1, .uops = { { UNPACK_SEQUENCE_TUPLE, 0, 0 } } },
[UNPACK_SEQUENCE_LIST] = { .nuops = 1, .uops = { { UNPACK_SEQUENCE_LIST, 0, 0 } } },
[UNPACK_EX] = { .nuops = 1, .uops = { { UNPACK_EX, 0, 0 } } },
[DELETE_ATTR] = { .nuops = 1, .uops = { { DELETE_ATTR, 0, 0 } } },
[STORE_GLOBAL] = { .nuops = 1, .uops = { { STORE_GLOBAL, 0, 0 } } },
[DELETE_GLOBAL] = { .nuops = 1, .uops = { { DELETE_GLOBAL, 0, 0 } } },
[DELETE_DEREF] = { .nuops = 1, .uops = { { DELETE_DEREF, 0, 0 } } },
[LOAD_FROM_DICT_OR_DEREF] = { .nuops = 1, .uops = { { LOAD_FROM_DICT_OR_DEREF, 0, 0 } } },
[LOAD_DEREF] = { .nuops = 1, .uops = { { LOAD_DEREF, 0, 0 } } },
[STORE_DEREF] = { .nuops = 1, .uops = { { STORE_DEREF, 0, 0 } } },
[COPY_FREE_VARS] = { .nuops = 1, .uops = { { COPY_FREE_VARS, 0, 0 } } },
[BUILD_STRING] = { .nuops = 1, .uops = { { BUILD_STRING, 0, 0 } } },
[BUILD_TUPLE] = { .nuops = 1, .uops = { { BUILD_TUPLE, 0, 0 } } },
[BUILD_LIST] = { .nuops = 1, .uops = { { BUILD_LIST, 0, 0 } } },
[LIST_EXTEND] = { .nuops = 1, .uops = { { LIST_EXTEND, 0, 0 } } },
[SET_UPDATE] = { .nuops = 1, .uops = { { SET_UPDATE, 0, 0 } } },
[BUILD_SET] = { .nuops = 1, .uops = { { BUILD_SET, 0, 0 } } },
[BUILD_MAP] = { .nuops = 1, .uops = { { BUILD_MAP, 0, 0 } } },
[SETUP_ANNOTATIONS] = { .nuops = 1, .uops = { { SETUP_ANNOTATIONS, 0, 0 } } },
[BUILD_CONST_KEY_MAP] = { .nuops = 1, .uops = { { BUILD_CONST_KEY_MAP, 0, 0 } } },
[DICT_UPDATE] = { .nuops = 1, .uops = { { DICT_UPDATE, 0, 0 } } },
[DICT_MERGE] = { .nuops = 1, .uops = { { DICT_MERGE, 0, 0 } } },
[MAP_ADD] = { .nuops = 1, .uops = { { MAP_ADD, 0, 0 } } },
[LOAD_SUPER_ATTR_ATTR] = { .nuops = 1, .uops = { { LOAD_SUPER_ATTR_ATTR, 0, 0 } } },
[LOAD_SUPER_ATTR_METHOD] = { .nuops = 1, .uops = { { LOAD_SUPER_ATTR_METHOD, 0, 0 } } },
[COMPARE_OP_FLOAT] = { .nuops = 1, .uops = { { COMPARE_OP_FLOAT, 0, 0 } } },
[COMPARE_OP_INT] = { .nuops = 1, .uops = { { COMPARE_OP_INT, 0, 0 } } },
[COMPARE_OP_STR] = { .nuops = 1, .uops = { { COMPARE_OP_STR, 0, 0 } } },
[IS_OP] = { .nuops = 1, .uops = { { IS_OP, 0, 0 } } },
[CONTAINS_OP] = { .nuops = 1, .uops = { { CONTAINS_OP, 0, 0 } } },
[CHECK_EG_MATCH] = { .nuops = 1, .uops = { { CHECK_EG_MATCH, 0, 0 } } },
[CHECK_EXC_MATCH] = { .nuops = 1, .uops = { { CHECK_EXC_MATCH, 0, 0 } } },
[GET_LEN] = { .nuops = 1, .uops = { { GET_LEN, 0, 0 } } },
[MATCH_CLASS] = { .nuops = 1, .uops = { { MATCH_CLASS, 0, 0 } } },
[MATCH_MAPPING] = { .nuops = 1, .uops = { { MATCH_MAPPING, 0, 0 } } },
[MATCH_SEQUENCE] = { .nuops = 1, .uops = { { MATCH_SEQUENCE, 0, 0 } } },
[MATCH_KEYS] = { .nuops = 1, .uops = { { MATCH_KEYS, 0, 0 } } },
[GET_ITER] = { .nuops = 1, .uops = { { GET_ITER, 0, 0 } } },
[GET_YIELD_FROM_ITER] = { .nuops = 1, .uops = { { GET_YIELD_FROM_ITER, 0, 0 } } },
[WITH_EXCEPT_START] = { .nuops = 1, .uops = { { WITH_EXCEPT_START, 0, 0 } } },
[PUSH_EXC_INFO] = { .nuops = 1, .uops = { { PUSH_EXC_INFO, 0, 0 } } },
[EXIT_INIT_CHECK] = { .nuops = 1, .uops = { { EXIT_INIT_CHECK, 0, 0 } } },
[MAKE_FUNCTION] = { .nuops = 1, .uops = { { MAKE_FUNCTION, 0, 0 } } },
[SET_FUNCTION_ATTRIBUTE] = { .nuops = 1, .uops = { { SET_FUNCTION_ATTRIBUTE, 0, 0 } } },
[BUILD_SLICE] = { .nuops = 1, .uops = { { BUILD_SLICE, 0, 0 } } },
[CONVERT_VALUE] = { .nuops = 1, .uops = { { CONVERT_VALUE, 0, 0 } } },
[FORMAT_SIMPLE] = { .nuops = 1, .uops = { { FORMAT_SIMPLE, 0, 0 } } },
[FORMAT_WITH_SPEC] = { .nuops = 1, .uops = { { FORMAT_WITH_SPEC, 0, 0 } } },
[COPY] = { .nuops = 1, .uops = { { COPY, 0, 0 } } },
[SWAP] = { .nuops = 1, .uops = { { SWAP, 0, 0 } } },
};
#endif

View File

@ -3,7 +3,9 @@
#include "opcode.h"
#include "pycore_interp.h"
#include "pycore_opcode.h"
#include "opcode_metadata.h"
#include "pycore_pystate.h"
#include "pycore_uops.h"
#include "cpython/optimizer.h"
#include <stdbool.h>
#include <stdint.h>
@ -278,3 +280,200 @@ PyUnstable_Optimizer_NewCounter(void)
opt->count = 0;
return (PyObject *)opt;
}
///////////////////// Experimental UOp Optimizer /////////////////////
#ifdef Py_DEBUG
/* For debugging the interpreter: */
# define LLTRACE 1 /* Low-level trace feature */
#endif
static void
uop_dealloc(_PyUOpExecutorObject *self) {
PyObject_Free(self);
}
static PyTypeObject UOpExecutor_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "uop_executor",
.tp_basicsize = sizeof(_PyUOpExecutorObject),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
.tp_dealloc = (destructor)uop_dealloc,
};
static int
translate_bytecode_to_trace(
PyCodeObject *code,
_Py_CODEUNIT *instr,
_PyUOpInstruction *trace,
int max_length)
{
#ifdef LLTRACE
char *uop_debug = Py_GETENV("PYTHONUOPSDEBUG");
int lltrace = 0;
if (uop_debug != NULL && *uop_debug >= '0') {
lltrace = *uop_debug - '0'; // TODO: Parse an int and all that
}
if (lltrace >= 4) {
fprintf(stderr,
"Optimizing %s (%s:%d) at offset %ld\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
#define ADD_TO_TRACE(OPCODE, OPERAND) \
if (lltrace >= 2) { \
const char *opname = (OPCODE) < 256 ? _PyOpcode_OpName[(OPCODE)] : ""; \
fprintf(stderr, " ADD_TO_TRACE(%s %d, %" PRIu64 ")\n", opname, (OPCODE), (uint64_t)(OPERAND)); \
} \
trace[trace_length].opcode = (OPCODE); \
trace[trace_length].operand = (OPERAND); \
trace_length++;
#else
#define ADD_TO_TRACE(OPCODE, OPERAND) \
trace[trace_length].opcode = (OPCODE); \
trace[trace_length].operand = (OPERAND); \
trace_length++;
#endif
int trace_length = 0;
// Always reserve space for one uop, plus SET_UP, plus EXIT_TRACE
while (trace_length + 3 <= max_length) {
int opcode = instr->op.code;
uint64_t operand = instr->op.arg;
switch (opcode) {
case LOAD_FAST_LOAD_FAST:
{
// Reserve space for two uops (+ SETUP + EXIT_TRACE)
if (trace_length + 4 > max_length) {
goto done;
}
uint64_t oparg1 = operand >> 4;
uint64_t oparg2 = operand & 15;
ADD_TO_TRACE(LOAD_FAST, oparg1);
ADD_TO_TRACE(LOAD_FAST, oparg2);
break;
}
default:
{
const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
if (expansion->nuops > 0) {
// Reserve space for nuops (+ SETUP + EXIT_TRACE)
int nuops = expansion->nuops;
if (trace_length + nuops + 2 > max_length) {
goto done;
}
for (int i = 0; i < nuops; i++) {
int offset = expansion->uops[i].offset;
switch (expansion->uops[i].size) {
case 0:
break;
case 1:
operand = read_u16(&instr[offset].cache);
break;
case 2:
operand = read_u32(&instr[offset].cache);
break;
case 4:
operand = read_u64(&instr[offset].cache);
break;
default:
fprintf(stderr,
"opcode=%d, operand=%" PRIu64 "; nuops=%d, i=%d; size=%d, offset=%d\n",
opcode, operand, nuops, i,
expansion->uops[i].size,
expansion->uops[i].offset);
Py_FatalError("garbled expansion");
}
ADD_TO_TRACE(expansion->uops[i].uop, operand);
assert(expansion->uops[0].size == 0); // TODO
}
break;
}
// fprintf(stderr, "Unsupported opcode %d\n", opcode);
goto done; // Break out of while loop
}
}
instr++;
// Add cache size for opcode
instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
ADD_TO_TRACE(SET_IP, (int)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
done:
if (trace_length > 0) {
ADD_TO_TRACE(EXIT_TRACE, 0);
#ifdef LLTRACE
if (lltrace >= 1) {
fprintf(stderr,
"Created a trace for %s (%s:%d) at offset %ld -- length %d\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive),
trace_length);
}
#endif
}
else {
#ifdef LLTRACE
if (lltrace >= 4) {
fprintf(stderr,
"No trace for %s (%s:%d) at offset %ld\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
#endif
}
return trace_length;
#undef ADD_TO_TRACE
}
static int
uop_optimize(
_PyOptimizerObject *self,
PyCodeObject *code,
_Py_CODEUNIT *instr,
_PyExecutorObject **exec_ptr)
{
_PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH];
int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH);
if (trace_length <= 0) {
// Error or nothing translated
return trace_length;
}
OBJECT_STAT_INC(optimization_traces_created);
_PyUOpExecutorObject *executor = (_PyUOpExecutorObject *)_PyObject_New(&UOpExecutor_Type);
if (executor == NULL) {
return -1;
}
executor->base.execute = _PyUopExecute;
memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction));
*exec_ptr = (_PyExecutorObject *)executor;
return 1;
}
static PyTypeObject UOpOptimizer_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "uop_optimizer",
.tp_basicsize = sizeof(_PyOptimizerObject),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
};
PyObject *
PyUnstable_Optimizer_NewUOpOptimizer(void)
{
_PyOptimizerObject *opt = (_PyOptimizerObject *)_PyObject_New(&UOpOptimizer_Type);
if (opt == NULL) {
return NULL;
}
opt->optimize = uop_optimize;
opt->resume_threshold = UINT16_MAX;
opt->backedge_threshold = 0;
return (PyObject *)opt;
}

View File

@ -1181,6 +1181,19 @@ init_interp_main(PyThreadState *tstate)
#endif
}
// Turn on experimental tier 2 (uops-based) optimizer
if (is_main_interp) {
char *envvar = Py_GETENV("PYTHONUOPS");
int enabled = envvar != NULL && *envvar > '0';
if (_Py_get_xoption(&config->xoptions, L"uops") != NULL) {
enabled = 1;
}
if (enabled) {
PyObject *opt = PyUnstable_Optimizer_NewUOpOptimizer();
PyUnstable_SetOptimizer((_PyOptimizerObject *)opt);
}
}
assert(!_PyErr_Occurred(tstate));
return _PyStatus_OK();

View File

@ -195,6 +195,10 @@ print_object_stats(FILE *out, ObjectStats *stats)
fprintf(out, "Object method cache collisions: %" PRIu64 "\n", stats->type_cache_collisions);
fprintf(out, "Object method cache dunder hits: %" PRIu64 "\n", stats->type_cache_dunder_hits);
fprintf(out, "Object method cache dunder misses: %" PRIu64 "\n", stats->type_cache_dunder_misses);
fprintf(out, "Optimization attempts: %" PRIu64 "\n", stats->optimization_attempts);
fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->optimization_traces_created);
fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->optimization_traces_executed);
fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->optimization_uops_executed);
}
static void

View File

@ -83,6 +83,7 @@ Objects/unicodetype_db.h
Python/deepfreeze/*.c
Python/frozen_modules/*.h
Python/generated_cases.c.h
Python/executor_cases.c.h
# not actually source
Python/bytecodes.c

View File

@ -366,6 +366,8 @@ Python/sysmodule.c - whatstrings -
Python/optimizer.c - DefaultOptimizer_Type -
Python/optimizer.c - CounterExecutor_Type -
Python/optimizer.c - CounterOptimizer_Type -
Python/optimizer.c - UOpExecutor_Type -
Python/optimizer.c - UOpOptimizer_Type -
Python/optimizer.c - _PyOptimizer_Default -
##-----------------------

Can't render this file because it has a wrong number of fields in line 4.

View File

@ -9,7 +9,7 @@ What's currently here:
- `plexer.py`: OO interface on top of lexer.py; main class: `PLexer`
- `parser.py`: Parser for instruction definition DSL; main class `Parser`
- `generate_cases.py`: driver script to read `Python/bytecodes.c` and
write `Python/generated_cases.c.h`
write `Python/generated_cases.c.h` (and several other files)
- `test_generator.py`: tests, require manual running using `pytest`
Note that there is some dummy C code at the top and bottom of

View File

@ -29,6 +29,9 @@ DEFAULT_METADATA_OUTPUT = os.path.relpath(
DEFAULT_PYMETADATA_OUTPUT = os.path.relpath(
os.path.join(ROOT, "Lib/_opcode_metadata.py")
)
DEFAULT_EXECUTOR_OUTPUT = os.path.relpath(
os.path.join(ROOT, "Python/executor_cases.c.h")
)
BEGIN_MARKER = "// BEGIN BYTECODES //"
END_MARKER = "// END BYTECODES //"
RE_PREDICTED = (
@ -61,6 +64,13 @@ arg_parser.add_argument(
arg_parser.add_argument(
"input", nargs=argparse.REMAINDER, help="Instruction definition file(s)"
)
arg_parser.add_argument(
"-e",
"--executor-cases",
type=str,
help="Write executor cases to this file",
default=DEFAULT_EXECUTOR_OUTPUT,
)
def effect_size(effect: StackEffect) -> tuple[int, str]:
@ -176,14 +186,14 @@ class Formatter:
self.prefix = self.prefix[:-4]
@contextlib.contextmanager
def block(self, head: str):
def block(self, head: str, tail: str = ""):
if head:
self.emit(head + " {")
else:
self.emit("{")
with self.indent():
yield
self.emit("}")
self.emit("}" + tail)
def stack_adjust(
self,
@ -290,6 +300,29 @@ class InstructionFlags:
f"(_PyOpcode_opcode_metadata[(OP)].flags & ({name}))")
FORBIDDEN_NAMES_IN_UOPS = (
"resume_with_error", # Proxy for "goto", which isn't an IDENTIFIER
"unbound_local_error",
"kwnames",
"next_instr",
"oparg1", # Proxy for super-instructions like LOAD_FAST_LOAD_FAST
"JUMPBY",
"DISPATCH",
"INSTRUMENTED_JUMP",
"throwflag",
"exception_unwind",
"import_from",
"import_name",
"_PyObject_CallNoArgs", # Proxy for BEFORE_WITH
)
# Interpreter tiers
TIER_ONE = 1 # Specializing adaptive interpreter (PEP 659)
TIER_TWO = 2 # Experimental tracing interpreter
Tiers: typing.TypeAlias = typing.Literal[1, 2]
@dataclasses.dataclass
class Instruction:
"""An instruction with additional data and code."""
@ -353,7 +386,32 @@ class Instruction:
cache = "0"
self.instr_fmt = fmt
def write(self, out: Formatter) -> None:
def is_viable_uop(self) -> bool:
"""Whether this instruction is viable as a uop."""
if self.always_exits:
return False
if self.instr_flags.HAS_ARG_FLAG:
# If the instruction uses oparg, it cannot use any caches
for c in self.cache_effects:
if c.name != UNUSED:
return False
else:
# If it doesn't use oparg, it can have one cache entry
caches: list[parser.CacheEffect] = []
cache_offset = 0
for c in self.cache_effects:
if c.name != UNUSED:
caches.append(c)
cache_offset += c.size
if len(caches) > 1:
return False
for forbidden in FORBIDDEN_NAMES_IN_UOPS:
# TODO: Don't check in '#ifdef ENABLE_SPECIALIZATION' regions
if variable_used(self.inst, forbidden):
return False
return True
def write(self, out: Formatter, tier: Tiers = TIER_ONE) -> None:
"""Write one instruction, sans prologue and epilogue."""
# Write a static assertion that a family's cache size is correct
if family := self.family:
@ -400,7 +458,7 @@ class Instruction:
# out.emit(f"next_instr += OPSIZE({self.inst.name}) - 1;")
self.write_body(out, 0)
self.write_body(out, 0, tier=tier)
# Skip the rest if the block always exits
if self.always_exits:
@ -427,10 +485,16 @@ class Instruction:
out.assign(dst, oeffect)
# Write cache effect
if self.cache_offset:
if tier == TIER_ONE and self.cache_offset:
out.emit(f"next_instr += {self.cache_offset};")
def write_body(self, out: Formatter, dedent: int, cache_adjust: int = 0) -> None:
def write_body(
self,
out: Formatter,
dedent: int,
cache_adjust: int = 0,
tier: Tiers = TIER_ONE,
) -> None:
"""Write the instruction body."""
# Write cache effect variable declarations and initializations
cache_offset = cache_adjust
@ -447,9 +511,12 @@ class Instruction:
else:
typ = f"uint{bits}_t "
func = f"read_u{bits}"
out.emit(
f"{typ}{ceffect.name} = {func}(&next_instr[{cache_offset}].cache);"
)
if tier == TIER_ONE:
out.emit(
f"{typ}{ceffect.name} = {func}(&next_instr[{cache_offset}].cache);"
)
else:
out.emit(f"{typ}{ceffect.name} = operand;")
cache_offset += ceffect.size
assert cache_offset == self.cache_offset + cache_adjust
@ -573,16 +640,24 @@ class Analyzer:
output_filename: str
metadata_filename: str
pymetadata_filename: str
executor_filename: str
errors: int = 0
emit_line_directives: bool = False
def __init__(self, input_filenames: list[str], output_filename: str,
metadata_filename: str, pymetadata_filename: str):
def __init__(
self,
input_filenames: list[str],
output_filename: str,
metadata_filename: str,
pymetadata_filename: str,
executor_filename: str,
):
"""Read the input file."""
self.input_filenames = input_filenames
self.output_filename = output_filename
self.metadata_filename = metadata_filename
self.pymetadata_filename = pymetadata_filename
self.executor_filename = executor_filename
def error(self, msg: str, node: parser.Node) -> None:
lineno = 0
@ -1107,6 +1182,8 @@ class Analyzer:
self.write_pseudo_instrs()
self.write_uop_defines()
self.write_stack_effect_functions()
# Write type definitions
@ -1114,12 +1191,17 @@ class Analyzer:
InstructionFlags.emit_macros(self.out)
self.out.emit("struct opcode_metadata {")
with self.out.indent():
with self.out.block("struct opcode_metadata", ";"):
self.out.emit("bool valid_entry;")
self.out.emit("enum InstructionFormat instr_format;")
self.out.emit("int flags;")
self.out.emit("};")
self.out.emit("")
with self.out.block("struct opcode_macro_expansion", ";"):
self.out.emit("int nuops;")
self.out.emit("struct { int16_t uop; int8_t size; int8_t offset; } uops[8];")
self.out.emit("")
self.out.emit("")
self.out.emit("#define OPCODE_METADATA_FMT(OP) "
"(_PyOpcode_opcode_metadata[(OP)].instr_format)")
@ -1130,7 +1212,9 @@ class Analyzer:
# Write metadata array declaration
self.out.emit("#ifndef NEED_OPCODE_METADATA")
self.out.emit("extern const struct opcode_metadata _PyOpcode_opcode_metadata[512];")
self.out.emit("extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];")
self.out.emit("#else")
self.out.emit("const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {")
# Write metadata for each instruction
@ -1150,6 +1234,31 @@ class Analyzer:
# Write end of array
self.out.emit("};")
with self.out.block(
"const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] =",
";",
):
# Write macro expansion for each non-pseudo instruction
for thing in self.everything:
match thing:
case OverriddenInstructionPlaceHolder():
pass
case parser.InstDef(name=name):
instr = self.instrs[name]
if instr.kind != "op" and instr.is_viable_uop():
self.out.emit(
f"[{name}] = "
f"{{ .nuops = 1, .uops = {{ {{ {name}, 0, 0 }} }} }},"
)
case parser.Macro():
# TODO: emit expansion if all parts are viable uops
pass
case parser.Pseudo():
pass
case _:
typing.assert_never(thing)
self.out.emit("#endif")
with open(self.pymetadata_filename, "w") as f:
@ -1184,7 +1293,6 @@ class Analyzer:
"opcode for family in _specializations.values() for opcode in family"
"]")
def write_pseudo_instrs(self) -> None:
"""Write the IS_PSEUDO_INSTR macro"""
self.out.emit("\n\n#define IS_PSEUDO_INSTR(OP) \\")
@ -1192,6 +1300,20 @@ class Analyzer:
self.out.emit(f" ((OP) == {op}) || \\")
self.out.emit(f" 0")
def write_uop_defines(self) -> None:
"""Write '#define XXX NNN' for each uop"""
self.out.emit("")
counter = 300
def add(name: str) -> None:
nonlocal counter
self.out.emit(f"#define {name} {counter}")
counter += 1
add("EXIT_TRACE")
add("SET_IP")
for instr in self.instrs.values():
if instr.kind == "op" and instr.is_viable_uop():
add(instr.name)
def emit_metadata_entry(
self, name: str, fmt: str, flags: InstructionFlags
) -> None:
@ -1221,10 +1343,7 @@ class Analyzer:
# Create formatter
self.out = Formatter(f, 8, self.emit_line_directives)
# Write provenance header
self.out.write_raw(f"{self.out.comment} This file is generated by {THIS}\n")
self.out.write_raw(self.from_source_files())
self.out.write_raw(f"{self.out.comment} Do not edit!\n")
self.write_provenance_header()
# Write and count instructions of all kinds
n_instrs = 0
@ -1252,6 +1371,33 @@ class Analyzer:
file=sys.stderr,
)
def write_executor_instructions(self) -> None:
"""Generate cases for the Tier 2 interpreter."""
with open(self.executor_filename, "w") as f:
self.out = Formatter(f, 8)
self.write_provenance_header()
for thing in self.everything:
match thing:
case OverriddenInstructionPlaceHolder():
self.write_overridden_instr_place_holder(thing)
case parser.InstDef():
instr = self.instrs[thing.name]
if instr.is_viable_uop():
self.out.emit("")
with self.out.block(f"case {thing.name}:"):
instr.write(self.out, tier=TIER_TWO)
self.out.emit("break;")
case parser.Macro():
pass # TODO
case parser.Pseudo():
pass
case _:
typing.assert_never(thing)
print(
f"Wrote some stuff to {self.executor_filename}",
file=sys.stderr,
)
def write_overridden_instr_place_holder(self,
place_holder: OverriddenInstructionPlaceHolder) -> None:
self.out.emit("")
@ -1405,7 +1551,7 @@ def main():
args.input.append(DEFAULT_INPUT)
# Raises OSError if input unreadable
a = Analyzer(args.input, args.output, args.metadata, args.pymetadata)
a = Analyzer(args.input, args.output, args.metadata, args.pymetadata, args.executor_cases)
if args.emit_line_directives:
a.emit_line_directives = True
@ -1415,6 +1561,7 @@ def main():
sys.exit(f"Found {a.errors} errors")
a.write_instructions() # Raises OSError if output can't be written
a.write_metadata()
a.write_executor_instructions()
if __name__ == "__main__":

View File

@ -44,7 +44,15 @@ def run_cases_test(input: str, expected: str):
temp_input.flush()
temp_output = tempfile.NamedTemporaryFile("w+")
temp_metadata = tempfile.NamedTemporaryFile("w+")
a = generate_cases.Analyzer([temp_input.name], temp_output.name, temp_metadata.name)
temp_pymetadata = tempfile.NamedTemporaryFile("w+")
temp_executor = tempfile.NamedTemporaryFile("w+")
a = generate_cases.Analyzer(
[temp_input.name],
temp_output.name,
temp_metadata.name,
temp_pymetadata.name,
temp_executor.name,
)
a.parse()
a.analyze()
if a.errors: