bpo-26219: per opcode cache for LOAD_GLOBAL (GH-12884)
This patch implements per opcode cache mechanism, and use it in only LOAD_GLOBAL opcode. Based on Yury's opcache3.patch in bpo-26219.
This commit is contained in:
parent
29ec422810
commit
91234a1636
|
@ -860,6 +860,10 @@ Optimizations
|
|||
methods up to 20--50%. (Contributed by Serhiy Storchaka in :issue:`23867`,
|
||||
:issue:`35582` and :issue:`36127`.)
|
||||
|
||||
* ``LOAD_GLOBAL`` instruction now uses new "per opcode cache" mechanism.
|
||||
It is about 40% faster now. (Contributed by Yury Selivanov and Inada Naoki in
|
||||
:issue:`26219`.)
|
||||
|
||||
|
||||
Build and C API Changes
|
||||
=======================
|
||||
|
|
|
@ -17,6 +17,8 @@ typedef uint16_t _Py_CODEUNIT;
|
|||
# define _Py_OPARG(word) ((word) >> 8)
|
||||
#endif
|
||||
|
||||
typedef struct _PyOpcache _PyOpcache;
|
||||
|
||||
/* Bytecode object */
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
|
@ -49,6 +51,21 @@ typedef struct {
|
|||
Type is a void* to keep the format private in codeobject.c to force
|
||||
people to go through the proper APIs. */
|
||||
void *co_extra;
|
||||
|
||||
/* Per opcodes just-in-time cache
|
||||
*
|
||||
* To reduce cache size, we use indirect mapping from opcode index to
|
||||
* cache object:
|
||||
* cache = co_opcache[co_opcache_map[next_instr - first_instr] - 1]
|
||||
*/
|
||||
|
||||
// co_opcache_map is indexed by (next_instr - first_instr).
|
||||
// * 0 means there is no cache for this opcode.
|
||||
// * n > 0 means there is cache in co_opcache[n-1].
|
||||
unsigned char *co_opcache_map;
|
||||
_PyOpcache *co_opcache;
|
||||
int co_opcache_flag; // used to determine when create a cache.
|
||||
unsigned char co_opcache_size; // length of co_opcache.
|
||||
} PyCodeObject;
|
||||
|
||||
/* Masks for co_flags above */
|
||||
|
|
|
@ -31,6 +31,9 @@ PyAPI_FUNC(void) _PyEval_SignalAsyncExc(
|
|||
PyAPI_FUNC(void) _PyEval_ReInitThreads(
|
||||
_PyRuntimeState *runtime);
|
||||
|
||||
/* Private function */
|
||||
void _PyEval_Fini(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
#ifndef Py_INTERNAL_CODE_H
|
||||
#define Py_INTERNAL_CODE_H
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
PyObject *ptr; /* Cached pointer (borrowed reference) */
|
||||
uint64_t globals_ver; /* ma_version of global dict */
|
||||
uint64_t builtins_ver; /* ma_version of builtin dict */
|
||||
} _PyOpcache_LoadGlobal;
|
||||
|
||||
struct _PyOpcache {
|
||||
union {
|
||||
_PyOpcache_LoadGlobal lg;
|
||||
} u;
|
||||
char optimized;
|
||||
};
|
||||
|
||||
/* Private API */
|
||||
int _PyCode_InitOpcache(PyCodeObject *co);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif /* !Py_INTERNAL_CODE_H */
|
|
@ -80,14 +80,14 @@ class DictVersionTests(unittest.TestCase):
|
|||
|
||||
# setting a key to the same value with dict.__setitem__
|
||||
# must change the version
|
||||
self.check_version_changed(d, d.__setitem__, 'key', value)
|
||||
self.check_version_dont_change(d, d.__setitem__, 'key', value)
|
||||
|
||||
# setting a key to the same value with dict.update
|
||||
# must change the version
|
||||
self.check_version_changed(d, d.update, key=value)
|
||||
self.check_version_dont_change(d, d.update, key=value)
|
||||
|
||||
d2 = self.new_dict(key=value)
|
||||
self.check_version_changed(d, d.update, d2)
|
||||
self.check_version_dont_change(d, d.update, d2)
|
||||
|
||||
def test_setitem_equal(self):
|
||||
class AlwaysEqual:
|
||||
|
|
|
@ -1070,6 +1070,7 @@ PYTHON_HEADERS= \
|
|||
$(srcdir)/Include/internal/pycore_accu.h \
|
||||
$(srcdir)/Include/internal/pycore_atomic.h \
|
||||
$(srcdir)/Include/internal/pycore_ceval.h \
|
||||
$(srcdir)/Include/internal/pycore_code.h \
|
||||
$(srcdir)/Include/internal/pycore_condvar.h \
|
||||
$(srcdir)/Include/internal/pycore_context.h \
|
||||
$(srcdir)/Include/internal/pycore_fileutils.h \
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
Implemented per opcode cache mechanism and ``LOAD_GLOBAL`` instruction use
|
||||
it. ``LOAD_GLOBAL`` is now about 40% faster. Contributed by Yury Selivanov,
|
||||
and Inada Naoki.
|
|
@ -2,7 +2,9 @@
|
|||
|
||||
#include "Python.h"
|
||||
#include "code.h"
|
||||
#include "opcode.h"
|
||||
#include "structmember.h"
|
||||
#include "pycore_code.h"
|
||||
#include "pycore_pystate.h"
|
||||
#include "pycore_tupleobject.h"
|
||||
#include "clinic/codeobject.c.h"
|
||||
|
@ -233,9 +235,56 @@ PyCode_New(int argcount, int posonlyargcount, int kwonlyargcount,
|
|||
co->co_zombieframe = NULL;
|
||||
co->co_weakreflist = NULL;
|
||||
co->co_extra = NULL;
|
||||
|
||||
co->co_opcache_map = NULL;
|
||||
co->co_opcache = NULL;
|
||||
co->co_opcache_flag = 0;
|
||||
co->co_opcache_size = 0;
|
||||
return co;
|
||||
}
|
||||
|
||||
int
|
||||
_PyCode_InitOpcache(PyCodeObject *co)
|
||||
{
|
||||
Py_ssize_t co_size = PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT);
|
||||
co->co_opcache_map = (unsigned char *)PyMem_Calloc(co_size, 1);
|
||||
if (co->co_opcache_map == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
_Py_CODEUNIT *opcodes = (_Py_CODEUNIT*)PyBytes_AS_STRING(co->co_code);
|
||||
Py_ssize_t opts = 0;
|
||||
|
||||
for (Py_ssize_t i = 0; i < co_size;) {
|
||||
unsigned char opcode = _Py_OPCODE(opcodes[i]);
|
||||
i++; // 'i' is now aligned to (next_instr - first_instr)
|
||||
|
||||
// TODO: LOAD_METHOD, LOAD_ATTR
|
||||
if (opcode == LOAD_GLOBAL) {
|
||||
co->co_opcache_map[i] = ++opts;
|
||||
if (opts > 254) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (opts) {
|
||||
co->co_opcache = (_PyOpcache *)PyMem_Calloc(opts, sizeof(_PyOpcache));
|
||||
if (co->co_opcache == NULL) {
|
||||
PyMem_FREE(co->co_opcache_map);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
PyMem_FREE(co->co_opcache_map);
|
||||
co->co_opcache_map = NULL;
|
||||
co->co_opcache = NULL;
|
||||
}
|
||||
|
||||
co->co_opcache_size = opts;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyCodeObject *
|
||||
PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno)
|
||||
{
|
||||
|
@ -458,6 +507,15 @@ code_new(PyTypeObject *type, PyObject *args, PyObject *kw)
|
|||
static void
|
||||
code_dealloc(PyCodeObject *co)
|
||||
{
|
||||
if (co->co_opcache != NULL) {
|
||||
PyMem_FREE(co->co_opcache);
|
||||
}
|
||||
if (co->co_opcache_map != NULL) {
|
||||
PyMem_FREE(co->co_opcache_map);
|
||||
}
|
||||
co->co_opcache_flag = 0;
|
||||
co->co_opcache_size = 0;
|
||||
|
||||
if (co->co_extra != NULL) {
|
||||
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
|
||||
_PyCodeObjectExtra *co_extra = co->co_extra;
|
||||
|
@ -504,6 +562,13 @@ code_sizeof(PyCodeObject *co, PyObject *Py_UNUSED(args))
|
|||
res += sizeof(_PyCodeObjectExtra) +
|
||||
(co_extra->ce_size-1) * sizeof(co_extra->ce_extras[0]);
|
||||
}
|
||||
if (co->co_opcache != NULL) {
|
||||
assert(co->co_opcache_map != NULL);
|
||||
// co_opcache_map
|
||||
res += PyBytes_GET_SIZE(co->co_code) / sizeof(_Py_CODEUNIT);
|
||||
// co_opcache
|
||||
res += co->co_opcache_size * sizeof(_PyOpcache);
|
||||
}
|
||||
return PyLong_FromSsize_t(res);
|
||||
}
|
||||
|
||||
|
|
|
@ -1080,6 +1080,7 @@ insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
|
|||
return 0;
|
||||
}
|
||||
|
||||
if (old_value != value) {
|
||||
if (_PyDict_HasSplitTable(mp)) {
|
||||
mp->ma_values[ix] = value;
|
||||
if (old_value == NULL) {
|
||||
|
@ -1092,8 +1093,8 @@ insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
|
|||
assert(old_value != NULL);
|
||||
DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
|
||||
}
|
||||
|
||||
mp->ma_version_tag = DICT_NEXT_VERSION();
|
||||
}
|
||||
Py_XDECREF(old_value); /* which **CAN** re-enter (see issue #22653) */
|
||||
ASSERT_CONSISTENT(mp);
|
||||
Py_DECREF(key);
|
||||
|
|
|
@ -158,6 +158,7 @@
|
|||
<ClInclude Include="..\Include\import.h" />
|
||||
<ClInclude Include="..\Include\internal\pycore_accu.h" />
|
||||
<ClInclude Include="..\Include\internal\pycore_atomic.h" />
|
||||
<ClInclude Include="..\Include\internal\pycore_code.h" />
|
||||
<ClInclude Include="..\Include\internal\pycore_ceval.h" />
|
||||
<ClInclude Include="..\Include\internal\pycore_condvar.h" />
|
||||
<ClInclude Include="..\Include\internal\pycore_context.h" />
|
||||
|
|
|
@ -177,6 +177,9 @@
|
|||
<ClInclude Include="..\Include\internal\pycore_atomic.h">
|
||||
<Filter>Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Include\internal\pycore_code.h">
|
||||
<Filter>Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Include\internal\pycore_ceval.h">
|
||||
<Filter>Include</Filter>
|
||||
</ClInclude>
|
||||
|
|
143
Python/ceval.c
143
Python/ceval.c
|
@ -11,6 +11,7 @@
|
|||
|
||||
#include "Python.h"
|
||||
#include "pycore_ceval.h"
|
||||
#include "pycore_code.h"
|
||||
#include "pycore_object.h"
|
||||
#include "pycore_pyerrors.h"
|
||||
#include "pycore_pylifecycle.h"
|
||||
|
@ -101,6 +102,20 @@ static long dxp[256];
|
|||
#endif
|
||||
#endif
|
||||
|
||||
/* per opcode cache */
|
||||
#define OPCACHE_MIN_RUNS 1024 /* create opcache when code executed this time */
|
||||
#define OPCACHE_STATS 0 /* Enable stats */
|
||||
|
||||
#if OPCACHE_STATS
|
||||
static size_t opcache_code_objects = 0;
|
||||
static size_t opcache_code_objects_extra_mem = 0;
|
||||
|
||||
static size_t opcache_global_opts = 0;
|
||||
static size_t opcache_global_hits = 0;
|
||||
static size_t opcache_global_misses = 0;
|
||||
#endif
|
||||
|
||||
|
||||
/* This can set eval_breaker to 0 even though gil_drop_request became
|
||||
1. We believe this is all right because the eval loop will release
|
||||
the GIL eventually anyway. */
|
||||
|
@ -225,6 +240,35 @@ exit_thread_if_finalizing(PyThreadState *tstate)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
_PyEval_Fini(void)
|
||||
{
|
||||
#if OPCACHE_STATS
|
||||
fprintf(stderr, "-- Opcode cache number of objects = %zd\n",
|
||||
opcache_code_objects);
|
||||
|
||||
fprintf(stderr, "-- Opcode cache total extra mem = %zd\n",
|
||||
opcache_code_objects_extra_mem);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
fprintf(stderr, "-- Opcode cache LOAD_GLOBAL hits = %zd (%d%%)\n",
|
||||
opcache_global_hits,
|
||||
(int) (100.0 * opcache_global_hits /
|
||||
(opcache_global_hits + opcache_global_misses)));
|
||||
|
||||
fprintf(stderr, "-- Opcode cache LOAD_GLOBAL misses = %zd (%d%%)\n",
|
||||
opcache_global_misses,
|
||||
(int) (100.0 * opcache_global_misses /
|
||||
(opcache_global_hits + opcache_global_misses)));
|
||||
|
||||
fprintf(stderr, "-- Opcode cache LOAD_GLOBAL opts = %zd\n",
|
||||
opcache_global_opts);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
PyEval_AcquireLock(void)
|
||||
{
|
||||
|
@ -799,6 +843,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
|
|||
const _Py_CODEUNIT *first_instr;
|
||||
PyObject *names;
|
||||
PyObject *consts;
|
||||
_PyOpcache *co_opcache;
|
||||
|
||||
#ifdef LLTRACE
|
||||
_Py_IDENTIFIER(__ltrace__);
|
||||
|
@ -1061,6 +1106,49 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
|
|||
Py_XDECREF(traceback); \
|
||||
} while(0)
|
||||
|
||||
/* macros for opcode cache */
|
||||
#define OPCACHE_CHECK() \
|
||||
do { \
|
||||
co_opcache = NULL; \
|
||||
if (co->co_opcache != NULL) { \
|
||||
unsigned char co_opt_offset = \
|
||||
co->co_opcache_map[next_instr - first_instr]; \
|
||||
if (co_opt_offset > 0) { \
|
||||
assert(co_opt_offset <= co->co_opcache_size); \
|
||||
co_opcache = &co->co_opcache[co_opt_offset - 1]; \
|
||||
assert(co_opcache != NULL); \
|
||||
if (co_opcache->optimized < 0) { \
|
||||
co_opcache = NULL; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#if OPCACHE_STATS
|
||||
|
||||
#define OPCACHE_STAT_GLOBAL_HIT() \
|
||||
do { \
|
||||
if (co->co_opcache != NULL) opcache_global_hits++; \
|
||||
} while (0)
|
||||
|
||||
#define OPCACHE_STAT_GLOBAL_MISS() \
|
||||
do { \
|
||||
if (co->co_opcache != NULL) opcache_global_misses++; \
|
||||
} while (0)
|
||||
|
||||
#define OPCACHE_STAT_GLOBAL_OPT() \
|
||||
do { \
|
||||
if (co->co_opcache != NULL) opcache_global_opts++; \
|
||||
} while (0)
|
||||
|
||||
#else /* OPCACHE_STATS */
|
||||
|
||||
#define OPCACHE_STAT_GLOBAL_HIT()
|
||||
#define OPCACHE_STAT_GLOBAL_MISS()
|
||||
#define OPCACHE_STAT_GLOBAL_OPT()
|
||||
|
||||
#endif
|
||||
|
||||
/* Start of code */
|
||||
|
||||
/* push frame */
|
||||
|
@ -1142,6 +1230,20 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
|
|||
f->f_stacktop = NULL; /* remains NULL unless yield suspends frame */
|
||||
f->f_executing = 1;
|
||||
|
||||
if (co->co_opcache_flag < OPCACHE_MIN_RUNS) {
|
||||
co->co_opcache_flag++;
|
||||
if (co->co_opcache_flag == OPCACHE_MIN_RUNS) {
|
||||
if (_PyCode_InitOpcache(co) < 0) {
|
||||
return NULL;
|
||||
}
|
||||
#if OPCACHE_STATS
|
||||
opcache_code_objects_extra_mem +=
|
||||
PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT) +
|
||||
sizeof(_PyOpcache) * co->co_opcache_size;
|
||||
opcache_code_objects++;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef LLTRACE
|
||||
lltrace = _PyDict_GetItemId(f->f_globals, &PyId___ltrace__) != NULL;
|
||||
|
@ -2451,11 +2553,30 @@ main_loop:
|
|||
}
|
||||
|
||||
case TARGET(LOAD_GLOBAL): {
|
||||
PyObject *name = GETITEM(names, oparg);
|
||||
PyObject *name;
|
||||
PyObject *v;
|
||||
if (PyDict_CheckExact(f->f_globals)
|
||||
&& PyDict_CheckExact(f->f_builtins))
|
||||
{
|
||||
OPCACHE_CHECK();
|
||||
if (co_opcache != NULL && co_opcache->optimized > 0) {
|
||||
_PyOpcache_LoadGlobal *lg = &co_opcache->u.lg;
|
||||
|
||||
if (lg->globals_ver ==
|
||||
((PyDictObject *)f->f_globals)->ma_version_tag
|
||||
&& lg->builtins_ver ==
|
||||
((PyDictObject *)f->f_builtins)->ma_version_tag)
|
||||
{
|
||||
PyObject *ptr = lg->ptr;
|
||||
OPCACHE_STAT_GLOBAL_HIT();
|
||||
assert(ptr != NULL);
|
||||
Py_INCREF(ptr);
|
||||
PUSH(ptr);
|
||||
DISPATCH();
|
||||
}
|
||||
}
|
||||
|
||||
name = GETITEM(names, oparg);
|
||||
v = _PyDict_LoadGlobal((PyDictObject *)f->f_globals,
|
||||
(PyDictObject *)f->f_builtins,
|
||||
name);
|
||||
|
@ -2468,12 +2589,32 @@ main_loop:
|
|||
}
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (co_opcache != NULL) {
|
||||
_PyOpcache_LoadGlobal *lg = &co_opcache->u.lg;
|
||||
|
||||
if (co_opcache->optimized == 0) {
|
||||
/* Wasn't optimized before. */
|
||||
OPCACHE_STAT_GLOBAL_OPT();
|
||||
} else {
|
||||
OPCACHE_STAT_GLOBAL_MISS();
|
||||
}
|
||||
|
||||
co_opcache->optimized = 1;
|
||||
lg->globals_ver =
|
||||
((PyDictObject *)f->f_globals)->ma_version_tag;
|
||||
lg->builtins_ver =
|
||||
((PyDictObject *)f->f_builtins)->ma_version_tag;
|
||||
lg->ptr = v; /* borrowed */
|
||||
}
|
||||
|
||||
Py_INCREF(v);
|
||||
}
|
||||
else {
|
||||
/* Slow-path if globals or builtins is not a dict */
|
||||
|
||||
/* namespace 1: globals */
|
||||
name = GETITEM(names, oparg);
|
||||
v = PyObject_GetItem(f->f_globals, name);
|
||||
if (v == NULL) {
|
||||
if (!_PyErr_ExceptionMatches(tstate, PyExc_KeyError)) {
|
||||
|
|
|
@ -1242,6 +1242,9 @@ Py_FinalizeEx(void)
|
|||
/* Destroy all modules */
|
||||
PyImport_Cleanup();
|
||||
|
||||
/* Print debug stats if any */
|
||||
_PyEval_Fini();
|
||||
|
||||
/* Flush sys.stdout and sys.stderr (again, in case more was printed) */
|
||||
if (flush_std_files() < 0) {
|
||||
status = -1;
|
||||
|
|
Loading…
Reference in New Issue