diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst
index 9474a2f4aaf..da97a728509 100644
--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@@ -860,6 +860,10 @@ Optimizations
methods up to 20--50%. (Contributed by Serhiy Storchaka in :issue:`23867`,
:issue:`35582` and :issue:`36127`.)
+* ``LOAD_GLOBAL`` instruction now uses new "per opcode cache" mechanism.
+ It is about 40% faster now. (Contributed by Yury Selivanov and Inada Naoki in
+ :issue:`26219`.)
+
Build and C API Changes
=======================
diff --git a/Include/code.h b/Include/code.h
index 933de97d078..b79d977394e 100644
--- a/Include/code.h
+++ b/Include/code.h
@@ -17,6 +17,8 @@ typedef uint16_t _Py_CODEUNIT;
# define _Py_OPARG(word) ((word) >> 8)
#endif
+typedef struct _PyOpcache _PyOpcache;
+
/* Bytecode object */
typedef struct {
PyObject_HEAD
@@ -49,6 +51,21 @@ typedef struct {
Type is a void* to keep the format private in codeobject.c to force
people to go through the proper APIs. */
void *co_extra;
+
+ /* Per opcodes just-in-time cache
+ *
+ * To reduce cache size, we use indirect mapping from opcode index to
+ * cache object:
+ * cache = co_opcache[co_opcache_map[next_instr - first_instr] - 1]
+ */
+
+ // co_opcache_map is indexed by (next_instr - first_instr).
+ // * 0 means there is no cache for this opcode.
+ // * n > 0 means there is cache in co_opcache[n-1].
+ unsigned char *co_opcache_map;
+ _PyOpcache *co_opcache;
+ int co_opcache_flag; // used to determine when create a cache.
+ unsigned char co_opcache_size; // length of co_opcache.
} PyCodeObject;
/* Masks for co_flags above */
diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index d44afdf4fa4..2c6df9a9af9 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -31,6 +31,9 @@ PyAPI_FUNC(void) _PyEval_SignalAsyncExc(
PyAPI_FUNC(void) _PyEval_ReInitThreads(
_PyRuntimeState *runtime);
+/* Private function */
+void _PyEval_Fini(void);
+
#ifdef __cplusplus
}
#endif
diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
new file mode 100644
index 00000000000..88956f109b4
--- /dev/null
+++ b/Include/internal/pycore_code.h
@@ -0,0 +1,27 @@
+#ifndef Py_INTERNAL_CODE_H
+#define Py_INTERNAL_CODE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ PyObject *ptr; /* Cached pointer (borrowed reference) */
+ uint64_t globals_ver; /* ma_version of global dict */
+ uint64_t builtins_ver; /* ma_version of builtin dict */
+} _PyOpcache_LoadGlobal;
+
+struct _PyOpcache {
+ union {
+ _PyOpcache_LoadGlobal lg;
+ } u;
+ char optimized;
+};
+
+/* Private API */
+int _PyCode_InitOpcache(PyCodeObject *co);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CODE_H */
diff --git a/Lib/test/test_dict_version.py b/Lib/test/test_dict_version.py
index 5671f9fc7c2..cb79e7434c2 100644
--- a/Lib/test/test_dict_version.py
+++ b/Lib/test/test_dict_version.py
@@ -80,14 +80,14 @@ class DictVersionTests(unittest.TestCase):
# setting a key to the same value with dict.__setitem__
# must change the version
- self.check_version_changed(d, d.__setitem__, 'key', value)
+ self.check_version_dont_change(d, d.__setitem__, 'key', value)
# setting a key to the same value with dict.update
# must change the version
- self.check_version_changed(d, d.update, key=value)
+ self.check_version_dont_change(d, d.update, key=value)
d2 = self.new_dict(key=value)
- self.check_version_changed(d, d.update, d2)
+ self.check_version_dont_change(d, d.update, d2)
def test_setitem_equal(self):
class AlwaysEqual:
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 0bf5df4b68e..a0bc9c1f1cc 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -1070,6 +1070,7 @@ PYTHON_HEADERS= \
$(srcdir)/Include/internal/pycore_accu.h \
$(srcdir)/Include/internal/pycore_atomic.h \
$(srcdir)/Include/internal/pycore_ceval.h \
+ $(srcdir)/Include/internal/pycore_code.h \
$(srcdir)/Include/internal/pycore_condvar.h \
$(srcdir)/Include/internal/pycore_context.h \
$(srcdir)/Include/internal/pycore_fileutils.h \
diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst b/Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst
new file mode 100644
index 00000000000..a4ee7b580bc
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst
@@ -0,0 +1,3 @@
+Implemented per opcode cache mechanism and ``LOAD_GLOBAL`` instruction use
+it. ``LOAD_GLOBAL`` is now about 40% faster. Contributed by Yury Selivanov,
+and Inada Naoki.
diff --git a/Objects/codeobject.c b/Objects/codeobject.c
index 233307562a5..0d9e5d16e76 100644
--- a/Objects/codeobject.c
+++ b/Objects/codeobject.c
@@ -2,7 +2,9 @@
#include "Python.h"
#include "code.h"
+#include "opcode.h"
#include "structmember.h"
+#include "pycore_code.h"
#include "pycore_pystate.h"
#include "pycore_tupleobject.h"
#include "clinic/codeobject.c.h"
@@ -233,9 +235,56 @@ PyCode_New(int argcount, int posonlyargcount, int kwonlyargcount,
co->co_zombieframe = NULL;
co->co_weakreflist = NULL;
co->co_extra = NULL;
+
+ co->co_opcache_map = NULL;
+ co->co_opcache = NULL;
+ co->co_opcache_flag = 0;
+ co->co_opcache_size = 0;
return co;
}
+int
+_PyCode_InitOpcache(PyCodeObject *co)
+{
+ Py_ssize_t co_size = PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT);
+ co->co_opcache_map = (unsigned char *)PyMem_Calloc(co_size, 1);
+ if (co->co_opcache_map == NULL) {
+ return -1;
+ }
+
+ _Py_CODEUNIT *opcodes = (_Py_CODEUNIT*)PyBytes_AS_STRING(co->co_code);
+ Py_ssize_t opts = 0;
+
+ for (Py_ssize_t i = 0; i < co_size;) {
+ unsigned char opcode = _Py_OPCODE(opcodes[i]);
+ i++; // 'i' is now aligned to (next_instr - first_instr)
+
+ // TODO: LOAD_METHOD, LOAD_ATTR
+ if (opcode == LOAD_GLOBAL) {
+ co->co_opcache_map[i] = ++opts;
+ if (opts > 254) {
+ break;
+ }
+ }
+ }
+
+ if (opts) {
+ co->co_opcache = (_PyOpcache *)PyMem_Calloc(opts, sizeof(_PyOpcache));
+ if (co->co_opcache == NULL) {
+ PyMem_FREE(co->co_opcache_map);
+ return -1;
+ }
+ }
+ else {
+ PyMem_FREE(co->co_opcache_map);
+ co->co_opcache_map = NULL;
+ co->co_opcache = NULL;
+ }
+
+ co->co_opcache_size = opts;
+ return 0;
+}
+
PyCodeObject *
PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno)
{
@@ -458,6 +507,15 @@ code_new(PyTypeObject *type, PyObject *args, PyObject *kw)
static void
code_dealloc(PyCodeObject *co)
{
+ if (co->co_opcache != NULL) {
+ PyMem_FREE(co->co_opcache);
+ }
+ if (co->co_opcache_map != NULL) {
+ PyMem_FREE(co->co_opcache_map);
+ }
+ co->co_opcache_flag = 0;
+ co->co_opcache_size = 0;
+
if (co->co_extra != NULL) {
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
_PyCodeObjectExtra *co_extra = co->co_extra;
@@ -504,6 +562,13 @@ code_sizeof(PyCodeObject *co, PyObject *Py_UNUSED(args))
res += sizeof(_PyCodeObjectExtra) +
(co_extra->ce_size-1) * sizeof(co_extra->ce_extras[0]);
}
+ if (co->co_opcache != NULL) {
+ assert(co->co_opcache_map != NULL);
+ // co_opcache_map
+ res += PyBytes_GET_SIZE(co->co_code) / sizeof(_Py_CODEUNIT);
+ // co_opcache
+ res += co->co_opcache_size * sizeof(_PyOpcache);
+ }
return PyLong_FromSsize_t(res);
}
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index 2b04b0b6796..0cc14437500 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -1080,20 +1080,21 @@ insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
return 0;
}
- if (_PyDict_HasSplitTable(mp)) {
- mp->ma_values[ix] = value;
- if (old_value == NULL) {
- /* pending state */
- assert(ix == mp->ma_used);
- mp->ma_used++;
+ if (old_value != value) {
+ if (_PyDict_HasSplitTable(mp)) {
+ mp->ma_values[ix] = value;
+ if (old_value == NULL) {
+ /* pending state */
+ assert(ix == mp->ma_used);
+ mp->ma_used++;
+ }
}
+ else {
+ assert(old_value != NULL);
+ DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
+ }
+ mp->ma_version_tag = DICT_NEXT_VERSION();
}
- else {
- assert(old_value != NULL);
- DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
- }
-
- mp->ma_version_tag = DICT_NEXT_VERSION();
Py_XDECREF(old_value); /* which **CAN** re-enter (see issue #22653) */
ASSERT_CONSISTENT(mp);
Py_DECREF(key);
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index 329f9feb2bd..89625da944e 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -158,6 +158,7 @@
+
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index d80d05fb15a..63ab88b4a01 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -177,6 +177,9 @@
Include
+
+ Include
+
Include
diff --git a/Python/ceval.c b/Python/ceval.c
index a092a235564..411ba3d73c5 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -11,6 +11,7 @@
#include "Python.h"
#include "pycore_ceval.h"
+#include "pycore_code.h"
#include "pycore_object.h"
#include "pycore_pyerrors.h"
#include "pycore_pylifecycle.h"
@@ -101,6 +102,20 @@ static long dxp[256];
#endif
#endif
+/* per opcode cache */
+#define OPCACHE_MIN_RUNS 1024 /* create opcache when code executed this time */
+#define OPCACHE_STATS 0 /* Enable stats */
+
+#if OPCACHE_STATS
+static size_t opcache_code_objects = 0;
+static size_t opcache_code_objects_extra_mem = 0;
+
+static size_t opcache_global_opts = 0;
+static size_t opcache_global_hits = 0;
+static size_t opcache_global_misses = 0;
+#endif
+
+
/* This can set eval_breaker to 0 even though gil_drop_request became
1. We believe this is all right because the eval loop will release
the GIL eventually anyway. */
@@ -225,6 +240,35 @@ exit_thread_if_finalizing(PyThreadState *tstate)
}
}
+void
+_PyEval_Fini(void)
+{
+#if OPCACHE_STATS
+ fprintf(stderr, "-- Opcode cache number of objects = %zd\n",
+ opcache_code_objects);
+
+ fprintf(stderr, "-- Opcode cache total extra mem = %zd\n",
+ opcache_code_objects_extra_mem);
+
+ fprintf(stderr, "\n");
+
+ fprintf(stderr, "-- Opcode cache LOAD_GLOBAL hits = %zd (%d%%)\n",
+ opcache_global_hits,
+ (int) (100.0 * opcache_global_hits /
+ (opcache_global_hits + opcache_global_misses)));
+
+ fprintf(stderr, "-- Opcode cache LOAD_GLOBAL misses = %zd (%d%%)\n",
+ opcache_global_misses,
+ (int) (100.0 * opcache_global_misses /
+ (opcache_global_hits + opcache_global_misses)));
+
+ fprintf(stderr, "-- Opcode cache LOAD_GLOBAL opts = %zd\n",
+ opcache_global_opts);
+
+ fprintf(stderr, "\n");
+#endif
+}
+
void
PyEval_AcquireLock(void)
{
@@ -799,6 +843,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
const _Py_CODEUNIT *first_instr;
PyObject *names;
PyObject *consts;
+ _PyOpcache *co_opcache;
#ifdef LLTRACE
_Py_IDENTIFIER(__ltrace__);
@@ -1061,6 +1106,49 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
Py_XDECREF(traceback); \
} while(0)
+ /* macros for opcode cache */
+#define OPCACHE_CHECK() \
+ do { \
+ co_opcache = NULL; \
+ if (co->co_opcache != NULL) { \
+ unsigned char co_opt_offset = \
+ co->co_opcache_map[next_instr - first_instr]; \
+ if (co_opt_offset > 0) { \
+ assert(co_opt_offset <= co->co_opcache_size); \
+ co_opcache = &co->co_opcache[co_opt_offset - 1]; \
+ assert(co_opcache != NULL); \
+ if (co_opcache->optimized < 0) { \
+ co_opcache = NULL; \
+ } \
+ } \
+ } \
+ } while (0)
+
+#if OPCACHE_STATS
+
+#define OPCACHE_STAT_GLOBAL_HIT() \
+ do { \
+ if (co->co_opcache != NULL) opcache_global_hits++; \
+ } while (0)
+
+#define OPCACHE_STAT_GLOBAL_MISS() \
+ do { \
+ if (co->co_opcache != NULL) opcache_global_misses++; \
+ } while (0)
+
+#define OPCACHE_STAT_GLOBAL_OPT() \
+ do { \
+ if (co->co_opcache != NULL) opcache_global_opts++; \
+ } while (0)
+
+#else /* OPCACHE_STATS */
+
+#define OPCACHE_STAT_GLOBAL_HIT()
+#define OPCACHE_STAT_GLOBAL_MISS()
+#define OPCACHE_STAT_GLOBAL_OPT()
+
+#endif
+
/* Start of code */
/* push frame */
@@ -1142,6 +1230,20 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
f->f_stacktop = NULL; /* remains NULL unless yield suspends frame */
f->f_executing = 1;
+ if (co->co_opcache_flag < OPCACHE_MIN_RUNS) {
+ co->co_opcache_flag++;
+ if (co->co_opcache_flag == OPCACHE_MIN_RUNS) {
+ if (_PyCode_InitOpcache(co) < 0) {
+ return NULL;
+ }
+#if OPCACHE_STATS
+ opcache_code_objects_extra_mem +=
+ PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT) +
+ sizeof(_PyOpcache) * co->co_opcache_size;
+ opcache_code_objects++;
+#endif
+ }
+ }
#ifdef LLTRACE
lltrace = _PyDict_GetItemId(f->f_globals, &PyId___ltrace__) != NULL;
@@ -2451,11 +2553,30 @@ main_loop:
}
case TARGET(LOAD_GLOBAL): {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name;
PyObject *v;
if (PyDict_CheckExact(f->f_globals)
&& PyDict_CheckExact(f->f_builtins))
{
+ OPCACHE_CHECK();
+ if (co_opcache != NULL && co_opcache->optimized > 0) {
+ _PyOpcache_LoadGlobal *lg = &co_opcache->u.lg;
+
+ if (lg->globals_ver ==
+ ((PyDictObject *)f->f_globals)->ma_version_tag
+ && lg->builtins_ver ==
+ ((PyDictObject *)f->f_builtins)->ma_version_tag)
+ {
+ PyObject *ptr = lg->ptr;
+ OPCACHE_STAT_GLOBAL_HIT();
+ assert(ptr != NULL);
+ Py_INCREF(ptr);
+ PUSH(ptr);
+ DISPATCH();
+ }
+ }
+
+ name = GETITEM(names, oparg);
v = _PyDict_LoadGlobal((PyDictObject *)f->f_globals,
(PyDictObject *)f->f_builtins,
name);
@@ -2468,12 +2589,32 @@ main_loop:
}
goto error;
}
+
+ if (co_opcache != NULL) {
+ _PyOpcache_LoadGlobal *lg = &co_opcache->u.lg;
+
+ if (co_opcache->optimized == 0) {
+ /* Wasn't optimized before. */
+ OPCACHE_STAT_GLOBAL_OPT();
+ } else {
+ OPCACHE_STAT_GLOBAL_MISS();
+ }
+
+ co_opcache->optimized = 1;
+ lg->globals_ver =
+ ((PyDictObject *)f->f_globals)->ma_version_tag;
+ lg->builtins_ver =
+ ((PyDictObject *)f->f_builtins)->ma_version_tag;
+ lg->ptr = v; /* borrowed */
+ }
+
Py_INCREF(v);
}
else {
/* Slow-path if globals or builtins is not a dict */
/* namespace 1: globals */
+ name = GETITEM(names, oparg);
v = PyObject_GetItem(f->f_globals, name);
if (v == NULL) {
if (!_PyErr_ExceptionMatches(tstate, PyExc_KeyError)) {
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 3de5528811a..fc7e5510b2c 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1242,6 +1242,9 @@ Py_FinalizeEx(void)
/* Destroy all modules */
PyImport_Cleanup();
+ /* Print debug stats if any */
+ _PyEval_Fini();
+
/* Flush sys.stdout and sys.stderr (again, in case more was printed) */
if (flush_std_files() < 0) {
status = -1;