From ea251806b8dffff11b30d2182af1e589caf88acf Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Sat, 26 Dec 2020 02:58:33 +0100 Subject: [PATCH] bpo-40521: Per-interpreter interned strings (GH-20085) Make the Unicode dictionary of interned strings compatible with subinterpreters. Remove the INTERN_NAME_STRINGS macro in typeobject.c: names are always now interned (even if EXPERIMENTAL_ISOLATED_SUBINTERPRETERS macro is defined). _PyUnicode_ClearInterned() now uses PyDict_Next() to no longer allocate memory, to ensure that the interned dictionary is cleared. --- Include/internal/pycore_interp.h | 11 +++ .../2020-05-14-02-55-39.bpo-40521.dIlXsZ.rst | 2 + Objects/typeobject.c | 22 ----- Objects/unicodeobject.c | 91 ++++++------------- Python/pylifecycle.c | 2 + 5 files changed, 44 insertions(+), 84 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-05-14-02-55-39.bpo-40521.dIlXsZ.rst diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 339c2c4c61f..58b11267126 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -76,6 +76,17 @@ struct _Py_unicode_state { shared as well. */ PyObject *latin1[256]; struct _Py_unicode_fs_codec fs_codec; + + /* This dictionary holds all interned unicode strings. Note that references + to strings in this dictionary are *not* counted in the string's ob_refcnt. + When the interned string reaches a refcnt of 0 the string deallocation + function will delete the reference from this dictionary. + + Another way to look at this is that to say that the actual reference + count of a string is: s->ob_refcnt + (s->state ? 2 : 0) + */ + PyObject *interned; + // Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId() struct _Py_unicode_ids ids; }; diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-14-02-55-39.bpo-40521.dIlXsZ.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-14-02-55-39.bpo-40521.dIlXsZ.rst new file mode 100644 index 00000000000..654757a5f90 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-14-02-55-39.bpo-40521.dIlXsZ.rst @@ -0,0 +1,2 @@ +Make the Unicode dictionary of interned strings compatible with +subinterpreters. Patch by Victor Stinner. diff --git a/Objects/typeobject.c b/Objects/typeobject.c index 661ccb71ab0..43c499a0452 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -48,11 +48,6 @@ typedef struct PySlot_Offset { } PySlot_Offset; -/* bpo-40521: Interned strings are shared by all subinterpreters */ -#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS -# define INTERN_NAME_STRINGS -#endif - /* alphabetical order */ _Py_IDENTIFIER(__abstractmethods__); _Py_IDENTIFIER(__class__); @@ -3527,7 +3522,6 @@ type_setattro(PyTypeObject *type, PyObject *name, PyObject *value) if (name == NULL) return -1; } -#ifdef INTERN_NAME_STRINGS if (!PyUnicode_CHECK_INTERNED(name)) { PyUnicode_InternInPlace(&name); if (!PyUnicode_CHECK_INTERNED(name)) { @@ -3537,7 +3531,6 @@ type_setattro(PyTypeObject *type, PyObject *name, PyObject *value) return -1; } } -#endif } else { /* Will fail in _PyObject_GenericSetAttrWithDict. */ @@ -7683,17 +7676,10 @@ _PyTypes_InitSlotDefs(void) for (slotdef *p = slotdefs; p->name; p++) { /* Slots must be ordered by their offset in the PyHeapTypeObject. */ assert(!p[1].name || p->offset <= p[1].offset); -#ifdef INTERN_NAME_STRINGS p->name_strobj = PyUnicode_InternFromString(p->name); if (!p->name_strobj || !PyUnicode_CHECK_INTERNED(p->name_strobj)) { return _PyStatus_NO_MEMORY(); } -#else - p->name_strobj = PyUnicode_FromString(p->name); - if (!p->name_strobj) { - return _PyStatus_NO_MEMORY(); - } -#endif } slotdefs_initialized = 1; return _PyStatus_OK(); @@ -7718,24 +7704,16 @@ update_slot(PyTypeObject *type, PyObject *name) int offset; assert(PyUnicode_CheckExact(name)); -#ifdef INTERN_NAME_STRINGS assert(PyUnicode_CHECK_INTERNED(name)); -#endif assert(slotdefs_initialized); pp = ptrs; for (p = slotdefs; p->name; p++) { assert(PyUnicode_CheckExact(p->name_strobj)); assert(PyUnicode_CheckExact(name)); -#ifdef INTERN_NAME_STRINGS if (p->name_strobj == name) { *pp++ = p; } -#else - if (p->name_strobj == name || _PyUnicode_EQ(p->name_strobj, name)) { - *pp++ = p; - } -#endif } *pp = NULL; for (pp = ptrs; *pp; pp++) { diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3238d1e692a..a03ca9a10d1 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -206,22 +206,6 @@ extern "C" { # define OVERALLOCATE_FACTOR 4 #endif -/* bpo-40521: Interned strings are shared by all interpreters. */ -#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS -# define INTERNED_STRINGS -#endif - -/* This dictionary holds all interned unicode strings. Note that references - to strings in this dictionary are *not* counted in the string's ob_refcnt. - When the interned string reaches a refcnt of 0 the string deallocation - function will delete the reference from this dictionary. - - Another way to look at this is that to say that the actual reference - count of a string is: s->ob_refcnt + (s->state ? 2 : 0) -*/ -#ifdef INTERNED_STRINGS -static PyObject *interned = NULL; -#endif static struct _Py_unicode_state* get_unicode_state(void) @@ -1946,7 +1930,8 @@ unicode_dealloc(PyObject *unicode) break; case SSTATE_INTERNED_MORTAL: -#ifdef INTERNED_STRINGS + { + struct _Py_unicode_state *state = get_unicode_state(); /* Revive the dead object temporarily. PyDict_DelItem() removes two references (key and value) which were ignored by PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2 @@ -1954,14 +1939,14 @@ unicode_dealloc(PyObject *unicode) PyDict_DelItem(). */ assert(Py_REFCNT(unicode) == 0); Py_SET_REFCNT(unicode, 3); - if (PyDict_DelItem(interned, unicode) != 0) { + if (PyDict_DelItem(state->interned, unicode) != 0) { _PyErr_WriteUnraisableMsg("deletion of interned string failed", NULL); } assert(Py_REFCNT(unicode) == 1); Py_SET_REFCNT(unicode, 0); -#endif break; + } case SSTATE_INTERNED_IMMORTAL: _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died"); @@ -11536,12 +11521,11 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right) if (PyUnicode_CHECK_INTERNED(left)) return 0; -#ifdef INTERNED_STRINGS assert(_PyUnicode_HASH(right_uni) != -1); Py_hash_t hash = _PyUnicode_HASH(left); - if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) + if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) { return 0; -#endif + } return unicode_compare_eq(left, right_uni); } @@ -15765,23 +15749,21 @@ PyUnicode_InternInPlace(PyObject **p) return; } -#ifdef INTERNED_STRINGS if (PyUnicode_READY(s) == -1) { PyErr_Clear(); return; } - if (interned == NULL) { - interned = PyDict_New(); - if (interned == NULL) { + struct _Py_unicode_state *state = get_unicode_state(); + if (state->interned == NULL) { + state->interned = PyDict_New(); + if (state->interned == NULL) { PyErr_Clear(); /* Don't leave an exception */ return; } } - PyObject *t; - t = PyDict_SetDefault(interned, s, s); - + PyObject *t = PyDict_SetDefault(state->interned, s, s); if (t == NULL) { PyErr_Clear(); return; @@ -15798,13 +15780,9 @@ PyUnicode_InternInPlace(PyObject **p) this. */ Py_SET_REFCNT(s, Py_REFCNT(s) - 2); _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; -#else - // PyDict expects that interned strings have their hash - // (PyASCIIObject.hash) already computed. - (void)unicode_hash(s); -#endif } + void PyUnicode_InternImmortal(PyObject **p) { @@ -15838,35 +15816,25 @@ PyUnicode_InternFromString(const char *cp) void _PyUnicode_ClearInterned(PyThreadState *tstate) { - if (!_Py_IsMainInterpreter(tstate)) { - // interned dict is shared by all interpreters + struct _Py_unicode_state *state = &tstate->interp->unicode; + if (state->interned == NULL) { return; } - - if (interned == NULL) { - return; - } - assert(PyDict_CheckExact(interned)); - - PyObject *keys = PyDict_Keys(interned); - if (keys == NULL) { - PyErr_Clear(); - return; - } - assert(PyList_CheckExact(keys)); + assert(PyDict_CheckExact(state->interned)); /* Interned unicode strings are not forcibly deallocated; rather, we give them their stolen references back, and then clear and DECREF the interned dict. */ - Py_ssize_t n = PyList_GET_SIZE(keys); #ifdef INTERNED_STATS - fprintf(stderr, "releasing %zd interned strings\n", n); + fprintf(stderr, "releasing %zd interned strings\n", + PyDict_GET_SIZE(state->interned)); Py_ssize_t immortal_size = 0, mortal_size = 0; #endif - for (Py_ssize_t i = 0; i < n; i++) { - PyObject *s = PyList_GET_ITEM(keys, i); + Py_ssize_t pos = 0; + PyObject *s, *ignored_value; + while (PyDict_Next(state->interned, &pos, &s, &ignored_value)) { assert(PyUnicode_IS_READY(s)); switch (PyUnicode_CHECK_INTERNED(s)) { @@ -15896,10 +15864,9 @@ _PyUnicode_ClearInterned(PyThreadState *tstate) "total size of all interned strings: %zd/%zd mortal/immortal\n", mortal_size, immortal_size); #endif - Py_DECREF(keys); - PyDict_Clear(interned); - Py_CLEAR(interned); + PyDict_Clear(state->interned); + Py_CLEAR(state->interned); } @@ -16269,19 +16236,19 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void) void _PyUnicode_Fini(PyThreadState *tstate) { - // _PyUnicode_ClearInterned() must be called before - struct _Py_unicode_state *state = &tstate->interp->unicode; - Py_CLEAR(state->empty_string); + // _PyUnicode_ClearInterned() must be called before + assert(state->interned == NULL); + + _PyUnicode_FiniEncodings(&state->fs_codec); + + unicode_clear_identifiers(tstate); for (Py_ssize_t i = 0; i < 256; i++) { Py_CLEAR(state->latin1[i]); } - - unicode_clear_identifiers(tstate); - - _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec); + Py_CLEAR(state->empty_string); } diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index c3c1aa22e94..ccbacb48194 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1573,6 +1573,8 @@ finalize_interp_types(PyThreadState *tstate) _PyFrame_Fini(tstate); _PyAsyncGen_Fini(tstate); _PyContext_Fini(tstate); + // Call _PyUnicode_ClearInterned() before _PyDict_Fini() since it uses + // a dict internally. _PyUnicode_ClearInterned(tstate); _PyDict_Fini(tstate);