From bb09ba679223666e01f8da780f97888a29d07131 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Sat, 27 Jul 2024 10:27:06 +0200 Subject: [PATCH] gh-122291: Intern latin-1 one-byte strings at startup (GH-122303) --- InternalDocs/string_interning.md | 72 +++++++++++++++----------------- Objects/unicodeobject.c | 36 ++++------------ 2 files changed, 43 insertions(+), 65 deletions(-) diff --git a/InternalDocs/string_interning.md b/InternalDocs/string_interning.md index 930ea110d85..358e2c070cd 100644 --- a/InternalDocs/string_interning.md +++ b/InternalDocs/string_interning.md @@ -8,51 +8,50 @@ This is used to optimize dict and attribute lookups, among other things. -Python uses three different mechanisms to intern strings: +Python uses two different mechanisms to intern strings: singletons and +dynamic interning. -- Singleton strings marked in C source with `_Py_STR` and `_Py_ID` macros. - These are statically allocated, and collected using `make regen-global-objects` - (`Tools/build/generate_global_objects.py`), which generates code - for declaration, initialization and finalization. +## Singletons - The difference between the two kinds is not important. (A `_Py_ID` string is - a valid C name, with which we can refer to it; a `_Py_STR` may e.g. contain - non-identifier characters, so it needs a separate C-compatible name.) +The 256 possible one-character latin-1 strings, which can be retrieved with +`_Py_LATIN1_CHR(c)`, are stored in statically allocated arrays, +`_PyRuntime.static_objects.strings.ascii` and +`_PyRuntime.static_objects.strings.latin1`. - The empty string is in this category (as `_Py_STR(empty)`). +Longer singleton strings are marked in C source with `_Py_ID` (if the string +is a valid C identifier fragment) or `_Py_STR` (if it needs a separate +C-compatible name.) +These are also stored in statically allocated arrays. +They are collected from CPython sources using `make regen-global-objects` +(`Tools/build/generate_global_objects.py`), which generates code +for declaration, initialization and finalization. - These singletons are interned in a runtime-global lookup table, - `_PyRuntime.cached_objects.interned_strings` (`INTERNED_STRINGS`), - at runtime initialization. +The empty string is one of the singletons: `_Py_STR(empty)`. -- The 256 possible one-character latin-1 strings are singletons, - which can be retrieved with `_Py_LATIN1_CHR(c)`, are stored in runtime-global - arrays, `_PyRuntime.static_objects.strings.ascii` and - `_PyRuntime.static_objects.strings.latin1`. - - These are NOT interned at startup in the normal build. - In the free-threaded build, they are; this avoids modifying the - global lookup table after threads are started. - - Interning a one-char latin-1 string will always intern the corresponding - singleton. - -- All other strings are allocated dynamically, and have their - `_PyUnicode_STATE(s).statically_allocated` flag set to zero. - When interned, such strings are added to an interpreter-wide dict, - `PyInterpreterState.cached_objects.interned_strings`. - - The key and value of each entry in this dict reference the same object. - -The three sets of singletons (`_Py_STR`, `_Py_ID`, `_Py_LATIN1_CHR`) +The three sets of singletons (`_Py_LATIN1_CHR`, `_Py_ID`, `_Py_STR`) are disjoint. If you have such a singleton, it (and no other copy) will be interned. +These singletons are interned in a runtime-global lookup table, +`_PyRuntime.cached_objects.interned_strings` (`INTERNED_STRINGS`), +at runtime initialization, and immutable until it's torn down +at runtime finalization. +It is shared across threads and interpreters without any synchronization. + + +## Dynamically allocated strings + +All other strings are allocated dynamically, and have their +`_PyUnicode_STATE(s).statically_allocated` flag set to zero. +When interned, such strings are added to an interpreter-wide dict, +`PyInterpreterState.cached_objects.interned_strings`. + +The key and value of each entry in this dict reference the same object. + ## Immortality and reference counting -Invariant: Every immortal string is interned, *except* the one-char latin-1 -singletons (which might but might not be interned). +Invariant: Every immortal string is interned. In practice, this means that you must not use `_Py_SetImmortal` on a string. (If you know it's already immortal, don't immortalize it; @@ -115,8 +114,5 @@ The valid transitions between these states are: Using `_PyUnicode_InternStatic` on these is an error; the other cases don't change the state. -- One-char latin-1 singletons can be interned (0 -> 3) using any interning - function; after that the functions don't change the state. - -- Other statically allocated strings are interned (0 -> 3) at runtime init; +- Singletons are interned (0 -> 3) at runtime init; after that all interning functions don't change the state. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 6196a8e766a..ffb879a6874 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -325,7 +325,8 @@ init_global_interned_strings(PyInterpreterState *interp) return _PyStatus_ERR("failed to create global interned dict"); } - /* Intern statically allocated string identifiers and deepfreeze strings. + /* Intern statically allocated string identifiers, deepfreeze strings, + * and one-byte latin-1 strings. * This must be done before any module initialization so that statically * allocated string identifiers are used instead of heap allocated strings. * Deepfreeze uses the interned identifiers if present to save space @@ -333,14 +334,11 @@ init_global_interned_strings(PyInterpreterState *interp) */ _PyUnicode_InitStaticStrings(interp); -#ifdef Py_GIL_DISABLED -// In the free-threaded build, intern the 1-byte strings as well for (int i = 0; i < 256; i++) { PyObject *s = LATIN1(i); _PyUnicode_InternStatic(interp, &s); assert(s == LATIN1(i)); } -#endif #ifdef Py_DEBUG assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1)); @@ -15355,26 +15353,14 @@ intern_static(PyInterpreterState *interp, PyObject *s /* stolen */) assert(s != NULL); assert(_PyUnicode_CHECK(s)); assert(_PyUnicode_STATE(s).statically_allocated); - - switch (PyUnicode_CHECK_INTERNED(s)) { - case SSTATE_NOT_INTERNED: - break; - case SSTATE_INTERNED_IMMORTAL_STATIC: - return s; - default: - Py_FatalError("_PyUnicode_InternStatic called on wrong string"); - } + assert(!PyUnicode_CHECK_INTERNED(s)); #ifdef Py_DEBUG /* We must not add process-global interned string if there's already a * per-interpreter interned_dict, which might contain duplicates. - * Except "short string" singletons: those are special-cased. */ + */ PyObject *interned = get_interned_dict(interp); - assert(interned == NULL || unicode_is_singleton(s)); -#ifdef Py_GIL_DISABLED - // In the free-threaded build, don't allow even the short strings. assert(interned == NULL); -#endif #endif /* Look in the global cache first. */ @@ -15446,11 +15432,6 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */, return s; } - /* Handle statically allocated strings. */ - if (_PyUnicode_STATE(s).statically_allocated) { - return intern_static(interp, s); - } - /* Is it already interned? */ switch (PyUnicode_CHECK_INTERNED(s)) { case SSTATE_NOT_INTERNED: @@ -15467,6 +15448,9 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */, return s; } + /* Statically allocated strings must be already interned. */ + assert(!_PyUnicode_STATE(s).statically_allocated); + #if Py_GIL_DISABLED /* In the free-threaded build, all interned strings are immortal */ immortalize = 1; @@ -15477,13 +15461,11 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */, immortalize = 1; } - /* if it's a short string, get the singleton -- and intern it */ + /* if it's a short string, get the singleton */ if (PyUnicode_GET_LENGTH(s) == 1 && PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) { PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s)); - if (!PyUnicode_CHECK_INTERNED(r)) { - r = intern_static(interp, r); - } + assert(PyUnicode_CHECK_INTERNED(r)); Py_DECREF(s); return r; }