gh-122291: Intern latin-1 one-byte strings at startup (GH-122303)

This commit is contained in:
Petr Viktorin 2024-07-27 10:27:06 +02:00 committed by GitHub
parent c08696286f
commit bb09ba6792
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 43 additions and 65 deletions

View File

@ -8,51 +8,50 @@
This is used to optimize dict and attribute lookups, among other things. This is used to optimize dict and attribute lookups, among other things.
Python uses three different mechanisms to intern strings: Python uses two different mechanisms to intern strings: singletons and
dynamic interning.
- Singleton strings marked in C source with `_Py_STR` and `_Py_ID` macros. ## Singletons
These are statically allocated, and collected using `make regen-global-objects`
(`Tools/build/generate_global_objects.py`), which generates code
for declaration, initialization and finalization.
The difference between the two kinds is not important. (A `_Py_ID` string is The 256 possible one-character latin-1 strings, which can be retrieved with
a valid C name, with which we can refer to it; a `_Py_STR` may e.g. contain `_Py_LATIN1_CHR(c)`, are stored in statically allocated arrays,
non-identifier characters, so it needs a separate C-compatible name.) `_PyRuntime.static_objects.strings.ascii` and
`_PyRuntime.static_objects.strings.latin1`.
The empty string is in this category (as `_Py_STR(empty)`). Longer singleton strings are marked in C source with `_Py_ID` (if the string
is a valid C identifier fragment) or `_Py_STR` (if it needs a separate
C-compatible name.)
These are also stored in statically allocated arrays.
They are collected from CPython sources using `make regen-global-objects`
(`Tools/build/generate_global_objects.py`), which generates code
for declaration, initialization and finalization.
These singletons are interned in a runtime-global lookup table, The empty string is one of the singletons: `_Py_STR(empty)`.
`_PyRuntime.cached_objects.interned_strings` (`INTERNED_STRINGS`),
at runtime initialization.
- The 256 possible one-character latin-1 strings are singletons, The three sets of singletons (`_Py_LATIN1_CHR`, `_Py_ID`, `_Py_STR`)
which can be retrieved with `_Py_LATIN1_CHR(c)`, are stored in runtime-global
arrays, `_PyRuntime.static_objects.strings.ascii` and
`_PyRuntime.static_objects.strings.latin1`.
These are NOT interned at startup in the normal build.
In the free-threaded build, they are; this avoids modifying the
global lookup table after threads are started.
Interning a one-char latin-1 string will always intern the corresponding
singleton.
- All other strings are allocated dynamically, and have their
`_PyUnicode_STATE(s).statically_allocated` flag set to zero.
When interned, such strings are added to an interpreter-wide dict,
`PyInterpreterState.cached_objects.interned_strings`.
The key and value of each entry in this dict reference the same object.
The three sets of singletons (`_Py_STR`, `_Py_ID`, `_Py_LATIN1_CHR`)
are disjoint. are disjoint.
If you have such a singleton, it (and no other copy) will be interned. If you have such a singleton, it (and no other copy) will be interned.
These singletons are interned in a runtime-global lookup table,
`_PyRuntime.cached_objects.interned_strings` (`INTERNED_STRINGS`),
at runtime initialization, and immutable until it's torn down
at runtime finalization.
It is shared across threads and interpreters without any synchronization.
## Dynamically allocated strings
All other strings are allocated dynamically, and have their
`_PyUnicode_STATE(s).statically_allocated` flag set to zero.
When interned, such strings are added to an interpreter-wide dict,
`PyInterpreterState.cached_objects.interned_strings`.
The key and value of each entry in this dict reference the same object.
## Immortality and reference counting ## Immortality and reference counting
Invariant: Every immortal string is interned, *except* the one-char latin-1 Invariant: Every immortal string is interned.
singletons (which might but might not be interned).
In practice, this means that you must not use `_Py_SetImmortal` on In practice, this means that you must not use `_Py_SetImmortal` on
a string. (If you know it's already immortal, don't immortalize it; a string. (If you know it's already immortal, don't immortalize it;
@ -115,8 +114,5 @@ The valid transitions between these states are:
Using `_PyUnicode_InternStatic` on these is an error; the other cases Using `_PyUnicode_InternStatic` on these is an error; the other cases
don't change the state. don't change the state.
- One-char latin-1 singletons can be interned (0 -> 3) using any interning - Singletons are interned (0 -> 3) at runtime init;
function; after that the functions don't change the state.
- Other statically allocated strings are interned (0 -> 3) at runtime init;
after that all interning functions don't change the state. after that all interning functions don't change the state.

View File

@ -325,7 +325,8 @@ init_global_interned_strings(PyInterpreterState *interp)
return _PyStatus_ERR("failed to create global interned dict"); return _PyStatus_ERR("failed to create global interned dict");
} }
/* Intern statically allocated string identifiers and deepfreeze strings. /* Intern statically allocated string identifiers, deepfreeze strings,
* and one-byte latin-1 strings.
* This must be done before any module initialization so that statically * This must be done before any module initialization so that statically
* allocated string identifiers are used instead of heap allocated strings. * allocated string identifiers are used instead of heap allocated strings.
* Deepfreeze uses the interned identifiers if present to save space * Deepfreeze uses the interned identifiers if present to save space
@ -333,14 +334,11 @@ init_global_interned_strings(PyInterpreterState *interp)
*/ */
_PyUnicode_InitStaticStrings(interp); _PyUnicode_InitStaticStrings(interp);
#ifdef Py_GIL_DISABLED
// In the free-threaded build, intern the 1-byte strings as well
for (int i = 0; i < 256; i++) { for (int i = 0; i < 256; i++) {
PyObject *s = LATIN1(i); PyObject *s = LATIN1(i);
_PyUnicode_InternStatic(interp, &s); _PyUnicode_InternStatic(interp, &s);
assert(s == LATIN1(i)); assert(s == LATIN1(i));
} }
#endif
#ifdef Py_DEBUG #ifdef Py_DEBUG
assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1)); assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
@ -15355,26 +15353,14 @@ intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
assert(s != NULL); assert(s != NULL);
assert(_PyUnicode_CHECK(s)); assert(_PyUnicode_CHECK(s));
assert(_PyUnicode_STATE(s).statically_allocated); assert(_PyUnicode_STATE(s).statically_allocated);
assert(!PyUnicode_CHECK_INTERNED(s));
switch (PyUnicode_CHECK_INTERNED(s)) {
case SSTATE_NOT_INTERNED:
break;
case SSTATE_INTERNED_IMMORTAL_STATIC:
return s;
default:
Py_FatalError("_PyUnicode_InternStatic called on wrong string");
}
#ifdef Py_DEBUG #ifdef Py_DEBUG
/* We must not add process-global interned string if there's already a /* We must not add process-global interned string if there's already a
* per-interpreter interned_dict, which might contain duplicates. * per-interpreter interned_dict, which might contain duplicates.
* Except "short string" singletons: those are special-cased. */ */
PyObject *interned = get_interned_dict(interp); PyObject *interned = get_interned_dict(interp);
assert(interned == NULL || unicode_is_singleton(s));
#ifdef Py_GIL_DISABLED
// In the free-threaded build, don't allow even the short strings.
assert(interned == NULL); assert(interned == NULL);
#endif
#endif #endif
/* Look in the global cache first. */ /* Look in the global cache first. */
@ -15446,11 +15432,6 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
return s; return s;
} }
/* Handle statically allocated strings. */
if (_PyUnicode_STATE(s).statically_allocated) {
return intern_static(interp, s);
}
/* Is it already interned? */ /* Is it already interned? */
switch (PyUnicode_CHECK_INTERNED(s)) { switch (PyUnicode_CHECK_INTERNED(s)) {
case SSTATE_NOT_INTERNED: case SSTATE_NOT_INTERNED:
@ -15467,6 +15448,9 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
return s; return s;
} }
/* Statically allocated strings must be already interned. */
assert(!_PyUnicode_STATE(s).statically_allocated);
#if Py_GIL_DISABLED #if Py_GIL_DISABLED
/* In the free-threaded build, all interned strings are immortal */ /* In the free-threaded build, all interned strings are immortal */
immortalize = 1; immortalize = 1;
@ -15477,13 +15461,11 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
immortalize = 1; immortalize = 1;
} }
/* if it's a short string, get the singleton -- and intern it */ /* if it's a short string, get the singleton */
if (PyUnicode_GET_LENGTH(s) == 1 && if (PyUnicode_GET_LENGTH(s) == 1 &&
PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) { PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s)); PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
if (!PyUnicode_CHECK_INTERNED(r)) { assert(PyUnicode_CHECK_INTERNED(r));
r = intern_static(interp, r);
}
Py_DECREF(s); Py_DECREF(s);
return r; return r;
} }