mirror of https://github.com/python/cpython
gh-122291: Intern latin-1 one-byte strings at startup (GH-122303)
This commit is contained in:
parent
c08696286f
commit
bb09ba6792
|
@ -8,51 +8,50 @@
|
||||||
|
|
||||||
This is used to optimize dict and attribute lookups, among other things.
|
This is used to optimize dict and attribute lookups, among other things.
|
||||||
|
|
||||||
Python uses three different mechanisms to intern strings:
|
Python uses two different mechanisms to intern strings: singletons and
|
||||||
|
dynamic interning.
|
||||||
|
|
||||||
- Singleton strings marked in C source with `_Py_STR` and `_Py_ID` macros.
|
## Singletons
|
||||||
These are statically allocated, and collected using `make regen-global-objects`
|
|
||||||
|
The 256 possible one-character latin-1 strings, which can be retrieved with
|
||||||
|
`_Py_LATIN1_CHR(c)`, are stored in statically allocated arrays,
|
||||||
|
`_PyRuntime.static_objects.strings.ascii` and
|
||||||
|
`_PyRuntime.static_objects.strings.latin1`.
|
||||||
|
|
||||||
|
Longer singleton strings are marked in C source with `_Py_ID` (if the string
|
||||||
|
is a valid C identifier fragment) or `_Py_STR` (if it needs a separate
|
||||||
|
C-compatible name.)
|
||||||
|
These are also stored in statically allocated arrays.
|
||||||
|
They are collected from CPython sources using `make regen-global-objects`
|
||||||
(`Tools/build/generate_global_objects.py`), which generates code
|
(`Tools/build/generate_global_objects.py`), which generates code
|
||||||
for declaration, initialization and finalization.
|
for declaration, initialization and finalization.
|
||||||
|
|
||||||
The difference between the two kinds is not important. (A `_Py_ID` string is
|
The empty string is one of the singletons: `_Py_STR(empty)`.
|
||||||
a valid C name, with which we can refer to it; a `_Py_STR` may e.g. contain
|
|
||||||
non-identifier characters, so it needs a separate C-compatible name.)
|
|
||||||
|
|
||||||
The empty string is in this category (as `_Py_STR(empty)`).
|
The three sets of singletons (`_Py_LATIN1_CHR`, `_Py_ID`, `_Py_STR`)
|
||||||
|
are disjoint.
|
||||||
|
If you have such a singleton, it (and no other copy) will be interned.
|
||||||
|
|
||||||
These singletons are interned in a runtime-global lookup table,
|
These singletons are interned in a runtime-global lookup table,
|
||||||
`_PyRuntime.cached_objects.interned_strings` (`INTERNED_STRINGS`),
|
`_PyRuntime.cached_objects.interned_strings` (`INTERNED_STRINGS`),
|
||||||
at runtime initialization.
|
at runtime initialization, and immutable until it's torn down
|
||||||
|
at runtime finalization.
|
||||||
|
It is shared across threads and interpreters without any synchronization.
|
||||||
|
|
||||||
- The 256 possible one-character latin-1 strings are singletons,
|
|
||||||
which can be retrieved with `_Py_LATIN1_CHR(c)`, are stored in runtime-global
|
|
||||||
arrays, `_PyRuntime.static_objects.strings.ascii` and
|
|
||||||
`_PyRuntime.static_objects.strings.latin1`.
|
|
||||||
|
|
||||||
These are NOT interned at startup in the normal build.
|
## Dynamically allocated strings
|
||||||
In the free-threaded build, they are; this avoids modifying the
|
|
||||||
global lookup table after threads are started.
|
|
||||||
|
|
||||||
Interning a one-char latin-1 string will always intern the corresponding
|
All other strings are allocated dynamically, and have their
|
||||||
singleton.
|
|
||||||
|
|
||||||
- All other strings are allocated dynamically, and have their
|
|
||||||
`_PyUnicode_STATE(s).statically_allocated` flag set to zero.
|
`_PyUnicode_STATE(s).statically_allocated` flag set to zero.
|
||||||
When interned, such strings are added to an interpreter-wide dict,
|
When interned, such strings are added to an interpreter-wide dict,
|
||||||
`PyInterpreterState.cached_objects.interned_strings`.
|
`PyInterpreterState.cached_objects.interned_strings`.
|
||||||
|
|
||||||
The key and value of each entry in this dict reference the same object.
|
The key and value of each entry in this dict reference the same object.
|
||||||
|
|
||||||
The three sets of singletons (`_Py_STR`, `_Py_ID`, `_Py_LATIN1_CHR`)
|
|
||||||
are disjoint.
|
|
||||||
If you have such a singleton, it (and no other copy) will be interned.
|
|
||||||
|
|
||||||
|
|
||||||
## Immortality and reference counting
|
## Immortality and reference counting
|
||||||
|
|
||||||
Invariant: Every immortal string is interned, *except* the one-char latin-1
|
Invariant: Every immortal string is interned.
|
||||||
singletons (which might but might not be interned).
|
|
||||||
|
|
||||||
In practice, this means that you must not use `_Py_SetImmortal` on
|
In practice, this means that you must not use `_Py_SetImmortal` on
|
||||||
a string. (If you know it's already immortal, don't immortalize it;
|
a string. (If you know it's already immortal, don't immortalize it;
|
||||||
|
@ -115,8 +114,5 @@ The valid transitions between these states are:
|
||||||
Using `_PyUnicode_InternStatic` on these is an error; the other cases
|
Using `_PyUnicode_InternStatic` on these is an error; the other cases
|
||||||
don't change the state.
|
don't change the state.
|
||||||
|
|
||||||
- One-char latin-1 singletons can be interned (0 -> 3) using any interning
|
- Singletons are interned (0 -> 3) at runtime init;
|
||||||
function; after that the functions don't change the state.
|
|
||||||
|
|
||||||
- Other statically allocated strings are interned (0 -> 3) at runtime init;
|
|
||||||
after that all interning functions don't change the state.
|
after that all interning functions don't change the state.
|
||||||
|
|
|
@ -325,7 +325,8 @@ init_global_interned_strings(PyInterpreterState *interp)
|
||||||
return _PyStatus_ERR("failed to create global interned dict");
|
return _PyStatus_ERR("failed to create global interned dict");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Intern statically allocated string identifiers and deepfreeze strings.
|
/* Intern statically allocated string identifiers, deepfreeze strings,
|
||||||
|
* and one-byte latin-1 strings.
|
||||||
* This must be done before any module initialization so that statically
|
* This must be done before any module initialization so that statically
|
||||||
* allocated string identifiers are used instead of heap allocated strings.
|
* allocated string identifiers are used instead of heap allocated strings.
|
||||||
* Deepfreeze uses the interned identifiers if present to save space
|
* Deepfreeze uses the interned identifiers if present to save space
|
||||||
|
@ -333,14 +334,11 @@ init_global_interned_strings(PyInterpreterState *interp)
|
||||||
*/
|
*/
|
||||||
_PyUnicode_InitStaticStrings(interp);
|
_PyUnicode_InitStaticStrings(interp);
|
||||||
|
|
||||||
#ifdef Py_GIL_DISABLED
|
|
||||||
// In the free-threaded build, intern the 1-byte strings as well
|
|
||||||
for (int i = 0; i < 256; i++) {
|
for (int i = 0; i < 256; i++) {
|
||||||
PyObject *s = LATIN1(i);
|
PyObject *s = LATIN1(i);
|
||||||
_PyUnicode_InternStatic(interp, &s);
|
_PyUnicode_InternStatic(interp, &s);
|
||||||
assert(s == LATIN1(i));
|
assert(s == LATIN1(i));
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#ifdef Py_DEBUG
|
#ifdef Py_DEBUG
|
||||||
assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
|
assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
|
||||||
|
|
||||||
|
@ -15355,26 +15353,14 @@ intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
|
||||||
assert(s != NULL);
|
assert(s != NULL);
|
||||||
assert(_PyUnicode_CHECK(s));
|
assert(_PyUnicode_CHECK(s));
|
||||||
assert(_PyUnicode_STATE(s).statically_allocated);
|
assert(_PyUnicode_STATE(s).statically_allocated);
|
||||||
|
assert(!PyUnicode_CHECK_INTERNED(s));
|
||||||
switch (PyUnicode_CHECK_INTERNED(s)) {
|
|
||||||
case SSTATE_NOT_INTERNED:
|
|
||||||
break;
|
|
||||||
case SSTATE_INTERNED_IMMORTAL_STATIC:
|
|
||||||
return s;
|
|
||||||
default:
|
|
||||||
Py_FatalError("_PyUnicode_InternStatic called on wrong string");
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef Py_DEBUG
|
#ifdef Py_DEBUG
|
||||||
/* We must not add process-global interned string if there's already a
|
/* We must not add process-global interned string if there's already a
|
||||||
* per-interpreter interned_dict, which might contain duplicates.
|
* per-interpreter interned_dict, which might contain duplicates.
|
||||||
* Except "short string" singletons: those are special-cased. */
|
*/
|
||||||
PyObject *interned = get_interned_dict(interp);
|
PyObject *interned = get_interned_dict(interp);
|
||||||
assert(interned == NULL || unicode_is_singleton(s));
|
|
||||||
#ifdef Py_GIL_DISABLED
|
|
||||||
// In the free-threaded build, don't allow even the short strings.
|
|
||||||
assert(interned == NULL);
|
assert(interned == NULL);
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Look in the global cache first. */
|
/* Look in the global cache first. */
|
||||||
|
@ -15446,11 +15432,6 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Handle statically allocated strings. */
|
|
||||||
if (_PyUnicode_STATE(s).statically_allocated) {
|
|
||||||
return intern_static(interp, s);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Is it already interned? */
|
/* Is it already interned? */
|
||||||
switch (PyUnicode_CHECK_INTERNED(s)) {
|
switch (PyUnicode_CHECK_INTERNED(s)) {
|
||||||
case SSTATE_NOT_INTERNED:
|
case SSTATE_NOT_INTERNED:
|
||||||
|
@ -15467,6 +15448,9 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Statically allocated strings must be already interned. */
|
||||||
|
assert(!_PyUnicode_STATE(s).statically_allocated);
|
||||||
|
|
||||||
#if Py_GIL_DISABLED
|
#if Py_GIL_DISABLED
|
||||||
/* In the free-threaded build, all interned strings are immortal */
|
/* In the free-threaded build, all interned strings are immortal */
|
||||||
immortalize = 1;
|
immortalize = 1;
|
||||||
|
@ -15477,13 +15461,11 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
|
||||||
immortalize = 1;
|
immortalize = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if it's a short string, get the singleton -- and intern it */
|
/* if it's a short string, get the singleton */
|
||||||
if (PyUnicode_GET_LENGTH(s) == 1 &&
|
if (PyUnicode_GET_LENGTH(s) == 1 &&
|
||||||
PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
|
PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
|
||||||
PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
|
PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
|
||||||
if (!PyUnicode_CHECK_INTERNED(r)) {
|
assert(PyUnicode_CHECK_INTERNED(r));
|
||||||
r = intern_static(interp, r);
|
|
||||||
}
|
|
||||||
Py_DECREF(s);
|
Py_DECREF(s);
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue