gh-100227: Move the Dict of Interned Strings to PyInterpreterState (gh-102339)

We can revisit the options for keeping it global later, if desired.  For now the approach seems quite complex, so we've gone with the simpler isolation solution in the meantime.

https://github.com/python/cpython/issues/100227
This commit is contained in:
Eric Snow 2023-03-28 12:52:28 -06:00 committed by GitHub
parent 7703def37e
commit ba65a065cf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 727 additions and 718 deletions

View File

@ -23,13 +23,6 @@ extern "C" {
// Only immutable objects should be considered runtime-global.
// All others must be per-interpreter.
#define _Py_CACHED_OBJECT(NAME) \
_PyRuntime.cached_objects.NAME
struct _Py_cached_objects {
PyObject *interned_strings;
};
#define _Py_GLOBAL_OBJECT(NAME) \
_PyRuntime.static_objects.NAME
#define _Py_SINGLETON(NAME) \
@ -65,6 +58,8 @@ struct _Py_static_objects {
(interp)->cached_objects.NAME
struct _Py_interp_cached_objects {
PyObject *interned_strings;
/* AST */
PyObject *str_replace_inf;

View File

@ -163,7 +163,6 @@ typedef struct pyruntimestate {
} types;
/* All the objects that are shared by the runtime's interpreters. */
struct _Py_cached_objects cached_objects;
struct _Py_static_objects static_objects;
/* The following fields are here to avoid allocation during init.

View File

@ -59,6 +59,7 @@ struct _Py_unicode_state {
struct _Py_unicode_ids ids;
};
extern void _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p);
extern void _PyUnicode_ClearInterned(PyInterpreterState *interp);

File diff suppressed because it is too large Load Diff

View File

@ -231,14 +231,32 @@ static inline PyObject* unicode_new_empty(void)
Another way to look at this is that to say that the actual reference
count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
*/
static inline PyObject *get_interned_dict(void)
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
{
return _Py_CACHED_OBJECT(interned_strings);
return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
}
static inline void set_interned_dict(PyObject *dict)
static int
init_interned_dict(PyInterpreterState *interp)
{
_Py_CACHED_OBJECT(interned_strings) = dict;
assert(get_interned_dict(interp) == NULL);
PyObject *interned = interned = PyDict_New();
if (interned == NULL) {
return -1;
}
_Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
return 0;
}
static void
clear_interned_dict(PyInterpreterState *interp)
{
PyObject *interned = get_interned_dict(interp);
if (interned != NULL) {
PyDict_Clear(interned);
Py_DECREF(interned);
_Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
}
}
#define _Py_RETURN_UNICODE_EMPTY() \
@ -1520,12 +1538,12 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
static void
unicode_dealloc(PyObject *unicode)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
#ifdef Py_DEBUG
if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
_Py_FatalRefcountError("deallocating an Unicode singleton");
}
#endif
PyObject *interned = get_interned_dict();
if (PyUnicode_CHECK_INTERNED(unicode)) {
/* Revive the dead object temporarily. PyDict_DelItem() removes two
references (key and value) which were ignored by
@ -1534,6 +1552,8 @@ unicode_dealloc(PyObject *unicode)
PyDict_DelItem(). */
assert(Py_REFCNT(unicode) == 0);
Py_SET_REFCNT(unicode, 3);
PyObject *interned = get_interned_dict(interp);
assert(interned != NULL);
if (PyDict_DelItem(interned, unicode) != 0) {
_PyErr_WriteUnraisableMsg("deletion of interned string failed",
NULL);
@ -14529,34 +14549,29 @@ _PyUnicode_InitState(PyInterpreterState *interp)
PyStatus
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
{
if (!_Py_IsMainInterpreter(interp)) {
return _PyStatus_OK();
}
// Initialize the global interned dict
PyObject *interned = PyDict_New();
if (interned == NULL) {
if (init_interned_dict(interp)) {
PyErr_Clear();
return _PyStatus_ERR("failed to create interned dict");
}
set_interned_dict(interned);
/* Intern statically allocated string identifiers and deepfreeze strings.
* This must be done before any module initialization so that statically
* allocated string identifiers are used instead of heap allocated strings.
* Deepfreeze uses the interned identifiers if present to save space
* else generates them and they are interned to speed up dict lookups.
*/
_PyUnicode_InitStaticStrings();
if (_Py_IsMainInterpreter(interp)) {
/* Intern statically allocated string identifiers and deepfreeze strings.
* This must be done before any module initialization so that statically
* allocated string identifiers are used instead of heap allocated strings.
* Deepfreeze uses the interned identifiers if present to save space
* else generates them and they are interned to speed up dict lookups.
*/
_PyUnicode_InitStaticStrings(interp);
#ifdef Py_DEBUG
assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
for (int i = 0; i < 256; i++) {
assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
}
for (int i = 0; i < 256; i++) {
assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
}
#endif
}
return _PyStatus_OK();
}
@ -14586,7 +14601,7 @@ error:
void
PyUnicode_InternInPlace(PyObject **p)
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
{
PyObject *s = *p;
#ifdef Py_DEBUG
@ -14608,7 +14623,7 @@ PyUnicode_InternInPlace(PyObject **p)
return;
}
PyObject *interned = get_interned_dict();
PyObject *interned = get_interned_dict(interp);
assert(interned != NULL);
PyObject *t = PyDict_SetDefault(interned, s, s);
@ -14629,6 +14644,13 @@ PyUnicode_InternInPlace(PyObject **p)
_PyUnicode_STATE(s).interned = 1;
}
void
PyUnicode_InternInPlace(PyObject **p)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
_PyUnicode_InternInPlace(interp, p);
}
// Function kept for the stable ABI.
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
void
@ -14653,12 +14675,7 @@ PyUnicode_InternFromString(const char *cp)
void
_PyUnicode_ClearInterned(PyInterpreterState *interp)
{
if (!_Py_IsMainInterpreter(interp)) {
// interned dict is shared by all interpreters
return;
}
PyObject *interned = get_interned_dict();
PyObject *interned = get_interned_dict(interp);
if (interned == NULL) {
return;
}
@ -14693,9 +14710,7 @@ _PyUnicode_ClearInterned(PyInterpreterState *interp)
total_length);
#endif
PyDict_Clear(interned);
Py_DECREF(interned);
set_interned_dict(NULL);
clear_interned_dict(interp);
}
@ -15108,7 +15123,7 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void)
static inline int
unicode_is_finalizing(void)
{
return (get_interned_dict() == NULL);
return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
}
#endif
@ -15131,14 +15146,13 @@ _PyUnicode_Fini(PyInterpreterState *interp)
{
struct _Py_unicode_state *state = &interp->unicode;
if (_Py_IsMainInterpreter(interp)) {
// _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
assert(get_interned_dict() == NULL);
// bpo-47182: force a unicodedata CAPI capsule re-import on
// subsequent initialization of main interpreter.
}
// _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
assert(get_interned_dict(interp) == NULL);
_PyUnicode_FiniEncodings(&state->fs_codec);
// bpo-47182: force a unicodedata CAPI capsule re-import on
// subsequent initialization of interpreter.
interp->unicode.ucnhash_capi = NULL;
unicode_clear_identifiers(state);

View File

@ -354,14 +354,14 @@ def generate_static_strings_initializer(identifiers, strings):
printer.write(before)
printer.write(START)
printer.write("static inline void")
with printer.block("_PyUnicode_InitStaticStrings(void)"):
with printer.block("_PyUnicode_InitStaticStrings(PyInterpreterState *interp)"):
printer.write(f'PyObject *string;')
for i in sorted(identifiers):
# This use of _Py_ID() is ignored by iter_global_strings()
# since iter_files() ignores .h files.
printer.write(f'string = &_Py_ID({i});')
printer.write(f'assert(_PyUnicode_CheckConsistency(string, 1));')
printer.write(f'PyUnicode_InternInPlace(&string);')
printer.write(f'_PyUnicode_InternInPlace(interp, &string);')
# XXX What about "strings"?
printer.write(END)
printer.write(after)