gh-116738: Make `_codecs` module thread-safe (#117530)

The module itself is a thin wrapper around calls to functions in
`Python/codecs.c`, so that's where the meaningful changes happened:

- Move codecs-related state that lives on `PyInterpreterState` to a
  struct declared in `pycore_codecs.h`.

- In free-threaded builds, add a mutex to `codecs_state` to synchronize
  operations on `search_path`. Because `search_path_mutex` is used as a
  normal mutex and not a critical section, we must be extremely careful
  with operations called while holding it.

- The codec registry is explicitly initialized as part of
  `_PyUnicode_InitEncodings` to simplify thread-safety.
This commit is contained in:
Brett Simmers 2024-05-02 15:25:36 -07:00 committed by GitHub
parent 4e2caf2aa0
commit f8290df63f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 120 additions and 79 deletions

View File

@ -8,6 +8,17 @@ extern "C" {
# error "this header requires Py_BUILD_CORE define" # error "this header requires Py_BUILD_CORE define"
#endif #endif
#include "pycore_lock.h" // PyMutex
/* Initialize codecs-related state for the given interpreter, including
registering the first codec search function. Must be called before any other
PyCodec-related functions, and while only one thread is active. */
extern PyStatus _PyCodec_InitRegistry(PyInterpreterState *interp);
/* Finalize codecs-related state for the given interpreter. No PyCodec-related
functions other than PyCodec_Unregister() may be called after this. */
extern void _PyCodec_Fini(PyInterpreterState *interp);
extern PyObject* _PyCodec_Lookup(const char *encoding); extern PyObject* _PyCodec_Lookup(const char *encoding);
/* Text codec specific encoding and decoding API. /* Text codec specific encoding and decoding API.
@ -48,6 +59,26 @@ extern PyObject* _PyCodecInfo_GetIncrementalEncoder(
PyObject *codec_info, PyObject *codec_info,
const char *errors); const char *errors);
// Per-interpreter state used by codecs.c.
struct codecs_state {
// A list of callable objects used to search for codecs.
PyObject *search_path;
// A dict mapping codec names to codecs returned from a callable in
// search_path.
PyObject *search_cache;
// A dict mapping error handling strategies to functions to implement them.
PyObject *error_registry;
#ifdef Py_GIL_DISABLED
// Used to safely delete a specific item from search_path.
PyMutex search_path_mutex;
#endif
// Whether or not the rest of the state is initialized.
int initialized;
};
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -14,6 +14,7 @@ extern "C" {
#include "pycore_atexit.h" // struct atexit_state #include "pycore_atexit.h" // struct atexit_state
#include "pycore_ceval_state.h" // struct _ceval_state #include "pycore_ceval_state.h" // struct _ceval_state
#include "pycore_code.h" // struct callable_cache #include "pycore_code.h" // struct callable_cache
#include "pycore_codecs.h" // struct codecs_state
#include "pycore_context.h" // struct _Py_context_state #include "pycore_context.h" // struct _Py_context_state
#include "pycore_crossinterp.h" // struct _xidregistry #include "pycore_crossinterp.h" // struct _xidregistry
#include "pycore_dict_state.h" // struct _Py_dict_state #include "pycore_dict_state.h" // struct _Py_dict_state
@ -182,10 +183,7 @@ struct _is {
possible to facilitate out-of-process observability possible to facilitate out-of-process observability
tools. */ tools. */
PyObject *codec_search_path; struct codecs_state codecs;
PyObject *codec_search_cache;
PyObject *codec_error_registry;
int codecs_initialized;
PyConfig config; PyConfig config;
unsigned long feature_flags; unsigned long feature_flags;

View File

@ -15441,7 +15441,11 @@ init_fs_encoding(PyThreadState *tstate)
PyStatus PyStatus
_PyUnicode_InitEncodings(PyThreadState *tstate) _PyUnicode_InitEncodings(PyThreadState *tstate)
{ {
PyStatus status = init_fs_encoding(tstate); PyStatus status = _PyCodec_InitRegistry(tstate->interp);
if (_PyStatus_EXCEPTION(status)) {
return status;
}
status = init_fs_encoding(tstate);
if (_PyStatus_EXCEPTION(status)) { if (_PyStatus_EXCEPTION(status)) {
return status; return status;
} }

View File

@ -11,6 +11,7 @@ Copyright (c) Corporation for National Research Initiatives.
#include "Python.h" #include "Python.h"
#include "pycore_call.h" // _PyObject_CallNoArgs() #include "pycore_call.h" // _PyObject_CallNoArgs()
#include "pycore_interp.h" // PyInterpreterState.codec_search_path #include "pycore_interp.h" // PyInterpreterState.codec_search_path
#include "pycore_lock.h" // PyMutex
#include "pycore_pyerrors.h" // _PyErr_FormatNote() #include "pycore_pyerrors.h" // _PyErr_FormatNote()
#include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_pystate.h" // _PyInterpreterState_GET()
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
@ -19,24 +20,10 @@ const char *Py_hexdigits = "0123456789abcdef";
/* --- Codec Registry ----------------------------------------------------- */ /* --- Codec Registry ----------------------------------------------------- */
/* Import the standard encodings package which will register the first
codec search function.
This is done in a lazy way so that the Unicode implementation does
not downgrade startup time of scripts not needing it.
ImportErrors are silently ignored by this function. Only one try is
made.
*/
static int _PyCodecRegistry_Init(void); /* Forward */
int PyCodec_Register(PyObject *search_function) int PyCodec_Register(PyObject *search_function)
{ {
PyInterpreterState *interp = _PyInterpreterState_GET(); PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) assert(interp->codecs.initialized);
goto onError;
if (search_function == NULL) { if (search_function == NULL) {
PyErr_BadArgument(); PyErr_BadArgument();
goto onError; goto onError;
@ -45,7 +32,14 @@ int PyCodec_Register(PyObject *search_function)
PyErr_SetString(PyExc_TypeError, "argument must be callable"); PyErr_SetString(PyExc_TypeError, "argument must be callable");
goto onError; goto onError;
} }
return PyList_Append(interp->codec_search_path, search_function); #ifdef Py_GIL_DISABLED
PyMutex_Lock(&interp->codecs.search_path_mutex);
#endif
int ret = PyList_Append(interp->codecs.search_path, search_function);
#ifdef Py_GIL_DISABLED
PyMutex_Unlock(&interp->codecs.search_path_mutex);
#endif
return ret;
onError: onError:
return -1; return -1;
@ -55,22 +49,34 @@ int
PyCodec_Unregister(PyObject *search_function) PyCodec_Unregister(PyObject *search_function)
{ {
PyInterpreterState *interp = _PyInterpreterState_GET(); PyInterpreterState *interp = _PyInterpreterState_GET();
PyObject *codec_search_path = interp->codec_search_path; if (interp->codecs.initialized != 1) {
/* Do nothing if codec_search_path is not created yet or was cleared. */ /* Do nothing if codecs state was cleared (only possible during
if (codec_search_path == NULL) { interpreter shutdown). */
return 0; return 0;
} }
PyObject *codec_search_path = interp->codecs.search_path;
assert(PyList_CheckExact(codec_search_path)); assert(PyList_CheckExact(codec_search_path));
Py_ssize_t n = PyList_GET_SIZE(codec_search_path); for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
for (Py_ssize_t i = 0; i < n; i++) { #ifdef Py_GIL_DISABLED
PyObject *item = PyList_GET_ITEM(codec_search_path, i); PyMutex_Lock(&interp->codecs.search_path_mutex);
#endif
PyObject *item = PyList_GetItemRef(codec_search_path, i);
int ret = 1;
if (item == search_function) { if (item == search_function) {
if (interp->codec_search_cache != NULL) { // We hold a reference to the item, so its destructor can't run
assert(PyDict_CheckExact(interp->codec_search_cache)); // while we hold search_path_mutex.
PyDict_Clear(interp->codec_search_cache); ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
} }
return PyList_SetSlice(codec_search_path, i, i+1, NULL); #ifdef Py_GIL_DISABLED
PyMutex_Unlock(&interp->codecs.search_path_mutex);
#endif
Py_DECREF(item);
if (ret != 1) {
assert(interp->codecs.search_cache != NULL);
assert(PyDict_CheckExact(interp->codecs.search_cache));
PyDict_Clear(interp->codecs.search_cache);
return ret;
} }
} }
return 0; return 0;
@ -132,9 +138,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
} }
PyInterpreterState *interp = _PyInterpreterState_GET(); PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) { assert(interp->codecs.initialized);
return NULL;
}
/* Convert the encoding to a normalized Python string: all /* Convert the encoding to a normalized Python string: all
characters are converted to lower case, spaces and hyphens are characters are converted to lower case, spaces and hyphens are
@ -147,7 +151,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
/* First, try to lookup the name in the registry dictionary */ /* First, try to lookup the name in the registry dictionary */
PyObject *result; PyObject *result;
if (PyDict_GetItemRef(interp->codec_search_cache, v, &result) < 0) { if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
goto onError; goto onError;
} }
if (result != NULL) { if (result != NULL) {
@ -156,7 +160,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
} }
/* Next, scan the search functions in order of registration */ /* Next, scan the search functions in order of registration */
const Py_ssize_t len = PyList_Size(interp->codec_search_path); const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
if (len < 0) if (len < 0)
goto onError; goto onError;
if (len == 0) { if (len == 0) {
@ -170,14 +174,15 @@ PyObject *_PyCodec_Lookup(const char *encoding)
for (i = 0; i < len; i++) { for (i = 0; i < len; i++) {
PyObject *func; PyObject *func;
func = PyList_GetItem(interp->codec_search_path, i); func = PyList_GetItemRef(interp->codecs.search_path, i);
if (func == NULL) if (func == NULL)
goto onError; goto onError;
result = PyObject_CallOneArg(func, v); result = PyObject_CallOneArg(func, v);
Py_DECREF(func);
if (result == NULL) if (result == NULL)
goto onError; goto onError;
if (result == Py_None) { if (result == Py_None) {
Py_DECREF(result); Py_CLEAR(result);
continue; continue;
} }
if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) { if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
@ -188,7 +193,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
} }
break; break;
} }
if (i == len) { if (result == NULL) {
/* XXX Perhaps we should cache misses too ? */ /* XXX Perhaps we should cache misses too ? */
PyErr_Format(PyExc_LookupError, PyErr_Format(PyExc_LookupError,
"unknown encoding: %s", encoding); "unknown encoding: %s", encoding);
@ -196,7 +201,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
} }
/* Cache and return the result */ /* Cache and return the result */
if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) { if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
Py_DECREF(result); Py_DECREF(result);
goto onError; goto onError;
} }
@ -600,13 +605,12 @@ PyObject *_PyCodec_DecodeText(PyObject *object,
int PyCodec_RegisterError(const char *name, PyObject *error) int PyCodec_RegisterError(const char *name, PyObject *error)
{ {
PyInterpreterState *interp = _PyInterpreterState_GET(); PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) assert(interp->codecs.initialized);
return -1;
if (!PyCallable_Check(error)) { if (!PyCallable_Check(error)) {
PyErr_SetString(PyExc_TypeError, "handler must be callable"); PyErr_SetString(PyExc_TypeError, "handler must be callable");
return -1; return -1;
} }
return PyDict_SetItemString(interp->codec_error_registry, return PyDict_SetItemString(interp->codecs.error_registry,
name, error); name, error);
} }
@ -616,13 +620,12 @@ int PyCodec_RegisterError(const char *name, PyObject *error)
PyObject *PyCodec_LookupError(const char *name) PyObject *PyCodec_LookupError(const char *name)
{ {
PyInterpreterState *interp = _PyInterpreterState_GET(); PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) assert(interp->codecs.initialized);
return NULL;
if (name==NULL) if (name==NULL)
name = "strict"; name = "strict";
PyObject *handler; PyObject *handler;
if (PyDict_GetItemStringRef(interp->codec_error_registry, name, &handler) < 0) { if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
return NULL; return NULL;
} }
if (handler == NULL) { if (handler == NULL) {
@ -1375,7 +1378,8 @@ static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
return PyCodec_SurrogateEscapeErrors(exc); return PyCodec_SurrogateEscapeErrors(exc);
} }
static int _PyCodecRegistry_Init(void) PyStatus
_PyCodec_InitRegistry(PyInterpreterState *interp)
{ {
static struct { static struct {
const char *name; const char *name;
@ -1463,45 +1467,51 @@ static int _PyCodecRegistry_Init(void)
} }
}; };
PyInterpreterState *interp = _PyInterpreterState_GET(); assert(interp->codecs.initialized == 0);
PyObject *mod; interp->codecs.search_path = PyList_New(0);
if (interp->codecs.search_path == NULL) {
if (interp->codec_search_path != NULL) return PyStatus_NoMemory();
return 0;
interp->codec_search_path = PyList_New(0);
if (interp->codec_search_path == NULL) {
return -1;
} }
interp->codecs.search_cache = PyDict_New();
interp->codec_search_cache = PyDict_New(); if (interp->codecs.search_cache == NULL) {
if (interp->codec_search_cache == NULL) { return PyStatus_NoMemory();
return -1;
} }
interp->codecs.error_registry = PyDict_New();
interp->codec_error_registry = PyDict_New(); if (interp->codecs.error_registry == NULL) {
if (interp->codec_error_registry == NULL) { return PyStatus_NoMemory();
return -1;
} }
for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL); PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
if (!func) { if (func == NULL) {
return -1; return PyStatus_NoMemory();
} }
int res = PyCodec_RegisterError(methods[i].name, func); int res = PyDict_SetItemString(interp->codecs.error_registry,
methods[i].name, func);
Py_DECREF(func); Py_DECREF(func);
if (res) { if (res < 0) {
return -1; return PyStatus_Error("Failed to insert into codec error registry");
} }
} }
mod = PyImport_ImportModule("encodings"); interp->codecs.initialized = 1;
// Importing `encodings' will call back into this module to register codec
// search functions, so this is done after everything else is initialized.
PyObject *mod = PyImport_ImportModule("encodings");
if (mod == NULL) { if (mod == NULL) {
return -1; return PyStatus_Error("Failed to import encodings module");
} }
Py_DECREF(mod); Py_DECREF(mod);
interp->codecs_initialized = 1;
return 0; return PyStatus_Ok();
}
void
_PyCodec_Fini(PyInterpreterState *interp)
{
Py_CLEAR(interp->codecs.search_path);
Py_CLEAR(interp->codecs.search_cache);
Py_CLEAR(interp->codecs.error_registry);
interp->codecs.initialized = 0;
} }

View File

@ -843,9 +843,7 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
} }
PyConfig_Clear(&interp->config); PyConfig_Clear(&interp->config);
Py_CLEAR(interp->codec_search_path); _PyCodec_Fini(interp);
Py_CLEAR(interp->codec_search_cache);
Py_CLEAR(interp->codec_error_registry);
assert(interp->imports.modules == NULL); assert(interp->imports.modules == NULL);
assert(interp->imports.modules_by_index == NULL); assert(interp->imports.modules_by_index == NULL);

View File

@ -344,7 +344,7 @@ Python/ceval.c - _PyEval_BinaryOps -
Python/ceval.c - _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS - Python/ceval.c - _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS -
Python/codecs.c - Py_hexdigits - Python/codecs.c - Py_hexdigits -
Python/codecs.c - ucnhash_capi - Python/codecs.c - ucnhash_capi -
Python/codecs.c _PyCodecRegistry_Init methods - Python/codecs.c _PyCodec_InitRegistry methods -
Python/compile.c - NO_LABEL - Python/compile.c - NO_LABEL -
Python/compile.c - NO_LOCATION - Python/compile.c - NO_LOCATION -
Python/dynload_shlib.c - _PyImport_DynLoadFiletab - Python/dynload_shlib.c - _PyImport_DynLoadFiletab -

Can't render this file because it has a wrong number of fields in line 4.