From bf305cc6f05948f264349a6a6c6fd7d49c1839d3 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 5 Feb 2020 17:39:57 +0100 Subject: [PATCH] Add PyInterpreterState.fs_codec.utf8 (GH-18367) Add a fast-path for UTF-8 encoding in PyUnicode_EncodeFSDefault() and PyUnicode_DecodeFSDefaultAndSize(). Add _PyUnicode_FiniEncodings() helper function for _PyUnicode_Fini(). --- Include/internal/pycore_pystate.h | 1 + Objects/unicodeobject.c | 107 +++++++++++++++--------------- 2 files changed, 55 insertions(+), 53 deletions(-) diff --git a/Include/internal/pycore_pystate.h b/Include/internal/pycore_pystate.h index b78ed690425..405efb9f460 100644 --- a/Include/internal/pycore_pystate.h +++ b/Include/internal/pycore_pystate.h @@ -102,6 +102,7 @@ struct _is { Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */ struct { char *encoding; /* Filesystem encoding (encoded to UTF-8) */ + int utf8; /* encoding=="utf-8"? */ char *errors; /* Filesystem errors (encoded to UTF-8) */ _Py_error_handler error_handler; } fs_codec; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5f10437a152..7c8bc06252a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3615,39 +3615,32 @@ PyObject * PyUnicode_EncodeFSDefault(PyObject *unicode) { PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); -#ifdef _Py_FORCE_UTF8_FS_ENCODING - if (interp->fs_codec.encoding) { + if (interp->fs_codec.utf8) { return unicode_encode_utf8(unicode, interp->fs_codec.error_handler, interp->fs_codec.errors); } - else { - const wchar_t *filesystem_errors = interp->config.filesystem_errors; - _Py_error_handler errors; - errors = get_error_handler_wide(filesystem_errors); - assert(errors != _Py_ERROR_UNKNOWN); - return unicode_encode_utf8(unicode, errors, NULL); - } -#else - /* Bootstrap check: if the filesystem codec is implemented in Python, we - cannot use it to encode and decode filenames before it is loaded. Load - the Python codec requires to encode at least its own filename. Use the C - implementation of the locale codec until the codec registry is - initialized and the Python codec is loaded. - See _PyUnicode_InitEncodings(). */ - if (interp->fs_codec.encoding) { +#ifndef _Py_FORCE_UTF8_FS_ENCODING + else if (interp->fs_codec.encoding) { return PyUnicode_AsEncodedString(unicode, interp->fs_codec.encoding, interp->fs_codec.errors); } - else { - const wchar_t *filesystem_errors = interp->config.filesystem_errors; - _Py_error_handler errors; - errors = get_error_handler_wide(filesystem_errors); - assert(errors != _Py_ERROR_UNKNOWN); - return unicode_encode_locale(unicode, errors, 0); - } #endif + else { + /* Before _PyUnicode_InitEncodings() is called, the Python codec + machinery is not ready and so cannot be used: + use wcstombs() in this case. */ + const wchar_t *filesystem_errors = interp->config.filesystem_errors; + assert(filesystem_errors != NULL); + _Py_error_handler errors = get_error_handler_wide(filesystem_errors); + assert(errors != _Py_ERROR_UNKNOWN); +#ifdef _Py_FORCE_UTF8_FS_ENCODING + return unicode_encode_utf8(unicode, errors, NULL); +#else + return unicode_encode_locale(unicode, errors, 0); +#endif + } } PyObject * @@ -3857,39 +3850,33 @@ PyObject* PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) { PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); -#ifdef _Py_FORCE_UTF8_FS_ENCODING - if (interp->fs_codec.encoding) { + if (interp->fs_codec.utf8) { return unicode_decode_utf8(s, size, interp->fs_codec.error_handler, interp->fs_codec.errors, NULL); } - else { - const wchar_t *filesystem_errors = interp->config.filesystem_errors; - _Py_error_handler errors; - errors = get_error_handler_wide(filesystem_errors); - assert(errors != _Py_ERROR_UNKNOWN); - return unicode_decode_utf8(s, size, errors, NULL, NULL); - } -#else - /* Bootstrap check: if the filesystem codec is implemented in Python, we - cannot use it to encode and decode filenames before it is loaded. Load - the Python codec requires to encode at least its own filename. Use the C - implementation of the locale codec until the codec registry is - initialized and the Python codec is loaded. - See _PyUnicode_InitEncodings(). */ - if (interp->fs_codec.encoding) { +#ifndef _Py_FORCE_UTF8_FS_ENCODING + else if (interp->fs_codec.encoding) { return PyUnicode_Decode(s, size, interp->fs_codec.encoding, interp->fs_codec.errors); } - else { - const wchar_t *filesystem_errors = interp->config.filesystem_errors; - _Py_error_handler errors; - errors = get_error_handler_wide(filesystem_errors); - return unicode_decode_locale(s, size, errors, 0); - } #endif + else { + /* Before _PyUnicode_InitEncodings() is called, the Python codec + machinery is not ready and so cannot be used: + use mbstowcs() in this case. */ + const wchar_t *filesystem_errors = interp->config.filesystem_errors; + assert(filesystem_errors != NULL); + _Py_error_handler errors = get_error_handler_wide(filesystem_errors); + assert(errors != _Py_ERROR_UNKNOWN); +#ifdef _Py_FORCE_UTF8_FS_ENCODING + return unicode_decode_utf8(s, size, errors, NULL, NULL); +#else + return unicode_decode_locale(s, size, errors, 0); +#endif + } } @@ -15849,10 +15836,16 @@ init_fs_codec(PyInterpreterState *interp) PyMem_RawFree(interp->fs_codec.encoding); interp->fs_codec.encoding = encoding; + /* encoding has been normalized by init_fs_encoding() */ + interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0); PyMem_RawFree(interp->fs_codec.errors); interp->fs_codec.errors = errors; interp->fs_codec.error_handler = error_handler; +#ifdef _Py_FORCE_UTF8_FS_ENCODING + assert(interp->fs_codec.utf8 == 1); +#endif + /* At this point, PyUnicode_EncodeFSDefault() and PyUnicode_DecodeFSDefault() can now use the Python codec rather than the C implementation of the filesystem encoding. */ @@ -15902,6 +15895,19 @@ _PyUnicode_InitEncodings(PyThreadState *tstate) } +static void +_PyUnicode_FiniEncodings(PyThreadState *tstate) +{ + PyInterpreterState *interp = tstate->interp; + PyMem_RawFree(interp->fs_codec.encoding); + interp->fs_codec.encoding = NULL; + interp->fs_codec.utf8 = 0; + PyMem_RawFree(interp->fs_codec.errors); + interp->fs_codec.errors = NULL; + interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN; +} + + #ifdef MS_WINDOWS int _PyUnicode_EnableLegacyWindowsFSEncoding(void) @@ -15954,12 +15960,7 @@ _PyUnicode_Fini(PyThreadState *tstate) _PyUnicode_ClearStaticStrings(); } - PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); - PyMem_RawFree(interp->fs_codec.encoding); - interp->fs_codec.encoding = NULL; - PyMem_RawFree(interp->fs_codec.errors); - interp->fs_codec.errors = NULL; - interp->config.filesystem_errors = (wchar_t *)_Py_ERROR_UNKNOWN; + _PyUnicode_FiniEncodings(tstate); }