Add PyInterpreterState.fs_codec.utf8 (GH-18367)

Add a fast-path for UTF-8 encoding in PyUnicode_EncodeFSDefault()
and PyUnicode_DecodeFSDefaultAndSize().

Add _PyUnicode_FiniEncodings() helper function for _PyUnicode_Fini().
This commit is contained in:
Victor Stinner 2020-02-05 17:39:57 +01:00 committed by GitHub
parent 0e4e735d06
commit bf305cc6f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 55 additions and 53 deletions

View File

@ -102,6 +102,7 @@ struct _is {
Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */
struct {
char *encoding; /* Filesystem encoding (encoded to UTF-8) */
int utf8; /* encoding=="utf-8"? */
char *errors; /* Filesystem errors (encoded to UTF-8) */
_Py_error_handler error_handler;
} fs_codec;

View File

@ -3615,39 +3615,32 @@ PyObject *
PyUnicode_EncodeFSDefault(PyObject *unicode)
{
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
#ifdef _Py_FORCE_UTF8_FS_ENCODING
if (interp->fs_codec.encoding) {
if (interp->fs_codec.utf8) {
return unicode_encode_utf8(unicode,
interp->fs_codec.error_handler,
interp->fs_codec.errors);
}
else {
const wchar_t *filesystem_errors = interp->config.filesystem_errors;
_Py_error_handler errors;
errors = get_error_handler_wide(filesystem_errors);
assert(errors != _Py_ERROR_UNKNOWN);
return unicode_encode_utf8(unicode, errors, NULL);
}
#else
/* Bootstrap check: if the filesystem codec is implemented in Python, we
cannot use it to encode and decode filenames before it is loaded. Load
the Python codec requires to encode at least its own filename. Use the C
implementation of the locale codec until the codec registry is
initialized and the Python codec is loaded.
See _PyUnicode_InitEncodings(). */
if (interp->fs_codec.encoding) {
#ifndef _Py_FORCE_UTF8_FS_ENCODING
else if (interp->fs_codec.encoding) {
return PyUnicode_AsEncodedString(unicode,
interp->fs_codec.encoding,
interp->fs_codec.errors);
}
else {
const wchar_t *filesystem_errors = interp->config.filesystem_errors;
_Py_error_handler errors;
errors = get_error_handler_wide(filesystem_errors);
assert(errors != _Py_ERROR_UNKNOWN);
return unicode_encode_locale(unicode, errors, 0);
}
#endif
else {
/* Before _PyUnicode_InitEncodings() is called, the Python codec
machinery is not ready and so cannot be used:
use wcstombs() in this case. */
const wchar_t *filesystem_errors = interp->config.filesystem_errors;
assert(filesystem_errors != NULL);
_Py_error_handler errors = get_error_handler_wide(filesystem_errors);
assert(errors != _Py_ERROR_UNKNOWN);
#ifdef _Py_FORCE_UTF8_FS_ENCODING
return unicode_encode_utf8(unicode, errors, NULL);
#else
return unicode_encode_locale(unicode, errors, 0);
#endif
}
}
PyObject *
@ -3857,39 +3850,33 @@ PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
{
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
#ifdef _Py_FORCE_UTF8_FS_ENCODING
if (interp->fs_codec.encoding) {
if (interp->fs_codec.utf8) {
return unicode_decode_utf8(s, size,
interp->fs_codec.error_handler,
interp->fs_codec.errors,
NULL);
}
else {
const wchar_t *filesystem_errors = interp->config.filesystem_errors;
_Py_error_handler errors;
errors = get_error_handler_wide(filesystem_errors);
assert(errors != _Py_ERROR_UNKNOWN);
return unicode_decode_utf8(s, size, errors, NULL, NULL);
}
#else
/* Bootstrap check: if the filesystem codec is implemented in Python, we
cannot use it to encode and decode filenames before it is loaded. Load
the Python codec requires to encode at least its own filename. Use the C
implementation of the locale codec until the codec registry is
initialized and the Python codec is loaded.
See _PyUnicode_InitEncodings(). */
if (interp->fs_codec.encoding) {
#ifndef _Py_FORCE_UTF8_FS_ENCODING
else if (interp->fs_codec.encoding) {
return PyUnicode_Decode(s, size,
interp->fs_codec.encoding,
interp->fs_codec.errors);
}
else {
const wchar_t *filesystem_errors = interp->config.filesystem_errors;
_Py_error_handler errors;
errors = get_error_handler_wide(filesystem_errors);
return unicode_decode_locale(s, size, errors, 0);
}
#endif
else {
/* Before _PyUnicode_InitEncodings() is called, the Python codec
machinery is not ready and so cannot be used:
use mbstowcs() in this case. */
const wchar_t *filesystem_errors = interp->config.filesystem_errors;
assert(filesystem_errors != NULL);
_Py_error_handler errors = get_error_handler_wide(filesystem_errors);
assert(errors != _Py_ERROR_UNKNOWN);
#ifdef _Py_FORCE_UTF8_FS_ENCODING
return unicode_decode_utf8(s, size, errors, NULL, NULL);
#else
return unicode_decode_locale(s, size, errors, 0);
#endif
}
}
@ -15849,10 +15836,16 @@ init_fs_codec(PyInterpreterState *interp)
PyMem_RawFree(interp->fs_codec.encoding);
interp->fs_codec.encoding = encoding;
/* encoding has been normalized by init_fs_encoding() */
interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0);
PyMem_RawFree(interp->fs_codec.errors);
interp->fs_codec.errors = errors;
interp->fs_codec.error_handler = error_handler;
#ifdef _Py_FORCE_UTF8_FS_ENCODING
assert(interp->fs_codec.utf8 == 1);
#endif
/* At this point, PyUnicode_EncodeFSDefault() and
PyUnicode_DecodeFSDefault() can now use the Python codec rather than
the C implementation of the filesystem encoding. */
@ -15902,6 +15895,19 @@ _PyUnicode_InitEncodings(PyThreadState *tstate)
}
static void
_PyUnicode_FiniEncodings(PyThreadState *tstate)
{
PyInterpreterState *interp = tstate->interp;
PyMem_RawFree(interp->fs_codec.encoding);
interp->fs_codec.encoding = NULL;
interp->fs_codec.utf8 = 0;
PyMem_RawFree(interp->fs_codec.errors);
interp->fs_codec.errors = NULL;
interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN;
}
#ifdef MS_WINDOWS
int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
@ -15954,12 +15960,7 @@ _PyUnicode_Fini(PyThreadState *tstate)
_PyUnicode_ClearStaticStrings();
}
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
PyMem_RawFree(interp->fs_codec.encoding);
interp->fs_codec.encoding = NULL;
PyMem_RawFree(interp->fs_codec.errors);
interp->fs_codec.errors = NULL;
interp->config.filesystem_errors = (wchar_t *)_Py_ERROR_UNKNOWN;
_PyUnicode_FiniEncodings(tstate);
}