diff --git a/Doc/c-api/sys.rst b/Doc/c-api/sys.rst index 9760dca2df1..a6a939ccc29 100644 --- a/Doc/c-api/sys.rst +++ b/Doc/c-api/sys.rst @@ -47,6 +47,60 @@ Operating System Utilities not call those functions directly! :c:type:`PyOS_sighandler_t` is a typedef alias for :c:type:`void (\*)(int)`. +.. c:function:: wchar_t* Py_DecodeLocale(const char* arg, size_t *size) + + Decode a byte string from the locale encoding with the :ref:`surrogateescape + error handler `: undecodable bytes are decoded as + characters in range U+DC80..U+DCFF. If a byte sequence can be decoded as a + surrogate character, escape the bytes using the surrogateescape error + handler instead of decoding them. + + Return a pointer to a newly allocated wide character string, use + :c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write + the number of wide characters excluding the null character into ``*size`` + + Return ``NULL`` on decoding error or memory allocation error. If *size* is + not ``NULL``, ``*size`` is set to ``(size_t)-1`` on memory error or set to + ``(size_t)-2`` on decoding error. + + Decoding errors should never happen, unless there is a bug in the C + library. + + Use the :c:func:`Py_EncodeLocale` function to encode the character string + back to a byte string. + + .. seealso:: + + The :c:func:`PyUnicode_DecodeFSDefaultAndSize` and + :c:func:`PyUnicode_DecodeLocaleAndSize` functions. + + .. versionadded:: 3.5 + + +.. c:function:: char* Py_EncodeLocale(const wchar_t *text, size_t *error_pos) + + Encode a wide character string to the locale encoding with the + :ref:`surrogateescape error handler `: surrogate characters + in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF. + + Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free` + to free the memory. Return ``NULL`` on encoding error or memory allocation + error + + If error_pos is not ``NULL``, ``*error_pos`` is set to the index of the + invalid character on encoding error, or set to ``(size_t)-1`` otherwise. + + Use the :c:func:`Py_DecodeLocale` function to decode the bytes string back + to a wide character string. + + .. seealso:: + + The :c:func:`PyUnicode_EncodeFSDefault` and + :c:func:`PyUnicode_EncodeLocale` functions. + + .. versionadded:: 3.5 + + .. _systemfunctions: System Functions diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 4352351ccab..2d1bae131c0 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -758,11 +758,13 @@ system. *errors* is ``NULL``. *str* must end with a null character but cannot contain embedded null characters. + Use :c:func:`PyUnicode_DecodeFSDefaultAndSize` to decode a string from + :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at + Python startup). + .. seealso:: - Use :c:func:`PyUnicode_DecodeFSDefaultAndSize` to decode a string from - :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at - Python startup). + The :c:func:`Py_DecodeLocale` function. .. versionadded:: 3.3 @@ -783,11 +785,13 @@ system. *errors* is ``NULL``. Return a :class:`bytes` object. *str* cannot contain embedded null characters. + Use :c:func:`PyUnicode_EncodeFSDefault` to encode a string to + :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at + Python startup). + .. seealso:: - Use :c:func:`PyUnicode_EncodeFSDefault` to encode a string to - :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at - Python startup). + The :c:func:`Py_EncodeLocale` function. .. versionadded:: 3.3 @@ -832,12 +836,14 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function: If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the locale encoding. + :c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the + locale encoding and cannot be modified later. If you need to decode a string + from the current locale encoding, use + :c:func:`PyUnicode_DecodeLocaleAndSize`. + .. seealso:: - :c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the - locale encoding and cannot be modified later. If you need to decode a - string from the current locale encoding, use - :c:func:`PyUnicode_DecodeLocaleAndSize`. + The :c:func:`Py_DecodeLocale` function. .. versionchanged:: 3.2 Use ``"strict"`` error handler on Windows. @@ -867,12 +873,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function: If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the locale encoding. + :c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the + locale encoding and cannot be modified later. If you need to encode a string + to the current locale encoding, use :c:func:`PyUnicode_EncodeLocale`. + .. seealso:: - :c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the - locale encoding and cannot be modified later. If you need to encode a - string to the current locale encoding, use - :c:func:`PyUnicode_EncodeLocale`. + The :c:func:`Py_EncodeLocale` function. .. versionadded:: 3.2 diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 36144e9ef23..4c2a0235707 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -318,6 +318,7 @@ and writing to platform dependent files: encodings. +.. _surrogateescape: .. _codec-base-classes: Codec Base Classes diff --git a/Doc/library/os.rst b/Doc/library/os.rst index 9cfc4729408..bf3a8d56952 100644 --- a/Doc/library/os.rst +++ b/Doc/library/os.rst @@ -78,9 +78,10 @@ uses the file system encoding to perform this conversion (see .. versionchanged:: 3.1 On some systems, conversion using the file system encoding may fail. In this - case, Python uses the ``surrogateescape`` encoding error handler, which means - that undecodable bytes are replaced by a Unicode character U+DCxx on - decoding, and these are again translated to the original byte on encoding. + case, Python uses the :ref:`surrogateescape encoding error handler + `, which means that undecodable bytes are replaced by a + Unicode character U+DCxx on decoding, and these are again translated to the + original byte on encoding. The file system encoding must guarantee to successfully decode all bytes diff --git a/Include/fileutils.h b/Include/fileutils.h index f2a43f75c4f..c5eebc5c072 100644 --- a/Include/fileutils.h +++ b/Include/fileutils.h @@ -7,11 +7,11 @@ extern "C" { PyAPI_FUNC(PyObject *) _Py_device_encoding(int); -PyAPI_FUNC(wchar_t *) _Py_char2wchar( +PyAPI_FUNC(wchar_t *) Py_DecodeLocale( const char *arg, size_t *size); -PyAPI_FUNC(char*) _Py_wchar2char( +PyAPI_FUNC(char*) Py_EncodeLocale( const wchar_t *text, size_t *error_pos); diff --git a/Misc/NEWS b/Misc/NEWS index e8e9ba54ae8..f771885581d 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,10 @@ Release date: TBA Core and Builtins ----------------- +- Issue #18395: Rename ``_Py_char2wchar()`` to :c:func:`Py_DecodeLocale`, + rename ``_Py_wchar2char()`` to :c:func:`Py_EncodeLocale`, and document + these functions. + - Issue #20179: Apply Argument Clinic to bytes and bytearray. Patch by Tal Einat. diff --git a/Misc/coverity_model.c b/Misc/coverity_model.c index 57f3aeb11ad..421d54d4191 100644 --- a/Misc/coverity_model.c +++ b/Misc/coverity_model.c @@ -85,7 +85,7 @@ PyObject *PyErr_SetFromErrnoWithFilename(PyObject *exc, const char *filename) } /* Python/fileutils.c */ -wchar_t *_Py_char2wchar(const char* arg, size_t *size) +wchar_t *Py_DecodeLocale(const char* arg, size_t *size) { wchar_t *w; __coverity_tainted_data_sink__(arg); diff --git a/Modules/getpath.c b/Modules/getpath.c index f26b8e9b2b7..de803f8fcb3 100644 --- a/Modules/getpath.c +++ b/Modules/getpath.c @@ -336,7 +336,7 @@ search_for_prefix(wchar_t *argv0_path, wchar_t *home, wchar_t *_prefix, joinpath(prefix, L"Modules/Setup"); if (isfile(prefix)) { /* Check VPATH to see if argv0_path is in the build directory. */ - vpath = _Py_char2wchar(VPATH, NULL); + vpath = Py_DecodeLocale(VPATH, NULL); if (vpath != NULL) { wcsncpy(prefix, argv0_path, MAXPATHLEN); prefix[MAXPATHLEN] = L'\0'; @@ -491,10 +491,10 @@ calculate_path(void) wchar_t *_pythonpath, *_prefix, *_exec_prefix; wchar_t *lib_python; - _pythonpath = _Py_char2wchar(PYTHONPATH, NULL); - _prefix = _Py_char2wchar(PREFIX, NULL); - _exec_prefix = _Py_char2wchar(EXEC_PREFIX, NULL); - lib_python = _Py_char2wchar("lib/python" VERSION, NULL); + _pythonpath = Py_DecodeLocale(PYTHONPATH, NULL); + _prefix = Py_DecodeLocale(PREFIX, NULL); + _exec_prefix = Py_DecodeLocale(EXEC_PREFIX, NULL); + lib_python = Py_DecodeLocale("lib/python" VERSION, NULL); if (!_pythonpath || !_prefix || !_exec_prefix || !lib_python) { Py_FatalError( @@ -503,7 +503,7 @@ calculate_path(void) } if (_path) { - path_buffer = _Py_char2wchar(_path, NULL); + path_buffer = Py_DecodeLocale(_path, NULL); path = path_buffer; } @@ -584,7 +584,7 @@ calculate_path(void) ** be running the interpreter in the build directory, so we use the ** build-directory-specific logic to find Lib and such. */ - wchar_t* wbuf = _Py_char2wchar(modPath, NULL); + wchar_t* wbuf = Py_DecodeLocale(modPath, NULL); if (wbuf == NULL) { Py_FatalError("Cannot decode framework location"); } @@ -709,7 +709,7 @@ calculate_path(void) if (_rtpypath && _rtpypath[0] != '\0') { size_t rtpypath_len; - rtpypath = _Py_char2wchar(_rtpypath, &rtpypath_len); + rtpypath = Py_DecodeLocale(_rtpypath, &rtpypath_len); if (rtpypath != NULL) bufsz += rtpypath_len + 1; } diff --git a/Modules/main.c b/Modules/main.c index 1c25326d0cd..8a9f5a25eee 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -647,7 +647,7 @@ Py_Main(int argc, wchar_t **argv) /* Used by Mac/Tools/pythonw.c to forward * the argv0 of the stub executable */ - wchar_t* wbuf = _Py_char2wchar(pyvenv_launcher, NULL); + wchar_t* wbuf = Py_DecodeLocale(pyvenv_launcher, NULL); if (wbuf == NULL) { Py_FatalError("Cannot decode __PYVENV_LAUNCHER__"); @@ -730,7 +730,7 @@ Py_Main(int argc, wchar_t **argv) char *cfilename_buffer; const char *cfilename; int err = errno; - cfilename_buffer = _Py_wchar2char(filename, NULL); + cfilename_buffer = Py_EncodeLocale(filename, NULL); if (cfilename_buffer != NULL) cfilename = cfilename_buffer; else diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 72272c70528..263ca85b7b7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3255,7 +3255,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) /* "surrogateescape" error handler */ char *str; - str = _Py_wchar2char(wstr, &error_pos); + str = Py_EncodeLocale(wstr, &error_pos); if (str == NULL) { if (error_pos == (size_t)-1) { PyErr_NoMemory(); @@ -3308,7 +3308,7 @@ encode_error: if (errmsg != NULL) { size_t errlen; - wstr = _Py_char2wchar(errmsg, &errlen); + wstr = Py_DecodeLocale(errmsg, &errlen); if (wstr != NULL) { reason = PyUnicode_FromWideChar(wstr, errlen); PyMem_RawFree(wstr); @@ -3526,7 +3526,7 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, if (surrogateescape) { /* "surrogateescape" error handler */ - wstr = _Py_char2wchar(str, &wlen); + wstr = Py_DecodeLocale(str, &wlen); if (wstr == NULL) { if (wlen == (size_t)-1) PyErr_NoMemory(); @@ -3581,7 +3581,7 @@ decode_error: error_pos = mbstowcs_errorpos(str, len); if (errmsg != NULL) { size_t errlen; - wstr = _Py_char2wchar(errmsg, &errlen); + wstr = Py_DecodeLocale(errmsg, &errlen); if (wstr != NULL) { reason = PyUnicode_FromWideChar(wstr, errlen); PyMem_RawFree(wstr); diff --git a/Programs/python.c b/Programs/python.c index 9811c01d491..2e5e4e368f0 100644 --- a/Programs/python.c +++ b/Programs/python.c @@ -52,7 +52,7 @@ main(int argc, char **argv) setlocale(LC_ALL, ""); for (i = 0; i < argc; i++) { - argv_copy[i] = _Py_char2wchar(argv[i], NULL); + argv_copy[i] = Py_DecodeLocale(argv[i], NULL); if (!argv_copy[i]) { PyMem_RawFree(oldloc); fprintf(stderr, "Fatal Python error: " diff --git a/Python/fileutils.c b/Python/fileutils.c index 065d3fd9741..227e92a79ef 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -82,11 +82,11 @@ extern int _Py_normalize_encoding(const char *, char *, size_t); Values of force_ascii: - 1: the workaround is used: _Py_wchar2char() uses - encode_ascii_surrogateescape() and _Py_char2wchar() uses + 1: the workaround is used: Py_EncodeLocale() uses + encode_ascii_surrogateescape() and Py_DecodeLocale() uses decode_ascii_surrogateescape() - 0: the workaround is not used: _Py_wchar2char() uses wcstombs() and - _Py_char2wchar() uses mbstowcs() + 0: the workaround is not used: Py_EncodeLocale() uses wcstombs() and + Py_DecodeLocale() uses mbstowcs() -1: unknown, need to call check_force_ascii() to get the value */ static int force_ascii = -1; @@ -241,24 +241,26 @@ decode_ascii_surrogateescape(const char *arg, size_t *size) /* Decode a byte string from the locale encoding with the - surrogateescape error handler (undecodable bytes are decoded as characters - in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate + surrogateescape error handler: undecodable bytes are decoded as characters + in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate character, escape the bytes using the surrogateescape error handler instead of decoding them. - Use _Py_wchar2char() to encode the character string back to a byte string. + Return a pointer to a newly allocated wide character string, use + PyMem_RawFree() to free the memory. If size is not NULL, write the number of + wide characters excluding the null character into *size - Return a pointer to a newly allocated wide character string (use - PyMem_RawFree() to free the memory) and write the number of written wide - characters excluding the null character into *size if size is not NULL, or - NULL on error (decoding or memory allocation error). If size is not NULL, - *size is set to (size_t)-1 on memory error and (size_t)-2 on decoding - error. + Return NULL on decoding error or memory allocation error. If *size* is not + NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on + decoding error. - Conversion errors should never happen, unless there is a bug in the C - library. */ + Decoding errors should never happen, unless there is a bug in the C + library. + + Use the Py_EncodeLocale() function to encode the character string back to a + byte string. */ wchar_t* -_Py_char2wchar(const char* arg, size_t *size) +Py_DecodeLocale(const char* arg, size_t *size) { #ifdef __APPLE__ wchar_t *wstr; @@ -389,19 +391,20 @@ oom: #endif /* __APPLE__ */ } -/* Encode a (wide) character string to the locale encoding with the - surrogateescape error handler (characters in range U+DC80..U+DCFF are - converted to bytes 0x80..0xFF). +/* Encode a wide character string to the locale encoding with the + surrogateescape error handler: surrogate characters in the range + U+DC80..U+DCFF are converted to bytes 0x80..0xFF. - This function is the reverse of _Py_char2wchar(). + Return a pointer to a newly allocated byte string, use PyMem_Free() to free + the memory. Return NULL on encoding or memory allocation error. - Return a pointer to a newly allocated byte string (use PyMem_Free() to free - the memory), or NULL on encoding or memory allocation error. + If error_pos is not NULL, *error_pos is set to the index of the invalid + character on encoding error, or set to (size_t)-1 otherwise. - If error_pos is not NULL: *error_pos is the index of the invalid character - on encoding error, or (size_t)-1 otherwise. */ + Use the Py_DecodeLocale() function to decode the bytes string back to a wide + character string. */ char* -_Py_wchar2char(const wchar_t *text, size_t *error_pos) +Py_EncodeLocale(const wchar_t *text, size_t *error_pos) { #ifdef __APPLE__ Py_ssize_t len; @@ -520,7 +523,7 @@ _Py_wstat(const wchar_t* path, struct stat *buf) { int err; char *fname; - fname = _Py_wchar2char(path, NULL); + fname = Py_EncodeLocale(path, NULL); if (fname == NULL) { errno = EINVAL; return -1; @@ -784,7 +787,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode) errno = EINVAL; return NULL; } - cpath = _Py_wchar2char(path, NULL); + cpath = Py_EncodeLocale(path, NULL); if (cpath == NULL) return NULL; f = fopen(cpath, cmode); @@ -875,7 +878,7 @@ _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz) int res; size_t r1; - cpath = _Py_wchar2char(path, NULL); + cpath = Py_EncodeLocale(path, NULL); if (cpath == NULL) { errno = EINVAL; return -1; @@ -889,7 +892,7 @@ _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz) return -1; } cbuf[res] = '\0'; /* buf will be null terminated */ - wbuf = _Py_char2wchar(cbuf, &r1); + wbuf = Py_DecodeLocale(cbuf, &r1); if (wbuf == NULL) { errno = EINVAL; return -1; @@ -920,7 +923,7 @@ _Py_wrealpath(const wchar_t *path, wchar_t *wresolved_path; char *res; size_t r; - cpath = _Py_wchar2char(path, NULL); + cpath = Py_EncodeLocale(path, NULL); if (cpath == NULL) { errno = EINVAL; return NULL; @@ -930,7 +933,7 @@ _Py_wrealpath(const wchar_t *path, if (res == NULL) return NULL; - wresolved_path = _Py_char2wchar(cresolved_path, &r); + wresolved_path = Py_DecodeLocale(cresolved_path, &r); if (wresolved_path == NULL) { errno = EINVAL; return NULL; @@ -963,7 +966,7 @@ _Py_wgetcwd(wchar_t *buf, size_t size) if (getcwd(fname, Py_ARRAY_LENGTH(fname)) == NULL) return NULL; - wname = _Py_char2wchar(fname, &len); + wname = Py_DecodeLocale(fname, &len); if (wname == NULL) return NULL; if (size <= len) { diff --git a/Python/frozenmain.c b/Python/frozenmain.c index 55d05fc26f0..cb84ed5f244 100644 --- a/Python/frozenmain.c +++ b/Python/frozenmain.c @@ -52,7 +52,7 @@ Py_FrozenMain(int argc, char **argv) setlocale(LC_ALL, ""); for (i = 0; i < argc; i++) { - argv_copy[i] = _Py_char2wchar(argv[i], NULL); + argv_copy[i] = Py_DecodeLocale(argv[i], NULL); argv_copy2[i] = argv_copy[i]; if (!argv_copy[i]) { fprintf(stderr, "Unable to decode the command line argument #%i\n",