diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 0f7d2bb4d18..2ac51dfccec 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1083,8 +1083,6 @@ These are the UTF-32 codec APIs: After completion, *\*byteorder* is set to the current byte order at the end of input data. - In a narrow build codepoints outside the BMP will be decoded as surrogate pairs. - If *byteorder* is *NULL*, the codec starts in native order mode. Return *NULL* if an exception was raised by the codec. diff --git a/Doc/reference/lexical_analysis.rst b/Doc/reference/lexical_analysis.rst index bab39f9d367..94f219b0231 100644 --- a/Doc/reference/lexical_analysis.rst +++ b/Doc/reference/lexical_analysis.rst @@ -538,9 +538,7 @@ Notes: this escape sequence. Exactly four hex digits are required. (6) - Any Unicode character can be encoded this way, but characters outside the Basic - Multilingual Plane (BMP) will be encoded using a surrogate pair if Python is - compiled to use 16-bit code units (the default). Exactly eight hex digits + Any Unicode character can be encoded this way. Exactly eight hex digits are required. diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 135e4694b0f..a8f5b5df360 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1022,8 +1022,7 @@ PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); /* Create a Unicode Object from the given Unicode code point ordinal. - The ordinal must be in range(0x10000) on narrow Python builds - (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is + The ordinal must be in range(0x110000). A ValueError is raised in case it is not. */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 87ac044e1c4..e90ee3f1a33 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5772,18 +5772,12 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) void *data; Py_ssize_t expandsize = 0; - /* Initial allocation is based on the longest-possible unichr + /* Initial allocation is based on the longest-possible character escape. - In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source - unichr, so in this case it's the longest unichr escape. In - narrow (UTF-16) builds this is five chars per source unichr - since there are two unichrs in the surrogate pair, so in narrow - (UTF-16) builds it's not the longest unichr escape. - - In wide or narrow builds '\uxxxx' is 6 chars per source unichr, - so in the narrow (UTF-16) build case it's the longest unichr - escape. + For UCS1 strings it's '\xxx', 4 bytes per source character. + For UCS2 strings it's '\uxxxx', 6 bytes per source character. + For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */ if (!PyUnicode_Check(unicode)) {