bpo-32030: Add _Py_EncodeUTF8_surrogateescape() (#4960)
Py_EncodeLocale() now uses _Py_EncodeUTF8_surrogateescape(), instead of using temporary unicode and bytes objects. So Py_EncodeLocale() doesn't use the Python C API anymore.
This commit is contained in:
parent
fbd605151f
commit
e47e698da6
|
@ -5147,6 +5147,95 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
|
|||
}
|
||||
|
||||
|
||||
/* UTF-8 encoder using the surrogateescape error handler .
|
||||
|
||||
On success, return a pointer to a newly allocated character string (use
|
||||
PyMem_Free() to free the memory).
|
||||
|
||||
On encoding failure, return NULL and write the position of the invalid
|
||||
surrogate character into *error_pos (if error_pos is set).
|
||||
|
||||
On memory allocation failure, return NULL and write (size_t)-1 into
|
||||
*error_pos (if error_pos is set). */
|
||||
char*
|
||||
_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos)
|
||||
{
|
||||
const Py_ssize_t max_char_size = 4;
|
||||
Py_ssize_t len = wcslen(text);
|
||||
|
||||
assert(len >= 0);
|
||||
|
||||
char *bytes;
|
||||
if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
|
||||
bytes = PyMem_Malloc((len + 1) * max_char_size);
|
||||
}
|
||||
else {
|
||||
bytes = NULL;
|
||||
}
|
||||
if (bytes == NULL) {
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *p = bytes;
|
||||
Py_ssize_t i;
|
||||
for (i = 0; i < len;) {
|
||||
Py_UCS4 ch = text[i++];
|
||||
|
||||
if (ch < 0x80) {
|
||||
/* Encode ASCII */
|
||||
*p++ = (char) ch;
|
||||
|
||||
}
|
||||
else if (ch < 0x0800) {
|
||||
/* Encode Latin-1 */
|
||||
*p++ = (char)(0xc0 | (ch >> 6));
|
||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||
}
|
||||
else if (Py_UNICODE_IS_SURROGATE(ch)) {
|
||||
/* surrogateescape error handler */
|
||||
if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)i - 1;
|
||||
}
|
||||
goto error;
|
||||
}
|
||||
*p++ = (char)(ch & 0xff);
|
||||
}
|
||||
else if (ch < 0x10000) {
|
||||
*p++ = (char)(0xe0 | (ch >> 12));
|
||||
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||
}
|
||||
else { /* ch >= 0x10000 */
|
||||
assert(ch <= MAX_UNICODE);
|
||||
/* Encode UCS4 Unicode ordinals */
|
||||
*p++ = (char)(0xf0 | (ch >> 18));
|
||||
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
|
||||
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||
}
|
||||
}
|
||||
*p++ = '\0';
|
||||
|
||||
size_t final_size = (p - bytes);
|
||||
char *bytes2 = PyMem_Realloc(bytes, final_size);
|
||||
if (bytes2 == NULL) {
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)-1;
|
||||
}
|
||||
goto error;
|
||||
}
|
||||
return bytes2;
|
||||
|
||||
error:
|
||||
PyMem_Free(bytes);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* Primary internal function which creates utf8 encoded bytes objects.
|
||||
|
||||
Allocation strategy: if the string is short, convert into a stack buffer
|
||||
|
|
|
@ -22,6 +22,8 @@ extern int winerror_to_errno(int);
|
|||
|
||||
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
|
||||
size_t *p_wlen);
|
||||
extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text,
|
||||
size_t *error_pos);
|
||||
|
||||
#ifdef O_CLOEXEC
|
||||
/* Does open() support the O_CLOEXEC flag? Possible values:
|
||||
|
@ -418,42 +420,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
|
|||
#endif /* __APPLE__ or __ANDROID__ */
|
||||
}
|
||||
|
||||
static char*
|
||||
_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
|
||||
{
|
||||
Py_ssize_t len;
|
||||
PyObject *unicode, *bytes = NULL;
|
||||
char *cpath;
|
||||
|
||||
unicode = PyUnicode_FromWideChar(text, wcslen(text));
|
||||
if (unicode == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
|
||||
Py_DECREF(unicode);
|
||||
if (bytes == NULL) {
|
||||
PyErr_Clear();
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
len = PyBytes_GET_SIZE(bytes);
|
||||
cpath = PyMem_Malloc(len+1);
|
||||
if (cpath == NULL) {
|
||||
PyErr_Clear();
|
||||
Py_DECREF(bytes);
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
memcpy(cpath, PyBytes_AsString(bytes), len + 1);
|
||||
Py_DECREF(bytes);
|
||||
return cpath;
|
||||
}
|
||||
|
||||
#if !defined(__APPLE__) && !defined(__ANDROID__)
|
||||
static char*
|
||||
|
@ -537,10 +503,10 @@ char*
|
|||
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
|
||||
{
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
return _Py_EncodeLocaleUTF8(text, error_pos);
|
||||
return _Py_EncodeUTF8_surrogateescape(text, error_pos);
|
||||
#else /* __APPLE__ */
|
||||
if (Py_UTF8Mode == 1) {
|
||||
return _Py_EncodeLocaleUTF8(text, error_pos);
|
||||
return _Py_EncodeUTF8_surrogateescape(text, error_pos);
|
||||
}
|
||||
|
||||
#ifndef MS_WINDOWS
|
||||
|
|
Loading…
Reference in New Issue