bpo-32030: Add _Py_EncodeUTF8_surrogateescape() (#4960)

Py_EncodeLocale() now uses _Py_EncodeUTF8_surrogateescape(), instead
of using temporary unicode and bytes objects. So Py_EncodeLocale()
doesn't use the Python C API anymore.
This commit is contained in:
Victor Stinner 2017-12-21 15:45:16 +01:00 committed by GitHub
parent fbd605151f
commit e47e698da6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 93 additions and 38 deletions

View File

@ -5147,6 +5147,95 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
}
/* UTF-8 encoder using the surrogateescape error handler .
On success, return a pointer to a newly allocated character string (use
PyMem_Free() to free the memory).
On encoding failure, return NULL and write the position of the invalid
surrogate character into *error_pos (if error_pos is set).
On memory allocation failure, return NULL and write (size_t)-1 into
*error_pos (if error_pos is set). */
char*
_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos)
{
const Py_ssize_t max_char_size = 4;
Py_ssize_t len = wcslen(text);
assert(len >= 0);
char *bytes;
if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
bytes = PyMem_Malloc((len + 1) * max_char_size);
}
else {
bytes = NULL;
}
if (bytes == NULL) {
if (error_pos != NULL) {
*error_pos = (size_t)-1;
}
return NULL;
}
char *p = bytes;
Py_ssize_t i;
for (i = 0; i < len;) {
Py_UCS4 ch = text[i++];
if (ch < 0x80) {
/* Encode ASCII */
*p++ = (char) ch;
}
else if (ch < 0x0800) {
/* Encode Latin-1 */
*p++ = (char)(0xc0 | (ch >> 6));
*p++ = (char)(0x80 | (ch & 0x3f));
}
else if (Py_UNICODE_IS_SURROGATE(ch)) {
/* surrogateescape error handler */
if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
if (error_pos != NULL) {
*error_pos = (size_t)i - 1;
}
goto error;
}
*p++ = (char)(ch & 0xff);
}
else if (ch < 0x10000) {
*p++ = (char)(0xe0 | (ch >> 12));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
}
else { /* ch >= 0x10000 */
assert(ch <= MAX_UNICODE);
/* Encode UCS4 Unicode ordinals */
*p++ = (char)(0xf0 | (ch >> 18));
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
}
}
*p++ = '\0';
size_t final_size = (p - bytes);
char *bytes2 = PyMem_Realloc(bytes, final_size);
if (bytes2 == NULL) {
if (error_pos != NULL) {
*error_pos = (size_t)-1;
}
goto error;
}
return bytes2;
error:
PyMem_Free(bytes);
return NULL;
}
/* Primary internal function which creates utf8 encoded bytes objects.
Allocation strategy: if the string is short, convert into a stack buffer

View File

@ -22,6 +22,8 @@ extern int winerror_to_errno(int);
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
size_t *p_wlen);
extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text,
size_t *error_pos);
#ifdef O_CLOEXEC
/* Does open() support the O_CLOEXEC flag? Possible values:
@ -418,42 +420,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
#endif /* __APPLE__ or __ANDROID__ */
}
static char*
_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
{
Py_ssize_t len;
PyObject *unicode, *bytes = NULL;
char *cpath;
unicode = PyUnicode_FromWideChar(text, wcslen(text));
if (unicode == NULL) {
return NULL;
}
bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Py_DECREF(unicode);
if (bytes == NULL) {
PyErr_Clear();
if (error_pos != NULL) {
*error_pos = (size_t)-1;
}
return NULL;
}
len = PyBytes_GET_SIZE(bytes);
cpath = PyMem_Malloc(len+1);
if (cpath == NULL) {
PyErr_Clear();
Py_DECREF(bytes);
if (error_pos != NULL) {
*error_pos = (size_t)-1;
}
return NULL;
}
memcpy(cpath, PyBytes_AsString(bytes), len + 1);
Py_DECREF(bytes);
return cpath;
}
#if !defined(__APPLE__) && !defined(__ANDROID__)
static char*
@ -537,10 +503,10 @@ char*
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
{
#if defined(__APPLE__) || defined(__ANDROID__)
return _Py_EncodeLocaleUTF8(text, error_pos);
return _Py_EncodeUTF8_surrogateescape(text, error_pos);
#else /* __APPLE__ */
if (Py_UTF8Mode == 1) {
return _Py_EncodeLocaleUTF8(text, error_pos);
return _Py_EncodeUTF8_surrogateescape(text, error_pos);
}
#ifndef MS_WINDOWS