From 2f197078fbd04c1df8c77798f7a257537de53aa6 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Sat, 17 Dec 2011 07:08:30 +0100 Subject: [PATCH] The locale decoder raises a UnicodeDecodeError instead of an OSError Search the invalid character using mbrtowc(). --- Objects/unicodeobject.c | 103 +++++++++++++++++++++++++++++++++------- 1 file changed, 86 insertions(+), 17 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index b3d6de21960..3f70af7e379 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3084,9 +3084,7 @@ wcstombs_errorpos(const wchar_t *wstr) #endif char outbuf[MB_LEN_MAX]; const wchar_t *start, *previous; - int save_errno; - save_errno = errno; #if SIZEOF_WCHAR_T == 2 buf[2] = 0; #else @@ -3114,14 +3112,11 @@ wcstombs_errorpos(const wchar_t *wstr) wstr++; #endif len = wcstombs(outbuf, buf, sizeof(outbuf)); - if (len == (size_t)-1) { - errno = save_errno; + if (len == (size_t)-1) return previous - start; - } } /* failed to find the unencodable character */ - errno = save_errno; return 0; } @@ -3199,7 +3194,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) len = wcstombs(NULL, wstr, 0); if (len == (size_t)-1) { - error_pos = wcstombs_errorpos(wstr); + error_pos = (size_t)-1; goto encode_error; } @@ -3211,7 +3206,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); if (len2 == (size_t)-1 || len2 > len) { - error_pos = wcstombs_errorpos(wstr); + error_pos = (size_t)-1; goto encode_error; } PyMem_Free(wstr); @@ -3221,12 +3216,23 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) encode_error: errmsg = strerror(errno); assert(errmsg != NULL); + + if (error_pos == (size_t)-1) + error_pos = wcstombs_errorpos(wstr); + PyMem_Free(wstr); Py_XDECREF(bytes); - if (errmsg != NULL) - reason = PyUnicode_DecodeLocale(errmsg, "surrogateescape"); - else + if (errmsg != NULL) { + size_t errlen; + wstr = _Py_char2wchar(errmsg, &errlen); + if (wstr != NULL) { + reason = PyUnicode_FromWideChar(wstr, errlen); + PyMem_Free(wstr); + } else + errmsg = NULL; + } + if (errmsg == NULL) reason = PyUnicode_FromString( "wcstombs() encountered an unencodable " "wide character"); @@ -3376,6 +3382,37 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode, return NULL; } +static size_t +mbstowcs_errorpos(const char *str, size_t len) +{ +#ifdef HAVE_MBRTOWC + const char *start = str; + mbstate_t mbs; + size_t converted; + wchar_t ch; + + memset(&mbs, 0, sizeof mbs); + while (len) + { + converted = mbrtowc(&ch, (char*)str, len, &mbs); + if (converted == 0) + /* Reached end of string */ + break; + if (converted == (size_t)-1 || converted == (size_t)-2) { + /* Conversion error or incomplete character */ + return str - start; + } + else { + str += converted; + len -= converted; + } + } + /* failed to find the undecodable byte sequence */ + return 0; +#endif + return 0; +} + PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, const char *errors) @@ -3386,6 +3423,9 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, size_t wlen, wlen2; PyObject *unicode; int surrogateescape; + size_t error_pos; + char *errmsg; + PyObject *reason, *exc; if (locale_error_handler(errors, &surrogateescape) < 0) return NULL; @@ -3415,10 +3455,8 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, #else wlen = len; #endif - if (wlen == (size_t)-1) { - PyErr_SetFromErrno(PyExc_OSError); - return NULL; - } + if (wlen == (size_t)-1) + goto decode_error; if (wlen+1 <= smallbuf_len) { wstr = smallbuf; } @@ -3436,8 +3474,7 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, if (wlen2 == (size_t)-1) { if (wstr != smallbuf) PyMem_Free(wstr); - PyErr_SetFromErrno(PyExc_OSError); - return NULL; + goto decode_error; } #ifdef HAVE_BROKEN_MBSTOWCS assert(wlen2 == wlen); @@ -3447,6 +3484,38 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, PyMem_Free(wstr); } return unicode; + +decode_error: + errmsg = strerror(errno); + assert(errmsg != NULL); + + error_pos = mbstowcs_errorpos(str, len); + if (errmsg != NULL) { + size_t errlen; + wstr = _Py_char2wchar(errmsg, &errlen); + if (wstr != NULL) { + reason = PyUnicode_FromWideChar(wstr, errlen); + PyMem_Free(wstr); + } else + errmsg = NULL; + } + if (errmsg == NULL) + reason = PyUnicode_FromString( + "mbstowcs() encountered an invalid multibyte sequence"); + if (reason == NULL) + return NULL; + + exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", + "locale", str, len, + (Py_ssize_t)error_pos, + (Py_ssize_t)(error_pos+1), + reason); + Py_DECREF(reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_XDECREF(exc); + } + return NULL; } PyObject*