diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 84593f29402..2a7abf96ab5 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1280,12 +1280,13 @@ functions can be used directly if desired. .. module:: encodings.mbcs :synopsis: Windows ANSI codepage -Encode operand according to the ANSI codepage (CP_ACP). This codec only -supports ``'strict'`` and ``'replace'`` error handlers to encode, and -``'strict'`` and ``'ignore'`` error handlers to decode. +Encode operand according to the ANSI codepage (CP_ACP). Availability: Windows only. +.. versionchanged:: 3.3 + Support any error handler. + .. versionchanged:: 3.2 Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used to encode, and ``'ignore'`` to decode. diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst index 945aa97757f..eb62968606d 100644 --- a/Doc/whatsnew/3.3.rst +++ b/Doc/whatsnew/3.3.rst @@ -197,6 +197,11 @@ The :mod:`array` module supports the :c:type:`long long` type using ``q`` and codecs ------ +The :mod:`~encodings.mbcs` codec has be rewritten to handle correclty +``replace`` and ``ignore`` error handlers on all Windows versions. The +:mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of +only ``replace`` to encode and ``ignore`` to decode. + Multibyte CJK decoders now resynchronize faster. They only ignore the first byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312', 'replace')`` now returns a ``\n`` after the replacement character. diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index a1725e5d0d7..99ec44c7ae2 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1466,6 +1466,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( Py_ssize_t *consumed /* bytes consumed */ ); +PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( + int code_page, /* code page number */ + const char *string, /* encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed /* bytes consumed */ + ); + PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( PyObject *unicode /* Unicode object */ ); @@ -1473,11 +1481,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( #ifndef Py_LIMITED_API PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( const Py_UNICODE *data, /* Unicode char buffer */ - Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ + Py_ssize_t length, /* number of Py_UNICODE chars to encode */ const char *errors /* error handling */ ); #endif +PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( + int code_page, /* code page number */ + PyObject *unicode, /* Unicode object */ + const char *errors /* error handling */ + ); + #endif /* HAVE_MBCS */ /* --- Decimal Encoder ---------------------------------------------------- */ diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index e9ce95a8666..f714a44d35e 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1744,6 +1744,203 @@ class TransformCodecTest(unittest.TestCase): self.assertEqual(sout, b"\x80") +class CodePageTest(unittest.TestCase): + CP_UTF8 = 65001 + vista_or_later = (sys.getwindowsversion().major >= 6) + + def test_invalid_code_page(self): + self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a') + self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a') + self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a') + self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a') + + def test_code_page_name(self): + self.assertRaisesRegex(UnicodeEncodeError, 'cp932', + codecs.code_page_encode, 932, '\xff') + self.assertRaisesRegex(UnicodeDecodeError, 'cp932', + codecs.code_page_decode, 932, b'\x81\x00') + self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8', + codecs.code_page_decode, self.CP_UTF8, b'\xff') + + def check_decode(self, cp, tests): + for raw, errors, expected in tests: + if expected is not None: + try: + decoded = codecs.code_page_decode(cp, raw, errors) + except UnicodeDecodeError as err: + self.fail('Unable to decode %a from "cp%s" with ' + 'errors=%r: %s' % (raw, cp, errors, err)) + self.assertEqual(decoded[0], expected, + '%a.decode("cp%s", %r)=%a != %a' + % (raw, cp, errors, decoded[0], expected)) + # assert 0 <= decoded[1] <= len(raw) + self.assertGreaterEqual(decoded[1], 0) + self.assertLessEqual(decoded[1], len(raw)) + else: + self.assertRaises(UnicodeDecodeError, + codecs.code_page_decode, cp, raw, errors) + + def check_encode(self, cp, tests): + for text, errors, expected in tests: + if expected is not None: + try: + encoded = codecs.code_page_encode(cp, text, errors) + except UnicodeEncodeError as err: + self.fail('Unable to encode %a to "cp%s" with ' + 'errors=%r: %s' % (text, cp, errors, err)) + self.assertEqual(encoded[0], expected, + '%a.encode("cp%s", %r)=%a != %a' + % (text, cp, errors, encoded[0], expected)) + self.assertEqual(encoded[1], len(text)) + else: + self.assertRaises(UnicodeEncodeError, + codecs.code_page_encode, cp, text, errors) + + def test_cp932(self): + self.check_encode(932, ( + ('abc', 'strict', b'abc'), + ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'), + # not encodable + ('\xff', 'strict', None), + ('[\xff]', 'ignore', b'[]'), + ('[\xff]', 'replace', b'[y]'), + ('[\u20ac]', 'replace', b'[?]'), + )) + tests = [ + (b'abc', 'strict', 'abc'), + (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'), + # invalid bytes + (b'\xff', 'strict', None), + (b'\xff', 'ignore', ''), + (b'\xff', 'replace', '\ufffd'), + (b'\x81\x00abc', 'strict', None), + (b'\x81\x00abc', 'ignore', '\x00abc'), + ] + if self.vista_or_later: + tests.append((b'\x81\x00abc', 'replace', '\ufffd\x00abc')) + else: + tests.append((b'\x81\x00abc', 'replace', '\x00\x00abc')) + self.check_decode(932, tests) + + def test_cp1252(self): + self.check_encode(1252, ( + ('abc', 'strict', b'abc'), + ('\xe9\u20ac', 'strict', b'\xe9\x80'), + ('\xff', 'strict', b'\xff'), + ('\u0141', 'strict', None), + ('\u0141', 'ignore', b''), + ('\u0141', 'replace', b'L'), + )) + self.check_decode(1252, ( + (b'abc', 'strict', 'abc'), + (b'\xe9\x80', 'strict', '\xe9\u20ac'), + (b'\xff', 'strict', '\xff'), + )) + + def test_cp_utf7(self): + cp = 65000 + self.check_encode(cp, ( + ('abc', 'strict', b'abc'), + ('\xe9\u20ac', 'strict', b'+AOkgrA-'), + ('\U0010ffff', 'strict', b'+2//f/w-'), + ('\udc80', 'strict', b'+3IA-'), + ('\ufffd', 'strict', b'+//0-'), + )) + self.check_decode(cp, ( + (b'abc', 'strict', 'abc'), + (b'+AOkgrA-', 'strict', '\xe9\u20ac'), + (b'+2//f/w-', 'strict', '\U0010ffff'), + (b'+3IA-', 'strict', '\udc80'), + (b'+//0-', 'strict', '\ufffd'), + # invalid bytes + (b'[+/]', 'strict', '[]'), + (b'[\xff]', 'strict', '[\xff]'), + )) + + def test_cp_utf8(self): + cp = self.CP_UTF8 + + tests = [ + ('abc', 'strict', b'abc'), + ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'), + ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'), + ] + if self.vista_or_later: + tests.append(('\udc80', 'strict', None)) + tests.append(('\udc80', 'ignore', b'')) + tests.append(('\udc80', 'replace', b'?')) + else: + tests.append(('\udc80', 'strict', b'\xed\xb2\x80')) + self.check_encode(cp, tests) + + tests = [ + (b'abc', 'strict', 'abc'), + (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'), + (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'), + (b'\xef\xbf\xbd', 'strict', '\ufffd'), + (b'[\xc3\xa9]', 'strict', '[\xe9]'), + # invalid bytes + (b'[\xff]', 'strict', None), + (b'[\xff]', 'ignore', '[]'), + (b'[\xff]', 'replace', '[\ufffd]'), + ] + if self.vista_or_later: + tests.extend(( + (b'[\xed\xb2\x80]', 'strict', None), + (b'[\xed\xb2\x80]', 'ignore', '[]'), + (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), + )) + else: + tests.extend(( + (b'[\xed\xb2\x80]', 'strict', '[\udc80]'), + )) + self.check_decode(cp, tests) + + def test_error_handlers(self): + self.check_encode(932, ( + ('\xff', 'backslashreplace', b'\\xff'), + ('\xff', 'xmlcharrefreplace', b'ÿ'), + )) + self.check_decode(932, ( + (b'\xff', 'surrogateescape', '\udcff'), + )) + if self.vista_or_later: + self.check_encode(self.CP_UTF8, ( + ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), + )) + + def test_multibyte_encoding(self): + self.check_decode(932, ( + (b'\x84\xe9\x80', 'ignore', '\u9a3e'), + (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'), + )) + self.check_decode(self.CP_UTF8, ( + (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), + (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), + )) + if self.vista_or_later: + self.check_encode(self.CP_UTF8, ( + ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), + ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), + )) + + def test_incremental(self): + decoded = codecs.code_page_decode(932, + b'\xe9\x80\xe9', 'strict', + False) + self.assertEqual(decoded, ('\u9a3e', 2)) + + decoded = codecs.code_page_decode(932, + b'\xe9\x80\xe9\x80', 'strict', + False) + self.assertEqual(decoded, ('\u9a3e\u9a3e', 4)) + + decoded = codecs.code_page_decode(932, + b'abc', 'strict', + False) + self.assertEqual(decoded, ('abc', 3)) + + def test_main(): support.run_unittest( UTF32Test, @@ -1772,6 +1969,7 @@ def test_main(): SurrogateEscapeTest, BomTest, TransformCodecTest, + CodePageTest, ) diff --git a/Misc/NEWS b/Misc/NEWS index 3d77bbfee9f..ef69bf23c83 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1? Core and Builtins ----------------- +- Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore + error handlers on all Windows versions. The MBCS codec is now supporting all + error handlers, instead of only replace to encode and ignore to decode. + - Issue #13188: When called without an explicit traceback argument, generator.throw() now gets the traceback from the passed exception's ``__traceback__`` attribute. Patch by Petri Lehtinen. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 26c87880bc2..be31fd23b09 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -612,6 +612,31 @@ mbcs_decode(PyObject *self, return codec_tuple(decoded, consumed); } +static PyObject * +code_page_decode(PyObject *self, + PyObject *args) +{ + Py_buffer pbuf; + const char *errors = NULL; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded = NULL; + int code_page; + + if (!PyArg_ParseTuple(args, "iy*|zi:code_page_decode", + &code_page, &pbuf, &errors, &final)) + return NULL; + consumed = pbuf.len; + + decoded = PyUnicode_DecodeCodePageStateful(code_page, + pbuf.buf, pbuf.len, errors, + final ? NULL : &consumed); + PyBuffer_Release(&pbuf); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + #endif /* HAVE_MBCS */ /* --- Encoder ------------------------------------------------------------ */ @@ -1011,6 +1036,29 @@ mbcs_encode(PyObject *self, return v; } +static PyObject * +code_page_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + int code_page; + + if (!PyArg_ParseTuple(args, "iO|z:code_page_encode", + &code_page, &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeCodePage(code_page, + str, + errors), + PyUnicode_GET_LENGTH(str)); + Py_DECREF(str); + return v; +} + #endif /* HAVE_MBCS */ /* --- Error handler registry --------------------------------------------- */ @@ -1101,6 +1149,8 @@ static PyMethodDef _codecs_functions[] = { #ifdef HAVE_MBCS {"mbcs_encode", mbcs_encode, METH_VARARGS}, {"mbcs_decode", mbcs_decode, METH_VARARGS}, + {"code_page_encode", code_page_encode, METH_VARARGS}, + {"code_page_decode", code_page_decode, METH_VARARGS}, #endif {"register_error", register_error, METH_VARARGS, register_error__doc__}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5f56cf7db0e..9d11546cb33 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -429,6 +429,10 @@ _PyUnicode_CheckConsistency(void *op, int check_content) } #endif +#ifdef HAVE_MBCS +static OSVERSIONINFOEX winver; +#endif + /* --- Bloom Filters ----------------------------------------------------- */ /* stuff to implement simple "bloom filters" for Unicode characters. @@ -6896,130 +6900,307 @@ PyUnicode_AsASCIIString(PyObject *unicode) #define NEED_RETRY #endif -/* XXX This code is limited to "true" double-byte encodings, as - a) it assumes an incomplete character consists of a single byte, and - b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte - encodings, see IsDBCSLeadByteEx documentation. */ +#ifndef WC_ERR_INVALID_CHARS +# define WC_ERR_INVALID_CHARS 0x0080 +#endif + +static char* +code_page_name(UINT code_page, PyObject **obj) +{ + *obj = NULL; + if (code_page == CP_ACP) + return "mbcs"; + if (code_page == CP_UTF7) + return "CP_UTF7"; + if (code_page == CP_UTF8) + return "CP_UTF8"; + + *obj = PyBytes_FromFormat("cp%u", code_page); + if (*obj == NULL) + return NULL; + return PyBytes_AS_STRING(*obj); +} static int -is_dbcs_lead_byte(const char *s, int offset) +is_dbcs_lead_byte(UINT code_page, const char *s, int offset) { const char *curr = s + offset; + const char *prev; - if (IsDBCSLeadByte(*curr)) { - const char *prev = CharPrev(s, curr); - return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); - } + if (!IsDBCSLeadByteEx(code_page, *curr)) + return 0; + + prev = CharPrevExA(code_page, s, curr, 0); + if (prev == curr) + return 1; + /* FIXME: This code is limited to "true" double-byte encodings, + as it assumes an incomplete character consists of a single + byte. */ + if (curr - prev == 2) + return 1; + if (!IsDBCSLeadByteEx(code_page, *prev)) + return 1; return 0; } +static DWORD +decode_code_page_flags(UINT code_page) +{ + if (code_page == CP_UTF7) { + /* The CP_UTF7 decoder only supports flags=0 */ + return 0; + } + else + return MB_ERR_INVALID_CHARS; +} + /* - * Decode MBCS string into unicode object. If 'final' is set, converts - * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. + * Decode a byte string from a Windows code page into unicode object in strict + * mode. + * + * Returns consumed size if succeed, returns -2 on decode error, or raise a + * WindowsError and returns -1 on other error. */ static int -decode_mbcs(PyUnicodeObject **v, - const char *s, /* MBCS string */ - int size, /* sizeof MBCS string */ - int final, - const char *errors) +decode_code_page_strict(UINT code_page, + PyUnicodeObject **v, + const char *in, + int insize) { - Py_UNICODE *p; - Py_ssize_t n; - DWORD usize; - DWORD flags; - - assert(size >= 0); - - /* check and handle 'errors' arg */ - if (errors==NULL || strcmp(errors, "strict")==0) - flags = MB_ERR_INVALID_CHARS; - else if (strcmp(errors, "ignore")==0) - flags = 0; - else { - PyErr_Format(PyExc_ValueError, - "mbcs encoding does not support errors='%s'", - errors); - return -1; - } - - /* Skip trailing lead-byte unless 'final' is set */ - if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) - --size; + const DWORD flags = decode_code_page_flags(code_page); + Py_UNICODE *out; + DWORD outsize; /* First get the size of the result */ - if (size > 0) { - usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); - if (usize==0) - goto mbcs_decode_error; - } else - usize = 0; + assert(insize > 0); + outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); + if (outsize <= 0) + goto error; if (*v == NULL) { /* Create unicode object */ - *v = _PyUnicode_New(usize); + *v = _PyUnicode_New(outsize); if (*v == NULL) return -1; - n = 0; + out = PyUnicode_AS_UNICODE(*v); } else { /* Extend unicode object */ - n = PyUnicode_GET_SIZE(*v); - if (PyUnicode_Resize((PyObject**)v, n + usize) < 0) + Py_ssize_t n = PyUnicode_GET_SIZE(*v); + if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0) return -1; + out = PyUnicode_AS_UNICODE(*v) + n; } /* Do the conversion */ - if (usize > 0) { - p = PyUnicode_AS_UNICODE(*v) + n; - if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { - goto mbcs_decode_error; - } - } - return size; + outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); + if (outsize <= 0) + goto error; + return insize; -mbcs_decode_error: - /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then - we raise a UnicodeDecodeError - else it is a 'generic' - windows error - */ - if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { - /* Ideally, we should get reason from FormatMessage - this - is the Windows 2000 English version of the message - */ - PyObject *exc = NULL; - const char *reason = "No mapping for the Unicode character exists " - "in the target multi-byte code page."; - make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); - if (exc != NULL) { - PyCodec_StrictErrors(exc); - Py_DECREF(exc); - } - } else { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - } +error: + if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) + return -2; + PyErr_SetFromWindowsErr(0); return -1; } -PyObject * -PyUnicode_DecodeMBCSStateful(const char *s, - Py_ssize_t size, - const char *errors, - Py_ssize_t *consumed) +/* + * Decode a byte string from a code page into unicode object with an error + * handler. + * + * Returns consumed size if succeed, or raise a WindowsError or + * UnicodeDecodeError exception and returns -1 on error. + */ +static int +decode_code_page_errors(UINT code_page, + PyUnicodeObject **v, + const char *in, + int size, + const char *errors) +{ + const char *startin = in; + const char *endin = in + size; + const DWORD flags = decode_code_page_flags(code_page); + /* Ideally, we should get reason from FormatMessage. This is the Windows + 2000 English version of the message. */ + const char *reason = "No mapping for the Unicode character exists " + "in the target code page."; + /* each step cannot decode more than 1 character, but a character can be + represented as a surrogate pair */ + wchar_t buffer[2], *startout, *out; + int insize, outsize; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *encoding_obj = NULL; + char *encoding; + DWORD err; + int ret = -1; + + assert(size > 0); + + encoding = code_page_name(code_page, &encoding_obj); + if (encoding == NULL) + return -1; + + if (errors == NULL || strcmp(errors, "strict") == 0) { + /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a + UnicodeDecodeError. */ + make_decode_exception(&exc, encoding, in, size, 0, 0, reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_CLEAR(exc); + } + goto error; + } + + if (*v == NULL) { + /* Create unicode object */ + if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { + PyErr_NoMemory(); + goto error; + } + *v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); + if (*v == NULL) + goto error; + startout = PyUnicode_AS_UNICODE(*v); + } + else { + /* Extend unicode object */ + Py_ssize_t n = PyUnicode_GET_SIZE(*v); + if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { + PyErr_NoMemory(); + goto error; + } + if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) + goto error; + startout = PyUnicode_AS_UNICODE(*v) + n; + } + + /* Decode the byte string character per character */ + out = startout; + while (in < endin) + { + /* Decode a character */ + insize = 1; + do + { + outsize = MultiByteToWideChar(code_page, flags, + in, insize, + buffer, Py_ARRAY_LENGTH(buffer)); + if (outsize > 0) + break; + err = GetLastError(); + if (err != ERROR_NO_UNICODE_TRANSLATION + && err != ERROR_INSUFFICIENT_BUFFER) + { + PyErr_SetFromWindowsErr(0); + goto error; + } + insize++; + } + /* 4=maximum length of a UTF-8 sequence */ + while (insize <= 4 && (in + insize) <= endin); + + if (outsize <= 0) { + Py_ssize_t startinpos, endinpos, outpos; + + startinpos = in - startin; + endinpos = startinpos + 1; + outpos = out - PyUnicode_AS_UNICODE(*v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + encoding, reason, + &startin, &endin, &startinpos, &endinpos, &exc, &in, + v, &outpos, &out)) + { + goto error; + } + } + else { + in += insize; + memcpy(out, buffer, outsize * sizeof(wchar_t)); + out += outsize; + } + } + + /* write a NUL character at the end */ + *out = 0; + + /* Extend unicode object */ + outsize = out - startout; + assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); + if (PyUnicode_Resize((PyObject**)v, outsize) < 0) + goto error; + ret = 0; + +error: + Py_XDECREF(encoding_obj); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return ret; +} + +/* + * Decode a byte string from a Windows code page into unicode object. If + * 'final' is set, converts trailing lead-byte too. + * + * Returns consumed size if succeed, or raise a WindowsError or + * UnicodeDecodeError exception and returns -1 on error. + */ +static int +decode_code_page(UINT code_page, + PyUnicodeObject **v, + const char *s, int size, + int final, const char *errors) +{ + int done; + + /* Skip trailing lead-byte unless 'final' is set */ + if (size == 0) { + if (*v == NULL) { + Py_INCREF(unicode_empty); + *v = (PyUnicodeObject*)unicode_empty; + if (*v == NULL) + return -1; + } + return 0; + } + + if (!final && is_dbcs_lead_byte(code_page, s, size - 1)) + --size; + + done = decode_code_page_strict(code_page, v, s, size); + if (done == -2) + done = decode_code_page_errors(code_page, v, s, size, errors); + return done; +} + +static PyObject * +decode_code_page_stateful(int code_page, + const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) { PyUnicodeObject *v = NULL; int done; + if (code_page < 0) { + PyErr_SetString(PyExc_ValueError, "invalid code page number"); + return NULL; + } + if (consumed) *consumed = 0; #ifdef NEED_RETRY retry: if (size > INT_MAX) - done = decode_mbcs(&v, s, INT_MAX, 0, errors); + done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors); else #endif - done = decode_mbcs(&v, s, (int)size, !consumed, errors); + done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors); if (done < 0) { Py_XDECREF(v); @@ -7036,6 +7217,7 @@ PyUnicode_DecodeMBCSStateful(const char *s, goto retry; } #endif + #ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&v)) { Py_DECREF(v); @@ -7046,6 +7228,25 @@ PyUnicode_DecodeMBCSStateful(const char *s, return (PyObject *)v; } +PyObject * +PyUnicode_DecodeCodePageStateful(int code_page, + const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + return decode_code_page_stateful(code_page, s, size, errors, consumed); +} + +PyObject * +PyUnicode_DecodeMBCSStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); +} + PyObject * PyUnicode_DecodeMBCS(const char *s, Py_ssize_t size, @@ -7054,105 +7255,342 @@ PyUnicode_DecodeMBCS(const char *s, return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); } +static DWORD +encode_code_page_flags(UINT code_page, const char *errors) +{ + if (code_page == CP_UTF8) { + if (winver.dwMajorVersion >= 6) + /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista + and later */ + return WC_ERR_INVALID_CHARS; + else + /* CP_UTF8 only supports flags=0 on Windows older than Vista */ + return 0; + } + else if (code_page == CP_UTF7) { + /* CP_UTF7 only supports flags=0 */ + return 0; + } + else { + if (errors != NULL && strcmp(errors, "replace") == 0) + return 0; + else + return WC_NO_BEST_FIT_CHARS; + } +} + /* - * Convert unicode into string object (MBCS). - * Returns 0 if succeed, -1 otherwise. + * Encode a Unicode string to a Windows code page into a byte string in strict + * mode. + * + * Returns consumed characters if succeed, returns -2 on encode error, or raise + * a WindowsError and returns -1 on other error. */ static int -encode_mbcs(PyObject **repr, - const Py_UNICODE *p, /* unicode */ - int size, /* size of unicode */ - const char* errors) +encode_code_page_strict(UINT code_page, PyObject **outbytes, + const Py_UNICODE *p, const int size, + const char* errors) { BOOL usedDefaultChar = FALSE; - BOOL *pusedDefaultChar; - int mbcssize; - Py_ssize_t n; + BOOL *pusedDefaultChar = &usedDefaultChar; + int outsize; PyObject *exc = NULL; - DWORD flags; + const DWORD flags = encode_code_page_flags(code_page, NULL); + char *out; - assert(size >= 0); + assert(size > 0); - /* check and handle 'errors' arg */ - if (errors==NULL || strcmp(errors, "strict")==0) { - flags = WC_NO_BEST_FIT_CHARS; + if (code_page != CP_UTF8 && code_page != CP_UTF7) pusedDefaultChar = &usedDefaultChar; - } else if (strcmp(errors, "replace")==0) { - flags = 0; + else pusedDefaultChar = NULL; - } else { - PyErr_Format(PyExc_ValueError, - "mbcs encoding does not support errors='%s'", - errors); - return -1; - } /* First get the size of the result */ - if (size > 0) { - mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, - NULL, pusedDefaultChar); - if (mbcssize == 0) { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - return -1; - } - /* If we used a default char, then we failed! */ - if (pusedDefaultChar && *pusedDefaultChar) - goto mbcs_encode_error; - } else { - mbcssize = 0; - } + outsize = WideCharToMultiByte(code_page, flags, + p, size, + NULL, 0, + NULL, pusedDefaultChar); + if (outsize <= 0) + goto error; + /* If we used a default char, then we failed! */ + if (pusedDefaultChar && *pusedDefaultChar) + return -2; - if (*repr == NULL) { + if (*outbytes == NULL) { /* Create string object */ - *repr = PyBytes_FromStringAndSize(NULL, mbcssize); - if (*repr == NULL) + *outbytes = PyBytes_FromStringAndSize(NULL, outsize); + if (*outbytes == NULL) return -1; - n = 0; + out = PyBytes_AS_STRING(*outbytes); } else { /* Extend string object */ - n = PyBytes_Size(*repr); - if (_PyBytes_Resize(repr, n + mbcssize) < 0) + const Py_ssize_t n = PyBytes_Size(*outbytes); + if (outsize > PY_SSIZE_T_MAX - n) { + PyErr_NoMemory(); return -1; + } + if (_PyBytes_Resize(outbytes, n + outsize) < 0) + return -1; + out = PyBytes_AS_STRING(*outbytes) + n; } /* Do the conversion */ - if (size > 0) { - char *s = PyBytes_AS_STRING(*repr) + n; - if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, - NULL, pusedDefaultChar)) { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - return -1; - } - if (pusedDefaultChar && *pusedDefaultChar) - goto mbcs_encode_error; - } + outsize = WideCharToMultiByte(code_page, flags, + p, size, + out, outsize, + NULL, pusedDefaultChar); + if (outsize <= 0) + goto error; + if (pusedDefaultChar && *pusedDefaultChar) + return -2; return 0; -mbcs_encode_error: - raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); - Py_XDECREF(exc); +error: + if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) + return -2; + PyErr_SetFromWindowsErr(0); return -1; } -PyObject * -PyUnicode_EncodeMBCS(const Py_UNICODE *p, - Py_ssize_t size, - const char *errors) +/* + * Encode a Unicode string to a Windows code page into a byte string using a + * error handler. + * + * Returns consumed characters if succeed, or raise a WindowsError and returns + * -1 on other error. + */ +static int +encode_code_page_errors(UINT code_page, PyObject **outbytes, + const Py_UNICODE *in, const int insize, + const char* errors) { - PyObject *repr = NULL; + const DWORD flags = encode_code_page_flags(code_page, errors); + const Py_UNICODE *startin = in; + const Py_UNICODE *endin = in + insize; + /* Ideally, we should get reason from FormatMessage. This is the Windows + 2000 English version of the message. */ + const char *reason = "invalid character"; + /* 4=maximum length of a UTF-8 sequence */ + char buffer[4]; + BOOL usedDefaultChar = FALSE, *pusedDefaultChar; + Py_ssize_t outsize; + char *out; + int charsize; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *encoding_obj = NULL; + char *encoding; + int err; + Py_ssize_t startpos, newpos, newoutsize; + PyObject *rep; + int ret = -1; + + assert(insize > 0); + + encoding = code_page_name(code_page, &encoding_obj); + if (encoding == NULL) + return -1; + + if (errors == NULL || strcmp(errors, "strict") == 0) { + /* The last error was ERROR_NO_UNICODE_TRANSLATION, + then we raise a UnicodeEncodeError. */ + make_encode_exception(&exc, encoding, in, insize, 0, 0, reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_DECREF(exc); + } + Py_XDECREF(encoding_obj); + return -1; + } + + if (code_page != CP_UTF8 && code_page != CP_UTF7) + pusedDefaultChar = &usedDefaultChar; + else + pusedDefaultChar = NULL; + + if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { + PyErr_NoMemory(); + goto error; + } + outsize = insize * Py_ARRAY_LENGTH(buffer); + + if (*outbytes == NULL) { + /* Create string object */ + *outbytes = PyBytes_FromStringAndSize(NULL, outsize); + if (*outbytes == NULL) + goto error; + out = PyBytes_AS_STRING(*outbytes); + } + else { + /* Extend string object */ + Py_ssize_t n = PyBytes_Size(*outbytes); + if (n > PY_SSIZE_T_MAX - outsize) { + PyErr_NoMemory(); + goto error; + } + if (_PyBytes_Resize(outbytes, n + outsize) < 0) + goto error; + out = PyBytes_AS_STRING(*outbytes) + n; + } + + /* Encode the string character per character */ + while (in < endin) + { + if ((in + 2) <= endin + && 0xD800 <= in[0] && in[0] <= 0xDBFF + && 0xDC00 <= in[1] && in[1] <= 0xDFFF) + charsize = 2; + else + charsize = 1; + + outsize = WideCharToMultiByte(code_page, flags, + in, charsize, + buffer, Py_ARRAY_LENGTH(buffer), + NULL, pusedDefaultChar); + if (outsize > 0) { + if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) + { + in += charsize; + memcpy(out, buffer, outsize); + out += outsize; + continue; + } + } + else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { + PyErr_SetFromWindowsErr(0); + goto error; + } + + charsize = Py_MAX(charsize - 1, 1); + startpos = in - startin; + rep = unicode_encode_call_errorhandler( + errors, &errorHandler, encoding, reason, + startin, insize, &exc, + startpos, startpos + charsize, &newpos); + if (rep == NULL) + goto error; + in = startin + newpos; + + if (PyBytes_Check(rep)) { + outsize = PyBytes_GET_SIZE(rep); + if (outsize != 1) { + Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); + newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); + if (_PyBytes_Resize(outbytes, newoutsize) < 0) { + Py_DECREF(rep); + goto error; + } + out = PyBytes_AS_STRING(*outbytes) + offset; + } + memcpy(out, PyBytes_AS_STRING(rep), outsize); + out += outsize; + } + else { + Py_ssize_t i; + enum PyUnicode_Kind kind; + void *data; + + if (PyUnicode_READY(rep) < 0) { + Py_DECREF(rep); + goto error; + } + + outsize = PyUnicode_GET_LENGTH(rep); + if (outsize != 1) { + Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); + newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); + if (_PyBytes_Resize(outbytes, newoutsize) < 0) { + Py_DECREF(rep); + goto error; + } + out = PyBytes_AS_STRING(*outbytes) + offset; + } + kind = PyUnicode_KIND(rep); + data = PyUnicode_DATA(rep); + for (i=0; i < outsize; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch > 127) { + raise_encode_exception(&exc, + encoding, + startin, insize, + startpos, startpos + charsize, + "unable to encode error handler result to ASCII"); + Py_DECREF(rep); + goto error; + } + *out = (unsigned char)ch; + out++; + } + } + Py_DECREF(rep); + } + /* write a NUL byte */ + *out = 0; + outsize = out - PyBytes_AS_STRING(*outbytes); + assert(outsize <= PyBytes_GET_SIZE(*outbytes)); + if (_PyBytes_Resize(outbytes, outsize) < 0) + goto error; + ret = 0; + +error: + Py_XDECREF(encoding_obj); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return ret; +} + +/* + * Encode a Unicode string to a Windows code page into a byte string. + * + * Returns consumed characters if succeed, or raise a WindowsError and returns + * -1 on other error. + */ +static int +encode_code_page_chunk(UINT code_page, PyObject **outbytes, + const Py_UNICODE *p, int size, + const char* errors) +{ + int done; + + if (size == 0) { + if (*outbytes == NULL) { + *outbytes = PyBytes_FromStringAndSize(NULL, 0); + if (*outbytes == NULL) + return -1; + } + return 0; + } + + done = encode_code_page_strict(code_page, outbytes, p, size, errors); + if (done == -2) + done = encode_code_page_errors(code_page, outbytes, p, size, errors); + return done; +} + +static PyObject * +encode_code_page(int code_page, + const Py_UNICODE *p, Py_ssize_t size, + const char *errors) +{ + PyObject *outbytes = NULL; int ret; + if (code_page < 0) { + PyErr_SetString(PyExc_ValueError, "invalid code page number"); + return NULL; + } + #ifdef NEED_RETRY retry: if (size > INT_MAX) - ret = encode_mbcs(&repr, p, INT_MAX, errors); + ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors); else #endif - ret = encode_mbcs(&repr, p, (int)size, errors); + ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors); if (ret < 0) { - Py_XDECREF(repr); + Py_XDECREF(outbytes); return NULL; } @@ -7164,7 +7602,28 @@ PyUnicode_EncodeMBCS(const Py_UNICODE *p, } #endif - return repr; + return outbytes; +} + +PyObject * +PyUnicode_EncodeMBCS(const Py_UNICODE *p, + Py_ssize_t size, + const char *errors) +{ + return encode_code_page(CP_ACP, p, size, errors); +} + +PyObject * +PyUnicode_EncodeCodePage(int code_page, + PyObject *unicode, + const char *errors) +{ + const Py_UNICODE *p; + Py_ssize_t size; + p = PyUnicode_AsUnicodeAndSize(unicode, &size); + if (p == NULL) + return NULL; + return encode_code_page(code_page, p, size, errors); } PyObject * @@ -13434,7 +13893,7 @@ PyTypeObject PyUnicode_Type = { /* Initialize the Unicode implementation */ -void _PyUnicode_Init(void) +int _PyUnicode_Init(void) { int i; @@ -13467,6 +13926,15 @@ void _PyUnicode_Init(void) Py_ARRAY_LENGTH(linebreak)); PyType_Ready(&EncodingMapType); + +#ifdef HAVE_MBCS + winver.dwOSVersionInfoSize = sizeof(winver); + if (!GetVersionEx((OSVERSIONINFO*)&winver)) { + PyErr_SetFromWindowsErr(0); + return -1; + } +#endif + return 0; } /* Finalize the Unicode implementation */ diff --git a/Python/pythonrun.c b/Python/pythonrun.c index a6e7c465681..0f2f0501cde 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -67,7 +67,7 @@ static void initsigs(void); static void call_py_exitfuncs(void); static void wait_for_thread_shutdown(void); static void call_ll_exitfuncs(void); -extern void _PyUnicode_Init(void); +extern int _PyUnicode_Init(void); extern void _PyUnicode_Fini(void); extern int _PyLong_Init(void); extern void PyLong_Fini(void); @@ -261,7 +261,8 @@ Py_InitializeEx(int install_sigs) Py_FatalError("Py_Initialize: can't make modules_reloading dictionary"); /* Init Unicode implementation; relies on the codec registry */ - _PyUnicode_Init(); + if (_PyUnicode_Init() < 0) + Py_FatalError("Py_Initialize: can't initialize unicode"); bimod = _PyBuiltin_Init(); if (bimod == NULL)