Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore

error handlers on all Windows versions. The MBCS codec is now supporting all
error handlers, instead of only replace to encode and ignore to decode.
This commit is contained in:
Victor Stinner 2011-10-18 21:21:00 +02:00
parent 1e73a2467f
commit 3a50e7056e
8 changed files with 893 additions and 152 deletions

View File

@ -1280,12 +1280,13 @@ functions can be used directly if desired.
.. module:: encodings.mbcs
:synopsis: Windows ANSI codepage
Encode operand according to the ANSI codepage (CP_ACP). This codec only
supports ``'strict'`` and ``'replace'`` error handlers to encode, and
``'strict'`` and ``'ignore'`` error handlers to decode.
Encode operand according to the ANSI codepage (CP_ACP).
Availability: Windows only.
.. versionchanged:: 3.3
Support any error handler.
.. versionchanged:: 3.2
Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
to encode, and ``'ignore'`` to decode.

View File

@ -197,6 +197,11 @@ The :mod:`array` module supports the :c:type:`long long` type using ``q`` and
codecs
------
The :mod:`~encodings.mbcs` codec has be rewritten to handle correclty
``replace`` and ``ignore`` error handlers on all Windows versions. The
:mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of
only ``replace`` to encode and ``ignore`` to decode.
Multibyte CJK decoders now resynchronize faster. They only ignore the first
byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312',
'replace')`` now returns a ``\n`` after the replacement character.

View File

@ -1466,6 +1466,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
Py_ssize_t *consumed /* bytes consumed */
);
PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
int code_page, /* code page number */
const char *string, /* encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed /* bytes consumed */
);
PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
PyObject *unicode /* Unicode object */
);
@ -1473,11 +1481,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
const Py_UNICODE *data, /* Unicode char buffer */
Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Py_ssize_t length, /* number of Py_UNICODE chars to encode */
const char *errors /* error handling */
);
#endif
PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
int code_page, /* code page number */
PyObject *unicode, /* Unicode object */
const char *errors /* error handling */
);
#endif /* HAVE_MBCS */
/* --- Decimal Encoder ---------------------------------------------------- */

View File

@ -1744,6 +1744,203 @@ class TransformCodecTest(unittest.TestCase):
self.assertEqual(sout, b"\x80")
class CodePageTest(unittest.TestCase):
CP_UTF8 = 65001
vista_or_later = (sys.getwindowsversion().major >= 6)
def test_invalid_code_page(self):
self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
def test_code_page_name(self):
self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
codecs.code_page_encode, 932, '\xff')
self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
codecs.code_page_decode, 932, b'\x81\x00')
self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
codecs.code_page_decode, self.CP_UTF8, b'\xff')
def check_decode(self, cp, tests):
for raw, errors, expected in tests:
if expected is not None:
try:
decoded = codecs.code_page_decode(cp, raw, errors)
except UnicodeDecodeError as err:
self.fail('Unable to decode %a from "cp%s" with '
'errors=%r: %s' % (raw, cp, errors, err))
self.assertEqual(decoded[0], expected,
'%a.decode("cp%s", %r)=%a != %a'
% (raw, cp, errors, decoded[0], expected))
# assert 0 <= decoded[1] <= len(raw)
self.assertGreaterEqual(decoded[1], 0)
self.assertLessEqual(decoded[1], len(raw))
else:
self.assertRaises(UnicodeDecodeError,
codecs.code_page_decode, cp, raw, errors)
def check_encode(self, cp, tests):
for text, errors, expected in tests:
if expected is not None:
try:
encoded = codecs.code_page_encode(cp, text, errors)
except UnicodeEncodeError as err:
self.fail('Unable to encode %a to "cp%s" with '
'errors=%r: %s' % (text, cp, errors, err))
self.assertEqual(encoded[0], expected,
'%a.encode("cp%s", %r)=%a != %a'
% (text, cp, errors, encoded[0], expected))
self.assertEqual(encoded[1], len(text))
else:
self.assertRaises(UnicodeEncodeError,
codecs.code_page_encode, cp, text, errors)
def test_cp932(self):
self.check_encode(932, (
('abc', 'strict', b'abc'),
('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
# not encodable
('\xff', 'strict', None),
('[\xff]', 'ignore', b'[]'),
('[\xff]', 'replace', b'[y]'),
('[\u20ac]', 'replace', b'[?]'),
))
tests = [
(b'abc', 'strict', 'abc'),
(b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
# invalid bytes
(b'\xff', 'strict', None),
(b'\xff', 'ignore', ''),
(b'\xff', 'replace', '\ufffd'),
(b'\x81\x00abc', 'strict', None),
(b'\x81\x00abc', 'ignore', '\x00abc'),
]
if self.vista_or_later:
tests.append((b'\x81\x00abc', 'replace', '\ufffd\x00abc'))
else:
tests.append((b'\x81\x00abc', 'replace', '\x00\x00abc'))
self.check_decode(932, tests)
def test_cp1252(self):
self.check_encode(1252, (
('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'\xe9\x80'),
('\xff', 'strict', b'\xff'),
('\u0141', 'strict', None),
('\u0141', 'ignore', b''),
('\u0141', 'replace', b'L'),
))
self.check_decode(1252, (
(b'abc', 'strict', 'abc'),
(b'\xe9\x80', 'strict', '\xe9\u20ac'),
(b'\xff', 'strict', '\xff'),
))
def test_cp_utf7(self):
cp = 65000
self.check_encode(cp, (
('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'+AOkgrA-'),
('\U0010ffff', 'strict', b'+2//f/w-'),
('\udc80', 'strict', b'+3IA-'),
('\ufffd', 'strict', b'+//0-'),
))
self.check_decode(cp, (
(b'abc', 'strict', 'abc'),
(b'+AOkgrA-', 'strict', '\xe9\u20ac'),
(b'+2//f/w-', 'strict', '\U0010ffff'),
(b'+3IA-', 'strict', '\udc80'),
(b'+//0-', 'strict', '\ufffd'),
# invalid bytes
(b'[+/]', 'strict', '[]'),
(b'[\xff]', 'strict', '[\xff]'),
))
def test_cp_utf8(self):
cp = self.CP_UTF8
tests = [
('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
]
if self.vista_or_later:
tests.append(('\udc80', 'strict', None))
tests.append(('\udc80', 'ignore', b''))
tests.append(('\udc80', 'replace', b'?'))
else:
tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
self.check_encode(cp, tests)
tests = [
(b'abc', 'strict', 'abc'),
(b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
(b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
(b'\xef\xbf\xbd', 'strict', '\ufffd'),
(b'[\xc3\xa9]', 'strict', '[\xe9]'),
# invalid bytes
(b'[\xff]', 'strict', None),
(b'[\xff]', 'ignore', '[]'),
(b'[\xff]', 'replace', '[\ufffd]'),
]
if self.vista_or_later:
tests.extend((
(b'[\xed\xb2\x80]', 'strict', None),
(b'[\xed\xb2\x80]', 'ignore', '[]'),
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
))
else:
tests.extend((
(b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
))
self.check_decode(cp, tests)
def test_error_handlers(self):
self.check_encode(932, (
('\xff', 'backslashreplace', b'\\xff'),
('\xff', 'xmlcharrefreplace', b'&#255;'),
))
self.check_decode(932, (
(b'\xff', 'surrogateescape', '\udcff'),
))
if self.vista_or_later:
self.check_encode(self.CP_UTF8, (
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
))
def test_multibyte_encoding(self):
self.check_decode(932, (
(b'\x84\xe9\x80', 'ignore', '\u9a3e'),
(b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
))
self.check_decode(self.CP_UTF8, (
(b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
(b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
))
if self.vista_or_later:
self.check_encode(self.CP_UTF8, (
('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
))
def test_incremental(self):
decoded = codecs.code_page_decode(932,
b'\xe9\x80\xe9', 'strict',
False)
self.assertEqual(decoded, ('\u9a3e', 2))
decoded = codecs.code_page_decode(932,
b'\xe9\x80\xe9\x80', 'strict',
False)
self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
decoded = codecs.code_page_decode(932,
b'abc', 'strict',
False)
self.assertEqual(decoded, ('abc', 3))
def test_main():
support.run_unittest(
UTF32Test,
@ -1772,6 +1969,7 @@ def test_main():
SurrogateEscapeTest,
BomTest,
TransformCodecTest,
CodePageTest,
)

View File

@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins
-----------------
- Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore
error handlers on all Windows versions. The MBCS codec is now supporting all
error handlers, instead of only replace to encode and ignore to decode.
- Issue #13188: When called without an explicit traceback argument,
generator.throw() now gets the traceback from the passed exception's
``__traceback__`` attribute. Patch by Petri Lehtinen.

View File

@ -612,6 +612,31 @@ mbcs_decode(PyObject *self,
return codec_tuple(decoded, consumed);
}
static PyObject *
code_page_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded = NULL;
int code_page;
if (!PyArg_ParseTuple(args, "iy*|zi:code_page_decode",
&code_page, &pbuf, &errors, &final))
return NULL;
consumed = pbuf.len;
decoded = PyUnicode_DecodeCodePageStateful(code_page,
pbuf.buf, pbuf.len, errors,
final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
#endif /* HAVE_MBCS */
/* --- Encoder ------------------------------------------------------------ */
@ -1011,6 +1036,29 @@ mbcs_encode(PyObject *self,
return v;
}
static PyObject *
code_page_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
int code_page;
if (!PyArg_ParseTuple(args, "iO|z:code_page_encode",
&code_page, &str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeCodePage(code_page,
str,
errors),
PyUnicode_GET_LENGTH(str));
Py_DECREF(str);
return v;
}
#endif /* HAVE_MBCS */
/* --- Error handler registry --------------------------------------------- */
@ -1101,6 +1149,8 @@ static PyMethodDef _codecs_functions[] = {
#ifdef HAVE_MBCS
{"mbcs_encode", mbcs_encode, METH_VARARGS},
{"mbcs_decode", mbcs_decode, METH_VARARGS},
{"code_page_encode", code_page_encode, METH_VARARGS},
{"code_page_decode", code_page_decode, METH_VARARGS},
#endif
{"register_error", register_error, METH_VARARGS,
register_error__doc__},

View File

@ -429,6 +429,10 @@ _PyUnicode_CheckConsistency(void *op, int check_content)
}
#endif
#ifdef HAVE_MBCS
static OSVERSIONINFOEX winver;
#endif
/* --- Bloom Filters ----------------------------------------------------- */
/* stuff to implement simple "bloom filters" for Unicode characters.
@ -6896,113 +6900,285 @@ PyUnicode_AsASCIIString(PyObject *unicode)
#define NEED_RETRY
#endif
/* XXX This code is limited to "true" double-byte encodings, as
a) it assumes an incomplete character consists of a single byte, and
b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
encodings, see IsDBCSLeadByteEx documentation. */
#ifndef WC_ERR_INVALID_CHARS
# define WC_ERR_INVALID_CHARS 0x0080
#endif
static char*
code_page_name(UINT code_page, PyObject **obj)
{
*obj = NULL;
if (code_page == CP_ACP)
return "mbcs";
if (code_page == CP_UTF7)
return "CP_UTF7";
if (code_page == CP_UTF8)
return "CP_UTF8";
*obj = PyBytes_FromFormat("cp%u", code_page);
if (*obj == NULL)
return NULL;
return PyBytes_AS_STRING(*obj);
}
static int
is_dbcs_lead_byte(const char *s, int offset)
is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
{
const char *curr = s + offset;
const char *prev;
if (IsDBCSLeadByte(*curr)) {
const char *prev = CharPrev(s, curr);
return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
if (!IsDBCSLeadByteEx(code_page, *curr))
return 0;
prev = CharPrevExA(code_page, s, curr, 0);
if (prev == curr)
return 1;
/* FIXME: This code is limited to "true" double-byte encodings,
as it assumes an incomplete character consists of a single
byte. */
if (curr - prev == 2)
return 1;
if (!IsDBCSLeadByteEx(code_page, *prev))
return 1;
return 0;
}
static DWORD
decode_code_page_flags(UINT code_page)
{
if (code_page == CP_UTF7) {
/* The CP_UTF7 decoder only supports flags=0 */
return 0;
}
else
return MB_ERR_INVALID_CHARS;
}
/*
* Decode a byte string from a Windows code page into unicode object in strict
* mode.
*
* Returns consumed size if succeed, returns -2 on decode error, or raise a
* WindowsError and returns -1 on other error.
*/
static int
decode_code_page_strict(UINT code_page,
PyUnicodeObject **v,
const char *in,
int insize)
{
const DWORD flags = decode_code_page_flags(code_page);
Py_UNICODE *out;
DWORD outsize;
/* First get the size of the result */
assert(insize > 0);
outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
if (outsize <= 0)
goto error;
if (*v == NULL) {
/* Create unicode object */
*v = _PyUnicode_New(outsize);
if (*v == NULL)
return -1;
out = PyUnicode_AS_UNICODE(*v);
}
else {
/* Extend unicode object */
Py_ssize_t n = PyUnicode_GET_SIZE(*v);
if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0)
return -1;
out = PyUnicode_AS_UNICODE(*v) + n;
}
/* Do the conversion */
outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
if (outsize <= 0)
goto error;
return insize;
error:
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
return -2;
PyErr_SetFromWindowsErr(0);
return -1;
}
/*
* Decode a byte string from a code page into unicode object with an error
* handler.
*
* Returns consumed size if succeed, or raise a WindowsError or
* UnicodeDecodeError exception and returns -1 on error.
*/
static int
decode_code_page_errors(UINT code_page,
PyUnicodeObject **v,
const char *in,
int size,
const char *errors)
{
const char *startin = in;
const char *endin = in + size;
const DWORD flags = decode_code_page_flags(code_page);
/* Ideally, we should get reason from FormatMessage. This is the Windows
2000 English version of the message. */
const char *reason = "No mapping for the Unicode character exists "
"in the target code page.";
/* each step cannot decode more than 1 character, but a character can be
represented as a surrogate pair */
wchar_t buffer[2], *startout, *out;
int insize, outsize;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *encoding_obj = NULL;
char *encoding;
DWORD err;
int ret = -1;
assert(size > 0);
encoding = code_page_name(code_page, &encoding_obj);
if (encoding == NULL)
return -1;
if (errors == NULL || strcmp(errors, "strict") == 0) {
/* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
UnicodeDecodeError. */
make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
if (exc != NULL) {
PyCodec_StrictErrors(exc);
Py_CLEAR(exc);
}
goto error;
}
if (*v == NULL) {
/* Create unicode object */
if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
PyErr_NoMemory();
goto error;
}
*v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
if (*v == NULL)
goto error;
startout = PyUnicode_AS_UNICODE(*v);
}
else {
/* Extend unicode object */
Py_ssize_t n = PyUnicode_GET_SIZE(*v);
if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
PyErr_NoMemory();
goto error;
}
if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
goto error;
startout = PyUnicode_AS_UNICODE(*v) + n;
}
/* Decode the byte string character per character */
out = startout;
while (in < endin)
{
/* Decode a character */
insize = 1;
do
{
outsize = MultiByteToWideChar(code_page, flags,
in, insize,
buffer, Py_ARRAY_LENGTH(buffer));
if (outsize > 0)
break;
err = GetLastError();
if (err != ERROR_NO_UNICODE_TRANSLATION
&& err != ERROR_INSUFFICIENT_BUFFER)
{
PyErr_SetFromWindowsErr(0);
goto error;
}
insize++;
}
/* 4=maximum length of a UTF-8 sequence */
while (insize <= 4 && (in + insize) <= endin);
if (outsize <= 0) {
Py_ssize_t startinpos, endinpos, outpos;
startinpos = in - startin;
endinpos = startinpos + 1;
outpos = out - PyUnicode_AS_UNICODE(*v);
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
encoding, reason,
&startin, &endin, &startinpos, &endinpos, &exc, &in,
v, &outpos, &out))
{
goto error;
}
}
else {
in += insize;
memcpy(out, buffer, outsize * sizeof(wchar_t));
out += outsize;
}
}
/* write a NUL character at the end */
*out = 0;
/* Extend unicode object */
outsize = out - startout;
assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
if (PyUnicode_Resize((PyObject**)v, outsize) < 0)
goto error;
ret = 0;
error:
Py_XDECREF(encoding_obj);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return ret;
}
/*
* Decode a byte string from a Windows code page into unicode object. If
* 'final' is set, converts trailing lead-byte too.
*
* Returns consumed size if succeed, or raise a WindowsError or
* UnicodeDecodeError exception and returns -1 on error.
*/
static int
decode_code_page(UINT code_page,
PyUnicodeObject **v,
const char *s, int size,
int final, const char *errors)
{
int done;
/* Skip trailing lead-byte unless 'final' is set */
if (size == 0) {
if (*v == NULL) {
Py_INCREF(unicode_empty);
*v = (PyUnicodeObject*)unicode_empty;
if (*v == NULL)
return -1;
}
return 0;
}
/*
* Decode MBCS string into unicode object. If 'final' is set, converts
* trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
*/
static int
decode_mbcs(PyUnicodeObject **v,
const char *s, /* MBCS string */
int size, /* sizeof MBCS string */
int final,
const char *errors)
{
Py_UNICODE *p;
Py_ssize_t n;
DWORD usize;
DWORD flags;
assert(size >= 0);
/* check and handle 'errors' arg */
if (errors==NULL || strcmp(errors, "strict")==0)
flags = MB_ERR_INVALID_CHARS;
else if (strcmp(errors, "ignore")==0)
flags = 0;
else {
PyErr_Format(PyExc_ValueError,
"mbcs encoding does not support errors='%s'",
errors);
return -1;
}
/* Skip trailing lead-byte unless 'final' is set */
if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
if (!final && is_dbcs_lead_byte(code_page, s, size - 1))
--size;
/* First get the size of the result */
if (size > 0) {
usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
if (usize==0)
goto mbcs_decode_error;
} else
usize = 0;
if (*v == NULL) {
/* Create unicode object */
*v = _PyUnicode_New(usize);
if (*v == NULL)
return -1;
n = 0;
}
else {
/* Extend unicode object */
n = PyUnicode_GET_SIZE(*v);
if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
return -1;
done = decode_code_page_strict(code_page, v, s, size);
if (done == -2)
done = decode_code_page_errors(code_page, v, s, size, errors);
return done;
}
/* Do the conversion */
if (usize > 0) {
p = PyUnicode_AS_UNICODE(*v) + n;
if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
goto mbcs_decode_error;
}
}
return size;
mbcs_decode_error:
/* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
we raise a UnicodeDecodeError - else it is a 'generic'
windows error
*/
if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
/* Ideally, we should get reason from FormatMessage - this
is the Windows 2000 English version of the message
*/
PyObject *exc = NULL;
const char *reason = "No mapping for the Unicode character exists "
"in the target multi-byte code page.";
make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
if (exc != NULL) {
PyCodec_StrictErrors(exc);
Py_DECREF(exc);
}
} else {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
}
return -1;
}
PyObject *
PyUnicode_DecodeMBCSStateful(const char *s,
static PyObject *
decode_code_page_stateful(int code_page,
const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
@ -7010,16 +7186,21 @@ PyUnicode_DecodeMBCSStateful(const char *s,
PyUnicodeObject *v = NULL;
int done;
if (code_page < 0) {
PyErr_SetString(PyExc_ValueError, "invalid code page number");
return NULL;
}
if (consumed)
*consumed = 0;
#ifdef NEED_RETRY
retry:
if (size > INT_MAX)
done = decode_mbcs(&v, s, INT_MAX, 0, errors);
done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors);
else
#endif
done = decode_mbcs(&v, s, (int)size, !consumed, errors);
done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors);
if (done < 0) {
Py_XDECREF(v);
@ -7036,6 +7217,7 @@ PyUnicode_DecodeMBCSStateful(const char *s,
goto retry;
}
#endif
#ifndef DONT_MAKE_RESULT_READY
if (_PyUnicode_READY_REPLACE(&v)) {
Py_DECREF(v);
@ -7046,6 +7228,25 @@ PyUnicode_DecodeMBCSStateful(const char *s,
return (PyObject *)v;
}
PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,
const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
return decode_code_page_stateful(code_page, s, size, errors, consumed);
}
PyObject *
PyUnicode_DecodeMBCSStateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
}
PyObject *
PyUnicode_DecodeMBCS(const char *s,
Py_ssize_t size,
@ -7054,105 +7255,342 @@ PyUnicode_DecodeMBCS(const char *s,
return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
}
static DWORD
encode_code_page_flags(UINT code_page, const char *errors)
{
if (code_page == CP_UTF8) {
if (winver.dwMajorVersion >= 6)
/* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
and later */
return WC_ERR_INVALID_CHARS;
else
/* CP_UTF8 only supports flags=0 on Windows older than Vista */
return 0;
}
else if (code_page == CP_UTF7) {
/* CP_UTF7 only supports flags=0 */
return 0;
}
else {
if (errors != NULL && strcmp(errors, "replace") == 0)
return 0;
else
return WC_NO_BEST_FIT_CHARS;
}
}
/*
* Convert unicode into string object (MBCS).
* Returns 0 if succeed, -1 otherwise.
* Encode a Unicode string to a Windows code page into a byte string in strict
* mode.
*
* Returns consumed characters if succeed, returns -2 on encode error, or raise
* a WindowsError and returns -1 on other error.
*/
static int
encode_mbcs(PyObject **repr,
const Py_UNICODE *p, /* unicode */
int size, /* size of unicode */
encode_code_page_strict(UINT code_page, PyObject **outbytes,
const Py_UNICODE *p, const int size,
const char* errors)
{
BOOL usedDefaultChar = FALSE;
BOOL *pusedDefaultChar;
int mbcssize;
Py_ssize_t n;
BOOL *pusedDefaultChar = &usedDefaultChar;
int outsize;
PyObject *exc = NULL;
DWORD flags;
const DWORD flags = encode_code_page_flags(code_page, NULL);
char *out;
assert(size >= 0);
assert(size > 0);
/* check and handle 'errors' arg */
if (errors==NULL || strcmp(errors, "strict")==0) {
flags = WC_NO_BEST_FIT_CHARS;
if (code_page != CP_UTF8 && code_page != CP_UTF7)
pusedDefaultChar = &usedDefaultChar;
} else if (strcmp(errors, "replace")==0) {
flags = 0;
else
pusedDefaultChar = NULL;
} else {
PyErr_Format(PyExc_ValueError,
"mbcs encoding does not support errors='%s'",
errors);
return -1;
}
/* First get the size of the result */
if (size > 0) {
mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
outsize = WideCharToMultiByte(code_page, flags,
p, size,
NULL, 0,
NULL, pusedDefaultChar);
if (mbcssize == 0) {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1;
}
if (outsize <= 0)
goto error;
/* If we used a default char, then we failed! */
if (pusedDefaultChar && *pusedDefaultChar)
goto mbcs_encode_error;
} else {
mbcssize = 0;
}
return -2;
if (*repr == NULL) {
if (*outbytes == NULL) {
/* Create string object */
*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
if (*repr == NULL)
*outbytes = PyBytes_FromStringAndSize(NULL, outsize);
if (*outbytes == NULL)
return -1;
n = 0;
out = PyBytes_AS_STRING(*outbytes);
}
else {
/* Extend string object */
n = PyBytes_Size(*repr);
if (_PyBytes_Resize(repr, n + mbcssize) < 0)
const Py_ssize_t n = PyBytes_Size(*outbytes);
if (outsize > PY_SSIZE_T_MAX - n) {
PyErr_NoMemory();
return -1;
}
if (_PyBytes_Resize(outbytes, n + outsize) < 0)
return -1;
out = PyBytes_AS_STRING(*outbytes) + n;
}
/* Do the conversion */
if (size > 0) {
char *s = PyBytes_AS_STRING(*repr) + n;
if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
NULL, pusedDefaultChar)) {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1;
}
outsize = WideCharToMultiByte(code_page, flags,
p, size,
out, outsize,
NULL, pusedDefaultChar);
if (outsize <= 0)
goto error;
if (pusedDefaultChar && *pusedDefaultChar)
goto mbcs_encode_error;
}
return -2;
return 0;
mbcs_encode_error:
raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
Py_XDECREF(exc);
error:
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
return -2;
PyErr_SetFromWindowsErr(0);
return -1;
}
PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Py_ssize_t size,
/*
* Encode a Unicode string to a Windows code page into a byte string using a
* error handler.
*
* Returns consumed characters if succeed, or raise a WindowsError and returns
* -1 on other error.
*/
static int
encode_code_page_errors(UINT code_page, PyObject **outbytes,
const Py_UNICODE *in, const int insize,
const char* errors)
{
PyObject *repr = NULL;
const DWORD flags = encode_code_page_flags(code_page, errors);
const Py_UNICODE *startin = in;
const Py_UNICODE *endin = in + insize;
/* Ideally, we should get reason from FormatMessage. This is the Windows
2000 English version of the message. */
const char *reason = "invalid character";
/* 4=maximum length of a UTF-8 sequence */
char buffer[4];
BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
Py_ssize_t outsize;
char *out;
int charsize;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *encoding_obj = NULL;
char *encoding;
int err;
Py_ssize_t startpos, newpos, newoutsize;
PyObject *rep;
int ret = -1;
assert(insize > 0);
encoding = code_page_name(code_page, &encoding_obj);
if (encoding == NULL)
return -1;
if (errors == NULL || strcmp(errors, "strict") == 0) {
/* The last error was ERROR_NO_UNICODE_TRANSLATION,
then we raise a UnicodeEncodeError. */
make_encode_exception(&exc, encoding, in, insize, 0, 0, reason);
if (exc != NULL) {
PyCodec_StrictErrors(exc);
Py_DECREF(exc);
}
Py_XDECREF(encoding_obj);
return -1;
}
if (code_page != CP_UTF8 && code_page != CP_UTF7)
pusedDefaultChar = &usedDefaultChar;
else
pusedDefaultChar = NULL;
if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
PyErr_NoMemory();
goto error;
}
outsize = insize * Py_ARRAY_LENGTH(buffer);
if (*outbytes == NULL) {
/* Create string object */
*outbytes = PyBytes_FromStringAndSize(NULL, outsize);
if (*outbytes == NULL)
goto error;
out = PyBytes_AS_STRING(*outbytes);
}
else {
/* Extend string object */
Py_ssize_t n = PyBytes_Size(*outbytes);
if (n > PY_SSIZE_T_MAX - outsize) {
PyErr_NoMemory();
goto error;
}
if (_PyBytes_Resize(outbytes, n + outsize) < 0)
goto error;
out = PyBytes_AS_STRING(*outbytes) + n;
}
/* Encode the string character per character */
while (in < endin)
{
if ((in + 2) <= endin
&& 0xD800 <= in[0] && in[0] <= 0xDBFF
&& 0xDC00 <= in[1] && in[1] <= 0xDFFF)
charsize = 2;
else
charsize = 1;
outsize = WideCharToMultiByte(code_page, flags,
in, charsize,
buffer, Py_ARRAY_LENGTH(buffer),
NULL, pusedDefaultChar);
if (outsize > 0) {
if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
{
in += charsize;
memcpy(out, buffer, outsize);
out += outsize;
continue;
}
}
else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
PyErr_SetFromWindowsErr(0);
goto error;
}
charsize = Py_MAX(charsize - 1, 1);
startpos = in - startin;
rep = unicode_encode_call_errorhandler(
errors, &errorHandler, encoding, reason,
startin, insize, &exc,
startpos, startpos + charsize, &newpos);
if (rep == NULL)
goto error;
in = startin + newpos;
if (PyBytes_Check(rep)) {
outsize = PyBytes_GET_SIZE(rep);
if (outsize != 1) {
Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
Py_DECREF(rep);
goto error;
}
out = PyBytes_AS_STRING(*outbytes) + offset;
}
memcpy(out, PyBytes_AS_STRING(rep), outsize);
out += outsize;
}
else {
Py_ssize_t i;
enum PyUnicode_Kind kind;
void *data;
if (PyUnicode_READY(rep) < 0) {
Py_DECREF(rep);
goto error;
}
outsize = PyUnicode_GET_LENGTH(rep);
if (outsize != 1) {
Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
Py_DECREF(rep);
goto error;
}
out = PyBytes_AS_STRING(*outbytes) + offset;
}
kind = PyUnicode_KIND(rep);
data = PyUnicode_DATA(rep);
for (i=0; i < outsize; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (ch > 127) {
raise_encode_exception(&exc,
encoding,
startin, insize,
startpos, startpos + charsize,
"unable to encode error handler result to ASCII");
Py_DECREF(rep);
goto error;
}
*out = (unsigned char)ch;
out++;
}
}
Py_DECREF(rep);
}
/* write a NUL byte */
*out = 0;
outsize = out - PyBytes_AS_STRING(*outbytes);
assert(outsize <= PyBytes_GET_SIZE(*outbytes));
if (_PyBytes_Resize(outbytes, outsize) < 0)
goto error;
ret = 0;
error:
Py_XDECREF(encoding_obj);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return ret;
}
/*
* Encode a Unicode string to a Windows code page into a byte string.
*
* Returns consumed characters if succeed, or raise a WindowsError and returns
* -1 on other error.
*/
static int
encode_code_page_chunk(UINT code_page, PyObject **outbytes,
const Py_UNICODE *p, int size,
const char* errors)
{
int done;
if (size == 0) {
if (*outbytes == NULL) {
*outbytes = PyBytes_FromStringAndSize(NULL, 0);
if (*outbytes == NULL)
return -1;
}
return 0;
}
done = encode_code_page_strict(code_page, outbytes, p, size, errors);
if (done == -2)
done = encode_code_page_errors(code_page, outbytes, p, size, errors);
return done;
}
static PyObject *
encode_code_page(int code_page,
const Py_UNICODE *p, Py_ssize_t size,
const char *errors)
{
PyObject *outbytes = NULL;
int ret;
if (code_page < 0) {
PyErr_SetString(PyExc_ValueError, "invalid code page number");
return NULL;
}
#ifdef NEED_RETRY
retry:
if (size > INT_MAX)
ret = encode_mbcs(&repr, p, INT_MAX, errors);
ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors);
else
#endif
ret = encode_mbcs(&repr, p, (int)size, errors);
ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors);
if (ret < 0) {
Py_XDECREF(repr);
Py_XDECREF(outbytes);
return NULL;
}
@ -7164,7 +7602,28 @@ PyUnicode_EncodeMBCS(const Py_UNICODE *p,
}
#endif
return repr;
return outbytes;
}
PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Py_ssize_t size,
const char *errors)
{
return encode_code_page(CP_ACP, p, size, errors);
}
PyObject *
PyUnicode_EncodeCodePage(int code_page,
PyObject *unicode,
const char *errors)
{
const Py_UNICODE *p;
Py_ssize_t size;
p = PyUnicode_AsUnicodeAndSize(unicode, &size);
if (p == NULL)
return NULL;
return encode_code_page(code_page, p, size, errors);
}
PyObject *
@ -13434,7 +13893,7 @@ PyTypeObject PyUnicode_Type = {
/* Initialize the Unicode implementation */
void _PyUnicode_Init(void)
int _PyUnicode_Init(void)
{
int i;
@ -13467,6 +13926,15 @@ void _PyUnicode_Init(void)
Py_ARRAY_LENGTH(linebreak));
PyType_Ready(&EncodingMapType);
#ifdef HAVE_MBCS
winver.dwOSVersionInfoSize = sizeof(winver);
if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
PyErr_SetFromWindowsErr(0);
return -1;
}
#endif
return 0;
}
/* Finalize the Unicode implementation */

View File

@ -67,7 +67,7 @@ static void initsigs(void);
static void call_py_exitfuncs(void);
static void wait_for_thread_shutdown(void);
static void call_ll_exitfuncs(void);
extern void _PyUnicode_Init(void);
extern int _PyUnicode_Init(void);
extern void _PyUnicode_Fini(void);
extern int _PyLong_Init(void);
extern void PyLong_Fini(void);
@ -261,7 +261,8 @@ Py_InitializeEx(int install_sigs)
Py_FatalError("Py_Initialize: can't make modules_reloading dictionary");
/* Init Unicode implementation; relies on the codec registry */
_PyUnicode_Init();
if (_PyUnicode_Init() < 0)
Py_FatalError("Py_Initialize: can't initialize unicode");
bimod = _PyBuiltin_Init();
if (bimod == NULL)