mirror of https://github.com/python/cpython
Issue #850997: mbcs encoding (Windows only) handles errors argument: strict
mode raises unicode errors. The encoder only supports "strict" and "replace" error handlers, the decoder only supports "strict" and "ignore" error handlers.
This commit is contained in:
parent
79ee19f3db
commit
554f3f0081
|
@ -1223,6 +1223,23 @@ functions can be used directly if desired.
|
||||||
Convert a label to Unicode, as specified in :rfc:`3490`.
|
Convert a label to Unicode, as specified in :rfc:`3490`.
|
||||||
|
|
||||||
|
|
||||||
|
:mod:`encodings.mbcs` --- Windows ANSI codepage
|
||||||
|
-----------------------------------------------
|
||||||
|
|
||||||
|
.. module:: encodings.mbcs
|
||||||
|
:synopsis: Windows ANSI codepage
|
||||||
|
|
||||||
|
Encode operand according to the ANSI codepage (CP_ACP). This codec only
|
||||||
|
supports ``'strict'`` and ``'replace'`` error handlers to encode, and
|
||||||
|
``'strict'`` and ``'ignore'`` error handlers to decode.
|
||||||
|
|
||||||
|
Availability: Windows only.
|
||||||
|
|
||||||
|
.. versionchanged:: 3.2
|
||||||
|
Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
|
||||||
|
to encode, and ``'ignore'`` to decode.
|
||||||
|
|
||||||
|
|
||||||
:mod:`encodings.utf_8_sig` --- UTF-8 codec with BOM signature
|
:mod:`encodings.utf_8_sig` --- UTF-8 codec with BOM signature
|
||||||
-------------------------------------------------------------
|
-------------------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -265,7 +265,7 @@ except ImportError:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
if _os.name in ("nt", "ce"):
|
if _os.name in ("nt", "ce"):
|
||||||
set_conversion_mode("mbcs", "ignore")
|
set_conversion_mode("mbcs", "strict")
|
||||||
else:
|
else:
|
||||||
set_conversion_mode("ascii", "strict")
|
set_conversion_mode("ascii", "strict")
|
||||||
|
|
||||||
|
|
|
@ -1358,11 +1358,6 @@ broken_incremental_coders = broken_unicode_with_streams + [
|
||||||
"idna",
|
"idna",
|
||||||
]
|
]
|
||||||
|
|
||||||
# The following encodings only support "strict" mode
|
|
||||||
only_strict_mode = [
|
|
||||||
"idna",
|
|
||||||
]
|
|
||||||
|
|
||||||
class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
|
class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
|
||||||
def test_basics(self):
|
def test_basics(self):
|
||||||
s = "abc123" # all codecs should be able to encode these
|
s = "abc123" # all codecs should be able to encode these
|
||||||
|
@ -1437,7 +1432,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
|
||||||
result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
|
result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
|
||||||
self.assertEqual(result, "")
|
self.assertEqual(result, "")
|
||||||
|
|
||||||
if encoding not in only_strict_mode:
|
if encoding not in ("idna", "mbcs"):
|
||||||
# check incremental decoder/encoder with errors argument
|
# check incremental decoder/encoder with errors argument
|
||||||
try:
|
try:
|
||||||
encoder = codecs.getincrementalencoder(encoding)("ignore")
|
encoder = codecs.getincrementalencoder(encoding)("ignore")
|
||||||
|
|
|
@ -12,6 +12,11 @@ What's New in Python 3.2 Alpha 1?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #850997: mbcs encoding (Windows only) handles errors argument: strict
|
||||||
|
mode raises unicode errors. The encoder only supports "strict" and "replace"
|
||||||
|
error handlers, the decoder only supports "strict" and "ignore" error
|
||||||
|
handlers.
|
||||||
|
|
||||||
- Issue #8592: PyArg_Parse*() functions raise a TypeError for "y", "u" and "Z"
|
- Issue #8592: PyArg_Parse*() functions raise a TypeError for "y", "u" and "Z"
|
||||||
formats if the string contains a null byte/character. Write unit tests for
|
formats if the string contains a null byte/character. Write unit tests for
|
||||||
string formats.
|
string formats.
|
||||||
|
|
|
@ -1767,6 +1767,33 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* create or adjust a UnicodeDecodeError */
|
||||||
|
static void
|
||||||
|
make_decode_exception(PyObject **exceptionObject,
|
||||||
|
const char *encoding,
|
||||||
|
const char *input, Py_ssize_t length,
|
||||||
|
Py_ssize_t startpos, Py_ssize_t endpos,
|
||||||
|
const char *reason)
|
||||||
|
{
|
||||||
|
if (*exceptionObject == NULL) {
|
||||||
|
*exceptionObject = PyUnicodeDecodeError_Create(
|
||||||
|
encoding, input, length, startpos, endpos, reason);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
|
||||||
|
goto onError;
|
||||||
|
if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
|
||||||
|
goto onError;
|
||||||
|
if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
|
||||||
|
onError:
|
||||||
|
Py_DECREF(*exceptionObject);
|
||||||
|
*exceptionObject = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/* error handling callback helper:
|
/* error handling callback helper:
|
||||||
build arguments, call the callback and check the arguments,
|
build arguments, call the callback and check the arguments,
|
||||||
if no exception occurred, copy the replacement to the output
|
if no exception occurred, copy the replacement to the output
|
||||||
|
@ -1800,20 +1827,13 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (*exceptionObject == NULL) {
|
make_decode_exception(exceptionObject,
|
||||||
*exceptionObject = PyUnicodeDecodeError_Create(
|
encoding,
|
||||||
encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
|
*input, *inend - *input,
|
||||||
if (*exceptionObject == NULL)
|
*startinpos, *endinpos,
|
||||||
goto onError;
|
reason);
|
||||||
}
|
if (*exceptionObject == NULL)
|
||||||
else {
|
goto onError;
|
||||||
if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
|
|
||||||
goto onError;
|
|
||||||
if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
|
|
||||||
goto onError;
|
|
||||||
if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
|
|
||||||
goto onError;
|
|
||||||
}
|
|
||||||
|
|
||||||
restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
|
restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
|
||||||
if (restuple == NULL)
|
if (restuple == NULL)
|
||||||
|
@ -4552,32 +4572,46 @@ static int is_dbcs_lead_byte(const char *s, int offset)
|
||||||
static int decode_mbcs(PyUnicodeObject **v,
|
static int decode_mbcs(PyUnicodeObject **v,
|
||||||
const char *s, /* MBCS string */
|
const char *s, /* MBCS string */
|
||||||
int size, /* sizeof MBCS string */
|
int size, /* sizeof MBCS string */
|
||||||
int final)
|
int final,
|
||||||
|
const char *errors)
|
||||||
{
|
{
|
||||||
Py_UNICODE *p;
|
Py_UNICODE *p;
|
||||||
Py_ssize_t n = 0;
|
Py_ssize_t n;
|
||||||
int usize = 0;
|
DWORD usize;
|
||||||
|
DWORD flags;
|
||||||
|
|
||||||
assert(size >= 0);
|
assert(size >= 0);
|
||||||
|
|
||||||
|
/* check and handle 'errors' arg */
|
||||||
|
if (errors==NULL || strcmp(errors, "strict")==0)
|
||||||
|
flags = MB_ERR_INVALID_CHARS;
|
||||||
|
else if (strcmp(errors, "ignore")==0)
|
||||||
|
flags = 0;
|
||||||
|
else {
|
||||||
|
PyErr_Format(PyExc_ValueError,
|
||||||
|
"mbcs encoding does not support errors='%s'",
|
||||||
|
errors);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
/* Skip trailing lead-byte unless 'final' is set */
|
/* Skip trailing lead-byte unless 'final' is set */
|
||||||
if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
|
if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
|
||||||
--size;
|
--size;
|
||||||
|
|
||||||
/* First get the size of the result */
|
/* First get the size of the result */
|
||||||
if (size > 0) {
|
if (size > 0) {
|
||||||
usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
|
usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
|
||||||
if (usize == 0) {
|
if (usize==0)
|
||||||
PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
goto mbcs_decode_error;
|
||||||
return -1;
|
} else
|
||||||
}
|
usize = 0;
|
||||||
}
|
|
||||||
|
|
||||||
if (*v == NULL) {
|
if (*v == NULL) {
|
||||||
/* Create unicode object */
|
/* Create unicode object */
|
||||||
*v = _PyUnicode_New(usize);
|
*v = _PyUnicode_New(usize);
|
||||||
if (*v == NULL)
|
if (*v == NULL)
|
||||||
return -1;
|
return -1;
|
||||||
|
n = 0;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
/* Extend unicode object */
|
/* Extend unicode object */
|
||||||
|
@ -4587,15 +4621,35 @@ static int decode_mbcs(PyUnicodeObject **v,
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Do the conversion */
|
/* Do the conversion */
|
||||||
if (size > 0) {
|
if (usize > 0) {
|
||||||
p = PyUnicode_AS_UNICODE(*v) + n;
|
p = PyUnicode_AS_UNICODE(*v) + n;
|
||||||
if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
|
if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
|
||||||
PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
goto mbcs_decode_error;
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return size;
|
return size;
|
||||||
|
|
||||||
|
mbcs_decode_error:
|
||||||
|
/* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
|
||||||
|
we raise a UnicodeDecodeError - else it is a 'generic'
|
||||||
|
windows error
|
||||||
|
*/
|
||||||
|
if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
|
||||||
|
/* Ideally, we should get reason from FormatMessage - this
|
||||||
|
is the Windows 2000 English version of the message
|
||||||
|
*/
|
||||||
|
PyObject *exc = NULL;
|
||||||
|
const char *reason = "No mapping for the Unicode character exists "
|
||||||
|
"in the target multi-byte code page.";
|
||||||
|
make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
|
||||||
|
if (exc != NULL) {
|
||||||
|
PyCodec_StrictErrors(exc);
|
||||||
|
Py_DECREF(exc);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
|
PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
|
||||||
|
@ -4612,10 +4666,10 @@ PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
|
||||||
#ifdef NEED_RETRY
|
#ifdef NEED_RETRY
|
||||||
retry:
|
retry:
|
||||||
if (size > INT_MAX)
|
if (size > INT_MAX)
|
||||||
done = decode_mbcs(&v, s, INT_MAX, 0);
|
done = decode_mbcs(&v, s, INT_MAX, 0, errors);
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
done = decode_mbcs(&v, s, (int)size, !consumed);
|
done = decode_mbcs(&v, s, (int)size, !consumed, errors);
|
||||||
|
|
||||||
if (done < 0) {
|
if (done < 0) {
|
||||||
Py_XDECREF(v);
|
Py_XDECREF(v);
|
||||||
|
@ -4649,20 +4703,45 @@ PyObject *PyUnicode_DecodeMBCS(const char *s,
|
||||||
*/
|
*/
|
||||||
static int encode_mbcs(PyObject **repr,
|
static int encode_mbcs(PyObject **repr,
|
||||||
const Py_UNICODE *p, /* unicode */
|
const Py_UNICODE *p, /* unicode */
|
||||||
int size) /* size of unicode */
|
int size, /* size of unicode */
|
||||||
|
const char* errors)
|
||||||
{
|
{
|
||||||
int mbcssize = 0;
|
BOOL usedDefaultChar = FALSE;
|
||||||
Py_ssize_t n = 0;
|
BOOL *pusedDefaultChar;
|
||||||
|
int mbcssize;
|
||||||
|
Py_ssize_t n;
|
||||||
|
PyObject *exc = NULL;
|
||||||
|
DWORD flags;
|
||||||
|
|
||||||
assert(size >= 0);
|
assert(size >= 0);
|
||||||
|
|
||||||
|
/* check and handle 'errors' arg */
|
||||||
|
if (errors==NULL || strcmp(errors, "strict")==0) {
|
||||||
|
flags = WC_NO_BEST_FIT_CHARS;
|
||||||
|
pusedDefaultChar = &usedDefaultChar;
|
||||||
|
} else if (strcmp(errors, "replace")==0) {
|
||||||
|
flags = 0;
|
||||||
|
pusedDefaultChar = NULL;
|
||||||
|
} else {
|
||||||
|
PyErr_Format(PyExc_ValueError,
|
||||||
|
"mbcs encoding does not support errors='%s'",
|
||||||
|
errors);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
/* First get the size of the result */
|
/* First get the size of the result */
|
||||||
if (size > 0) {
|
if (size > 0) {
|
||||||
mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
|
mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
|
||||||
|
NULL, pusedDefaultChar);
|
||||||
if (mbcssize == 0) {
|
if (mbcssize == 0) {
|
||||||
PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
/* If we used a default char, then we failed! */
|
||||||
|
if (pusedDefaultChar && *pusedDefaultChar)
|
||||||
|
goto mbcs_encode_error;
|
||||||
|
} else {
|
||||||
|
mbcssize = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (*repr == NULL) {
|
if (*repr == NULL) {
|
||||||
|
@ -4670,6 +4749,7 @@ static int encode_mbcs(PyObject **repr,
|
||||||
*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
|
*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
|
||||||
if (*repr == NULL)
|
if (*repr == NULL)
|
||||||
return -1;
|
return -1;
|
||||||
|
n = 0;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
/* Extend string object */
|
/* Extend string object */
|
||||||
|
@ -4681,13 +4761,20 @@ static int encode_mbcs(PyObject **repr,
|
||||||
/* Do the conversion */
|
/* Do the conversion */
|
||||||
if (size > 0) {
|
if (size > 0) {
|
||||||
char *s = PyBytes_AS_STRING(*repr) + n;
|
char *s = PyBytes_AS_STRING(*repr) + n;
|
||||||
if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
|
if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
|
||||||
|
NULL, pusedDefaultChar)) {
|
||||||
PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
if (pusedDefaultChar && *pusedDefaultChar)
|
||||||
|
goto mbcs_encode_error;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
mbcs_encode_error:
|
||||||
|
raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
|
||||||
|
Py_XDECREF(exc);
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
|
PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
|
||||||
|
@ -4700,10 +4787,10 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
|
||||||
#ifdef NEED_RETRY
|
#ifdef NEED_RETRY
|
||||||
retry:
|
retry:
|
||||||
if (size > INT_MAX)
|
if (size > INT_MAX)
|
||||||
ret = encode_mbcs(&repr, p, INT_MAX);
|
ret = encode_mbcs(&repr, p, INT_MAX, errors);
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
ret = encode_mbcs(&repr, p, (int)size);
|
ret = encode_mbcs(&repr, p, (int)size, errors);
|
||||||
|
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
Py_XDECREF(repr);
|
Py_XDECREF(repr);
|
||||||
|
|
Loading…
Reference in New Issue