Issue #850997: mbcs encoding (Windows only) handles errors argument: strict

mode raises unicode errors. The encoder only supports "strict" and "replace"
error handlers, the decoder only supports "strict" and "ignore" error handlers.
This commit is contained in:
Victor Stinner 2010-06-16 23:33:54 +00:00
parent 79ee19f3db
commit 554f3f0081
5 changed files with 149 additions and 45 deletions

View File

@ -1223,6 +1223,23 @@ functions can be used directly if desired.
Convert a label to Unicode, as specified in :rfc:`3490`. Convert a label to Unicode, as specified in :rfc:`3490`.
:mod:`encodings.mbcs` --- Windows ANSI codepage
-----------------------------------------------
.. module:: encodings.mbcs
:synopsis: Windows ANSI codepage
Encode operand according to the ANSI codepage (CP_ACP). This codec only
supports ``'strict'`` and ``'replace'`` error handlers to encode, and
``'strict'`` and ``'ignore'`` error handlers to decode.
Availability: Windows only.
.. versionchanged:: 3.2
Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
to encode, and ``'ignore'`` to decode.
:mod:`encodings.utf_8_sig` --- UTF-8 codec with BOM signature :mod:`encodings.utf_8_sig` --- UTF-8 codec with BOM signature
------------------------------------------------------------- -------------------------------------------------------------

View File

@ -265,7 +265,7 @@ except ImportError:
pass pass
else: else:
if _os.name in ("nt", "ce"): if _os.name in ("nt", "ce"):
set_conversion_mode("mbcs", "ignore") set_conversion_mode("mbcs", "strict")
else: else:
set_conversion_mode("ascii", "strict") set_conversion_mode("ascii", "strict")

View File

@ -1358,11 +1358,6 @@ broken_incremental_coders = broken_unicode_with_streams + [
"idna", "idna",
] ]
# The following encodings only support "strict" mode
only_strict_mode = [
"idna",
]
class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
def test_basics(self): def test_basics(self):
s = "abc123" # all codecs should be able to encode these s = "abc123" # all codecs should be able to encode these
@ -1437,7 +1432,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding)) result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
self.assertEqual(result, "") self.assertEqual(result, "")
if encoding not in only_strict_mode: if encoding not in ("idna", "mbcs"):
# check incremental decoder/encoder with errors argument # check incremental decoder/encoder with errors argument
try: try:
encoder = codecs.getincrementalencoder(encoding)("ignore") encoder = codecs.getincrementalencoder(encoding)("ignore")

View File

@ -12,6 +12,11 @@ What's New in Python 3.2 Alpha 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #850997: mbcs encoding (Windows only) handles errors argument: strict
mode raises unicode errors. The encoder only supports "strict" and "replace"
error handlers, the decoder only supports "strict" and "ignore" error
handlers.
- Issue #8592: PyArg_Parse*() functions raise a TypeError for "y", "u" and "Z" - Issue #8592: PyArg_Parse*() functions raise a TypeError for "y", "u" and "Z"
formats if the string contains a null byte/character. Write unit tests for formats if the string contains a null byte/character. Write unit tests for
string formats. string formats.

View File

@ -1767,6 +1767,33 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
return 0; return 0;
} }
/* create or adjust a UnicodeDecodeError */
static void
make_decode_exception(PyObject **exceptionObject,
const char *encoding,
const char *input, Py_ssize_t length,
Py_ssize_t startpos, Py_ssize_t endpos,
const char *reason)
{
if (*exceptionObject == NULL) {
*exceptionObject = PyUnicodeDecodeError_Create(
encoding, input, length, startpos, endpos, reason);
}
else {
if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
goto onError;
if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
goto onError;
if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
goto onError;
}
return;
onError:
Py_DECREF(*exceptionObject);
*exceptionObject = NULL;
}
/* error handling callback helper: /* error handling callback helper:
build arguments, call the callback and check the arguments, build arguments, call the callback and check the arguments,
if no exception occurred, copy the replacement to the output if no exception occurred, copy the replacement to the output
@ -1800,20 +1827,13 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
goto onError; goto onError;
} }
if (*exceptionObject == NULL) { make_decode_exception(exceptionObject,
*exceptionObject = PyUnicodeDecodeError_Create( encoding,
encoding, *input, *inend-*input, *startinpos, *endinpos, reason); *input, *inend - *input,
*startinpos, *endinpos,
reason);
if (*exceptionObject == NULL) if (*exceptionObject == NULL)
goto onError; goto onError;
}
else {
if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
goto onError;
if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
goto onError;
if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
goto onError;
}
restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
if (restuple == NULL) if (restuple == NULL)
@ -4552,32 +4572,46 @@ static int is_dbcs_lead_byte(const char *s, int offset)
static int decode_mbcs(PyUnicodeObject **v, static int decode_mbcs(PyUnicodeObject **v,
const char *s, /* MBCS string */ const char *s, /* MBCS string */
int size, /* sizeof MBCS string */ int size, /* sizeof MBCS string */
int final) int final,
const char *errors)
{ {
Py_UNICODE *p; Py_UNICODE *p;
Py_ssize_t n = 0; Py_ssize_t n;
int usize = 0; DWORD usize;
DWORD flags;
assert(size >= 0); assert(size >= 0);
/* check and handle 'errors' arg */
if (errors==NULL || strcmp(errors, "strict")==0)
flags = MB_ERR_INVALID_CHARS;
else if (strcmp(errors, "ignore")==0)
flags = 0;
else {
PyErr_Format(PyExc_ValueError,
"mbcs encoding does not support errors='%s'",
errors);
return -1;
}
/* Skip trailing lead-byte unless 'final' is set */ /* Skip trailing lead-byte unless 'final' is set */
if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
--size; --size;
/* First get the size of the result */ /* First get the size of the result */
if (size > 0) { if (size > 0) {
usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
if (usize == 0) { if (usize==0)
PyErr_SetFromWindowsErrWithFilename(0, NULL); goto mbcs_decode_error;
return -1; } else
} usize = 0;
}
if (*v == NULL) { if (*v == NULL) {
/* Create unicode object */ /* Create unicode object */
*v = _PyUnicode_New(usize); *v = _PyUnicode_New(usize);
if (*v == NULL) if (*v == NULL)
return -1; return -1;
n = 0;
} }
else { else {
/* Extend unicode object */ /* Extend unicode object */
@ -4587,15 +4621,35 @@ static int decode_mbcs(PyUnicodeObject **v,
} }
/* Do the conversion */ /* Do the conversion */
if (size > 0) { if (usize > 0) {
p = PyUnicode_AS_UNICODE(*v) + n; p = PyUnicode_AS_UNICODE(*v) + n;
if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
PyErr_SetFromWindowsErrWithFilename(0, NULL); goto mbcs_decode_error;
return -1;
} }
} }
return size; return size;
mbcs_decode_error:
/* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
we raise a UnicodeDecodeError - else it is a 'generic'
windows error
*/
if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
/* Ideally, we should get reason from FormatMessage - this
is the Windows 2000 English version of the message
*/
PyObject *exc = NULL;
const char *reason = "No mapping for the Unicode character exists "
"in the target multi-byte code page.";
make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
if (exc != NULL) {
PyCodec_StrictErrors(exc);
Py_DECREF(exc);
}
} else {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
}
return -1;
} }
PyObject *PyUnicode_DecodeMBCSStateful(const char *s, PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
@ -4612,10 +4666,10 @@ PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
#ifdef NEED_RETRY #ifdef NEED_RETRY
retry: retry:
if (size > INT_MAX) if (size > INT_MAX)
done = decode_mbcs(&v, s, INT_MAX, 0); done = decode_mbcs(&v, s, INT_MAX, 0, errors);
else else
#endif #endif
done = decode_mbcs(&v, s, (int)size, !consumed); done = decode_mbcs(&v, s, (int)size, !consumed, errors);
if (done < 0) { if (done < 0) {
Py_XDECREF(v); Py_XDECREF(v);
@ -4649,20 +4703,45 @@ PyObject *PyUnicode_DecodeMBCS(const char *s,
*/ */
static int encode_mbcs(PyObject **repr, static int encode_mbcs(PyObject **repr,
const Py_UNICODE *p, /* unicode */ const Py_UNICODE *p, /* unicode */
int size) /* size of unicode */ int size, /* size of unicode */
const char* errors)
{ {
int mbcssize = 0; BOOL usedDefaultChar = FALSE;
Py_ssize_t n = 0; BOOL *pusedDefaultChar;
int mbcssize;
Py_ssize_t n;
PyObject *exc = NULL;
DWORD flags;
assert(size >= 0); assert(size >= 0);
/* check and handle 'errors' arg */
if (errors==NULL || strcmp(errors, "strict")==0) {
flags = WC_NO_BEST_FIT_CHARS;
pusedDefaultChar = &usedDefaultChar;
} else if (strcmp(errors, "replace")==0) {
flags = 0;
pusedDefaultChar = NULL;
} else {
PyErr_Format(PyExc_ValueError,
"mbcs encoding does not support errors='%s'",
errors);
return -1;
}
/* First get the size of the result */ /* First get the size of the result */
if (size > 0) { if (size > 0) {
mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
NULL, pusedDefaultChar);
if (mbcssize == 0) { if (mbcssize == 0) {
PyErr_SetFromWindowsErrWithFilename(0, NULL); PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1; return -1;
} }
/* If we used a default char, then we failed! */
if (pusedDefaultChar && *pusedDefaultChar)
goto mbcs_encode_error;
} else {
mbcssize = 0;
} }
if (*repr == NULL) { if (*repr == NULL) {
@ -4670,6 +4749,7 @@ static int encode_mbcs(PyObject **repr,
*repr = PyBytes_FromStringAndSize(NULL, mbcssize); *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
if (*repr == NULL) if (*repr == NULL)
return -1; return -1;
n = 0;
} }
else { else {
/* Extend string object */ /* Extend string object */
@ -4681,13 +4761,20 @@ static int encode_mbcs(PyObject **repr,
/* Do the conversion */ /* Do the conversion */
if (size > 0) { if (size > 0) {
char *s = PyBytes_AS_STRING(*repr) + n; char *s = PyBytes_AS_STRING(*repr) + n;
if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
NULL, pusedDefaultChar)) {
PyErr_SetFromWindowsErrWithFilename(0, NULL); PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1; return -1;
} }
if (pusedDefaultChar && *pusedDefaultChar)
goto mbcs_encode_error;
} }
return 0; return 0;
mbcs_encode_error:
raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
Py_XDECREF(exc);
return -1;
} }
PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
@ -4700,10 +4787,10 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
#ifdef NEED_RETRY #ifdef NEED_RETRY
retry: retry:
if (size > INT_MAX) if (size > INT_MAX)
ret = encode_mbcs(&repr, p, INT_MAX); ret = encode_mbcs(&repr, p, INT_MAX, errors);
else else
#endif #endif
ret = encode_mbcs(&repr, p, (int)size); ret = encode_mbcs(&repr, p, (int)size, errors);
if (ret < 0) { if (ret < 0) {
Py_XDECREF(repr); Py_XDECREF(repr);