Issue #850997: mbcs encoding (Windows only) handles errors argument: strict

mode raises unicode errors. The encoder only supports "strict" and "replace" error handlers, the decoder only supports "strict" and "ignore" error handlers.
2010-06-16 23:33:54 +00:00 · 2010-06-16 23:33:54 +00:00 · 554f3f0081
parent 79ee19f3db
commit 554f3f0081
5 changed files with 149 additions and 45 deletions
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@ -1223,6 +1223,23 @@ functions can be used directly if desired.
   Convert a label to Unicode, as specified in :rfc:`3490`.
 :mod:`encodings.mbcs` --- Windows ANSI codepage
 -----------------------------------------------
 .. module:: encodings.mbcs
   :synopsis: Windows ANSI codepage
 Encode operand according to the ANSI codepage (CP_ACP). This codec only
 supports ``'strict'`` and ``'replace'`` error handlers to encode, and
 ``'strict'`` and ``'ignore'`` error handlers to decode.
 Availability: Windows only.
 .. versionchanged:: 3.2
   Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
   to encode, and ``'ignore'`` to decode.
 :mod:`encodings.utf_8_sig` --- UTF-8 codec with BOM signature
 -------------------------------------------------------------
--- a/Lib/ctypes/init.py
+++ b/Lib/ctypes/init.py
@ -265,7 +265,7 @@ except ImportError:
    pass
 else:
    if _os.name in ("nt", "ce"):
-        set_conversion_mode("mbcs", "ignore")
+        set_conversion_mode("mbcs", "strict")
    else:
        set_conversion_mode("ascii", "strict")
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -1358,11 +1358,6 @@ broken_incremental_coders = broken_unicode_with_streams + [
    "idna",
 ]
 # The following encodings only support "strict" mode
 only_strict_mode = [
    "idna",
 ]
 class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
    def test_basics(self):
        s = "abc123" # all codecs should be able to encode these
@ -1437,7 +1432,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
                    result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
                    self.assertEqual(result, "")
-                if encoding not in only_strict_mode:
+                if encoding not in ("idna", "mbcs"):
                    # check incremental decoder/encoder with errors argument
                    try:
                        encoder = codecs.getincrementalencoder(encoding)("ignore")
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -12,6 +12,11 @@ What's New in Python 3.2 Alpha 1?
 Core and Builtins
 -----------------
 - Issue #850997: mbcs encoding (Windows only) handles errors argument: strict
  mode raises unicode errors. The encoder only supports "strict" and "replace"
  error handlers, the decoder only supports "strict" and "ignore" error
  handlers.
 - Issue #8592: PyArg_Parse*() functions raise a TypeError for "y", "u" and "Z"
  formats if the string contains a null byte/character. Write unit tests for
  string formats.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1767,6 +1767,33 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
    return 0;
 }
 /* create or adjust a UnicodeDecodeError */
 static void
 make_decode_exception(PyObject **exceptionObject,
                      const char *encoding,
                      const char *input, Py_ssize_t length,
                      Py_ssize_t startpos, Py_ssize_t endpos,
                      const char *reason)
 {
    if (*exceptionObject == NULL) {
        *exceptionObject = PyUnicodeDecodeError_Create(
            encoding, input, length, startpos, endpos, reason);
    }
    else {
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
            goto onError;
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
            goto onError;
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
            goto onError;
    }
    return;
 onError:
    Py_DECREF(*exceptionObject);
    *exceptionObject = NULL;
 }
 /* error handling callback helper:
   build arguments, call the callback and check the arguments,
   if no exception occurred, copy the replacement to the output
@ -1800,20 +1827,13 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
            goto onError;
    }
-    if (*exceptionObject == NULL) {
+    make_decode_exception(exceptionObject,
-        *exceptionObject = PyUnicodeDecodeError_Create(
+        encoding,
-            encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
+        *input, *inend - *input,
-        if (*exceptionObject == NULL)
+        *startinpos, *endinpos,
-            goto onError;
+        reason);
-    }
+    if (*exceptionObject == NULL)
-    else {
+        goto onError;
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
            goto onError;
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
            goto onError;
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
            goto onError;
    }
    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
    if (restuple == NULL)
@ -4552,32 +4572,46 @@ static int is_dbcs_lead_byte(const char *s, int offset)
 static int decode_mbcs(PyUnicodeObject **v,
                       const char *s, /* MBCS string */
                       int size, /* sizeof MBCS string */
-                       int final)
+                       int final,
                       const char *errors)
 {
    Py_UNICODE *p;
-    Py_ssize_t n = 0;
+    Py_ssize_t n;
-    int usize = 0;
+    DWORD usize;
    DWORD flags;
    assert(size >= 0);
    /* check and handle 'errors' arg */
    if (errors==NULL || strcmp(errors, "strict")==0)
        flags = MB_ERR_INVALID_CHARS;
    else if (strcmp(errors, "ignore")==0)
        flags = 0;
    else {
        PyErr_Format(PyExc_ValueError,
                     "mbcs encoding does not support errors='%s'",
                     errors);
        return -1;
    }
    /* Skip trailing lead-byte unless 'final' is set */
    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
        --size;
    /* First get the size of the result */
    if (size > 0) {
-        usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
+        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
-        if (usize == 0) {
+        if (usize==0)
-            PyErr_SetFromWindowsErrWithFilename(0, NULL);
+            goto mbcs_decode_error;
-            return -1;
+    } else
-        }
+        usize = 0;
    }
    if (*v == NULL) {
        /* Create unicode object */
        *v = _PyUnicode_New(usize);
        if (*v == NULL)
            return -1;
        n = 0;
    }
    else {
        /* Extend unicode object */
@ -4587,15 +4621,35 @@ static int decode_mbcs(PyUnicodeObject **v,
    }
    /* Do the conversion */
-    if (size > 0) {
+    if (usize > 0) {
        p = PyUnicode_AS_UNICODE(*v) + n;
-        if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
+        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
-            PyErr_SetFromWindowsErrWithFilename(0, NULL);
+            goto mbcs_decode_error;
            return -1;
        }
    }
    return size;
 mbcs_decode_error:
    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
       we raise a UnicodeDecodeError - else it is a 'generic'
       windows error
     */
    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
        /* Ideally, we should get reason from FormatMessage - this
           is the Windows 2000 English version of the message
        */
        PyObject *exc = NULL;
        const char *reason = "No mapping for the Unicode character exists "
                             "in the target multi-byte code page.";
        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
        if (exc != NULL) {
            PyCodec_StrictErrors(exc);
            Py_DECREF(exc);
        }
    } else {
        PyErr_SetFromWindowsErrWithFilename(0, NULL);
    }
    return -1;
 }
 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
@ -4612,10 +4666,10 @@ PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
 #ifdef NEED_RETRY
  retry:
    if (size > INT_MAX)
-        done = decode_mbcs(&v, s, INT_MAX, 0);
+        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
    else
 #endif
-        done = decode_mbcs(&v, s, (int)size, !consumed);
+        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
    if (done < 0) {
        Py_XDECREF(v);
@ -4649,20 +4703,45 @@ PyObject *PyUnicode_DecodeMBCS(const char *s,
 */
 static int encode_mbcs(PyObject **repr,
                       const Py_UNICODE *p, /* unicode */
-                       int size) /* size of unicode */
+                       int size, /* size of unicode */
                       const char* errors)
 {
-    int mbcssize = 0;
+    BOOL usedDefaultChar = FALSE;
-    Py_ssize_t n = 0;
+    BOOL *pusedDefaultChar;
    int mbcssize;
    Py_ssize_t n;
    PyObject *exc = NULL;
    DWORD flags;
    assert(size >= 0);
    /* check and handle 'errors' arg */
    if (errors==NULL || strcmp(errors, "strict")==0) {
        flags = WC_NO_BEST_FIT_CHARS;
        pusedDefaultChar = &usedDefaultChar;
    } else if (strcmp(errors, "replace")==0) {
        flags = 0;
        pusedDefaultChar = NULL;
    } else {
         PyErr_Format(PyExc_ValueError,
                      "mbcs encoding does not support errors='%s'",
                      errors);
         return -1;
    }
    /* First get the size of the result */
    if (size > 0) {
-        mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
+        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
                                       NULL, pusedDefaultChar);
        if (mbcssize == 0) {
            PyErr_SetFromWindowsErrWithFilename(0, NULL);
            return -1;
        }
        /* If we used a default char, then we failed! */
        if (pusedDefaultChar && *pusedDefaultChar)
            goto mbcs_encode_error;
    } else {
        mbcssize = 0;
    }
    if (*repr == NULL) {
@ -4670,6 +4749,7 @@ static int encode_mbcs(PyObject **repr,
        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
        if (*repr == NULL)
            return -1;
        n = 0;
    }
    else {
        /* Extend string object */
@ -4681,13 +4761,20 @@ static int encode_mbcs(PyObject **repr,
    /* Do the conversion */
    if (size > 0) {
        char *s = PyBytes_AS_STRING(*repr) + n;
-        if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
+        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
                                     NULL, pusedDefaultChar)) {
            PyErr_SetFromWindowsErrWithFilename(0, NULL);
            return -1;
        }
        if (pusedDefaultChar && *pusedDefaultChar)
            goto mbcs_encode_error;
    }
    return 0;
 mbcs_encode_error:
    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
    Py_XDECREF(exc);
    return -1;
 }
 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
@ -4700,10 +4787,10 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
 #ifdef NEED_RETRY
  retry:
    if (size > INT_MAX)
-        ret = encode_mbcs(&repr, p, INT_MAX);
+        ret = encode_mbcs(&repr, p, INT_MAX, errors);
    else
 #endif
-        ret = encode_mbcs(&repr, p, (int)size);
+        ret = encode_mbcs(&repr, p, (int)size, errors);
    if (ret < 0) {
        Py_XDECREF(repr);