Issue #5915: Implement PEP 383, Non-decodable Bytes in

System Character Interfaces.
2009-05-05 04:43:17 +00:00 · 2009-05-05 04:43:17 +00:00 · 011e842033
parent 93f65a177b
commit 011e842033
15 changed files with 726 additions and 289 deletions
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@ -322,6 +322,8 @@ and implemented by all standard Python codecs:
 | ``'backslashreplace'``  | Replace with backslashed escape sequences     |
 |                         | (only for encoding).                          |
 +-------------------------+-----------------------------------------------+
+| ``'utf8b'``             | Replace byte with surrogate U+DCxx.           |
+-------------------------+-----------------------------------------------+

 In addition, the following error handlers are specific to a single codec:

@ -333,7 +335,7 @@ In addition, the following error handlers are specific to a single codec:
 +------------------+---------+--------------------------------------------+

 .. versionadded:: 3.1
-   The ``'surrogates'`` error handler.
+   The ``'utf8b'`` and ``'surrogates'`` error handlers.

 The set of allowed values can be extended via :meth:`register_error`.

--- a/Doc/library/os.rst
+++ b/Doc/library/os.rst
@ -51,6 +51,30 @@ the :mod:`os` module, but using them is of course a threat to portability!
   ``'ce'``, ``'java'``.


+.. _os-filenames:
+
+File Names, Command Line Arguments, and Environment Variables
+-------------------------------------------------------------
+
+In Python, file names, command line arguments, and environment
+variables are represented using the string type. On some systems,
+decoding these strings to and from bytes is necessary before passing
+them to the operating system. Python uses the file system encoding to
+perform this conversion (see :func:`sys.getfilesystemencoding`).
+
+.. versionchanged:: 3.1
+   On some systems, conversion using the file system encoding may
+   fail. In this case, Python uses the ``utf8b`` encoding error
+   handler, which means that undecodable bytes are replaced by a
+   Unicode character U+DCxx on decoding, and these are again
+   translated to the original byte on encoding.
+
+
+The file system encoding must guarantee to successfully decode all
+bytes below 128. If the file system encoding fails to provide this
+guarantee, API functions may raise UnicodeErrors.
+
+
 .. _os-procinfo:

 Process Parameters
@ -688,12 +712,8 @@ Files and Directories

 .. function:: getcwd()

-   Return a string representing the current working directory.  On Unix
-   platforms, this function may raise :exc:`UnicodeDecodeError` if the name of
-   the current directory is not decodable in the file system encoding.  Use
-   :func:`getcwdb` if you need the call to never fail. Availability: Unix,
-   Windows.
-
+   Return a string representing the current working directory.
+   Availability: Unix, Windows.

 .. function:: getcwdb()

@ -800,10 +820,8 @@ Files and Directories
   entries ``'.'`` and ``'..'`` even if they are present in the directory.
   Availability: Unix, Windows.

-   This function can be called with a bytes or string argument.  In the bytes
-   case, all filenames will be listed as returned by the underlying API.  In the
-   string case, filenames will be decoded using the file system encoding, and
-   skipped if a decoding error occurs.
+   This function can be called with a bytes or string argument, and returns
+   filenames of the same datatype.


 .. function:: lstat(path)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -198,6 +198,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
 # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
 # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
+# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
@ -296,6 +297,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
 # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
 # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
+# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
@ -693,25 +695,6 @@ PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
    PyObject *unicode,
    const char *errors);

-/* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
-
-   If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
-   UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
-   invalid characters with '?'.
-
-   The function is intended to be used for paths and file names only
-   during bootstrapping process where the codecs are not set up.
-*/
-
-PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
-    const char *s               /* encoded string */
-    );
-
-PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
-    const char *s,               /* encoded string */
-    Py_ssize_t size              /* size */
-    );
-
 /* Returns a pointer to the default encoding (normally, UTF-8) of the
   Unicode object unicode and the size of the encoded representation
   in bytes stored in *size.
@ -1252,6 +1235,33 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
    const char *errors		/* error handling */
    );

+/* --- File system encoding ---------------------------------------------- */
+
+/* ParseTuple converter which converts a Unicode object into the file
+   system encoding, using the PEP 383 error handler; bytes objects are
+   output as-is. */
+
+PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
+
+/* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
+
+   If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
+   UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
+   invalid characters with '?'.
+
+   The function is intended to be used for paths and file names only
+   during bootstrapping process where the codecs are not set up.
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
+    const char *s               /* encoded string */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
+    const char *s,               /* encoded string */
+    Py_ssize_t size              /* size */
+    );
+
 /* --- Methods & Slots ----------------------------------------------------

   These are capable of handling Unicode objects and strings on input
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -1516,6 +1516,34 @@ class TypesTest(unittest.TestCase):
        self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
        self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))

+class Utf8bTest(unittest.TestCase):
+
+    def test_utf8(self):
+        # Bad byte
+        self.assertEqual(b"foo\x80bar".decode("utf-8", "utf8b"),
+                         "foo\udc80bar")
+        self.assertEqual("foo\udc80bar".encode("utf-8", "utf8b"),
+                         b"foo\x80bar")
+        # bad-utf-8 encoded surrogate
+        self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "utf8b"),
+                         "\udced\udcb0\udc80")
+        self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "utf8b"),
+                         b"\xed\xb0\x80")
+
+    def test_ascii(self):
+        # bad byte
+        self.assertEqual(b"foo\x80bar".decode("ascii", "utf8b"),
+                         "foo\udc80bar")
+        self.assertEqual("foo\udc80bar".encode("ascii", "utf8b"),
+                         b"foo\x80bar")
+
+    def test_charmap(self):
+        # bad byte: \xa5 is unmapped in iso-8859-3
+        self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "utf8b"),
+                         "foo\udca5bar")
+        self.assertEqual("foo\udca5bar".encode("iso-8859-3", "utf8b"),
+                         b"foo\xa5bar")
+

 def test_main():
    support.run_unittest(
@ -1543,6 +1571,7 @@ def test_main():
        CharmapTest,
        WithStmtTest,
        TypesTest,
+        Utf8bTest,
    )


--- a/Lib/test/test_os.py
+++ b/Lib/test/test_os.py
@ -7,6 +7,7 @@ import errno
 import unittest
 import warnings
 import sys
+import shutil
 from test import support

 # Tests creating TESTFN
@ -698,9 +699,44 @@ if sys.platform != 'win32':
                    self.assertRaises(os.error, os.setregid, 0, 0)
                self.assertRaises(OverflowError, os.setregid, 1<<32, 0)
                self.assertRaises(OverflowError, os.setregid, 0, 1<<32)
+
+    class Pep383Tests(unittest.TestCase):
+        filenames = [b'foo\xf6bar', 'foo\xf6bar'.encode("utf-8")]
+
+        def setUp(self):
+            self.fsencoding = sys.getfilesystemencoding()
+            sys.setfilesystemencoding("utf-8")
+            self.dir = support.TESTFN
+            self.bdir = self.dir.encode("utf-8", "utf8b")
+            os.mkdir(self.dir)
+            self.unicodefn = []
+            for fn in self.filenames:
+                f = open(os.path.join(self.bdir, fn), "w")
+                f.close()
+                self.unicodefn.append(fn.decode("utf-8", "utf8b"))
+
+        def tearDown(self):
+            shutil.rmtree(self.dir)
+            sys.setfilesystemencoding(self.fsencoding)
+
+        def test_listdir(self):
+            expected = set(self.unicodefn)
+            found = set(os.listdir(support.TESTFN))
+            self.assertEquals(found, expected)
+
+        def test_open(self):
+            for fn in self.unicodefn:
+                f = open(os.path.join(self.dir, fn))
+                f.close()
+
+        def test_stat(self):
+            for fn in self.unicodefn:
+                os.stat(os.path.join(self.dir, fn))
 else:
    class PosixUidGidTests(unittest.TestCase):
        pass
+    class Pep383Tests(unittest.TestCase):
+        pass

 def test_main():
    support.run_unittest(
@ -714,7 +750,8 @@ def test_main():
        ExecTests,
        Win32ErrorTests,
        TestInvalidFD,
-        PosixUidGidTests
+        PosixUidGidTests,
+        Pep383Tests
    )

 if __name__ == "__main__":
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -12,6 +12,8 @@ What's New in Python 3.1 beta 1?
 Core and Builtins
 -----------------

+- Implement PEP 383, Non-decodable Bytes in System Character Interfaces.
+
 - Issue #5890: in subclasses of 'property' the __doc__ attribute was
  shadowed by classtype's, even if it was None.  property now
  inserts the __doc__ into the subclass instance __dict__.
--- a/Modules/_io/fileio.c
+++ b/Modules/_io/fileio.c
@ -245,7 +245,7 @@ fileio_init(PyObject *oself, PyObject *args, PyObject *kwds)
 				return -1;

 			stringobj = PyUnicode_AsEncodedString(
-				u, Py_FileSystemDefaultEncoding, NULL);
+				u, Py_FileSystemDefaultEncoding, "utf8b");
 			Py_DECREF(u);
 			if (stringobj == NULL)
 				return -1;
--- a/Modules/posixmodule.c
+++ b/Modules/posixmodule.c
--- a/Modules/python.c
+++ b/Modules/python.c
@ -14,6 +14,93 @@ wmain(int argc, wchar_t **argv)
 	return Py_Main(argc, argv);
 }
 #else
+static wchar_t*
+char2wchar(char* arg)
+{
+	wchar_t *res;
+#ifdef HAVE_BROKEN_MBSTOWCS
+	/* Some platforms have a broken implementation of
+	 * mbstowcs which does not count the characters that
+	 * would result from conversion.  Use an upper bound.
+	 */
+	size_t argsize = strlen(arg);
+#else
+	size_t argsize = mbstowcs(NULL, arg, 0);
+#endif
+	size_t count;
+	unsigned char *in;
+	wchar_t *out;
+#ifdef HAVE_MBRTOWC
+	mbstate_t mbs;
+#endif
+	if (argsize != (size_t)-1) {
+		res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
+		if (!res)
+			goto oom;
+		count = mbstowcs(res, arg, argsize+1);
+		if (count != (size_t)-1)
+			return res;
+		PyMem_Free(res);
+	}
+	/* Conversion failed. Fall back to escaping with utf8b. */
+#ifdef HAVE_MBRTOWC
+	/* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
+	
+	/* Overallocate; as multi-byte characters are in the argument, the
+	   actual output could use less memory. */
+	argsize = strlen(arg) + 1;
+	res = PyMem_Malloc(argsize*sizeof(wchar_t));
+	if (!res) goto oom;
+	in = (unsigned char*)arg;
+	out = res;
+	memset(&mbs, 0, sizeof mbs);
+	while (argsize) {
+		size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
+		if (converted == 0)
+			/* Reached end of string; null char stored. */
+			break;
+		if (converted == (size_t)-2) {
+			/* Incomplete character. This should never happen,
+			   since we provide everything that we have -
+			   unless there is a bug in the C library, or I 
+			   misunderstood how mbrtowc works. */
+			fprintf(stderr, "unexpected mbrtowc result -2\n");
+			return NULL;
+		}
+		if (converted == (size_t)-1) {
+			/* Conversion error. Escape as UTF-8b, and start over
+			   in the initial shift state. */
+			*out++ = 0xdc00 + *in++;
+			argsize--;
+			memset(&mbs, 0, sizeof mbs);
+			continue;
+		}
+		/* successfully converted some bytes */
+		in += converted;
+		argsize -= converted;
+		out++;
+	}
+#else
+	/* Cannot use C locale for escaping; manually escape as if charset
+	   is ASCII (i.e. escape all bytes > 128. This will still roundtrip
+	   correctly in the locale's charset, which must be an ASCII superset. */
+	res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
+	if (!res) goto oom;
+	in = (unsigned char*)arg;
+	out = res;
+	while(*in)
+		if(*in < 128)
+			*out++ = *in++;
+		else
+			*out++ = 0xdc00 + *in++;
+	*out = 0;
+#endif
+	return res;
+oom:
+	fprintf(stderr, "out of memory\n");
+	return NULL;
+}
+
 int
 main(int argc, char **argv)
 {
@ -40,32 +127,10 @@ main(int argc, char **argv)
 	oldloc = strdup(setlocale(LC_ALL, NULL));
 	setlocale(LC_ALL, "");
 	for (i = 0; i < argc; i++) {
-#ifdef HAVE_BROKEN_MBSTOWCS
-		/* Some platforms have a broken implementation of
-		 * mbstowcs which does not count the characters that
-		 * would result from conversion.  Use an upper bound.
-		 */
-		size_t argsize = strlen(argv[i]);
-#else
-		size_t argsize = mbstowcs(NULL, argv[i], 0);
-#endif
-		size_t count;
-		if (argsize == (size_t)-1) {
-			fprintf(stderr, "Could not convert argument %d to string\n", i);
+		argv_copy2[i] = argv_copy[i] = char2wchar(argv[i]);
+		if (!argv_copy[i])
 			return 1;
 	}
-		argv_copy[i] = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
-		argv_copy2[i] = argv_copy[i];
-		if (!argv_copy[i]) {
-			fprintf(stderr, "out of memory\n");
-			return 1;
-		}
-		count = mbstowcs(argv_copy[i], argv[i], argsize+1);
-		if (count == (size_t)-1) {
-			fprintf(stderr, "Could not convert argument %d to string\n", i);
-			return 1;
-		}
-	}
 	setlocale(LC_ALL, oldloc);
 	free(oldloc);
 	res = Py_Main(argc, argv_copy);
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1530,6 +1530,53 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
    }
 }

+/* Convert the argument to a bytes object, according to the file
+   system encoding */
+
+int
+PyUnicode_FSConverter(PyObject* arg, void* addr)
+{
+    PyObject *output = NULL;
+    Py_ssize_t size;
+    void *data;
+    if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
+        output = arg;
+        Py_INCREF(output);
+    }
+    else {
+        arg = PyUnicode_FromObject(arg);
+        if (!arg)
+            return 0;
+        output = PyUnicode_AsEncodedObject(arg, 
+                                           Py_FileSystemDefaultEncoding,
+                                           "utf8b");
+        Py_DECREF(arg);
+        if (!output)
+            return 0;
+        if (!PyBytes_Check(output)) {
+            Py_DECREF(output);
+            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
+            return 0;
+        }
+    }
+    if (PyBytes_Check(output)) {
+         size = PyBytes_GET_SIZE(output);
+         data = PyBytes_AS_STRING(output);
+    } 
+    else {
+         size = PyByteArray_GET_SIZE(output);
+         data = PyByteArray_AS_STRING(output);
+    }
+    if (size != strlen(data)) {
+        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
+        Py_DECREF(output);
+        return 0;
+    }
+    *(PyObject**)addr = output;
+    return 1;
+}
+
+
 char*
 _PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
 {
@ -4154,12 +4201,23 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
                                                              collstart-startp, collend-startp, &newpos);
                if (repunicode == NULL)
                    goto onError;
-                if (!PyUnicode_Check(repunicode)) {
-                    /* Implementation limitation: byte results not supported yet. */
-                    PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+                if (PyBytes_Check(repunicode)) {
+                    /* Directly copy bytes result to output. */
+                    repsize = PyBytes_Size(repunicode);
+                    if (repsize > 1) {
+                        /* Make room for all additional bytes. */
+                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
                            Py_DECREF(repunicode);
                            goto onError;
                        }
+                        ressize += repsize-1;
+                    }
+                    memcpy(str, PyBytes_AsString(repunicode), repsize);
+                    str += repsize;
+                    p = startp + newpos;
+                    Py_DECREF(repunicode);
+                    break;
+                }
                /* need more space? (at least enough for what we
                   have+the replacement+the rest of the string, so
                   we won't have to check space for encodable characters) */
@ -5123,12 +5181,25 @@ int charmap_encoding_error(
                                                      collstartpos, collendpos, &newpos);
        if (repunicode == NULL)
            return -1;
-        if (!PyUnicode_Check(repunicode)) {
-            /* Implementation limitation: byte results not supported yet. */
-            PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+        if (PyBytes_Check(repunicode)) {
+            /* Directly copy bytes result to output. */
+            Py_ssize_t outsize = PyBytes_Size(*res);
+            Py_ssize_t requiredsize;
+            repsize = PyBytes_Size(repunicode);
+            requiredsize = *respos + repsize;
+            if (requiredsize > outsize)
+                /* Make room for all additional bytes. */
+                if (charmapencode_resize(res, respos, requiredsize)) {
                    Py_DECREF(repunicode);
                    return -1;
                }
+            memcpy(PyBytes_AsString(*res) + *respos,
+                   PyBytes_AsString(repunicode),  repsize);
+            *respos += repsize;
+            *inpos = newpos;
+            Py_DECREF(repunicode);
+            break;
+        }
        /* generate replacement  */
        repsize = PyUnicode_GET_SIZE(repunicode);
        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
@ -5691,7 +5762,7 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
            if (repunicode == NULL)
                goto onError;
            if (!PyUnicode_Check(repunicode)) {
-                /* Implementation limitation: byte results not supported yet. */
+                /* Byte results not supported, since they have no decimal property. */
                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
                Py_DECREF(repunicode);
                goto onError;
--- a/Python/codecs.c
+++ b/Python/codecs.c
@ -829,6 +829,82 @@ PyCodec_SurrogateErrors(PyObject *exc)
    }
 }

+static PyObject *
+PyCodec_UTF8bErrors(PyObject *exc)
+{
+    PyObject *restuple;
+    PyObject *object;
+    Py_ssize_t start;
+    Py_ssize_t end;
+    PyObject *res;
+    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
+	Py_UNICODE *p;
+	Py_UNICODE *startp;
+	char *outp;
+	if (PyUnicodeEncodeError_GetStart(exc, &start))
+	    return NULL;
+	if (PyUnicodeEncodeError_GetEnd(exc, &end))
+	    return NULL;
+	if (!(object = PyUnicodeEncodeError_GetObject(exc)))
+	    return NULL;
+	startp = PyUnicode_AS_UNICODE(object);
+	res = PyBytes_FromStringAndSize(NULL, end-start);
+	if (!res) {
+	    Py_DECREF(object);
+	    return NULL;
+	}
+	outp = PyBytes_AsString(res);
+	for (p = startp+start; p < startp+end; p++) {
+	    Py_UNICODE ch = *p;
+	    if (ch < 0xdc80 || ch > 0xdcff) {
+		/* Not a UTF-8b surrogate, fail with original exception */
+		PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+		Py_DECREF(res);
+		Py_DECREF(object);
+		return NULL;
+	    }
+	    *outp++ = ch - 0xdc00;
+	}
+	restuple = Py_BuildValue("(On)", res, end);
+	Py_DECREF(res);
+	Py_DECREF(object);
+	return restuple;
+    }
+    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
+	unsigned char *p;
+	Py_UNICODE ch[4]; /* decode up to 4 bad bytes. */
+	int consumed = 0;
+	if (PyUnicodeDecodeError_GetStart(exc, &start))
+	    return NULL;
+	if (PyUnicodeDecodeError_GetEnd(exc, &end))
+	    return NULL;
+	if (!(object = PyUnicodeDecodeError_GetObject(exc)))
+	    return NULL;
+	if (!(p = (unsigned char*)PyBytes_AsString(object))) {
+	    Py_DECREF(object);
+	    return NULL;
+	}
+	while (consumed < 4 && consumed < end-start) {
+	    /* Refuse to escape ASCII bytes. */
+	    if (p[start+consumed] < 128)
+		break;
+	    ch[consumed] = 0xdc00 + p[start+consumed];
+	    consumed++;
+	}
+	Py_DECREF(object);
+	if (!consumed) {
+	    /* codec complained about ASCII byte. */
+	    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+	    return NULL;
+	}	    
+	return Py_BuildValue("(u#n)", ch, consumed, start+consumed);
+    }
+    else {
+	wrong_exception_type(exc);
+	return NULL;
+    }
+}
+
 	
 static PyObject *strict_errors(PyObject *self, PyObject *exc)
 {
@ -864,6 +940,11 @@ static PyObject *surrogates_errors(PyObject *self, PyObject *exc)
    return PyCodec_SurrogateErrors(exc);
 }

+static PyObject *utf8b_errors(PyObject *self, PyObject *exc)
+{
+    return PyCodec_UTF8bErrors(exc);
+}
+
 static int _PyCodecRegistry_Init(void)
 {
    static struct {
@ -918,6 +999,14 @@ static int _PyCodecRegistry_Init(void)
 		surrogates_errors,
 		METH_O
 	    }
+	},
+	{
+	    "utf8b",
+	    {
+		"utf8b",
+		utf8b_errors,
+		METH_O
+	    }
 	}
    };

--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@ -262,6 +262,22 @@ Py_InitializeEx(int install_sigs)

 	_PyImportHooks_Init();

+#if defined(HAVE_LANGINFO_H) && defined(CODESET)
+	/* On Unix, set the file system encoding according to the
+	   user's preference, if the CODESET names a well-known
+	   Python codec, and Py_FileSystemDefaultEncoding isn't
+	   initialized by other means. Also set the encoding of
+	   stdin and stdout if these are terminals.  */
+
+	codeset = get_codeset();
+	if (codeset) {
+		if (!Py_FileSystemDefaultEncoding)
+			Py_FileSystemDefaultEncoding = codeset;
+		else
+			free(codeset);
+	}
+#endif
+
 	if (install_sigs)
 		initsigs(); /* Signal handling stuff, including initintr() */
 		
@ -285,22 +301,6 @@ Py_InitializeEx(int install_sigs)
 #ifdef WITH_THREAD
 	_PyGILState_Init(interp, tstate);
 #endif /* WITH_THREAD */
-
-#if defined(HAVE_LANGINFO_H) && defined(CODESET)
-	/* On Unix, set the file system encoding according to the
-	   user's preference, if the CODESET names a well-known
-	   Python codec, and Py_FileSystemDefaultEncoding isn't
-	   initialized by other means. Also set the encoding of
-	   stdin and stdout if these are terminals.  */
-
-	codeset = get_codeset();
-	if (codeset) {
-		if (!Py_FileSystemDefaultEncoding)
-			Py_FileSystemDefaultEncoding = codeset;
-		else
-			free(codeset);
-	}
-#endif
 }

 void
--- a/5
+++ b/5
@ -1,5 +1,5 @@
 #! /bin/sh
-# From configure.in Revision: 71731 .
+# From configure.in Revision: 72144 .
 # Guess values for system-dependent variables and create Makefiles.
 # Generated by GNU Autoconf 2.61 for python 3.1.
 #
@ -16297,13 +16297,14 @@ echo "${ECHO_T}MACHDEP_OBJS" >&6; }



+


 for ac_func in alarm setitimer getitimer bind_textdomain_codeset chown \
 clock confstr ctermid execv fchmod fchown fork fpathconf ftime ftruncate \
 gai_strerror getgroups getlogin getloadavg getpeername getpgid getpid \
 getpriority getpwent getspnam getspent getsid getwd \
- kill killpg lchmod lchown lstat mkfifo mknod mktime \
+ kill killpg lchmod lchown lstat mbrtowc mkfifo mknod mktime \
 mremap nice pathconf pause plock poll pthread_init \
 putenv readlink realpath \
 select sem_open sem_timedwait sem_getvalue sem_unlink setegid seteuid \
--- a/configure.in
+++ b/configure.in
@ -2403,7 +2403,7 @@ AC_CHECK_FUNCS(alarm setitimer getitimer bind_textdomain_codeset chown \
 clock confstr ctermid execv fchmod fchown fork fpathconf ftime ftruncate \
 gai_strerror getgroups getlogin getloadavg getpeername getpgid getpid \
 getpriority getpwent getspnam getspent getsid getwd \
- kill killpg lchmod lchown lstat mkfifo mknod mktime \
+ kill killpg lchmod lchown lstat mbrtowc mkfifo mknod mktime \
 mremap nice pathconf pause plock poll pthread_init \
 putenv readlink realpath \
 select sem_open sem_timedwait sem_getvalue sem_unlink setegid seteuid \
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@ -419,6 +419,9 @@
 /* Define this if you have the makedev macro. */
 #undef HAVE_MAKEDEV

+/* Define to 1 if you have the `mbrtowc' function. */
+#undef HAVE_MBRTOWC
+
 /* Define to 1 if you have the `memmove' function. */
 #undef HAVE_MEMMOVE