Issue #5915: Implement PEP 383, Non-decodable Bytes in

System Character Interfaces.
2009-05-05 04:43:17 +00:00 · 2009-05-05 04:43:17 +00:00 · 011e842033
parent 93f65a177b
commit 011e842033
15 changed files with 726 additions and 289 deletions
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@ -322,6 +322,8 @@ and implemented by all standard Python codecs:
 | ``'backslashreplace'``  | Replace with backslashed escape sequences     |
 |                         | (only for encoding).                          |
 +-------------------------+-----------------------------------------------+
 | ``'utf8b'``             | Replace byte with surrogate U+DCxx.           |
 +-------------------------+-----------------------------------------------+
 In addition, the following error handlers are specific to a single codec:
@ -333,7 +335,7 @@ In addition, the following error handlers are specific to a single codec:
 +------------------+---------+--------------------------------------------+
 .. versionadded:: 3.1
-   The ``'surrogates'`` error handler.
+   The ``'utf8b'`` and ``'surrogates'`` error handlers.
 The set of allowed values can be extended via :meth:`register_error`.
--- a/Doc/library/os.rst
+++ b/Doc/library/os.rst
@ -51,6 +51,30 @@ the :mod:`os` module, but using them is of course a threat to portability!
   ``'ce'``, ``'java'``.
 .. _os-filenames:
 File Names, Command Line Arguments, and Environment Variables
 -------------------------------------------------------------
 In Python, file names, command line arguments, and environment
 variables are represented using the string type. On some systems,
 decoding these strings to and from bytes is necessary before passing
 them to the operating system. Python uses the file system encoding to
 perform this conversion (see :func:`sys.getfilesystemencoding`).
 .. versionchanged:: 3.1
   On some systems, conversion using the file system encoding may
   fail. In this case, Python uses the ``utf8b`` encoding error
   handler, which means that undecodable bytes are replaced by a
   Unicode character U+DCxx on decoding, and these are again
   translated to the original byte on encoding.
 The file system encoding must guarantee to successfully decode all
 bytes below 128. If the file system encoding fails to provide this
 guarantee, API functions may raise UnicodeErrors.
 .. _os-procinfo:
 Process Parameters
@ -688,12 +712,8 @@ Files and Directories
 .. function:: getcwd()
-   Return a string representing the current working directory.  On Unix
+   Return a string representing the current working directory.
-   platforms, this function may raise :exc:`UnicodeDecodeError` if the name of
+   Availability: Unix, Windows.
   the current directory is not decodable in the file system encoding.  Use
   :func:`getcwdb` if you need the call to never fail. Availability: Unix,
   Windows.
 .. function:: getcwdb()
@ -800,10 +820,8 @@ Files and Directories
   entries ``'.'`` and ``'..'`` even if they are present in the directory.
   Availability: Unix, Windows.
-   This function can be called with a bytes or string argument.  In the bytes
+   This function can be called with a bytes or string argument, and returns
-   case, all filenames will be listed as returned by the underlying API.  In the
+   filenames of the same datatype.
   string case, filenames will be decoded using the file system encoding, and
   skipped if a decoding error occurs.
 .. function:: lstat(path)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -198,6 +198,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
 # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
 # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
 # define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
@ -296,6 +297,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
 # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
 # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
 # define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
@ -693,25 +695,6 @@ PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
    PyObject *unicode,
    const char *errors);
 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
   If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
   UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
   invalid characters with '?'.
   The function is intended to be used for paths and file names only
   during bootstrapping process where the codecs are not set up.
 */
 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
    const char *s               /* encoded string */
    );
 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
    const char *s,               /* encoded string */
    Py_ssize_t size              /* size */
    );
 /* Returns a pointer to the default encoding (normally, UTF-8) of the
   Unicode object unicode and the size of the encoded representation
   in bytes stored in *size.
@ -1252,6 +1235,33 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
    const char *errors		/* error handling */
    );
 /* --- File system encoding ---------------------------------------------- */
 /* ParseTuple converter which converts a Unicode object into the file
   system encoding, using the PEP 383 error handler; bytes objects are
   output as-is. */
 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
   If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
   UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
   invalid characters with '?'.
   The function is intended to be used for paths and file names only
   during bootstrapping process where the codecs are not set up.
 */
 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
    const char *s               /* encoded string */
    );
 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
    const char *s,               /* encoded string */
    Py_ssize_t size              /* size */
    );
 /* --- Methods & Slots ----------------------------------------------------
   These are capable of handling Unicode objects and strings on input
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -1516,6 +1516,34 @@ class TypesTest(unittest.TestCase):
        self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
        self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
 class Utf8bTest(unittest.TestCase):
    def test_utf8(self):
        # Bad byte
        self.assertEqual(b"foo\x80bar".decode("utf-8", "utf8b"),
                         "foo\udc80bar")
        self.assertEqual("foo\udc80bar".encode("utf-8", "utf8b"),
                         b"foo\x80bar")
        # bad-utf-8 encoded surrogate
        self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "utf8b"),
                         "\udced\udcb0\udc80")
        self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "utf8b"),
                         b"\xed\xb0\x80")
    def test_ascii(self):
        # bad byte
        self.assertEqual(b"foo\x80bar".decode("ascii", "utf8b"),
                         "foo\udc80bar")
        self.assertEqual("foo\udc80bar".encode("ascii", "utf8b"),
                         b"foo\x80bar")
    def test_charmap(self):
        # bad byte: \xa5 is unmapped in iso-8859-3
        self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "utf8b"),
                         "foo\udca5bar")
        self.assertEqual("foo\udca5bar".encode("iso-8859-3", "utf8b"),
                         b"foo\xa5bar")
 def test_main():
    support.run_unittest(
@ -1543,6 +1571,7 @@ def test_main():
        CharmapTest,
        WithStmtTest,
        TypesTest,
        Utf8bTest,
    )
--- a/Lib/test/test_os.py
+++ b/Lib/test/test_os.py
@ -7,6 +7,7 @@ import errno
 import unittest
 import warnings
 import sys
 import shutil
 from test import support
 # Tests creating TESTFN
@ -698,9 +699,44 @@ if sys.platform != 'win32':
                    self.assertRaises(os.error, os.setregid, 0, 0)
                self.assertRaises(OverflowError, os.setregid, 1<<32, 0)
                self.assertRaises(OverflowError, os.setregid, 0, 1<<32)
    class Pep383Tests(unittest.TestCase):
        filenames = [b'foo\xf6bar', 'foo\xf6bar'.encode("utf-8")]
        def setUp(self):
            self.fsencoding = sys.getfilesystemencoding()
            sys.setfilesystemencoding("utf-8")
            self.dir = support.TESTFN
            self.bdir = self.dir.encode("utf-8", "utf8b")
            os.mkdir(self.dir)
            self.unicodefn = []
            for fn in self.filenames:
                f = open(os.path.join(self.bdir, fn), "w")
                f.close()
                self.unicodefn.append(fn.decode("utf-8", "utf8b"))
        def tearDown(self):
            shutil.rmtree(self.dir)
            sys.setfilesystemencoding(self.fsencoding)
        def test_listdir(self):
            expected = set(self.unicodefn)
            found = set(os.listdir(support.TESTFN))
            self.assertEquals(found, expected)
        def test_open(self):
            for fn in self.unicodefn:
                f = open(os.path.join(self.dir, fn))
                f.close()
        def test_stat(self):
            for fn in self.unicodefn:
                os.stat(os.path.join(self.dir, fn))
 else:
    class PosixUidGidTests(unittest.TestCase):
        pass
    class Pep383Tests(unittest.TestCase):
        pass
 def test_main():
    support.run_unittest(
@ -714,7 +750,8 @@ def test_main():
        ExecTests,
        Win32ErrorTests,
        TestInvalidFD,
-        PosixUidGidTests
+        PosixUidGidTests,
        Pep383Tests
    )
 if __name__ == "__main__":
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -12,6 +12,8 @@ What's New in Python 3.1 beta 1?
 Core and Builtins
 -----------------
 - Implement PEP 383, Non-decodable Bytes in System Character Interfaces.
 - Issue #5890: in subclasses of 'property' the __doc__ attribute was
  shadowed by classtype's, even if it was None.  property now
  inserts the __doc__ into the subclass instance __dict__.
--- a/Modules/_io/fileio.c
+++ b/Modules/_io/fileio.c
@ -245,7 +245,7 @@ fileio_init(PyObject *oself, PyObject *args, PyObject *kwds)
 				return -1;
 			stringobj = PyUnicode_AsEncodedString(
-				u, Py_FileSystemDefaultEncoding, NULL);
+				u, Py_FileSystemDefaultEncoding, "utf8b");
 			Py_DECREF(u);
 			if (stringobj == NULL)
 				return -1;
--- a/Modules/posixmodule.c
+++ b/Modules/posixmodule.c
--- a/Modules/python.c
+++ b/Modules/python.c
@ -14,6 +14,93 @@ wmain(int argc, wchar_t **argv)
 	return Py_Main(argc, argv);
 }
 #else
 static wchar_t*
 char2wchar(char* arg)
 {
 	wchar_t *res;
 #ifdef HAVE_BROKEN_MBSTOWCS
 	/* Some platforms have a broken implementation of
 	 * mbstowcs which does not count the characters that
 	 * would result from conversion.  Use an upper bound.
 	 */
 	size_t argsize = strlen(arg);
 #else
 	size_t argsize = mbstowcs(NULL, arg, 0);
 #endif
 	size_t count;
 	unsigned char *in;
 	wchar_t *out;
 #ifdef HAVE_MBRTOWC
 	mbstate_t mbs;
 #endif
 	if (argsize != (size_t)-1) {
 		res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
 		if (!res)
 			goto oom;
 		count = mbstowcs(res, arg, argsize+1);
 		if (count != (size_t)-1)
 			return res;
 		PyMem_Free(res);
 	}
 	/* Conversion failed. Fall back to escaping with utf8b. */
 #ifdef HAVE_MBRTOWC
 	/* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
 	/* Overallocate; as multi-byte characters are in the argument, the
 	   actual output could use less memory. */
 	argsize = strlen(arg) + 1;
 	res = PyMem_Malloc(argsize*sizeof(wchar_t));
 	if (!res) goto oom;
 	in = (unsigned char*)arg;
 	out = res;
 	memset(&mbs, 0, sizeof mbs);
 	while (argsize) {
 		size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
 		if (converted == 0)
 			/* Reached end of string; null char stored. */
 			break;
 		if (converted == (size_t)-2) {
 			/* Incomplete character. This should never happen,
 			   since we provide everything that we have -
 			   unless there is a bug in the C library, or I 
 			   misunderstood how mbrtowc works. */
 			fprintf(stderr, "unexpected mbrtowc result -2\n");
 			return NULL;
 		}
 		if (converted == (size_t)-1) {
 			/* Conversion error. Escape as UTF-8b, and start over
 			   in the initial shift state. */
 			*out++ = 0xdc00 + *in++;
 			argsize--;
 			memset(&mbs, 0, sizeof mbs);
 			continue;
 		}
 		/* successfully converted some bytes */
 		in += converted;
 		argsize -= converted;
 		out++;
 	}
 #else
 	/* Cannot use C locale for escaping; manually escape as if charset
 	   is ASCII (i.e. escape all bytes > 128. This will still roundtrip
 	   correctly in the locale's charset, which must be an ASCII superset. */
 	res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
 	if (!res) goto oom;
 	in = (unsigned char*)arg;
 	out = res;
 	while(*in)
 		if(*in < 128)
 			*out++ = *in++;
 		else
 			*out++ = 0xdc00 + *in++;
 	*out = 0;
 #endif
 	return res;
 oom:
 	fprintf(stderr, "out of memory\n");
 	return NULL;
 }
 int
 main(int argc, char **argv)
 {
@ -40,31 +127,9 @@ main(int argc, char **argv)
 	oldloc = strdup(setlocale(LC_ALL, NULL));
 	setlocale(LC_ALL, "");
 	for (i = 0; i < argc; i++) {
-#ifdef HAVE_BROKEN_MBSTOWCS
+		argv_copy2[i] = argv_copy[i] = char2wchar(argv[i]);
-		/* Some platforms have a broken implementation of
+		if (!argv_copy[i])
 		 * mbstowcs which does not count the characters that
 		 * would result from conversion.  Use an upper bound.
 		 */
 		size_t argsize = strlen(argv[i]);
 #else
 		size_t argsize = mbstowcs(NULL, argv[i], 0);
 #endif
 		size_t count;
 		if (argsize == (size_t)-1) {
 			fprintf(stderr, "Could not convert argument %d to string\n", i);
 			return 1;
 		}
 		argv_copy[i] = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
 		argv_copy2[i] = argv_copy[i];
 		if (!argv_copy[i]) {
 			fprintf(stderr, "out of memory\n");
 			return 1;
 		}
 		count = mbstowcs(argv_copy[i], argv[i], argsize+1);
 		if (count == (size_t)-1) {
 			fprintf(stderr, "Could not convert argument %d to string\n", i);
 			return 1;
 		}
 	}
 	setlocale(LC_ALL, oldloc);
 	free(oldloc);
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1530,6 +1530,53 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
    }
 }
 /* Convert the argument to a bytes object, according to the file
   system encoding */
 int
 PyUnicode_FSConverter(PyObject* arg, void* addr)
 {
    PyObject *output = NULL;
    Py_ssize_t size;
    void *data;
    if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
        output = arg;
        Py_INCREF(output);
    }
    else {
        arg = PyUnicode_FromObject(arg);
        if (!arg)
            return 0;
        output = PyUnicode_AsEncodedObject(arg, 
                                           Py_FileSystemDefaultEncoding,
                                           "utf8b");
        Py_DECREF(arg);
        if (!output)
            return 0;
        if (!PyBytes_Check(output)) {
            Py_DECREF(output);
            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
            return 0;
        }
    }
    if (PyBytes_Check(output)) {
         size = PyBytes_GET_SIZE(output);
         data = PyBytes_AS_STRING(output);
    } 
    else {
         size = PyByteArray_GET_SIZE(output);
         data = PyByteArray_AS_STRING(output);
    }
    if (size != strlen(data)) {
        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
        Py_DECREF(output);
        return 0;
    }
    *(PyObject**)addr = output;
    return 1;
 }
 char*
 _PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
 {
@ -4154,11 +4201,22 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
                                                              collstart-startp, collend-startp, &newpos);
                if (repunicode == NULL)
                    goto onError;
-                if (!PyUnicode_Check(repunicode)) {
+                if (PyBytes_Check(repunicode)) {
-                    /* Implementation limitation: byte results not supported yet. */
+                    /* Directly copy bytes result to output. */
-                    PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+                    repsize = PyBytes_Size(repunicode);
                    if (repsize > 1) {
                        /* Make room for all additional bytes. */
                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
                            Py_DECREF(repunicode);
                            goto onError;
                        }
                        ressize += repsize-1;
                    }
                    memcpy(str, PyBytes_AsString(repunicode), repsize);
                    str += repsize;
                    p = startp + newpos;
                    Py_DECREF(repunicode);
-                    goto onError;
+                    break;
                }
                /* need more space? (at least enough for what we
                   have+the replacement+the rest of the string, so
@ -5123,11 +5181,24 @@ int charmap_encoding_error(
                                                      collstartpos, collendpos, &newpos);
        if (repunicode == NULL)
            return -1;
-        if (!PyUnicode_Check(repunicode)) {
+        if (PyBytes_Check(repunicode)) {
-            /* Implementation limitation: byte results not supported yet. */
+            /* Directly copy bytes result to output. */
-            PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+            Py_ssize_t outsize = PyBytes_Size(*res);
            Py_ssize_t requiredsize;
            repsize = PyBytes_Size(repunicode);
            requiredsize = *respos + repsize;
            if (requiredsize > outsize)
                /* Make room for all additional bytes. */
                if (charmapencode_resize(res, respos, requiredsize)) {
                    Py_DECREF(repunicode);
                    return -1;
                }
            memcpy(PyBytes_AsString(*res) + *respos,
                   PyBytes_AsString(repunicode),  repsize);
            *respos += repsize;
            *inpos = newpos;
            Py_DECREF(repunicode);
-            return -1;
+            break;
        }
        /* generate replacement  */
        repsize = PyUnicode_GET_SIZE(repunicode);
@ -5691,7 +5762,7 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
            if (repunicode == NULL)
                goto onError;
            if (!PyUnicode_Check(repunicode)) {
-                /* Implementation limitation: byte results not supported yet. */
+                /* Byte results not supported, since they have no decimal property. */
                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
                Py_DECREF(repunicode);
                goto onError;
--- a/Python/codecs.c
+++ b/Python/codecs.c
@ -829,6 +829,82 @@ PyCodec_SurrogateErrors(PyObject *exc)
    }
 }
 static PyObject *
 PyCodec_UTF8bErrors(PyObject *exc)
 {
    PyObject *restuple;
    PyObject *object;
    Py_ssize_t start;
    Py_ssize_t end;
    PyObject *res;
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
 	Py_UNICODE *p;
 	Py_UNICODE *startp;
 	char *outp;
 	if (PyUnicodeEncodeError_GetStart(exc, &start))
 	    return NULL;
 	if (PyUnicodeEncodeError_GetEnd(exc, &end))
 	    return NULL;
 	if (!(object = PyUnicodeEncodeError_GetObject(exc)))
 	    return NULL;
 	startp = PyUnicode_AS_UNICODE(object);
 	res = PyBytes_FromStringAndSize(NULL, end-start);
 	if (!res) {
 	    Py_DECREF(object);
 	    return NULL;
 	}
 	outp = PyBytes_AsString(res);
 	for (p = startp+start; p < startp+end; p++) {
 	    Py_UNICODE ch = *p;
 	    if (ch < 0xdc80 || ch > 0xdcff) {
 		/* Not a UTF-8b surrogate, fail with original exception */
 		PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
 		Py_DECREF(res);
 		Py_DECREF(object);
 		return NULL;
 	    }
 	    *outp++ = ch - 0xdc00;
 	}
 	restuple = Py_BuildValue("(On)", res, end);
 	Py_DECREF(res);
 	Py_DECREF(object);
 	return restuple;
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
 	unsigned char *p;
 	Py_UNICODE ch[4]; /* decode up to 4 bad bytes. */
 	int consumed = 0;
 	if (PyUnicodeDecodeError_GetStart(exc, &start))
 	    return NULL;
 	if (PyUnicodeDecodeError_GetEnd(exc, &end))
 	    return NULL;
 	if (!(object = PyUnicodeDecodeError_GetObject(exc)))
 	    return NULL;
 	if (!(p = (unsigned char*)PyBytes_AsString(object))) {
 	    Py_DECREF(object);
 	    return NULL;
 	}
 	while (consumed < 4 && consumed < end-start) {
 	    /* Refuse to escape ASCII bytes. */
 	    if (p[start+consumed] < 128)
 		break;
 	    ch[consumed] = 0xdc00 + p[start+consumed];
 	    consumed++;
 	}
 	Py_DECREF(object);
 	if (!consumed) {
 	    /* codec complained about ASCII byte. */
 	    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
 	    return NULL;
 	}	    
 	return Py_BuildValue("(u#n)", ch, consumed, start+consumed);
    }
    else {
 	wrong_exception_type(exc);
 	return NULL;
    }
 }
 static PyObject *strict_errors(PyObject *self, PyObject *exc)
 {
@ -864,6 +940,11 @@ static PyObject *surrogates_errors(PyObject *self, PyObject *exc)
    return PyCodec_SurrogateErrors(exc);
 }
 static PyObject *utf8b_errors(PyObject *self, PyObject *exc)
 {
    return PyCodec_UTF8bErrors(exc);
 }
 static int _PyCodecRegistry_Init(void)
 {
    static struct {
@ -918,6 +999,14 @@ static int _PyCodecRegistry_Init(void)
 		surrogates_errors,
 		METH_O
 	    }
 	},
 	{
 	    "utf8b",
 	    {
 		"utf8b",
 		utf8b_errors,
 		METH_O
 	    }
 	}
    };
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@ -262,6 +262,22 @@ Py_InitializeEx(int install_sigs)
 	_PyImportHooks_Init();
 #if defined(HAVE_LANGINFO_H) && defined(CODESET)
 	/* On Unix, set the file system encoding according to the
 	   user's preference, if the CODESET names a well-known
 	   Python codec, and Py_FileSystemDefaultEncoding isn't
 	   initialized by other means. Also set the encoding of
 	   stdin and stdout if these are terminals.  */
 	codeset = get_codeset();
 	if (codeset) {
 		if (!Py_FileSystemDefaultEncoding)
 			Py_FileSystemDefaultEncoding = codeset;
 		else
 			free(codeset);
 	}
 #endif
 	if (install_sigs)
 		initsigs(); /* Signal handling stuff, including initintr() */
@ -285,22 +301,6 @@ Py_InitializeEx(int install_sigs)
 #ifdef WITH_THREAD
 	_PyGILState_Init(interp, tstate);
 #endif /* WITH_THREAD */
 #if defined(HAVE_LANGINFO_H) && defined(CODESET)
 	/* On Unix, set the file system encoding according to the
 	   user's preference, if the CODESET names a well-known
 	   Python codec, and Py_FileSystemDefaultEncoding isn't
 	   initialized by other means. Also set the encoding of
 	   stdin and stdout if these are terminals.  */
 	codeset = get_codeset();
 	if (codeset) {
 		if (!Py_FileSystemDefaultEncoding)
 			Py_FileSystemDefaultEncoding = codeset;
 		else
 			free(codeset);
 	}
 #endif
 }
 void
--- a/5
+++ b/5
@ -1,5 +1,5 @@
 #! /bin/sh
-# From configure.in Revision: 71731 .
+# From configure.in Revision: 72144 .
 # Guess values for system-dependent variables and create Makefiles.
 # Generated by GNU Autoconf 2.61 for python 3.1.
 #
@ -16297,13 +16297,14 @@ echo "${ECHO_T}MACHDEP_OBJS" >&6; }
 for ac_func in alarm setitimer getitimer bind_textdomain_codeset chown \
 clock confstr ctermid execv fchmod fchown fork fpathconf ftime ftruncate \
 gai_strerror getgroups getlogin getloadavg getpeername getpgid getpid \
 getpriority getpwent getspnam getspent getsid getwd \
- kill killpg lchmod lchown lstat mkfifo mknod mktime \
+ kill killpg lchmod lchown lstat mbrtowc mkfifo mknod mktime \
 mremap nice pathconf pause plock poll pthread_init \
 putenv readlink realpath \
 select sem_open sem_timedwait sem_getvalue sem_unlink setegid seteuid \
--- a/configure.in
+++ b/configure.in
@ -2403,7 +2403,7 @@ AC_CHECK_FUNCS(alarm setitimer getitimer bind_textdomain_codeset chown \
 clock confstr ctermid execv fchmod fchown fork fpathconf ftime ftruncate \
 gai_strerror getgroups getlogin getloadavg getpeername getpgid getpid \
 getpriority getpwent getspnam getspent getsid getwd \
- kill killpg lchmod lchown lstat mkfifo mknod mktime \
+ kill killpg lchmod lchown lstat mbrtowc mkfifo mknod mktime \
 mremap nice pathconf pause plock poll pthread_init \
 putenv readlink realpath \
 select sem_open sem_timedwait sem_getvalue sem_unlink setegid seteuid \
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@ -419,6 +419,9 @@
 /* Define this if you have the makedev macro. */
 #undef HAVE_MAKEDEV
 /* Define to 1 if you have the `mbrtowc' function. */
 #undef HAVE_MBRTOWC
 /* Define to 1 if you have the `memmove' function. */
 #undef HAVE_MEMMOVE