diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index b7bd125401d..e3f98efc686 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -322,7 +322,7 @@ and implemented by all standard Python codecs: | ``'backslashreplace'`` | Replace with backslashed escape sequences | | | (only for encoding). | +-------------------------+-----------------------------------------------+ -| ``'utf8b'`` | Replace byte with surrogate U+DCxx. | +| ``'surrogateescape'`` | Replace byte with surrogate U+DCxx. | +-------------------------+-----------------------------------------------+ In addition, the following error handlers are specific to a single codec: @@ -335,7 +335,7 @@ In addition, the following error handlers are specific to a single codec: +-------------------+---------+-------------------------------------------+ .. versionadded:: 3.1 - The ``'utf8b'`` and ``'surrogatepass'`` error handlers. + The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers. The set of allowed values can be extended via :meth:`register_error`. diff --git a/Doc/library/os.rst b/Doc/library/os.rst index 83f5ee9dc0e..221374048cf 100644 --- a/Doc/library/os.rst +++ b/Doc/library/os.rst @@ -64,8 +64,8 @@ perform this conversion (see :func:`sys.getfilesystemencoding`). .. versionchanged:: 3.1 On some systems, conversion using the file system encoding may - fail. In this case, Python uses the ``utf8b`` encoding error - handler, which means that undecodable bytes are replaced by a + fail. In this case, Python uses the ``surrogateescape`` encoding + error handler, which means that undecodable bytes are replaced by a Unicode character U+DCxx on decoding, and these are again translated to the original byte on encoding. diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 9ca769910be..4ec7b5865cc 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1521,32 +1521,32 @@ class TypesTest(unittest.TestCase): self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6)) self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6)) -class Utf8bTest(unittest.TestCase): +class SurrogateEscapeTest(unittest.TestCase): def test_utf8(self): # Bad byte - self.assertEqual(b"foo\x80bar".decode("utf-8", "utf8b"), + self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"), "foo\udc80bar") - self.assertEqual("foo\udc80bar".encode("utf-8", "utf8b"), + self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"), b"foo\x80bar") # bad-utf-8 encoded surrogate - self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "utf8b"), + self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"), "\udced\udcb0\udc80") - self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "utf8b"), + self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"), b"\xed\xb0\x80") def test_ascii(self): # bad byte - self.assertEqual(b"foo\x80bar".decode("ascii", "utf8b"), + self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"), "foo\udc80bar") - self.assertEqual("foo\udc80bar".encode("ascii", "utf8b"), + self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"), b"foo\x80bar") def test_charmap(self): # bad byte: \xa5 is unmapped in iso-8859-3 - self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "utf8b"), + self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"), "foo\udca5bar") - self.assertEqual("foo\udca5bar".encode("iso-8859-3", "utf8b"), + self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"), b"foo\xa5bar") @@ -1576,7 +1576,7 @@ def test_main(): CharmapTest, WithStmtTest, TypesTest, - Utf8bTest, + SurrogateEscapeTest, ) diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py index 014d874f4d3..c680d8d77a3 100644 --- a/Lib/test/test_os.py +++ b/Lib/test/test_os.py @@ -708,13 +708,13 @@ if sys.platform != 'win32': self.fsencoding = sys.getfilesystemencoding() sys.setfilesystemencoding("utf-8") self.dir = support.TESTFN - self.bdir = self.dir.encode("utf-8", "utf8b") + self.bdir = self.dir.encode("utf-8", "surrogateescape") os.mkdir(self.dir) self.unicodefn = [] for fn in self.filenames: f = open(os.path.join(self.bdir, fn), "w") f.close() - self.unicodefn.append(fn.decode("utf-8", "utf8b")) + self.unicodefn.append(fn.decode("utf-8", "surrogateescape")) def tearDown(self): shutil.rmtree(self.dir) diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c index 164f7e46d1b..555dc12c69d 100644 --- a/Modules/_io/fileio.c +++ b/Modules/_io/fileio.c @@ -245,7 +245,7 @@ fileio_init(PyObject *oself, PyObject *args, PyObject *kwds) return -1; stringobj = PyUnicode_AsEncodedString( - u, Py_FileSystemDefaultEncoding, "utf8b"); + u, Py_FileSystemDefaultEncoding, "surrogateescape"); Py_DECREF(u); if (stringobj == NULL) return -1; diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c index 2050d5a1a78..21dcb4d9638 100644 --- a/Modules/posixmodule.c +++ b/Modules/posixmodule.c @@ -494,13 +494,13 @@ convertenviron(void) if (p == NULL) continue; k = PyUnicode_Decode(*e, (int)(p-*e), - Py_FileSystemDefaultEncoding, "utf8b"); + Py_FileSystemDefaultEncoding, "surrogateescape"); if (k == NULL) { PyErr_Clear(); continue; } v = PyUnicode_Decode(p+1, strlen(p+1), - Py_FileSystemDefaultEncoding, "utf8b"); + Py_FileSystemDefaultEncoding, "surrogateescape"); if (v == NULL) { PyErr_Clear(); Py_DECREF(k); @@ -2167,7 +2167,7 @@ posix_getcwd(int use_bytes) return posix_error(); if (use_bytes) return PyBytes_FromStringAndSize(buf, strlen(buf)); - return PyUnicode_Decode(buf, strlen(buf), Py_FileSystemDefaultEncoding,"utf8b"); + return PyUnicode_Decode(buf, strlen(buf), Py_FileSystemDefaultEncoding,"surrogateescape"); } PyDoc_STRVAR(posix_getcwd__doc__, @@ -2513,7 +2513,7 @@ posix_listdir(PyObject *self, PyObject *args) w = PyUnicode_FromEncodedObject(v, Py_FileSystemDefaultEncoding, - "utf8b"); + "surrogateescape"); Py_DECREF(v); if (w != NULL) v = w; @@ -4695,7 +4695,7 @@ posix_readlink(PyObject *self, PyObject *args) w = PyUnicode_FromEncodedObject(v, Py_FileSystemDefaultEncoding, - "utf8b"); + "surrogateescape"); if (w != NULL) { Py_DECREF(v); v = w; diff --git a/Modules/python.c b/Modules/python.c index 4c0a55bb1fa..13c6d5b82a4 100644 --- a/Modules/python.c +++ b/Modules/python.c @@ -42,7 +42,7 @@ char2wchar(char* arg) return res; PyMem_Free(res); } - /* Conversion failed. Fall back to escaping with utf8b. */ + /* Conversion failed. Fall back to escaping with surrogateescape. */ #ifdef HAVE_MBRTOWC /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3740892e675..3bd1efd9392 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1549,7 +1549,7 @@ PyUnicode_FSConverter(PyObject* arg, void* addr) return 0; output = PyUnicode_AsEncodedObject(arg, Py_FileSystemDefaultEncoding, - "utf8b"); + "surrogateescape"); Py_DECREF(arg); if (!output) return 0; diff --git a/Python/codecs.c b/Python/codecs.c index cd6b7f0f60c..d1915f181d1 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -830,7 +830,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc) } static PyObject * -PyCodec_UTF8bErrors(PyObject *exc) +PyCodec_SurrogateEscapeErrors(PyObject *exc) { PyObject *restuple; PyObject *object; @@ -940,9 +940,9 @@ static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) return PyCodec_SurrogatePassErrors(exc); } -static PyObject *utf8b_errors(PyObject *self, PyObject *exc) +static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc) { - return PyCodec_UTF8bErrors(exc); + return PyCodec_SurrogateEscapeErrors(exc); } static int _PyCodecRegistry_Init(void) @@ -1001,10 +1001,10 @@ static int _PyCodecRegistry_Init(void) } }, { - "utf8b", + "surrogateescape", { - "utf8b", - utf8b_errors, + "surrogateescape", + surrogateescape_errors, METH_O } }