mirror of https://github.com/python/cpython
Rename utf8b error handler to surrogateescape.
This commit is contained in:
parent
e0a2b72e61
commit
43c57785d3
|
@ -322,7 +322,7 @@ and implemented by all standard Python codecs:
|
||||||
| ``'backslashreplace'`` | Replace with backslashed escape sequences |
|
| ``'backslashreplace'`` | Replace with backslashed escape sequences |
|
||||||
| | (only for encoding). |
|
| | (only for encoding). |
|
||||||
+-------------------------+-----------------------------------------------+
|
+-------------------------+-----------------------------------------------+
|
||||||
| ``'utf8b'`` | Replace byte with surrogate U+DCxx. |
|
| ``'surrogateescape'`` | Replace byte with surrogate U+DCxx. |
|
||||||
+-------------------------+-----------------------------------------------+
|
+-------------------------+-----------------------------------------------+
|
||||||
|
|
||||||
In addition, the following error handlers are specific to a single codec:
|
In addition, the following error handlers are specific to a single codec:
|
||||||
|
@ -335,7 +335,7 @@ In addition, the following error handlers are specific to a single codec:
|
||||||
+-------------------+---------+-------------------------------------------+
|
+-------------------+---------+-------------------------------------------+
|
||||||
|
|
||||||
.. versionadded:: 3.1
|
.. versionadded:: 3.1
|
||||||
The ``'utf8b'`` and ``'surrogatepass'`` error handlers.
|
The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers.
|
||||||
|
|
||||||
The set of allowed values can be extended via :meth:`register_error`.
|
The set of allowed values can be extended via :meth:`register_error`.
|
||||||
|
|
||||||
|
|
|
@ -64,8 +64,8 @@ perform this conversion (see :func:`sys.getfilesystemencoding`).
|
||||||
|
|
||||||
.. versionchanged:: 3.1
|
.. versionchanged:: 3.1
|
||||||
On some systems, conversion using the file system encoding may
|
On some systems, conversion using the file system encoding may
|
||||||
fail. In this case, Python uses the ``utf8b`` encoding error
|
fail. In this case, Python uses the ``surrogateescape`` encoding
|
||||||
handler, which means that undecodable bytes are replaced by a
|
error handler, which means that undecodable bytes are replaced by a
|
||||||
Unicode character U+DCxx on decoding, and these are again
|
Unicode character U+DCxx on decoding, and these are again
|
||||||
translated to the original byte on encoding.
|
translated to the original byte on encoding.
|
||||||
|
|
||||||
|
|
|
@ -1521,32 +1521,32 @@ class TypesTest(unittest.TestCase):
|
||||||
self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
|
self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
|
||||||
self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
|
self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
|
||||||
|
|
||||||
class Utf8bTest(unittest.TestCase):
|
class SurrogateEscapeTest(unittest.TestCase):
|
||||||
|
|
||||||
def test_utf8(self):
|
def test_utf8(self):
|
||||||
# Bad byte
|
# Bad byte
|
||||||
self.assertEqual(b"foo\x80bar".decode("utf-8", "utf8b"),
|
self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
|
||||||
"foo\udc80bar")
|
"foo\udc80bar")
|
||||||
self.assertEqual("foo\udc80bar".encode("utf-8", "utf8b"),
|
self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
|
||||||
b"foo\x80bar")
|
b"foo\x80bar")
|
||||||
# bad-utf-8 encoded surrogate
|
# bad-utf-8 encoded surrogate
|
||||||
self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "utf8b"),
|
self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
|
||||||
"\udced\udcb0\udc80")
|
"\udced\udcb0\udc80")
|
||||||
self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "utf8b"),
|
self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
|
||||||
b"\xed\xb0\x80")
|
b"\xed\xb0\x80")
|
||||||
|
|
||||||
def test_ascii(self):
|
def test_ascii(self):
|
||||||
# bad byte
|
# bad byte
|
||||||
self.assertEqual(b"foo\x80bar".decode("ascii", "utf8b"),
|
self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
|
||||||
"foo\udc80bar")
|
"foo\udc80bar")
|
||||||
self.assertEqual("foo\udc80bar".encode("ascii", "utf8b"),
|
self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
|
||||||
b"foo\x80bar")
|
b"foo\x80bar")
|
||||||
|
|
||||||
def test_charmap(self):
|
def test_charmap(self):
|
||||||
# bad byte: \xa5 is unmapped in iso-8859-3
|
# bad byte: \xa5 is unmapped in iso-8859-3
|
||||||
self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "utf8b"),
|
self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
|
||||||
"foo\udca5bar")
|
"foo\udca5bar")
|
||||||
self.assertEqual("foo\udca5bar".encode("iso-8859-3", "utf8b"),
|
self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
|
||||||
b"foo\xa5bar")
|
b"foo\xa5bar")
|
||||||
|
|
||||||
|
|
||||||
|
@ -1576,7 +1576,7 @@ def test_main():
|
||||||
CharmapTest,
|
CharmapTest,
|
||||||
WithStmtTest,
|
WithStmtTest,
|
||||||
TypesTest,
|
TypesTest,
|
||||||
Utf8bTest,
|
SurrogateEscapeTest,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -708,13 +708,13 @@ if sys.platform != 'win32':
|
||||||
self.fsencoding = sys.getfilesystemencoding()
|
self.fsencoding = sys.getfilesystemencoding()
|
||||||
sys.setfilesystemencoding("utf-8")
|
sys.setfilesystemencoding("utf-8")
|
||||||
self.dir = support.TESTFN
|
self.dir = support.TESTFN
|
||||||
self.bdir = self.dir.encode("utf-8", "utf8b")
|
self.bdir = self.dir.encode("utf-8", "surrogateescape")
|
||||||
os.mkdir(self.dir)
|
os.mkdir(self.dir)
|
||||||
self.unicodefn = []
|
self.unicodefn = []
|
||||||
for fn in self.filenames:
|
for fn in self.filenames:
|
||||||
f = open(os.path.join(self.bdir, fn), "w")
|
f = open(os.path.join(self.bdir, fn), "w")
|
||||||
f.close()
|
f.close()
|
||||||
self.unicodefn.append(fn.decode("utf-8", "utf8b"))
|
self.unicodefn.append(fn.decode("utf-8", "surrogateescape"))
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
shutil.rmtree(self.dir)
|
shutil.rmtree(self.dir)
|
||||||
|
|
|
@ -245,7 +245,7 @@ fileio_init(PyObject *oself, PyObject *args, PyObject *kwds)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
stringobj = PyUnicode_AsEncodedString(
|
stringobj = PyUnicode_AsEncodedString(
|
||||||
u, Py_FileSystemDefaultEncoding, "utf8b");
|
u, Py_FileSystemDefaultEncoding, "surrogateescape");
|
||||||
Py_DECREF(u);
|
Py_DECREF(u);
|
||||||
if (stringobj == NULL)
|
if (stringobj == NULL)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
|
@ -494,13 +494,13 @@ convertenviron(void)
|
||||||
if (p == NULL)
|
if (p == NULL)
|
||||||
continue;
|
continue;
|
||||||
k = PyUnicode_Decode(*e, (int)(p-*e),
|
k = PyUnicode_Decode(*e, (int)(p-*e),
|
||||||
Py_FileSystemDefaultEncoding, "utf8b");
|
Py_FileSystemDefaultEncoding, "surrogateescape");
|
||||||
if (k == NULL) {
|
if (k == NULL) {
|
||||||
PyErr_Clear();
|
PyErr_Clear();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
v = PyUnicode_Decode(p+1, strlen(p+1),
|
v = PyUnicode_Decode(p+1, strlen(p+1),
|
||||||
Py_FileSystemDefaultEncoding, "utf8b");
|
Py_FileSystemDefaultEncoding, "surrogateescape");
|
||||||
if (v == NULL) {
|
if (v == NULL) {
|
||||||
PyErr_Clear();
|
PyErr_Clear();
|
||||||
Py_DECREF(k);
|
Py_DECREF(k);
|
||||||
|
@ -2167,7 +2167,7 @@ posix_getcwd(int use_bytes)
|
||||||
return posix_error();
|
return posix_error();
|
||||||
if (use_bytes)
|
if (use_bytes)
|
||||||
return PyBytes_FromStringAndSize(buf, strlen(buf));
|
return PyBytes_FromStringAndSize(buf, strlen(buf));
|
||||||
return PyUnicode_Decode(buf, strlen(buf), Py_FileSystemDefaultEncoding,"utf8b");
|
return PyUnicode_Decode(buf, strlen(buf), Py_FileSystemDefaultEncoding,"surrogateescape");
|
||||||
}
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(posix_getcwd__doc__,
|
PyDoc_STRVAR(posix_getcwd__doc__,
|
||||||
|
@ -2513,7 +2513,7 @@ posix_listdir(PyObject *self, PyObject *args)
|
||||||
|
|
||||||
w = PyUnicode_FromEncodedObject(v,
|
w = PyUnicode_FromEncodedObject(v,
|
||||||
Py_FileSystemDefaultEncoding,
|
Py_FileSystemDefaultEncoding,
|
||||||
"utf8b");
|
"surrogateescape");
|
||||||
Py_DECREF(v);
|
Py_DECREF(v);
|
||||||
if (w != NULL)
|
if (w != NULL)
|
||||||
v = w;
|
v = w;
|
||||||
|
@ -4695,7 +4695,7 @@ posix_readlink(PyObject *self, PyObject *args)
|
||||||
|
|
||||||
w = PyUnicode_FromEncodedObject(v,
|
w = PyUnicode_FromEncodedObject(v,
|
||||||
Py_FileSystemDefaultEncoding,
|
Py_FileSystemDefaultEncoding,
|
||||||
"utf8b");
|
"surrogateescape");
|
||||||
if (w != NULL) {
|
if (w != NULL) {
|
||||||
Py_DECREF(v);
|
Py_DECREF(v);
|
||||||
v = w;
|
v = w;
|
||||||
|
|
|
@ -42,7 +42,7 @@ char2wchar(char* arg)
|
||||||
return res;
|
return res;
|
||||||
PyMem_Free(res);
|
PyMem_Free(res);
|
||||||
}
|
}
|
||||||
/* Conversion failed. Fall back to escaping with utf8b. */
|
/* Conversion failed. Fall back to escaping with surrogateescape. */
|
||||||
#ifdef HAVE_MBRTOWC
|
#ifdef HAVE_MBRTOWC
|
||||||
/* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
|
/* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
|
||||||
|
|
||||||
|
|
|
@ -1549,7 +1549,7 @@ PyUnicode_FSConverter(PyObject* arg, void* addr)
|
||||||
return 0;
|
return 0;
|
||||||
output = PyUnicode_AsEncodedObject(arg,
|
output = PyUnicode_AsEncodedObject(arg,
|
||||||
Py_FileSystemDefaultEncoding,
|
Py_FileSystemDefaultEncoding,
|
||||||
"utf8b");
|
"surrogateescape");
|
||||||
Py_DECREF(arg);
|
Py_DECREF(arg);
|
||||||
if (!output)
|
if (!output)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -830,7 +830,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
|
||||||
}
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
PyCodec_UTF8bErrors(PyObject *exc)
|
PyCodec_SurrogateEscapeErrors(PyObject *exc)
|
||||||
{
|
{
|
||||||
PyObject *restuple;
|
PyObject *restuple;
|
||||||
PyObject *object;
|
PyObject *object;
|
||||||
|
@ -940,9 +940,9 @@ static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
|
||||||
return PyCodec_SurrogatePassErrors(exc);
|
return PyCodec_SurrogatePassErrors(exc);
|
||||||
}
|
}
|
||||||
|
|
||||||
static PyObject *utf8b_errors(PyObject *self, PyObject *exc)
|
static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
|
||||||
{
|
{
|
||||||
return PyCodec_UTF8bErrors(exc);
|
return PyCodec_SurrogateEscapeErrors(exc);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int _PyCodecRegistry_Init(void)
|
static int _PyCodecRegistry_Init(void)
|
||||||
|
@ -1001,10 +1001,10 @@ static int _PyCodecRegistry_Init(void)
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"utf8b",
|
"surrogateescape",
|
||||||
{
|
{
|
||||||
"utf8b",
|
"surrogateescape",
|
||||||
utf8b_errors,
|
surrogateescape_errors,
|
||||||
METH_O
|
METH_O
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue