diff --git a/Doc/c-api/codec.rst b/Doc/c-api/codec.rst index 83252afbb7f..5bb56e37ec5 100644 --- a/Doc/c-api/codec.rst +++ b/Doc/c-api/codec.rst @@ -116,3 +116,8 @@ Registry API for Unicode encoding error handlers Replace the unicode encode error with backslash escapes (``\x``, ``\u`` and ``\U``). +.. c:function:: PyObject* PyCodec_NameReplaceErrors(PyObject *exc) + + Replace the unicode encode error with `\N{...}` escapes. + + .. versionadded: 3.4 diff --git a/Doc/howto/unicode.rst b/Doc/howto/unicode.rst index 50bca5a0c5a..aac2373c0e3 100644 --- a/Doc/howto/unicode.rst +++ b/Doc/howto/unicode.rst @@ -325,8 +325,9 @@ The *errors* parameter is the same as the parameter of the :meth:`~bytes.decode` method but supports a few more possible handlers. As well as ``'strict'``, ``'ignore'``, and ``'replace'`` (which in this case inserts a question mark instead of the unencodable character), there is -also ``'xmlcharrefreplace'`` (inserts an XML character reference) and -``backslashreplace`` (inserts a ``\uNNNN`` escape sequence). +also ``'xmlcharrefreplace'`` (inserts an XML character reference), +``backslashreplace`` (inserts a ``\uNNNN`` escape sequence) and +``namereplace`` (inserts a ``\N{...}`` escape sequence). The following example shows the different results:: @@ -346,6 +347,8 @@ The following example shows the different results:: b'ꀀabcd޴' >>> u.encode('ascii', 'backslashreplace') b'\\ua000abcd\\u07b4' + >>> u.encode('ascii', 'namereplace') + b'\\N{YI SYLLABLE IT}abcd\\u07b4' The low-level routines for registering and accessing the available encodings are found in the :mod:`codecs` module. Implementing new diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 4c2a0235707..ea4c450de78 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -98,6 +98,8 @@ It defines the following functions: reference (for encoding only) * ``'backslashreplace'``: replace with backslashed escape sequences (for encoding only) + * ``'namereplace'``: replace with ``\N{...}`` escape sequences (for + encoding only) * ``'surrogateescape'``: on decoding, replace with code points in the Unicode Private Use Area ranging from U+DC80 to U+DCFF. These private code points will then be turned back into the same bytes when the @@ -232,6 +234,11 @@ functions which use :func:`lookup` for the codec lookup: Implements the ``backslashreplace`` error handling (for encoding only): the unencodable character is replaced by a backslashed escape sequence. +.. function:: namereplace_errors(exception) + + Implements the ``namereplace`` error handling (for encoding only): the + unencodable character is replaced by a ``\N{...}`` escape sequence. + To simplify working with encoded files or stream, the module also defines these utility functions: @@ -363,6 +370,9 @@ and implemented by all standard Python codecs: | ``'backslashreplace'`` | Replace with backslashed escape sequences | | | (only for encoding). | +-------------------------+-----------------------------------------------+ +| ``'namereplace'`` | Replace with ``\N{...}`` escape sequences | +| | (only for encoding). | ++-------------------------+-----------------------------------------------+ | ``'surrogateescape'`` | Replace byte with surrogate U+DCxx, as defined| | | in :pep:`383`. | +-------------------------+-----------------------------------------------+ @@ -384,6 +394,9 @@ schemes: .. versionchanged:: 3.4 The ``'surrogatepass'`` error handlers now works with utf-16\* and utf-32\* codecs. +.. versionadded:: 3.4 + The ``'namereplace'`` error handler. + The set of allowed values can be extended via :meth:`register_error`. @@ -477,6 +490,8 @@ define in order to be compatible with the Python codec registry. * ``'backslashreplace'`` Replace with backslashed escape sequences. + * ``'namereplace'`` Replace with ``\N{...}`` escape sequences. + The *errors* argument will be assigned to an attribute of the same name. Assigning to this attribute makes it possible to switch between different error handling strategies during the lifetime of the :class:`IncrementalEncoder` @@ -625,6 +640,8 @@ compatible with the Python codec registry. * ``'backslashreplace'`` Replace with backslashed escape sequences. + * ``'namereplace'`` Replace with ``\N{...}`` escape sequences. + The *errors* argument will be assigned to an attribute of the same name. Assigning to this attribute makes it possible to switch between different error handling strategies during the lifetime of the :class:`StreamWriter` object. diff --git a/Doc/library/functions.rst b/Doc/library/functions.rst index 9e38d6f7a1a..d1e3407565c 100644 --- a/Doc/library/functions.rst +++ b/Doc/library/functions.rst @@ -975,6 +975,9 @@ are always available. They are listed here in alphabetical order. replaces unsupported characters with Python's backslashed escape sequences. + * ``'namereplace'`` (also only supported when writing) + replaces unsupported characters with ``\N{...}`` escape sequences. + .. index:: single: universal newlines; open() built-in function diff --git a/Doc/library/io.rst b/Doc/library/io.rst index 005428659d3..c77db906810 100644 --- a/Doc/library/io.rst +++ b/Doc/library/io.rst @@ -827,9 +827,10 @@ Text I/O errors can lead to data loss.) ``'replace'`` causes a replacement marker (such as ``'?'``) to be inserted where there is malformed data. When writing, ``'xmlcharrefreplace'`` (replace with the appropriate XML character - reference) or ``'backslashreplace'`` (replace with backslashed escape - sequences) can be used. Any other error handling name that has been - registered with :func:`codecs.register_error` is also valid. + reference), ``'backslashreplace'`` (replace with backslashed escape + sequences) or ``'namereplace'`` (replace with ``\N{...}`` escape sequences) + can be used. Any other error handling name that has been registered with + :func:`codecs.register_error` is also valid. .. index:: single: universal newlines; io.TextIOWrapper class diff --git a/Include/codecs.h b/Include/codecs.h index b3088e4902d..466913540fd 100644 --- a/Include/codecs.h +++ b/Include/codecs.h @@ -225,6 +225,9 @@ PyAPI_FUNC(PyObject *) PyCodec_XMLCharRefReplaceErrors(PyObject *exc); /* replace the unicode encode error with backslash escapes (\x, \u and \U) */ PyAPI_FUNC(PyObject *) PyCodec_BackslashReplaceErrors(PyObject *exc); +/* replace the unicode encode error with backslash escapes (\N, \x, \u and \U) */ +PyAPI_FUNC(PyObject *) PyCodec_NameReplaceErrors(PyObject *exc); + PyAPI_DATA(const char *) Py_hexdigits; #ifdef __cplusplus diff --git a/Lib/codecs.py b/Lib/codecs.py index 993451711af..85df89af63d 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -22,6 +22,7 @@ __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", "strict_errors", "ignore_errors", "replace_errors", "xmlcharrefreplace_errors", + "backslashreplace_errors", "namereplace_errors", "register_error", "lookup_error"] ### Constants @@ -1085,6 +1086,7 @@ try: replace_errors = lookup_error("replace") xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") backslashreplace_errors = lookup_error("backslashreplace") + namereplace_errors = lookup_error("namereplace") except LookupError: # In --disable-unicode builds, these error handler are missing strict_errors = None @@ -1092,6 +1094,7 @@ except LookupError: replace_errors = None xmlcharrefreplace_errors = None backslashreplace_errors = None + namereplace_errors = None # Tell modulefinder that using codecs probably needs the encodings # package diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index a1ce9cf78ad..9743791c9b2 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -158,6 +158,22 @@ class CodecCallbackTest(unittest.TestCase): sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff" self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) + def test_nameescape(self): + # Does the same as backslashescape, but prefers ``\N{...}`` escape + # sequences. + sin = "a\xac\u1234\u20ac\u8000\U0010ffff" + sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' + b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') + self.assertEqual(sin.encode("ascii", "namereplace"), sout) + + sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' + b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') + self.assertEqual(sin.encode("latin-1", "namereplace"), sout) + + sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4' + b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') + self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout) + def test_decoding_callbacks(self): # This is a test for a decoding callback handler # that allows the decoding of the invalid sequence @@ -297,7 +313,7 @@ class CodecCallbackTest(unittest.TestCase): def test_longstrings(self): # test long strings to check for memory overflow problems errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", - "backslashreplace"] + "backslashreplace", "namereplace"] # register the handlers under different names, # to prevent the codec from recognizing the name for err in errors: @@ -611,6 +627,81 @@ class CodecCallbackTest(unittest.TestCase): ("\\udfff", 1) ) + def test_badandgoodnamereplaceexceptions(self): + # "namereplace" complains about a non-exception passed in + self.assertRaises( + TypeError, + codecs.namereplace_errors, + 42 + ) + # "namereplace" complains about the wrong exception types + self.assertRaises( + TypeError, + codecs.namereplace_errors, + UnicodeError("ouch") + ) + # "namereplace" can only be used for encoding + self.assertRaises( + TypeError, + codecs.namereplace_errors, + UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") + ) + self.assertRaises( + TypeError, + codecs.namereplace_errors, + UnicodeTranslateError("\u3042", 0, 1, "ouch") + ) + # Use the correct exception + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")), + ("\\N{HIRAGANA LETTER A}", 1) + ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\x00", 0, 1, "ouch")), + ("\\x00", 1) + ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\xff", 0, 1, "ouch")), + ("\\N{LATIN SMALL LETTER Y WITH DIAERESIS}", 1) + ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\u0100", 0, 1, "ouch")), + ("\\N{LATIN CAPITAL LETTER A WITH MACRON}", 1) + ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")), + ("\\uffff", 1) + ) + if SIZEOF_WCHAR_T > 0: + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\U00010000", + 0, 1, "ouch")), + ("\\N{LINEAR B SYLLABLE B008 A}", 1) + ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\U0010ffff", + 0, 1, "ouch")), + ("\\U0010ffff", 1) + ) + # Lone surrogates (regardless of unicode width) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")), + ("\\ud800", 1) + ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")), + ("\\udfff", 1) + ) + def test_badhandlerresults(self): results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15") @@ -651,6 +742,10 @@ class CodecCallbackTest(unittest.TestCase): codecs.backslashreplace_errors, codecs.lookup_error("backslashreplace") ) + self.assertEqual( + codecs.namereplace_errors, + codecs.lookup_error("namereplace") + ) def test_unencodablereplacement(self): def unencrepl(exc): @@ -804,7 +899,8 @@ class CodecCallbackTest(unittest.TestCase): class D(dict): def __getitem__(self, key): raise ValueError - for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"): + for err in ("strict", "replace", "xmlcharrefreplace", + "backslashreplace", "namereplace", "test.posreturn"): self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None}) self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D()) self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300}) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 7d0eeb66fc2..e03a1db325a 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -349,6 +349,8 @@ class ReadTest(MixInCheckStateHandling): self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding) self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"), "[\\udc80]".encode(self.encoding)) + self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"), + "[\\udc80]".encode(self.encoding)) self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"), "[�]".encode(self.encoding)) self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"), @@ -808,6 +810,7 @@ class CP65001Test(ReadTest, unittest.TestCase): ('\udc80', 'ignore', b''), ('\udc80', 'replace', b'?'), ('\udc80', 'backslashreplace', b'\\udc80'), + ('\udc80', 'namereplace', b'\\udc80'), ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), )) else: @@ -869,6 +872,8 @@ class CP65001Test(ReadTest, unittest.TestCase): self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001") self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"), b'[\\udc80]') + self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"), + b'[\\udc80]') self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"), b'[�]') self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"), @@ -2824,6 +2829,8 @@ class CodePageTest(unittest.TestCase): ('[\xff]', 'replace', b'[y]'), ('[\u20ac]', 'replace', b'[?]'), ('[\xff]', 'backslashreplace', b'[\\xff]'), + ('[\xff]', 'namereplace', + b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'), ('[\xff]', 'xmlcharrefreplace', b'[ÿ]'), ('\udcff', 'strict', None), ('[\udcff]', 'surrogateescape', b'[\xff]'), diff --git a/Misc/NEWS b/Misc/NEWS index bf117c21e1f..2662e6019ff 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -191,6 +191,8 @@ Core and Builtins Library ------- +- Issue #19676: Added the "namereplace" error handler. + - Issue #22788: Add *context* parameter to logging.handlers.HTTPHandler. - Issue #22921: Allow SSLContext to take the *hostname* parameter even if diff --git a/Python/codecs.c b/Python/codecs.c index 151fea7d498..b09ea3a28fe 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -9,6 +9,7 @@ Copyright (c) Corporation for National Research Initiatives. ------------------------------------------------------------------------ */ #include "Python.h" +#include "ucnhash.h" #include const char *Py_hexdigits = "0123456789abcdef"; @@ -933,6 +934,97 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) } } +static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; +static int ucnhash_initialized = 0; + +PyObject *PyCodec_NameReplaceErrors(PyObject *exc) +{ + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + PyObject *restuple; + PyObject *object; + Py_ssize_t i; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + unsigned char *outp; + int ressize; + Py_UCS4 c; + char buffer[256]; /* NAME_MAXLEN */ + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + if (!ucnhash_initialized) { + /* load the unicode data module */ + ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( + PyUnicodeData_CAPSULE_NAME, 1); + ucnhash_initialized = 1; + } + for (i = start, ressize = 0; i < end; ++i) { + /* object is guaranteed to be "ready" */ + c = PyUnicode_READ_CHAR(object, i); + if (ucnhash_CAPI && + ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { + ressize += 1+1+1+strlen(buffer)+1; + } + else if (c >= 0x10000) { + ressize += 1+1+8; + } + else if (c >= 0x100) { + ressize += 1+1+4; + } + else + ressize += 1+1+2; + } + res = PyUnicode_New(ressize, 127); + if (res==NULL) + return NULL; + for (i = start, outp = PyUnicode_1BYTE_DATA(res); + i < end; ++i) { + c = PyUnicode_READ_CHAR(object, i); + *outp++ = '\\'; + if (ucnhash_CAPI && + ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { + *outp++ = 'N'; + *outp++ = '{'; + strcpy((char *)outp, buffer); + outp += strlen(buffer); + *outp++ = '}'; + continue; + } + if (c >= 0x00010000) { + *outp++ = 'U'; + *outp++ = Py_hexdigits[(c>>28)&0xf]; + *outp++ = Py_hexdigits[(c>>24)&0xf]; + *outp++ = Py_hexdigits[(c>>20)&0xf]; + *outp++ = Py_hexdigits[(c>>16)&0xf]; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; + } + else if (c >= 0x100) { + *outp++ = 'u'; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; + } + else + *outp++ = 'x'; + *outp++ = Py_hexdigits[(c>>4)&0xf]; + *outp++ = Py_hexdigits[c&0xf]; + } + + assert(_PyUnicode_CheckConsistency(res, 1)); + restuple = Py_BuildValue("(Nn)", res, end); + Py_DECREF(object); + return restuple; + } + else { + wrong_exception_type(exc); + return NULL; + } +} + #define ENC_UNKNOWN -1 #define ENC_UTF8 0 #define ENC_UTF16BE 1 @@ -1276,6 +1368,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) return PyCodec_BackslashReplaceErrors(exc); } +static PyObject *namereplace_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_NameReplaceErrors(exc); +} + static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) { return PyCodec_SurrogatePassErrors(exc); @@ -1345,6 +1442,17 @@ static int _PyCodecRegistry_Init(void) "backslashed escape sequence.") } }, + { + "namereplace", + { + "namereplace_errors", + namereplace_errors, + METH_O, + PyDoc_STR("Implements the 'namereplace' error handling, " + "which replaces an unencodable character with a " + "\\N{...} escape sequence.") + } + }, { "surrogatepass", {