Issue #19676: Added the "namereplace" error handler.
This commit is contained in:
parent
6cecf68c7b
commit
166ebc4e5d
|
@ -116,3 +116,8 @@ Registry API for Unicode encoding error handlers
|
||||||
Replace the unicode encode error with backslash escapes (``\x``, ``\u`` and
|
Replace the unicode encode error with backslash escapes (``\x``, ``\u`` and
|
||||||
``\U``).
|
``\U``).
|
||||||
|
|
||||||
|
.. c:function:: PyObject* PyCodec_NameReplaceErrors(PyObject *exc)
|
||||||
|
|
||||||
|
Replace the unicode encode error with `\N{...}` escapes.
|
||||||
|
|
||||||
|
.. versionadded: 3.4
|
||||||
|
|
|
@ -325,8 +325,9 @@ The *errors* parameter is the same as the parameter of the
|
||||||
:meth:`~bytes.decode` method but supports a few more possible handlers. As well as
|
:meth:`~bytes.decode` method but supports a few more possible handlers. As well as
|
||||||
``'strict'``, ``'ignore'``, and ``'replace'`` (which in this case
|
``'strict'``, ``'ignore'``, and ``'replace'`` (which in this case
|
||||||
inserts a question mark instead of the unencodable character), there is
|
inserts a question mark instead of the unencodable character), there is
|
||||||
also ``'xmlcharrefreplace'`` (inserts an XML character reference) and
|
also ``'xmlcharrefreplace'`` (inserts an XML character reference),
|
||||||
``backslashreplace`` (inserts a ``\uNNNN`` escape sequence).
|
``backslashreplace`` (inserts a ``\uNNNN`` escape sequence) and
|
||||||
|
``namereplace`` (inserts a ``\N{...}`` escape sequence).
|
||||||
|
|
||||||
The following example shows the different results::
|
The following example shows the different results::
|
||||||
|
|
||||||
|
@ -346,6 +347,8 @@ The following example shows the different results::
|
||||||
b'ꀀabcd޴'
|
b'ꀀabcd޴'
|
||||||
>>> u.encode('ascii', 'backslashreplace')
|
>>> u.encode('ascii', 'backslashreplace')
|
||||||
b'\\ua000abcd\\u07b4'
|
b'\\ua000abcd\\u07b4'
|
||||||
|
>>> u.encode('ascii', 'namereplace')
|
||||||
|
b'\\N{YI SYLLABLE IT}abcd\\u07b4'
|
||||||
|
|
||||||
The low-level routines for registering and accessing the available
|
The low-level routines for registering and accessing the available
|
||||||
encodings are found in the :mod:`codecs` module. Implementing new
|
encodings are found in the :mod:`codecs` module. Implementing new
|
||||||
|
|
|
@ -98,6 +98,8 @@ It defines the following functions:
|
||||||
reference (for encoding only)
|
reference (for encoding only)
|
||||||
* ``'backslashreplace'``: replace with backslashed escape sequences (for
|
* ``'backslashreplace'``: replace with backslashed escape sequences (for
|
||||||
encoding only)
|
encoding only)
|
||||||
|
* ``'namereplace'``: replace with ``\N{...}`` escape sequences (for
|
||||||
|
encoding only)
|
||||||
* ``'surrogateescape'``: on decoding, replace with code points in the Unicode
|
* ``'surrogateescape'``: on decoding, replace with code points in the Unicode
|
||||||
Private Use Area ranging from U+DC80 to U+DCFF. These private code
|
Private Use Area ranging from U+DC80 to U+DCFF. These private code
|
||||||
points will then be turned back into the same bytes when the
|
points will then be turned back into the same bytes when the
|
||||||
|
@ -232,6 +234,11 @@ functions which use :func:`lookup` for the codec lookup:
|
||||||
Implements the ``backslashreplace`` error handling (for encoding only): the
|
Implements the ``backslashreplace`` error handling (for encoding only): the
|
||||||
unencodable character is replaced by a backslashed escape sequence.
|
unencodable character is replaced by a backslashed escape sequence.
|
||||||
|
|
||||||
|
.. function:: namereplace_errors(exception)
|
||||||
|
|
||||||
|
Implements the ``namereplace`` error handling (for encoding only): the
|
||||||
|
unencodable character is replaced by a ``\N{...}`` escape sequence.
|
||||||
|
|
||||||
To simplify working with encoded files or stream, the module also defines these
|
To simplify working with encoded files or stream, the module also defines these
|
||||||
utility functions:
|
utility functions:
|
||||||
|
|
||||||
|
@ -363,6 +370,9 @@ and implemented by all standard Python codecs:
|
||||||
| ``'backslashreplace'`` | Replace with backslashed escape sequences |
|
| ``'backslashreplace'`` | Replace with backslashed escape sequences |
|
||||||
| | (only for encoding). |
|
| | (only for encoding). |
|
||||||
+-------------------------+-----------------------------------------------+
|
+-------------------------+-----------------------------------------------+
|
||||||
|
| ``'namereplace'`` | Replace with ``\N{...}`` escape sequences |
|
||||||
|
| | (only for encoding). |
|
||||||
|
+-------------------------+-----------------------------------------------+
|
||||||
| ``'surrogateescape'`` | Replace byte with surrogate U+DCxx, as defined|
|
| ``'surrogateescape'`` | Replace byte with surrogate U+DCxx, as defined|
|
||||||
| | in :pep:`383`. |
|
| | in :pep:`383`. |
|
||||||
+-------------------------+-----------------------------------------------+
|
+-------------------------+-----------------------------------------------+
|
||||||
|
@ -384,6 +394,9 @@ schemes:
|
||||||
.. versionchanged:: 3.4
|
.. versionchanged:: 3.4
|
||||||
The ``'surrogatepass'`` error handlers now works with utf-16\* and utf-32\* codecs.
|
The ``'surrogatepass'`` error handlers now works with utf-16\* and utf-32\* codecs.
|
||||||
|
|
||||||
|
.. versionadded:: 3.4
|
||||||
|
The ``'namereplace'`` error handler.
|
||||||
|
|
||||||
The set of allowed values can be extended via :meth:`register_error`.
|
The set of allowed values can be extended via :meth:`register_error`.
|
||||||
|
|
||||||
|
|
||||||
|
@ -477,6 +490,8 @@ define in order to be compatible with the Python codec registry.
|
||||||
|
|
||||||
* ``'backslashreplace'`` Replace with backslashed escape sequences.
|
* ``'backslashreplace'`` Replace with backslashed escape sequences.
|
||||||
|
|
||||||
|
* ``'namereplace'`` Replace with ``\N{...}`` escape sequences.
|
||||||
|
|
||||||
The *errors* argument will be assigned to an attribute of the same name.
|
The *errors* argument will be assigned to an attribute of the same name.
|
||||||
Assigning to this attribute makes it possible to switch between different error
|
Assigning to this attribute makes it possible to switch between different error
|
||||||
handling strategies during the lifetime of the :class:`IncrementalEncoder`
|
handling strategies during the lifetime of the :class:`IncrementalEncoder`
|
||||||
|
@ -625,6 +640,8 @@ compatible with the Python codec registry.
|
||||||
|
|
||||||
* ``'backslashreplace'`` Replace with backslashed escape sequences.
|
* ``'backslashreplace'`` Replace with backslashed escape sequences.
|
||||||
|
|
||||||
|
* ``'namereplace'`` Replace with ``\N{...}`` escape sequences.
|
||||||
|
|
||||||
The *errors* argument will be assigned to an attribute of the same name.
|
The *errors* argument will be assigned to an attribute of the same name.
|
||||||
Assigning to this attribute makes it possible to switch between different error
|
Assigning to this attribute makes it possible to switch between different error
|
||||||
handling strategies during the lifetime of the :class:`StreamWriter` object.
|
handling strategies during the lifetime of the :class:`StreamWriter` object.
|
||||||
|
|
|
@ -975,6 +975,9 @@ are always available. They are listed here in alphabetical order.
|
||||||
replaces unsupported characters with Python's backslashed escape
|
replaces unsupported characters with Python's backslashed escape
|
||||||
sequences.
|
sequences.
|
||||||
|
|
||||||
|
* ``'namereplace'`` (also only supported when writing)
|
||||||
|
replaces unsupported characters with ``\N{...}`` escape sequences.
|
||||||
|
|
||||||
.. index::
|
.. index::
|
||||||
single: universal newlines; open() built-in function
|
single: universal newlines; open() built-in function
|
||||||
|
|
||||||
|
|
|
@ -827,9 +827,10 @@ Text I/O
|
||||||
errors can lead to data loss.) ``'replace'`` causes a replacement marker
|
errors can lead to data loss.) ``'replace'`` causes a replacement marker
|
||||||
(such as ``'?'``) to be inserted where there is malformed data. When
|
(such as ``'?'``) to be inserted where there is malformed data. When
|
||||||
writing, ``'xmlcharrefreplace'`` (replace with the appropriate XML character
|
writing, ``'xmlcharrefreplace'`` (replace with the appropriate XML character
|
||||||
reference) or ``'backslashreplace'`` (replace with backslashed escape
|
reference), ``'backslashreplace'`` (replace with backslashed escape
|
||||||
sequences) can be used. Any other error handling name that has been
|
sequences) or ``'namereplace'`` (replace with ``\N{...}`` escape sequences)
|
||||||
registered with :func:`codecs.register_error` is also valid.
|
can be used. Any other error handling name that has been registered with
|
||||||
|
:func:`codecs.register_error` is also valid.
|
||||||
|
|
||||||
.. index::
|
.. index::
|
||||||
single: universal newlines; io.TextIOWrapper class
|
single: universal newlines; io.TextIOWrapper class
|
||||||
|
|
|
@ -225,6 +225,9 @@ PyAPI_FUNC(PyObject *) PyCodec_XMLCharRefReplaceErrors(PyObject *exc);
|
||||||
/* replace the unicode encode error with backslash escapes (\x, \u and \U) */
|
/* replace the unicode encode error with backslash escapes (\x, \u and \U) */
|
||||||
PyAPI_FUNC(PyObject *) PyCodec_BackslashReplaceErrors(PyObject *exc);
|
PyAPI_FUNC(PyObject *) PyCodec_BackslashReplaceErrors(PyObject *exc);
|
||||||
|
|
||||||
|
/* replace the unicode encode error with backslash escapes (\N, \x, \u and \U) */
|
||||||
|
PyAPI_FUNC(PyObject *) PyCodec_NameReplaceErrors(PyObject *exc);
|
||||||
|
|
||||||
PyAPI_DATA(const char *) Py_hexdigits;
|
PyAPI_DATA(const char *) Py_hexdigits;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -22,6 +22,7 @@ __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
|
||||||
"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
|
"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
|
||||||
"strict_errors", "ignore_errors", "replace_errors",
|
"strict_errors", "ignore_errors", "replace_errors",
|
||||||
"xmlcharrefreplace_errors",
|
"xmlcharrefreplace_errors",
|
||||||
|
"backslashreplace_errors", "namereplace_errors",
|
||||||
"register_error", "lookup_error"]
|
"register_error", "lookup_error"]
|
||||||
|
|
||||||
### Constants
|
### Constants
|
||||||
|
@ -1085,6 +1086,7 @@ try:
|
||||||
replace_errors = lookup_error("replace")
|
replace_errors = lookup_error("replace")
|
||||||
xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
|
xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
|
||||||
backslashreplace_errors = lookup_error("backslashreplace")
|
backslashreplace_errors = lookup_error("backslashreplace")
|
||||||
|
namereplace_errors = lookup_error("namereplace")
|
||||||
except LookupError:
|
except LookupError:
|
||||||
# In --disable-unicode builds, these error handler are missing
|
# In --disable-unicode builds, these error handler are missing
|
||||||
strict_errors = None
|
strict_errors = None
|
||||||
|
@ -1092,6 +1094,7 @@ except LookupError:
|
||||||
replace_errors = None
|
replace_errors = None
|
||||||
xmlcharrefreplace_errors = None
|
xmlcharrefreplace_errors = None
|
||||||
backslashreplace_errors = None
|
backslashreplace_errors = None
|
||||||
|
namereplace_errors = None
|
||||||
|
|
||||||
# Tell modulefinder that using codecs probably needs the encodings
|
# Tell modulefinder that using codecs probably needs the encodings
|
||||||
# package
|
# package
|
||||||
|
|
|
@ -158,6 +158,22 @@ class CodecCallbackTest(unittest.TestCase):
|
||||||
sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
|
sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
|
||||||
self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
|
self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
|
||||||
|
|
||||||
|
def test_nameescape(self):
|
||||||
|
# Does the same as backslashescape, but prefers ``\N{...}`` escape
|
||||||
|
# sequences.
|
||||||
|
sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
|
||||||
|
sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
|
||||||
|
b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
|
||||||
|
self.assertEqual(sin.encode("ascii", "namereplace"), sout)
|
||||||
|
|
||||||
|
sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
|
||||||
|
b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
|
||||||
|
self.assertEqual(sin.encode("latin-1", "namereplace"), sout)
|
||||||
|
|
||||||
|
sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4'
|
||||||
|
b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
|
||||||
|
self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout)
|
||||||
|
|
||||||
def test_decoding_callbacks(self):
|
def test_decoding_callbacks(self):
|
||||||
# This is a test for a decoding callback handler
|
# This is a test for a decoding callback handler
|
||||||
# that allows the decoding of the invalid sequence
|
# that allows the decoding of the invalid sequence
|
||||||
|
@ -297,7 +313,7 @@ class CodecCallbackTest(unittest.TestCase):
|
||||||
def test_longstrings(self):
|
def test_longstrings(self):
|
||||||
# test long strings to check for memory overflow problems
|
# test long strings to check for memory overflow problems
|
||||||
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
|
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
|
||||||
"backslashreplace"]
|
"backslashreplace", "namereplace"]
|
||||||
# register the handlers under different names,
|
# register the handlers under different names,
|
||||||
# to prevent the codec from recognizing the name
|
# to prevent the codec from recognizing the name
|
||||||
for err in errors:
|
for err in errors:
|
||||||
|
@ -611,6 +627,81 @@ class CodecCallbackTest(unittest.TestCase):
|
||||||
("\\udfff", 1)
|
("\\udfff", 1)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_badandgoodnamereplaceexceptions(self):
|
||||||
|
# "namereplace" complains about a non-exception passed in
|
||||||
|
self.assertRaises(
|
||||||
|
TypeError,
|
||||||
|
codecs.namereplace_errors,
|
||||||
|
42
|
||||||
|
)
|
||||||
|
# "namereplace" complains about the wrong exception types
|
||||||
|
self.assertRaises(
|
||||||
|
TypeError,
|
||||||
|
codecs.namereplace_errors,
|
||||||
|
UnicodeError("ouch")
|
||||||
|
)
|
||||||
|
# "namereplace" can only be used for encoding
|
||||||
|
self.assertRaises(
|
||||||
|
TypeError,
|
||||||
|
codecs.namereplace_errors,
|
||||||
|
UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
|
||||||
|
)
|
||||||
|
self.assertRaises(
|
||||||
|
TypeError,
|
||||||
|
codecs.namereplace_errors,
|
||||||
|
UnicodeTranslateError("\u3042", 0, 1, "ouch")
|
||||||
|
)
|
||||||
|
# Use the correct exception
|
||||||
|
self.assertEqual(
|
||||||
|
codecs.namereplace_errors(
|
||||||
|
UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")),
|
||||||
|
("\\N{HIRAGANA LETTER A}", 1)
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
codecs.namereplace_errors(
|
||||||
|
UnicodeEncodeError("ascii", "\x00", 0, 1, "ouch")),
|
||||||
|
("\\x00", 1)
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
codecs.namereplace_errors(
|
||||||
|
UnicodeEncodeError("ascii", "\xff", 0, 1, "ouch")),
|
||||||
|
("\\N{LATIN SMALL LETTER Y WITH DIAERESIS}", 1)
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
codecs.namereplace_errors(
|
||||||
|
UnicodeEncodeError("ascii", "\u0100", 0, 1, "ouch")),
|
||||||
|
("\\N{LATIN CAPITAL LETTER A WITH MACRON}", 1)
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
codecs.namereplace_errors(
|
||||||
|
UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
|
||||||
|
("\\uffff", 1)
|
||||||
|
)
|
||||||
|
if SIZEOF_WCHAR_T > 0:
|
||||||
|
self.assertEqual(
|
||||||
|
codecs.namereplace_errors(
|
||||||
|
UnicodeEncodeError("ascii", "\U00010000",
|
||||||
|
0, 1, "ouch")),
|
||||||
|
("\\N{LINEAR B SYLLABLE B008 A}", 1)
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
codecs.namereplace_errors(
|
||||||
|
UnicodeEncodeError("ascii", "\U0010ffff",
|
||||||
|
0, 1, "ouch")),
|
||||||
|
("\\U0010ffff", 1)
|
||||||
|
)
|
||||||
|
# Lone surrogates (regardless of unicode width)
|
||||||
|
self.assertEqual(
|
||||||
|
codecs.namereplace_errors(
|
||||||
|
UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")),
|
||||||
|
("\\ud800", 1)
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
codecs.namereplace_errors(
|
||||||
|
UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")),
|
||||||
|
("\\udfff", 1)
|
||||||
|
)
|
||||||
|
|
||||||
def test_badhandlerresults(self):
|
def test_badhandlerresults(self):
|
||||||
results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
|
results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
|
||||||
encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
|
encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
|
||||||
|
@ -651,6 +742,10 @@ class CodecCallbackTest(unittest.TestCase):
|
||||||
codecs.backslashreplace_errors,
|
codecs.backslashreplace_errors,
|
||||||
codecs.lookup_error("backslashreplace")
|
codecs.lookup_error("backslashreplace")
|
||||||
)
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
codecs.namereplace_errors,
|
||||||
|
codecs.lookup_error("namereplace")
|
||||||
|
)
|
||||||
|
|
||||||
def test_unencodablereplacement(self):
|
def test_unencodablereplacement(self):
|
||||||
def unencrepl(exc):
|
def unencrepl(exc):
|
||||||
|
@ -804,7 +899,8 @@ class CodecCallbackTest(unittest.TestCase):
|
||||||
class D(dict):
|
class D(dict):
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
raise ValueError
|
raise ValueError
|
||||||
for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
|
for err in ("strict", "replace", "xmlcharrefreplace",
|
||||||
|
"backslashreplace", "namereplace", "test.posreturn"):
|
||||||
self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
|
self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
|
||||||
self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
|
self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
|
||||||
self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
|
self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
|
||||||
|
|
|
@ -349,6 +349,8 @@ class ReadTest(MixInCheckStateHandling):
|
||||||
self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
|
self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
|
||||||
self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
|
self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
|
||||||
"[\\udc80]".encode(self.encoding))
|
"[\\udc80]".encode(self.encoding))
|
||||||
|
self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
|
||||||
|
"[\\udc80]".encode(self.encoding))
|
||||||
self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
|
self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
|
||||||
"[�]".encode(self.encoding))
|
"[�]".encode(self.encoding))
|
||||||
self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
|
self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
|
||||||
|
@ -808,6 +810,7 @@ class CP65001Test(ReadTest, unittest.TestCase):
|
||||||
('\udc80', 'ignore', b''),
|
('\udc80', 'ignore', b''),
|
||||||
('\udc80', 'replace', b'?'),
|
('\udc80', 'replace', b'?'),
|
||||||
('\udc80', 'backslashreplace', b'\\udc80'),
|
('\udc80', 'backslashreplace', b'\\udc80'),
|
||||||
|
('\udc80', 'namereplace', b'\\udc80'),
|
||||||
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
|
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
|
||||||
))
|
))
|
||||||
else:
|
else:
|
||||||
|
@ -869,6 +872,8 @@ class CP65001Test(ReadTest, unittest.TestCase):
|
||||||
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
|
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
|
||||||
self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
|
self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
|
||||||
b'[\\udc80]')
|
b'[\\udc80]')
|
||||||
|
self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
|
||||||
|
b'[\\udc80]')
|
||||||
self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
|
self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
|
||||||
b'[�]')
|
b'[�]')
|
||||||
self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
|
self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
|
||||||
|
@ -2824,6 +2829,8 @@ class CodePageTest(unittest.TestCase):
|
||||||
('[\xff]', 'replace', b'[y]'),
|
('[\xff]', 'replace', b'[y]'),
|
||||||
('[\u20ac]', 'replace', b'[?]'),
|
('[\u20ac]', 'replace', b'[?]'),
|
||||||
('[\xff]', 'backslashreplace', b'[\\xff]'),
|
('[\xff]', 'backslashreplace', b'[\\xff]'),
|
||||||
|
('[\xff]', 'namereplace',
|
||||||
|
b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
|
||||||
('[\xff]', 'xmlcharrefreplace', b'[ÿ]'),
|
('[\xff]', 'xmlcharrefreplace', b'[ÿ]'),
|
||||||
('\udcff', 'strict', None),
|
('\udcff', 'strict', None),
|
||||||
('[\udcff]', 'surrogateescape', b'[\xff]'),
|
('[\udcff]', 'surrogateescape', b'[\xff]'),
|
||||||
|
|
|
@ -191,6 +191,8 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #19676: Added the "namereplace" error handler.
|
||||||
|
|
||||||
- Issue #22788: Add *context* parameter to logging.handlers.HTTPHandler.
|
- Issue #22788: Add *context* parameter to logging.handlers.HTTPHandler.
|
||||||
|
|
||||||
- Issue #22921: Allow SSLContext to take the *hostname* parameter even if
|
- Issue #22921: Allow SSLContext to take the *hostname* parameter even if
|
||||||
|
|
108
Python/codecs.c
108
Python/codecs.c
|
@ -9,6 +9,7 @@ Copyright (c) Corporation for National Research Initiatives.
|
||||||
------------------------------------------------------------------------ */
|
------------------------------------------------------------------------ */
|
||||||
|
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
|
#include "ucnhash.h"
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
|
|
||||||
const char *Py_hexdigits = "0123456789abcdef";
|
const char *Py_hexdigits = "0123456789abcdef";
|
||||||
|
@ -933,6 +934,97 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
|
||||||
|
static int ucnhash_initialized = 0;
|
||||||
|
|
||||||
|
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
|
||||||
|
{
|
||||||
|
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
||||||
|
PyObject *restuple;
|
||||||
|
PyObject *object;
|
||||||
|
Py_ssize_t i;
|
||||||
|
Py_ssize_t start;
|
||||||
|
Py_ssize_t end;
|
||||||
|
PyObject *res;
|
||||||
|
unsigned char *outp;
|
||||||
|
int ressize;
|
||||||
|
Py_UCS4 c;
|
||||||
|
char buffer[256]; /* NAME_MAXLEN */
|
||||||
|
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
||||||
|
return NULL;
|
||||||
|
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
||||||
|
return NULL;
|
||||||
|
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
||||||
|
return NULL;
|
||||||
|
if (!ucnhash_initialized) {
|
||||||
|
/* load the unicode data module */
|
||||||
|
ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
|
||||||
|
PyUnicodeData_CAPSULE_NAME, 1);
|
||||||
|
ucnhash_initialized = 1;
|
||||||
|
}
|
||||||
|
for (i = start, ressize = 0; i < end; ++i) {
|
||||||
|
/* object is guaranteed to be "ready" */
|
||||||
|
c = PyUnicode_READ_CHAR(object, i);
|
||||||
|
if (ucnhash_CAPI &&
|
||||||
|
ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
|
||||||
|
ressize += 1+1+1+strlen(buffer)+1;
|
||||||
|
}
|
||||||
|
else if (c >= 0x10000) {
|
||||||
|
ressize += 1+1+8;
|
||||||
|
}
|
||||||
|
else if (c >= 0x100) {
|
||||||
|
ressize += 1+1+4;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ressize += 1+1+2;
|
||||||
|
}
|
||||||
|
res = PyUnicode_New(ressize, 127);
|
||||||
|
if (res==NULL)
|
||||||
|
return NULL;
|
||||||
|
for (i = start, outp = PyUnicode_1BYTE_DATA(res);
|
||||||
|
i < end; ++i) {
|
||||||
|
c = PyUnicode_READ_CHAR(object, i);
|
||||||
|
*outp++ = '\\';
|
||||||
|
if (ucnhash_CAPI &&
|
||||||
|
ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
|
||||||
|
*outp++ = 'N';
|
||||||
|
*outp++ = '{';
|
||||||
|
strcpy((char *)outp, buffer);
|
||||||
|
outp += strlen(buffer);
|
||||||
|
*outp++ = '}';
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (c >= 0x00010000) {
|
||||||
|
*outp++ = 'U';
|
||||||
|
*outp++ = Py_hexdigits[(c>>28)&0xf];
|
||||||
|
*outp++ = Py_hexdigits[(c>>24)&0xf];
|
||||||
|
*outp++ = Py_hexdigits[(c>>20)&0xf];
|
||||||
|
*outp++ = Py_hexdigits[(c>>16)&0xf];
|
||||||
|
*outp++ = Py_hexdigits[(c>>12)&0xf];
|
||||||
|
*outp++ = Py_hexdigits[(c>>8)&0xf];
|
||||||
|
}
|
||||||
|
else if (c >= 0x100) {
|
||||||
|
*outp++ = 'u';
|
||||||
|
*outp++ = Py_hexdigits[(c>>12)&0xf];
|
||||||
|
*outp++ = Py_hexdigits[(c>>8)&0xf];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
*outp++ = 'x';
|
||||||
|
*outp++ = Py_hexdigits[(c>>4)&0xf];
|
||||||
|
*outp++ = Py_hexdigits[c&0xf];
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(_PyUnicode_CheckConsistency(res, 1));
|
||||||
|
restuple = Py_BuildValue("(Nn)", res, end);
|
||||||
|
Py_DECREF(object);
|
||||||
|
return restuple;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
wrong_exception_type(exc);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define ENC_UNKNOWN -1
|
#define ENC_UNKNOWN -1
|
||||||
#define ENC_UTF8 0
|
#define ENC_UTF8 0
|
||||||
#define ENC_UTF16BE 1
|
#define ENC_UTF16BE 1
|
||||||
|
@ -1276,6 +1368,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
|
||||||
return PyCodec_BackslashReplaceErrors(exc);
|
return PyCodec_BackslashReplaceErrors(exc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
|
||||||
|
{
|
||||||
|
return PyCodec_NameReplaceErrors(exc);
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
|
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
|
||||||
{
|
{
|
||||||
return PyCodec_SurrogatePassErrors(exc);
|
return PyCodec_SurrogatePassErrors(exc);
|
||||||
|
@ -1345,6 +1442,17 @@ static int _PyCodecRegistry_Init(void)
|
||||||
"backslashed escape sequence.")
|
"backslashed escape sequence.")
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"namereplace",
|
||||||
|
{
|
||||||
|
"namereplace_errors",
|
||||||
|
namereplace_errors,
|
||||||
|
METH_O,
|
||||||
|
PyDoc_STR("Implements the 'namereplace' error handling, "
|
||||||
|
"which replaces an unencodable character with a "
|
||||||
|
"\\N{...} escape sequence.")
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"surrogatepass",
|
"surrogatepass",
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue