Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.
The utf-16* and utf-32* encoders no longer allow surrogate code points (U+D800-U+DFFF) to be encoded. The utf-32* decoders no longer decode byte sequences that correspond to surrogate code points. The surrogatepass error handler now works with the utf-16* and utf-32* codecs. Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
This commit is contained in:
parent
a938bcfe95
commit
58cf607d13
|
@ -365,18 +365,23 @@ and implemented by all standard Python codecs:
|
||||||
| | in :pep:`383`. |
|
| | in :pep:`383`. |
|
||||||
+-------------------------+-----------------------------------------------+
|
+-------------------------+-----------------------------------------------+
|
||||||
|
|
||||||
In addition, the following error handlers are specific to a single codec:
|
In addition, the following error handlers are specific to Unicode encoding
|
||||||
|
schemes:
|
||||||
|
|
||||||
+-------------------+---------+-------------------------------------------+
|
+-------------------+------------------------+-------------------------------------------+
|
||||||
| Value | Codec | Meaning |
|
| Value | Codec | Meaning |
|
||||||
+===================+=========+===========================================+
|
+===================+========================+===========================================+
|
||||||
|``'surrogatepass'``| utf-8 | Allow encoding and decoding of surrogate |
|
|``'surrogatepass'``| utf-8, utf-16, utf-32, | Allow encoding and decoding of surrogate |
|
||||||
| | | codes in UTF-8. |
|
| | utf-16-be, utf-16-le, | codes in all the Unicode encoding schemes.|
|
||||||
+-------------------+---------+-------------------------------------------+
|
| | utf-32-be, utf-32-le | |
|
||||||
|
+-------------------+------------------------+-------------------------------------------+
|
||||||
|
|
||||||
.. versionadded:: 3.1
|
.. versionadded:: 3.1
|
||||||
The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers.
|
The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers.
|
||||||
|
|
||||||
|
.. versionchanged:: 3.4
|
||||||
|
The ``'surrogatepass'`` error handlers now works with utf-16\* and utf-32\* codecs.
|
||||||
|
|
||||||
The set of allowed values can be extended via :meth:`register_error`.
|
The set of allowed values can be extended via :meth:`register_error`.
|
||||||
|
|
||||||
|
|
||||||
|
@ -1167,6 +1172,12 @@ particular, the following variants typically exist:
|
||||||
| utf_8_sig | | all languages |
|
| utf_8_sig | | all languages |
|
||||||
+-----------------+--------------------------------+--------------------------------+
|
+-----------------+--------------------------------+--------------------------------+
|
||||||
|
|
||||||
|
.. versionchanged:: 3.4
|
||||||
|
The utf-16\* and utf-32\* encoders no longer allow surrogate code points
|
||||||
|
(U+D800--U+DFFF) to be encoded. The utf-32\* decoders no longer decode
|
||||||
|
byte sequences that correspond to surrogate code points.
|
||||||
|
|
||||||
|
|
||||||
Python Specific Encodings
|
Python Specific Encodings
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
|
|
|
@ -253,6 +253,13 @@ Some smaller changes made to the core Python language are:
|
||||||
``__main__.__file__`` when a script has been executed directly using
|
``__main__.__file__`` when a script has been executed directly using
|
||||||
a relative path (Contributed by Brett Cannon in :issue:`18416`).
|
a relative path (Contributed by Brett Cannon in :issue:`18416`).
|
||||||
|
|
||||||
|
* Now all the UTF-\* codecs (except UTF-7) reject surrogates during both
|
||||||
|
encoding and decoding unless the ``surrogatepass`` error handler is used,
|
||||||
|
with the exception of the UTF-16 decoder that accepts valid surrogate pairs,
|
||||||
|
and the UTF-16 encoder that produces them while encoding non-BMP characters.
|
||||||
|
Contributed by Victor Stinner, Kang-Hao (Kenny) Lu and Serhiy Storchaka in
|
||||||
|
:issue:`12892`.
|
||||||
|
|
||||||
|
|
||||||
New Modules
|
New Modules
|
||||||
===========
|
===========
|
||||||
|
|
|
@ -300,8 +300,46 @@ class ReadTest(MixInCheckStateHandling):
|
||||||
self.assertEqual(reader.readline(), s5)
|
self.assertEqual(reader.readline(), s5)
|
||||||
self.assertEqual(reader.readline(), "")
|
self.assertEqual(reader.readline(), "")
|
||||||
|
|
||||||
|
ill_formed_sequence_replace = "\ufffd"
|
||||||
|
|
||||||
|
def test_lone_surrogates(self):
|
||||||
|
self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
|
||||||
|
self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
|
||||||
|
"[\\udc80]".encode(self.encoding))
|
||||||
|
self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
|
||||||
|
"[�]".encode(self.encoding))
|
||||||
|
self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
|
||||||
|
"[]".encode(self.encoding))
|
||||||
|
self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
|
||||||
|
"[?]".encode(self.encoding))
|
||||||
|
|
||||||
|
bom = "".encode(self.encoding)
|
||||||
|
for before, after in [("\U00010fff", "A"), ("[", "]"),
|
||||||
|
("A", "\U00010fff")]:
|
||||||
|
before_sequence = before.encode(self.encoding)[len(bom):]
|
||||||
|
after_sequence = after.encode(self.encoding)[len(bom):]
|
||||||
|
test_string = before + "\uDC80" + after
|
||||||
|
test_sequence = (bom + before_sequence +
|
||||||
|
self.ill_formed_sequence + after_sequence)
|
||||||
|
self.assertRaises(UnicodeDecodeError, test_sequence.decode,
|
||||||
|
self.encoding)
|
||||||
|
self.assertEqual(test_string.encode(self.encoding,
|
||||||
|
"surrogatepass"),
|
||||||
|
test_sequence)
|
||||||
|
self.assertEqual(test_sequence.decode(self.encoding,
|
||||||
|
"surrogatepass"),
|
||||||
|
test_string)
|
||||||
|
self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
|
||||||
|
before + after)
|
||||||
|
self.assertEqual(test_sequence.decode(self.encoding, "replace"),
|
||||||
|
before + self.ill_formed_sequence_replace + after)
|
||||||
|
|
||||||
class UTF32Test(ReadTest, unittest.TestCase):
|
class UTF32Test(ReadTest, unittest.TestCase):
|
||||||
encoding = "utf-32"
|
encoding = "utf-32"
|
||||||
|
if sys.byteorder == 'little':
|
||||||
|
ill_formed_sequence = b"\x80\xdc\x00\x00"
|
||||||
|
else:
|
||||||
|
ill_formed_sequence = b"\x00\x00\xdc\x80"
|
||||||
|
|
||||||
spamle = (b'\xff\xfe\x00\x00'
|
spamle = (b'\xff\xfe\x00\x00'
|
||||||
b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
|
b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
|
||||||
|
@ -393,6 +431,7 @@ class UTF32Test(ReadTest, unittest.TestCase):
|
||||||
|
|
||||||
class UTF32LETest(ReadTest, unittest.TestCase):
|
class UTF32LETest(ReadTest, unittest.TestCase):
|
||||||
encoding = "utf-32-le"
|
encoding = "utf-32-le"
|
||||||
|
ill_formed_sequence = b"\x80\xdc\x00\x00"
|
||||||
|
|
||||||
def test_partial(self):
|
def test_partial(self):
|
||||||
self.check_partial(
|
self.check_partial(
|
||||||
|
@ -437,6 +476,7 @@ class UTF32LETest(ReadTest, unittest.TestCase):
|
||||||
|
|
||||||
class UTF32BETest(ReadTest, unittest.TestCase):
|
class UTF32BETest(ReadTest, unittest.TestCase):
|
||||||
encoding = "utf-32-be"
|
encoding = "utf-32-be"
|
||||||
|
ill_formed_sequence = b"\x00\x00\xdc\x80"
|
||||||
|
|
||||||
def test_partial(self):
|
def test_partial(self):
|
||||||
self.check_partial(
|
self.check_partial(
|
||||||
|
@ -482,6 +522,10 @@ class UTF32BETest(ReadTest, unittest.TestCase):
|
||||||
|
|
||||||
class UTF16Test(ReadTest, unittest.TestCase):
|
class UTF16Test(ReadTest, unittest.TestCase):
|
||||||
encoding = "utf-16"
|
encoding = "utf-16"
|
||||||
|
if sys.byteorder == 'little':
|
||||||
|
ill_formed_sequence = b"\x80\xdc"
|
||||||
|
else:
|
||||||
|
ill_formed_sequence = b"\xdc\x80"
|
||||||
|
|
||||||
spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
|
spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
|
||||||
spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
|
spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
|
||||||
|
@ -562,6 +606,7 @@ class UTF16Test(ReadTest, unittest.TestCase):
|
||||||
|
|
||||||
class UTF16LETest(ReadTest, unittest.TestCase):
|
class UTF16LETest(ReadTest, unittest.TestCase):
|
||||||
encoding = "utf-16-le"
|
encoding = "utf-16-le"
|
||||||
|
ill_formed_sequence = b"\x80\xdc"
|
||||||
|
|
||||||
def test_partial(self):
|
def test_partial(self):
|
||||||
self.check_partial(
|
self.check_partial(
|
||||||
|
@ -605,6 +650,7 @@ class UTF16LETest(ReadTest, unittest.TestCase):
|
||||||
|
|
||||||
class UTF16BETest(ReadTest, unittest.TestCase):
|
class UTF16BETest(ReadTest, unittest.TestCase):
|
||||||
encoding = "utf-16-be"
|
encoding = "utf-16-be"
|
||||||
|
ill_formed_sequence = b"\xdc\x80"
|
||||||
|
|
||||||
def test_partial(self):
|
def test_partial(self):
|
||||||
self.check_partial(
|
self.check_partial(
|
||||||
|
@ -648,6 +694,8 @@ class UTF16BETest(ReadTest, unittest.TestCase):
|
||||||
|
|
||||||
class UTF8Test(ReadTest, unittest.TestCase):
|
class UTF8Test(ReadTest, unittest.TestCase):
|
||||||
encoding = "utf-8"
|
encoding = "utf-8"
|
||||||
|
ill_formed_sequence = b"\xed\xb2\x80"
|
||||||
|
ill_formed_sequence_replace = "\ufffd" * 3
|
||||||
|
|
||||||
def test_partial(self):
|
def test_partial(self):
|
||||||
self.check_partial(
|
self.check_partial(
|
||||||
|
@ -677,18 +725,11 @@ class UTF8Test(ReadTest, unittest.TestCase):
|
||||||
u, u.encode(self.encoding))
|
u, u.encode(self.encoding))
|
||||||
|
|
||||||
def test_lone_surrogates(self):
|
def test_lone_surrogates(self):
|
||||||
self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
|
super().test_lone_surrogates()
|
||||||
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
|
# not sure if this is making sense for
|
||||||
self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
|
# UTF-16 and UTF-32
|
||||||
b'[\\udc80]')
|
self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
|
||||||
self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
|
|
||||||
b'[�]')
|
|
||||||
self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
|
|
||||||
b'[\x80]')
|
b'[\x80]')
|
||||||
self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
|
|
||||||
b'[]')
|
|
||||||
self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
|
|
||||||
b'[?]')
|
|
||||||
|
|
||||||
def test_surrogatepass_handler(self):
|
def test_surrogatepass_handler(self):
|
||||||
self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
|
self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
|
||||||
|
@ -851,6 +892,9 @@ class UTF7Test(ReadTest, unittest.TestCase):
|
||||||
self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
|
self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
|
||||||
self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
|
self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
|
||||||
|
|
||||||
|
test_lone_surrogates = None
|
||||||
|
|
||||||
|
|
||||||
class UTF16ExTest(unittest.TestCase):
|
class UTF16ExTest(unittest.TestCase):
|
||||||
|
|
||||||
def test_errors(self):
|
def test_errors(self):
|
||||||
|
@ -875,7 +919,7 @@ class ReadBufferTest(unittest.TestCase):
|
||||||
self.assertRaises(TypeError, codecs.readbuffer_encode)
|
self.assertRaises(TypeError, codecs.readbuffer_encode)
|
||||||
self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
|
self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
|
||||||
|
|
||||||
class UTF8SigTest(ReadTest, unittest.TestCase):
|
class UTF8SigTest(UTF8Test, unittest.TestCase):
|
||||||
encoding = "utf-8-sig"
|
encoding = "utf-8-sig"
|
||||||
|
|
||||||
def test_partial(self):
|
def test_partial(self):
|
||||||
|
|
|
@ -783,6 +783,7 @@ Ned Jackson Lovely
|
||||||
Jason Lowe
|
Jason Lowe
|
||||||
Tony Lownds
|
Tony Lownds
|
||||||
Ray Loyzaga
|
Ray Loyzaga
|
||||||
|
Kang-Hao (Kenny) Lu
|
||||||
Lukas Lueg
|
Lukas Lueg
|
||||||
Loren Luke
|
Loren Luke
|
||||||
Fredrik Lundh
|
Fredrik Lundh
|
||||||
|
|
|
@ -10,6 +10,12 @@ Projected release date: 2013-11-24
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #12892: The utf-16* and utf-32* encoders no longer allow surrogate code
|
||||||
|
points (U+D800-U+DFFF) to be encoded. The utf-32* decoders no longer decode
|
||||||
|
byte sequences that correspond to surrogate code points. The surrogatepass
|
||||||
|
error handler now works with the utf-16* and utf-32* codecs. Based on
|
||||||
|
patches by Victor Stinner and Kang-Hao (Kenny) Lu.
|
||||||
|
|
||||||
- Issue #17806: Added keyword-argument support for "tabsize" to
|
- Issue #17806: Added keyword-argument support for "tabsize" to
|
||||||
str/bytes.expandtabs().
|
str/bytes.expandtabs().
|
||||||
|
|
||||||
|
|
|
@ -596,26 +596,134 @@ IllegalSurrogate:
|
||||||
#undef SWAB
|
#undef SWAB
|
||||||
|
|
||||||
|
|
||||||
Py_LOCAL_INLINE(void)
|
#if STRINGLIB_MAX_CHAR >= 0x80
|
||||||
STRINGLIB(utf16_encode)(unsigned short *out,
|
Py_LOCAL_INLINE(Py_ssize_t)
|
||||||
const STRINGLIB_CHAR *in,
|
STRINGLIB(utf16_encode_)(const STRINGLIB_CHAR *in,
|
||||||
Py_ssize_t len,
|
Py_ssize_t len,
|
||||||
|
unsigned short **outptr,
|
||||||
int native_ordering)
|
int native_ordering)
|
||||||
{
|
{
|
||||||
|
unsigned short *out = *outptr;
|
||||||
const STRINGLIB_CHAR *end = in + len;
|
const STRINGLIB_CHAR *end = in + len;
|
||||||
#if STRINGLIB_SIZEOF_CHAR == 1
|
#if STRINGLIB_SIZEOF_CHAR == 1
|
||||||
# define SWAB2(CH) ((CH) << 8)
|
# define SWAB2(CH) ((CH) << 8)
|
||||||
#else
|
#else
|
||||||
# define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
|
# define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
|
||||||
#endif
|
#endif
|
||||||
#if STRINGLIB_MAX_CHAR < 0x10000
|
|
||||||
if (native_ordering) {
|
if (native_ordering) {
|
||||||
# if STRINGLIB_SIZEOF_CHAR == 2
|
#if STRINGLIB_MAX_CHAR < 0x10000
|
||||||
Py_MEMCPY(out, in, 2 * len);
|
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
||||||
# else
|
while (in < unrolled_end) {
|
||||||
_PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out);
|
# if STRINGLIB_MAX_CHAR >= 0xd800
|
||||||
|
if (((in[0] ^ 0xd800) &
|
||||||
|
(in[1] ^ 0xd800) &
|
||||||
|
(in[2] ^ 0xd800) &
|
||||||
|
(in[3] ^ 0xd800) & 0xf800) == 0)
|
||||||
|
break;
|
||||||
# endif
|
# endif
|
||||||
|
out[0] = in[0];
|
||||||
|
out[1] = in[1];
|
||||||
|
out[2] = in[2];
|
||||||
|
out[3] = in[3];
|
||||||
|
in += 4; out += 4;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
while (in < end) {
|
||||||
|
Py_UCS4 ch;
|
||||||
|
ch = *in++;
|
||||||
|
#if STRINGLIB_MAX_CHAR >= 0xd800
|
||||||
|
if (ch < 0xd800)
|
||||||
|
*out++ = ch;
|
||||||
|
else if (ch < 0xe000)
|
||||||
|
/* reject surrogate characters (U+DC800-U+DFFF) */
|
||||||
|
goto fail;
|
||||||
|
# if STRINGLIB_MAX_CHAR >= 0x10000
|
||||||
|
else if (ch >= 0x10000) {
|
||||||
|
out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
|
||||||
|
out[1] = Py_UNICODE_LOW_SURROGATE(ch);
|
||||||
|
out += 2;
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
*out++ = ch;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
|
#if STRINGLIB_MAX_CHAR < 0x10000
|
||||||
|
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
||||||
|
while (in < unrolled_end) {
|
||||||
|
# if STRINGLIB_MAX_CHAR >= 0xd800
|
||||||
|
if (((in[0] ^ 0xd800) &
|
||||||
|
(in[1] ^ 0xd800) &
|
||||||
|
(in[2] ^ 0xd800) &
|
||||||
|
(in[3] ^ 0xd800) & 0xf800) == 0)
|
||||||
|
break;
|
||||||
|
# endif
|
||||||
|
out[0] = SWAB2(in[0]);
|
||||||
|
out[1] = SWAB2(in[1]);
|
||||||
|
out[2] = SWAB2(in[2]);
|
||||||
|
out[3] = SWAB2(in[3]);
|
||||||
|
in += 4; out += 4;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
while (in < end) {
|
||||||
|
Py_UCS4 ch = *in++;
|
||||||
|
#if STRINGLIB_MAX_CHAR >= 0xd800
|
||||||
|
if (ch < 0xd800)
|
||||||
|
*out++ = SWAB2((Py_UCS2)ch);
|
||||||
|
else if (ch < 0xe000)
|
||||||
|
/* reject surrogate characters (U+DC800-U+DFFF) */
|
||||||
|
goto fail;
|
||||||
|
# if STRINGLIB_MAX_CHAR >= 0x10000
|
||||||
|
else if (ch >= 0x10000) {
|
||||||
|
Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
|
||||||
|
Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
|
||||||
|
out[0] = SWAB2(ch1);
|
||||||
|
out[1] = SWAB2(ch2);
|
||||||
|
out += 2;
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
*out++ = SWAB2((Py_UCS2)ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*outptr = out;
|
||||||
|
return len;
|
||||||
|
#if STRINGLIB_MAX_CHAR >= 0xd800
|
||||||
|
fail:
|
||||||
|
#endif
|
||||||
|
*outptr = out;
|
||||||
|
return len - (end - in + 1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#undef SWAB2
|
||||||
|
|
||||||
|
#if STRINGLIB_MAX_CHAR >= 0x80
|
||||||
|
Py_LOCAL_INLINE(Py_ssize_t)
|
||||||
|
STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
|
||||||
|
Py_ssize_t len,
|
||||||
|
unsigned short **outptr,
|
||||||
|
int native_ordering)
|
||||||
|
{
|
||||||
|
unsigned short *out = *outptr;
|
||||||
|
const STRINGLIB_CHAR *end = in + len;
|
||||||
|
#if STRINGLIB_SIZEOF_CHAR == 1
|
||||||
|
if (native_ordering) {
|
||||||
|
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
||||||
|
while (in < unrolled_end) {
|
||||||
|
out[0] = in[0];
|
||||||
|
out[1] = in[1];
|
||||||
|
out[2] = in[2];
|
||||||
|
out[3] = in[3];
|
||||||
|
in += 4; out += 4;
|
||||||
|
}
|
||||||
|
while (in < end) {
|
||||||
|
*out++ = *in++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
|
||||||
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
||||||
while (in < unrolled_end) {
|
while (in < unrolled_end) {
|
||||||
out[0] = SWAB2(in[0]);
|
out[0] = SWAB2(in[0]);
|
||||||
|
@ -625,37 +733,95 @@ STRINGLIB(utf16_encode)(unsigned short *out,
|
||||||
in += 4; out += 4;
|
in += 4; out += 4;
|
||||||
}
|
}
|
||||||
while (in < end) {
|
while (in < end) {
|
||||||
*out++ = SWAB2(*in);
|
Py_UCS4 ch = *in++;
|
||||||
++in;
|
*out++ = SWAB2((Py_UCS2)ch);
|
||||||
}
|
}
|
||||||
|
#undef SWAB2
|
||||||
}
|
}
|
||||||
|
*outptr = out;
|
||||||
|
return len;
|
||||||
#else
|
#else
|
||||||
if (native_ordering) {
|
if (native_ordering) {
|
||||||
|
#if STRINGLIB_MAX_CHAR < 0x10000
|
||||||
|
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
||||||
|
while (in < unrolled_end) {
|
||||||
|
/* check if any character is a surrogate character */
|
||||||
|
if (((in[0] ^ 0xd800) &
|
||||||
|
(in[1] ^ 0xd800) &
|
||||||
|
(in[2] ^ 0xd800) &
|
||||||
|
(in[3] ^ 0xd800) & 0xf800) == 0)
|
||||||
|
break;
|
||||||
|
out[0] = in[0];
|
||||||
|
out[1] = in[1];
|
||||||
|
out[2] = in[2];
|
||||||
|
out[3] = in[3];
|
||||||
|
in += 4; out += 4;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
while (in < end) {
|
while (in < end) {
|
||||||
Py_UCS4 ch = *in++;
|
Py_UCS4 ch;
|
||||||
if (ch < 0x10000)
|
ch = *in++;
|
||||||
|
if (ch < 0xd800)
|
||||||
*out++ = ch;
|
*out++ = ch;
|
||||||
else {
|
else if (ch < 0xe000)
|
||||||
|
/* reject surrogate characters (U+DC800-U+DFFF) */
|
||||||
|
goto fail;
|
||||||
|
#if STRINGLIB_MAX_CHAR >= 0x10000
|
||||||
|
else if (ch >= 0x10000) {
|
||||||
out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
|
out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
|
||||||
out[1] = Py_UNICODE_LOW_SURROGATE(ch);
|
out[1] = Py_UNICODE_LOW_SURROGATE(ch);
|
||||||
out += 2;
|
out += 2;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
*out++ = ch;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
|
||||||
|
#if STRINGLIB_MAX_CHAR < 0x10000
|
||||||
|
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
||||||
|
while (in < unrolled_end) {
|
||||||
|
/* check if any character is a surrogate character */
|
||||||
|
if (((in[0] ^ 0xd800) &
|
||||||
|
(in[1] ^ 0xd800) &
|
||||||
|
(in[2] ^ 0xd800) &
|
||||||
|
(in[3] ^ 0xd800) & 0xf800) == 0)
|
||||||
|
break;
|
||||||
|
out[0] = SWAB2(in[0]);
|
||||||
|
out[1] = SWAB2(in[1]);
|
||||||
|
out[2] = SWAB2(in[2]);
|
||||||
|
out[3] = SWAB2(in[3]);
|
||||||
|
in += 4; out += 4;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
while (in < end) {
|
while (in < end) {
|
||||||
Py_UCS4 ch = *in++;
|
Py_UCS4 ch = *in++;
|
||||||
if (ch < 0x10000)
|
if (ch < 0xd800)
|
||||||
*out++ = SWAB2((Py_UCS2)ch);
|
*out++ = SWAB2((Py_UCS2)ch);
|
||||||
else {
|
else if (ch < 0xe000)
|
||||||
|
/* reject surrogate characters (U+DC800-U+DFFF) */
|
||||||
|
goto fail;
|
||||||
|
#if STRINGLIB_MAX_CHAR >= 0x10000
|
||||||
|
else if (ch >= 0x10000) {
|
||||||
Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
|
Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
|
||||||
Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
|
Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
|
||||||
out[0] = SWAB2(ch1);
|
out[0] = SWAB2(ch1);
|
||||||
out[1] = SWAB2(ch2);
|
out[1] = SWAB2(ch2);
|
||||||
out += 2;
|
out += 2;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
else
|
||||||
|
*out++ = SWAB2((Py_UCS2)ch);
|
||||||
|
}
|
||||||
#undef SWAB2
|
#undef SWAB2
|
||||||
|
}
|
||||||
|
*outptr = out;
|
||||||
|
return len;
|
||||||
|
fail:
|
||||||
|
*outptr = out;
|
||||||
|
return len - (end - in + 1);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* STRINGLIB_IS_UNICODE */
|
#endif /* STRINGLIB_IS_UNICODE */
|
||||||
|
|
|
@ -4963,6 +4963,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
|
||||||
_PyUnicodeWriter writer;
|
_PyUnicodeWriter writer;
|
||||||
const unsigned char *q, *e;
|
const unsigned char *q, *e;
|
||||||
int le, bo = 0; /* assume native ordering by default */
|
int le, bo = 0; /* assume native ordering by default */
|
||||||
|
const char *encoding;
|
||||||
const char *errmsg = "";
|
const char *errmsg = "";
|
||||||
PyObject *errorHandler = NULL;
|
PyObject *errorHandler = NULL;
|
||||||
PyObject *exc = NULL;
|
PyObject *exc = NULL;
|
||||||
|
@ -5002,6 +5003,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
|
||||||
#else
|
#else
|
||||||
le = bo <= 0;
|
le = bo <= 0;
|
||||||
#endif
|
#endif
|
||||||
|
encoding = le ? "utf-32-le" : "utf-32-be";
|
||||||
|
|
||||||
_PyUnicodeWriter_Init(&writer);
|
_PyUnicodeWriter_Init(&writer);
|
||||||
writer.min_length = (e - q + 3) / 4;
|
writer.min_length = (e - q + 3) / 4;
|
||||||
|
@ -5022,6 +5024,9 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
|
||||||
ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
|
ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
|
||||||
if (ch > maxch)
|
if (ch > maxch)
|
||||||
break;
|
break;
|
||||||
|
if (kind != PyUnicode_1BYTE_KIND &&
|
||||||
|
Py_UNICODE_IS_SURROGATE(ch))
|
||||||
|
break;
|
||||||
PyUnicode_WRITE(kind, data, pos++, ch);
|
PyUnicode_WRITE(kind, data, pos++, ch);
|
||||||
q += 4;
|
q += 4;
|
||||||
} while (q <= last);
|
} while (q <= last);
|
||||||
|
@ -5031,6 +5036,9 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
|
||||||
ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
|
ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
|
||||||
if (ch > maxch)
|
if (ch > maxch)
|
||||||
break;
|
break;
|
||||||
|
if (kind != PyUnicode_1BYTE_KIND &&
|
||||||
|
Py_UNICODE_IS_SURROGATE(ch))
|
||||||
|
break;
|
||||||
PyUnicode_WRITE(kind, data, pos++, ch);
|
PyUnicode_WRITE(kind, data, pos++, ch);
|
||||||
q += 4;
|
q += 4;
|
||||||
} while (q <= last);
|
} while (q <= last);
|
||||||
|
@ -5038,7 +5046,12 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
|
||||||
writer.pos = pos;
|
writer.pos = pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ch <= maxch) {
|
if (Py_UNICODE_IS_SURROGATE(ch)) {
|
||||||
|
errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
|
||||||
|
startinpos = ((const char *)q) - starts;
|
||||||
|
endinpos = startinpos + 4;
|
||||||
|
}
|
||||||
|
else if (ch <= maxch) {
|
||||||
if (q == e || consumed)
|
if (q == e || consumed)
|
||||||
break;
|
break;
|
||||||
/* remaining bytes at the end? (size should be divisible by 4) */
|
/* remaining bytes at the end? (size should be divisible by 4) */
|
||||||
|
@ -5062,7 +5075,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
|
||||||
chooses to skip the input */
|
chooses to skip the input */
|
||||||
if (unicode_decode_call_errorhandler_writer(
|
if (unicode_decode_call_errorhandler_writer(
|
||||||
errors, &errorHandler,
|
errors, &errorHandler,
|
||||||
"utf32", errmsg,
|
encoding, errmsg,
|
||||||
&starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
|
&starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
|
||||||
&writer))
|
&writer))
|
||||||
goto onError;
|
goto onError;
|
||||||
|
@ -5099,6 +5112,10 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
#else
|
#else
|
||||||
int iorder[] = {3, 2, 1, 0};
|
int iorder[] = {3, 2, 1, 0};
|
||||||
#endif
|
#endif
|
||||||
|
const char *encoding;
|
||||||
|
PyObject *errorHandler = NULL;
|
||||||
|
PyObject *exc = NULL;
|
||||||
|
PyObject *rep = NULL;
|
||||||
|
|
||||||
#define STORECHAR(CH) \
|
#define STORECHAR(CH) \
|
||||||
do { \
|
do { \
|
||||||
|
@ -5130,7 +5147,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
if (byteorder == 0)
|
if (byteorder == 0)
|
||||||
STORECHAR(0xFEFF);
|
STORECHAR(0xFEFF);
|
||||||
if (len == 0)
|
if (len == 0)
|
||||||
goto done;
|
return v;
|
||||||
|
|
||||||
if (byteorder == -1) {
|
if (byteorder == -1) {
|
||||||
/* force LE */
|
/* force LE */
|
||||||
|
@ -5138,6 +5155,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
iorder[1] = 1;
|
iorder[1] = 1;
|
||||||
iorder[2] = 2;
|
iorder[2] = 2;
|
||||||
iorder[3] = 3;
|
iorder[3] = 3;
|
||||||
|
encoding = "utf-32-le";
|
||||||
}
|
}
|
||||||
else if (byteorder == 1) {
|
else if (byteorder == 1) {
|
||||||
/* force BE */
|
/* force BE */
|
||||||
|
@ -5145,13 +5163,103 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
iorder[1] = 2;
|
iorder[1] = 2;
|
||||||
iorder[2] = 1;
|
iorder[2] = 1;
|
||||||
iorder[3] = 0;
|
iorder[3] = 0;
|
||||||
|
encoding = "utf-32-be";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
encoding = "utf-32";
|
||||||
|
|
||||||
|
if (kind == PyUnicode_1BYTE_KIND) {
|
||||||
|
for (i = 0; i < len; i++)
|
||||||
|
STORECHAR(PyUnicode_READ(kind, data, i));
|
||||||
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < len; i++)
|
for (i = 0; i < len;) {
|
||||||
STORECHAR(PyUnicode_READ(kind, data, i));
|
Py_ssize_t repsize, moreunits;
|
||||||
|
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||||
|
i++;
|
||||||
|
assert(ch <= MAX_UNICODE);
|
||||||
|
if (!Py_UNICODE_IS_SURROGATE(ch)) {
|
||||||
|
STORECHAR(ch);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
done:
|
rep = unicode_encode_call_errorhandler(
|
||||||
|
errors, &errorHandler,
|
||||||
|
encoding, "surrogates not allowed",
|
||||||
|
str, &exc, i-1, i, &i);
|
||||||
|
|
||||||
|
if (!rep)
|
||||||
|
goto error;
|
||||||
|
|
||||||
|
if (PyBytes_Check(rep)) {
|
||||||
|
repsize = PyBytes_GET_SIZE(rep);
|
||||||
|
if (repsize & 3) {
|
||||||
|
raise_encode_exception(&exc, encoding,
|
||||||
|
str, i - 1, i,
|
||||||
|
"surrogates not allowed");
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
moreunits = repsize / 4;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert(PyUnicode_Check(rep));
|
||||||
|
if (PyUnicode_READY(rep) < 0)
|
||||||
|
goto error;
|
||||||
|
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
|
||||||
|
if (!PyUnicode_IS_ASCII(rep)) {
|
||||||
|
raise_encode_exception(&exc, encoding,
|
||||||
|
str, i - 1, i,
|
||||||
|
"surrogates not allowed");
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* four bytes are reserved for each surrogate */
|
||||||
|
if (moreunits > 1) {
|
||||||
|
Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
|
||||||
|
Py_ssize_t morebytes = 4 * (moreunits - 1);
|
||||||
|
if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
|
||||||
|
/* integer overflow */
|
||||||
|
PyErr_NoMemory();
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
|
||||||
|
goto error;
|
||||||
|
p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (PyBytes_Check(rep)) {
|
||||||
|
Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
|
||||||
|
p += repsize;
|
||||||
|
} else /* rep is unicode */ {
|
||||||
|
const Py_UCS1 *repdata;
|
||||||
|
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
|
||||||
|
repdata = PyUnicode_1BYTE_DATA(rep);
|
||||||
|
while (repsize--) {
|
||||||
|
Py_UCS4 ch = *repdata++;
|
||||||
|
STORECHAR(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_CLEAR(rep);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Cut back to size actually needed. This is necessary for, for example,
|
||||||
|
encoding of a string containing isolated surrogates and the 'ignore'
|
||||||
|
handler is used. */
|
||||||
|
nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
|
||||||
|
if (nsize != PyBytes_GET_SIZE(v))
|
||||||
|
_PyBytes_Resize(&v, nsize);
|
||||||
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_XDECREF(exc);
|
||||||
return v;
|
return v;
|
||||||
|
error:
|
||||||
|
Py_XDECREF(rep);
|
||||||
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_XDECREF(exc);
|
||||||
|
Py_XDECREF(v);
|
||||||
|
return NULL;
|
||||||
#undef STORECHAR
|
#undef STORECHAR
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5204,6 +5312,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
|
||||||
const char *errmsg = "";
|
const char *errmsg = "";
|
||||||
PyObject *errorHandler = NULL;
|
PyObject *errorHandler = NULL;
|
||||||
PyObject *exc = NULL;
|
PyObject *exc = NULL;
|
||||||
|
const char *encoding;
|
||||||
|
|
||||||
q = (unsigned char *)s;
|
q = (unsigned char *)s;
|
||||||
e = q + size;
|
e = q + size;
|
||||||
|
@ -5237,8 +5346,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
|
||||||
|
|
||||||
#if PY_LITTLE_ENDIAN
|
#if PY_LITTLE_ENDIAN
|
||||||
native_ordering = bo <= 0;
|
native_ordering = bo <= 0;
|
||||||
|
encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
|
||||||
#else
|
#else
|
||||||
native_ordering = bo >= 0;
|
native_ordering = bo >= 0;
|
||||||
|
encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Note: size will always be longer than the resulting Unicode
|
/* Note: size will always be longer than the resulting Unicode
|
||||||
|
@ -5312,7 +5423,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
|
||||||
if (unicode_decode_call_errorhandler_writer(
|
if (unicode_decode_call_errorhandler_writer(
|
||||||
errors,
|
errors,
|
||||||
&errorHandler,
|
&errorHandler,
|
||||||
"utf16", errmsg,
|
encoding, errmsg,
|
||||||
&starts,
|
&starts,
|
||||||
(const char **)&e,
|
(const char **)&e,
|
||||||
&startinpos,
|
&startinpos,
|
||||||
|
@ -5348,13 +5459,17 @@ _PyUnicode_EncodeUTF16(PyObject *str,
|
||||||
Py_ssize_t len;
|
Py_ssize_t len;
|
||||||
PyObject *v;
|
PyObject *v;
|
||||||
unsigned short *out;
|
unsigned short *out;
|
||||||
Py_ssize_t bytesize;
|
|
||||||
Py_ssize_t pairs;
|
Py_ssize_t pairs;
|
||||||
#if PY_BIG_ENDIAN
|
#if PY_BIG_ENDIAN
|
||||||
int native_ordering = byteorder >= 0;
|
int native_ordering = byteorder >= 0;
|
||||||
#else
|
#else
|
||||||
int native_ordering = byteorder <= 0;
|
int native_ordering = byteorder <= 0;
|
||||||
#endif
|
#endif
|
||||||
|
const char *encoding;
|
||||||
|
Py_ssize_t nsize, pos;
|
||||||
|
PyObject *errorHandler = NULL;
|
||||||
|
PyObject *exc = NULL;
|
||||||
|
PyObject *rep = NULL;
|
||||||
|
|
||||||
if (!PyUnicode_Check(str)) {
|
if (!PyUnicode_Check(str)) {
|
||||||
PyErr_BadArgument();
|
PyErr_BadArgument();
|
||||||
|
@ -5376,8 +5491,8 @@ _PyUnicode_EncodeUTF16(PyObject *str,
|
||||||
}
|
}
|
||||||
if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
|
if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
|
||||||
return PyErr_NoMemory();
|
return PyErr_NoMemory();
|
||||||
bytesize = (len + pairs + (byteorder == 0)) * 2;
|
nsize = len + pairs + (byteorder == 0);
|
||||||
v = PyBytes_FromStringAndSize(NULL, bytesize);
|
v = PyBytes_FromStringAndSize(NULL, nsize * 2);
|
||||||
if (v == NULL)
|
if (v == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
@ -5389,25 +5504,107 @@ _PyUnicode_EncodeUTF16(PyObject *str,
|
||||||
if (len == 0)
|
if (len == 0)
|
||||||
goto done;
|
goto done;
|
||||||
|
|
||||||
switch (kind) {
|
if (kind == PyUnicode_1BYTE_KIND) {
|
||||||
case PyUnicode_1BYTE_KIND: {
|
ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
|
||||||
ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
|
goto done;
|
||||||
break;
|
|
||||||
}
|
|
||||||
case PyUnicode_2BYTE_KIND: {
|
|
||||||
ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case PyUnicode_4BYTE_KIND: {
|
|
||||||
ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
assert(0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (byteorder < 0)
|
||||||
|
encoding = "utf-16-le";
|
||||||
|
else if (byteorder > 0)
|
||||||
|
encoding = "utf-16-be";
|
||||||
|
else
|
||||||
|
encoding = "utf-16";
|
||||||
|
|
||||||
|
pos = 0;
|
||||||
|
while (pos < len) {
|
||||||
|
Py_ssize_t repsize, moreunits;
|
||||||
|
|
||||||
|
if (kind == PyUnicode_2BYTE_KIND) {
|
||||||
|
pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
|
||||||
|
&out, native_ordering);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert(kind == PyUnicode_4BYTE_KIND);
|
||||||
|
pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
|
||||||
|
&out, native_ordering);
|
||||||
|
}
|
||||||
|
if (pos == len)
|
||||||
|
break;
|
||||||
|
|
||||||
|
rep = unicode_encode_call_errorhandler(
|
||||||
|
errors, &errorHandler,
|
||||||
|
encoding, "surrogates not allowed",
|
||||||
|
str, &exc, pos, pos + 1, &pos);
|
||||||
|
if (!rep)
|
||||||
|
goto error;
|
||||||
|
|
||||||
|
if (PyBytes_Check(rep)) {
|
||||||
|
repsize = PyBytes_GET_SIZE(rep);
|
||||||
|
if (repsize & 1) {
|
||||||
|
raise_encode_exception(&exc, encoding,
|
||||||
|
str, pos - 1, pos,
|
||||||
|
"surrogates not allowed");
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
moreunits = repsize / 2;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert(PyUnicode_Check(rep));
|
||||||
|
if (PyUnicode_READY(rep) < 0)
|
||||||
|
goto error;
|
||||||
|
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
|
||||||
|
if (!PyUnicode_IS_ASCII(rep)) {
|
||||||
|
raise_encode_exception(&exc, encoding,
|
||||||
|
str, pos - 1, pos,
|
||||||
|
"surrogates not allowed");
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* two bytes are reserved for each surrogate */
|
||||||
|
if (moreunits > 1) {
|
||||||
|
Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
|
||||||
|
Py_ssize_t morebytes = 2 * (moreunits - 1);
|
||||||
|
if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
|
||||||
|
/* integer overflow */
|
||||||
|
PyErr_NoMemory();
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
|
||||||
|
goto error;
|
||||||
|
out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (PyBytes_Check(rep)) {
|
||||||
|
Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
|
||||||
|
out += moreunits;
|
||||||
|
} else /* rep is unicode */ {
|
||||||
|
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
|
||||||
|
ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
|
||||||
|
&out, native_ordering);
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_CLEAR(rep);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Cut back to size actually needed. This is necessary for, for example,
|
||||||
|
encoding of a string containing isolated surrogates and the 'ignore' handler
|
||||||
|
is used. */
|
||||||
|
nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
|
||||||
|
if (nsize != PyBytes_GET_SIZE(v))
|
||||||
|
_PyBytes_Resize(&v, nsize);
|
||||||
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_XDECREF(exc);
|
||||||
done:
|
done:
|
||||||
return v;
|
return v;
|
||||||
|
error:
|
||||||
|
Py_XDECREF(rep);
|
||||||
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_XDECREF(exc);
|
||||||
|
Py_XDECREF(v);
|
||||||
|
return NULL;
|
||||||
|
#undef STORECHAR
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
|
|
163
Python/codecs.c
163
Python/codecs.c
|
@ -753,6 +753,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define ENC_UTF8 0
|
||||||
|
#define ENC_UTF16BE 1
|
||||||
|
#define ENC_UTF16LE 2
|
||||||
|
#define ENC_UTF32BE 3
|
||||||
|
#define ENC_UTF32LE 4
|
||||||
|
|
||||||
|
static int
|
||||||
|
get_standard_encoding(const char *encoding, int *bytelength)
|
||||||
|
{
|
||||||
|
if (Py_TOLOWER(encoding[0]) == 'u' &&
|
||||||
|
Py_TOLOWER(encoding[1]) == 't' &&
|
||||||
|
Py_TOLOWER(encoding[2]) == 'f') {
|
||||||
|
encoding += 3;
|
||||||
|
if (*encoding == '-' || *encoding == '_' )
|
||||||
|
encoding++;
|
||||||
|
if (encoding[0] == '1' && encoding[1] == '6') {
|
||||||
|
encoding += 2;
|
||||||
|
*bytelength = 2;
|
||||||
|
if (*encoding == '\0') {
|
||||||
|
#ifdef WORDS_BIGENDIAN
|
||||||
|
return ENC_UTF16BE;
|
||||||
|
#else
|
||||||
|
return ENC_UTF16LE;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
if (*encoding == '-' || *encoding == '_' )
|
||||||
|
encoding++;
|
||||||
|
if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
|
||||||
|
if (Py_TOLOWER(encoding[0]) == 'b')
|
||||||
|
return ENC_UTF16BE;
|
||||||
|
if (Py_TOLOWER(encoding[0]) == 'l')
|
||||||
|
return ENC_UTF16LE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (encoding[0] == '3' && encoding[1] == '2') {
|
||||||
|
encoding += 2;
|
||||||
|
*bytelength = 4;
|
||||||
|
if (*encoding == '\0') {
|
||||||
|
#ifdef WORDS_BIGENDIAN
|
||||||
|
return ENC_UTF32BE;
|
||||||
|
#else
|
||||||
|
return ENC_UTF32LE;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
if (*encoding == '-' || *encoding == '_' )
|
||||||
|
encoding++;
|
||||||
|
if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
|
||||||
|
if (Py_TOLOWER(encoding[0]) == 'b')
|
||||||
|
return ENC_UTF32BE;
|
||||||
|
if (Py_TOLOWER(encoding[0]) == 'l')
|
||||||
|
return ENC_UTF32LE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* utf-8 */
|
||||||
|
*bytelength = 3;
|
||||||
|
return ENC_UTF8;
|
||||||
|
}
|
||||||
|
|
||||||
/* This handler is declared static until someone demonstrates
|
/* This handler is declared static until someone demonstrates
|
||||||
a need to call it directly. */
|
a need to call it directly. */
|
||||||
static PyObject *
|
static PyObject *
|
||||||
|
@ -760,24 +819,40 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
|
||||||
{
|
{
|
||||||
PyObject *restuple;
|
PyObject *restuple;
|
||||||
PyObject *object;
|
PyObject *object;
|
||||||
|
PyObject *encode;
|
||||||
|
char *encoding;
|
||||||
|
int code;
|
||||||
|
int bytelength;
|
||||||
Py_ssize_t i;
|
Py_ssize_t i;
|
||||||
Py_ssize_t start;
|
Py_ssize_t start;
|
||||||
Py_ssize_t end;
|
Py_ssize_t end;
|
||||||
PyObject *res;
|
PyObject *res;
|
||||||
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
||||||
char *outp;
|
unsigned char *outp;
|
||||||
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
||||||
return NULL;
|
return NULL;
|
||||||
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
||||||
return NULL;
|
return NULL;
|
||||||
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
||||||
return NULL;
|
return NULL;
|
||||||
res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
|
if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
|
||||||
|
Py_DECREF(object);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (!(encoding = PyUnicode_AsUTF8(encode))) {
|
||||||
|
Py_DECREF(object);
|
||||||
|
Py_DECREF(encode);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
code = get_standard_encoding(encoding, &bytelength);
|
||||||
|
Py_DECREF(encode);
|
||||||
|
|
||||||
|
res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
|
||||||
if (!res) {
|
if (!res) {
|
||||||
Py_DECREF(object);
|
Py_DECREF(object);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
outp = PyBytes_AsString(res);
|
outp = (unsigned char*)PyBytes_AsString(res);
|
||||||
for (i = start; i < end; i++) {
|
for (i = start; i < end; i++) {
|
||||||
/* object is guaranteed to be "ready" */
|
/* object is guaranteed to be "ready" */
|
||||||
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
|
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
|
||||||
|
@ -788,9 +863,33 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
|
||||||
Py_DECREF(object);
|
Py_DECREF(object);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
*outp++ = (char)(0xe0 | (ch >> 12));
|
switch (code) {
|
||||||
*outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
case ENC_UTF8:
|
||||||
*outp++ = (char)(0x80 | (ch & 0x3f));
|
*outp++ = (unsigned char)(0xe0 | (ch >> 12));
|
||||||
|
*outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
|
||||||
|
*outp++ = (unsigned char)(0x80 | (ch & 0x3f));
|
||||||
|
break;
|
||||||
|
case ENC_UTF16LE:
|
||||||
|
*outp++ = (unsigned char) ch;
|
||||||
|
*outp++ = (unsigned char)(ch >> 8);
|
||||||
|
break;
|
||||||
|
case ENC_UTF16BE:
|
||||||
|
*outp++ = (unsigned char)(ch >> 8);
|
||||||
|
*outp++ = (unsigned char) ch;
|
||||||
|
break;
|
||||||
|
case ENC_UTF32LE:
|
||||||
|
*outp++ = (unsigned char) ch;
|
||||||
|
*outp++ = (unsigned char)(ch >> 8);
|
||||||
|
*outp++ = (unsigned char)(ch >> 16);
|
||||||
|
*outp++ = (unsigned char)(ch >> 24);
|
||||||
|
break;
|
||||||
|
case ENC_UTF32BE:
|
||||||
|
*outp++ = (unsigned char)(ch >> 24);
|
||||||
|
*outp++ = (unsigned char)(ch >> 16);
|
||||||
|
*outp++ = (unsigned char)(ch >> 8);
|
||||||
|
*outp++ = (unsigned char) ch;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
restuple = Py_BuildValue("(On)", res, end);
|
restuple = Py_BuildValue("(On)", res, end);
|
||||||
Py_DECREF(res);
|
Py_DECREF(res);
|
||||||
|
@ -802,34 +901,64 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
|
||||||
Py_UCS4 ch = 0;
|
Py_UCS4 ch = 0;
|
||||||
if (PyUnicodeDecodeError_GetStart(exc, &start))
|
if (PyUnicodeDecodeError_GetStart(exc, &start))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
if (PyUnicodeDecodeError_GetEnd(exc, &end))
|
||||||
|
return NULL;
|
||||||
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
|
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
|
||||||
return NULL;
|
return NULL;
|
||||||
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
|
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
|
||||||
Py_DECREF(object);
|
Py_DECREF(object);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
|
||||||
|
Py_DECREF(object);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (!(encoding = PyUnicode_AsUTF8(encode))) {
|
||||||
|
Py_DECREF(object);
|
||||||
|
Py_DECREF(encode);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
code = get_standard_encoding(encoding, &bytelength);
|
||||||
|
Py_DECREF(encode);
|
||||||
|
|
||||||
/* Try decoding a single surrogate character. If
|
/* Try decoding a single surrogate character. If
|
||||||
there are more, let the codec call us again. */
|
there are more, let the codec call us again. */
|
||||||
p += start;
|
p += start;
|
||||||
if (PyBytes_GET_SIZE(object) - start >= 3 &&
|
if (PyBytes_GET_SIZE(object) - start >= bytelength) {
|
||||||
(p[0] & 0xf0) == 0xe0 &&
|
switch (code) {
|
||||||
(p[1] & 0xc0) == 0x80 &&
|
case ENC_UTF8:
|
||||||
(p[2] & 0xc0) == 0x80) {
|
if ((p[0] & 0xf0) == 0xe0 &&
|
||||||
/* it's a three-byte code */
|
(p[1] & 0xc0) == 0x80 &&
|
||||||
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
|
(p[2] & 0xc0) == 0x80) {
|
||||||
if (!Py_UNICODE_IS_SURROGATE(ch))
|
/* it's a three-byte code */
|
||||||
/* it's not a surrogate - fail */
|
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
|
||||||
ch = 0;
|
}
|
||||||
|
break;
|
||||||
|
case ENC_UTF16LE:
|
||||||
|
ch = p[1] << 8 | p[0];
|
||||||
|
break;
|
||||||
|
case ENC_UTF16BE:
|
||||||
|
ch = p[0] << 8 | p[1];
|
||||||
|
break;
|
||||||
|
case ENC_UTF32LE:
|
||||||
|
ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
|
||||||
|
break;
|
||||||
|
case ENC_UTF32BE:
|
||||||
|
ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Py_DECREF(object);
|
Py_DECREF(object);
|
||||||
if (ch == 0) {
|
if (!Py_UNICODE_IS_SURROGATE(ch)) {
|
||||||
|
/* it's not a surrogate - fail */
|
||||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
res = PyUnicode_FromOrdinal(ch);
|
res = PyUnicode_FromOrdinal(ch);
|
||||||
if (res == NULL)
|
if (res == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
return Py_BuildValue("(Nn)", res, start+3);
|
return Py_BuildValue("(Nn)", res, start + bytelength);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
wrong_exception_type(exc);
|
wrong_exception_type(exc);
|
||||||
|
|
Loading…
Reference in New Issue