bpo-24214: Fixed the UTF-8 incremental decoder. (GH-12603) (GH-12627)
The bug occurred when the encoded surrogate character is passed
to the incremental decoder in two chunks.
(cherry picked from commit 7a465cb5ee
)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
4724ba9b57
commit
bd48280cb6
|
@ -401,6 +401,15 @@ class ReadTest(MixInCheckStateHandling):
|
|||
self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
|
||||
before + backslashreplace + after)
|
||||
|
||||
def test_incremental_surrogatepass(self):
|
||||
# Test incremental decoder for surrogatepass handler:
|
||||
# see issue #24214
|
||||
data = '\uD901'.encode(self.encoding, 'surrogatepass')
|
||||
for i in range(1, len(data)):
|
||||
dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
|
||||
self.assertEqual(dec.decode(data[:i]), '')
|
||||
self.assertEqual(dec.decode(data[i:], True), '\uD901')
|
||||
|
||||
|
||||
class UTF32Test(ReadTest, unittest.TestCase):
|
||||
encoding = "utf-32"
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Fixed support of the surrogatepass error handler in the UTF-8 incremental
|
||||
decoder.
|
|
@ -4890,6 +4890,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
|||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
if (s == end || consumed) {
|
||||
goto End;
|
||||
}
|
||||
errmsg = "invalid continuation byte";
|
||||
startinpos = s - starts;
|
||||
endinpos = startinpos + ch - 1;
|
||||
|
|
Loading…
Reference in New Issue