bpo-24214: Fixed the UTF-8 incremental decoder. (GH-12603)
The bug occurred when the encoded surrogate character is passed to the incremental decoder in two chunks.
This commit is contained in:
parent
38f4e468d4
commit
7a465cb5ee
|
@ -406,6 +406,15 @@ class ReadTest(MixInCheckStateHandling):
|
|||
self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
|
||||
before + backslashreplace + after)
|
||||
|
||||
def test_incremental_surrogatepass(self):
|
||||
# Test incremental decoder for surrogatepass handler:
|
||||
# see issue #24214
|
||||
data = '\uD901'.encode(self.encoding, 'surrogatepass')
|
||||
for i in range(1, len(data)):
|
||||
dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
|
||||
self.assertEqual(dec.decode(data[:i]), '')
|
||||
self.assertEqual(dec.decode(data[i:], True), '\uD901')
|
||||
|
||||
|
||||
class UTF32Test(ReadTest, unittest.TestCase):
|
||||
encoding = "utf-32"
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Fixed support of the surrogatepass error handler in the UTF-8 incremental
|
||||
decoder.
|
|
@ -4883,6 +4883,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
|||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
if (s == end || consumed) {
|
||||
goto End;
|
||||
}
|
||||
errmsg = "invalid continuation byte";
|
||||
startinpos = s - starts;
|
||||
endinpos = startinpos + ch - 1;
|
||||
|
|
Loading…
Reference in New Issue