Backport r54786:

Fix utf-8-sig incremental decoder, which didn't recognise a BOM when the
first chunk fed to the decoder started with a BOM, but was longer than 3 bytes.
This commit is contained in:
Walter Dörwald 2007-04-21 10:31:43 +00:00
parent 552ba11085
commit 93a3603c67
3 changed files with 19 additions and 7 deletions

View File

@ -44,14 +44,19 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
self.first = True
def _buffer_decode(self, input, errors, final):
if self.first and codecs.BOM_UTF8.startswith(input): # might be a BOM
if self.first:
if len(input) < 3:
# not enough data to decide if this really is a BOM
# => try again on the next call
return (u"", 0)
(output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
self.first = False
return (output, consumed+3)
if codecs.BOM_UTF8.startswith(input):
# not enough data to decide if this really is a BOM
# => try again on the next call
return (u"", 0)
else:
self.first = None
else:
self.first = None
if input[:3] == codecs.BOM_UTF8:
(output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
return (output, consumed+3)
return codecs.utf_8_decode(input, errors, final)
def reset(self):

View File

@ -430,6 +430,11 @@ class UTF8SigTest(ReadTest):
# SF bug #1601501: check that the codec works with a buffer
unicode("\xef\xbb\xbf", "utf-8-sig")
def test_bom(self):
d = codecs.getincrementaldecoder("utf-8-sig")()
s = u"spam"
self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
class EscapeDecodeTest(unittest.TestCase):
def test_empty(self):
self.assertEquals(codecs.escape_decode(""), ("", 0))

View File

@ -602,6 +602,8 @@ Tests
- Fix bsddb test_basics.test06_Transactions to check the version
number properly.
- Fix utf-8-sig incremental decoder, which didn't recognise a BOM when the
first chunk fed to the decoder started with a BOM, but was longer than 3 bytes.
Documentation
-------------