Issue #5006: Better handling of unicode byte-order marks (BOM) in the io
library. This means, for example, that opening an UTF-16 text file in append mode doesn't add a BOM at the end of the file if the file isn't empty.
This commit is contained in:
parent
082a65ab1f
commit
8243ddb6ca
20
Lib/io.py
20
Lib/io.py
|
@ -1440,6 +1440,15 @@ class TextIOWrapper(TextIOBase):
|
|||
self._snapshot = None # info for reconstructing decoder state
|
||||
self._seekable = self._telling = self.buffer.seekable()
|
||||
|
||||
if self._seekable and self.writable():
|
||||
position = self.buffer.tell()
|
||||
if position != 0:
|
||||
try:
|
||||
self._get_encoder().setstate(0)
|
||||
except LookupError:
|
||||
# Sometimes the encoder doesn't exist
|
||||
pass
|
||||
|
||||
# self._snapshot is either None, or a tuple (dec_flags, next_input)
|
||||
# where dec_flags is the second (integer) item of the decoder state
|
||||
# and next_input is the chunk of input bytes that comes next after the
|
||||
|
@ -1726,6 +1735,17 @@ class TextIOWrapper(TextIOBase):
|
|||
raise IOError("can't restore logical file position")
|
||||
self._decoded_chars_used = chars_to_skip
|
||||
|
||||
# Finally, reset the encoder (merely useful for proper BOM handling)
|
||||
try:
|
||||
encoder = self._encoder or self._get_encoder()
|
||||
except LookupError:
|
||||
# Sometimes the encoder doesn't exist
|
||||
pass
|
||||
else:
|
||||
if cookie != 0:
|
||||
encoder.setstate(0)
|
||||
else:
|
||||
encoder.reset()
|
||||
return cookie
|
||||
|
||||
def read(self, n=None):
|
||||
|
|
|
@ -799,6 +799,37 @@ class StatefulIncrementalDecoderTest(unittest.TestCase):
|
|||
self.assertEquals(d.decode(b'oiabcd'), '')
|
||||
self.assertEquals(d.decode(b'', 1), 'abcd.')
|
||||
|
||||
def test_append_bom(self):
|
||||
# The BOM is not written again when appending to a non-empty file
|
||||
filename = test_support.TESTFN
|
||||
for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
|
||||
with io.open(filename, 'w', encoding=charset) as f:
|
||||
f.write('aaa')
|
||||
pos = f.tell()
|
||||
with io.open(filename, 'rb') as f:
|
||||
self.assertEquals(f.read(), 'aaa'.encode(charset))
|
||||
|
||||
with io.open(filename, 'a', encoding=charset) as f:
|
||||
f.write('xxx')
|
||||
with io.open(filename, 'rb') as f:
|
||||
self.assertEquals(f.read(), 'aaaxxx'.encode(charset))
|
||||
|
||||
def test_seek_bom(self):
|
||||
# Same test, but when seeking manually
|
||||
filename = test_support.TESTFN
|
||||
for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
|
||||
with io.open(filename, 'w', encoding=charset) as f:
|
||||
f.write('aaa')
|
||||
pos = f.tell()
|
||||
with io.open(filename, 'r+', encoding=charset) as f:
|
||||
f.seek(pos)
|
||||
f.write('zzz')
|
||||
f.seek(0)
|
||||
f.write('bbb')
|
||||
with io.open(filename, 'rb') as f:
|
||||
self.assertEquals(f.read(), 'bbbzzz'.encode(charset))
|
||||
|
||||
|
||||
class TextIOWrapperTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
|
|
|
@ -84,6 +84,10 @@ C-API
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #5006: Better handling of unicode byte-order marks (BOM) in the io
|
||||
library. This means, for example, that opening an UTF-16 text file in append
|
||||
mode doesn't add a BOM at the end of the file if the file isn't empty.
|
||||
|
||||
- Issue #3704: cookielib was not properly handling URLs with a / in the
|
||||
parameters.
|
||||
|
||||
|
|
Loading…
Reference in New Issue