cpython/Lib/encodings/utf_8_sig.py

58 lines
1.6 KiB
Python

""" Python 'utf-8-sig' Codec
This work similar to UTF-8 with the following changes:
* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
first three bytes.
* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
bytes will be skipped.
"""
import codecs
### Codec APIs
def encode(input, errors='strict'):
return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
def decode(input, errors='strict'):
prefix = 0
if input.startswith(codecs.BOM_UTF8):
input = input[3:]
prefix = 3
(output, consumed) = codecs.utf_8_decode(input, errors, True)
return (output, consumed+prefix)
class StreamWriter(codecs.StreamWriter):
def reset(self):
codecs.StreamWriter.reset(self)
try:
del self.encode
except AttributeError:
pass
def encode(self, input, errors='strict'):
self.encode = codecs.utf_8_encode
return encode(input, errors)
class StreamReader(codecs.StreamReader):
def reset(self):
codecs.StreamReader.reset(self)
try:
del self.decode
except AttributeError:
pass
def decode(self, input, errors='strict'):
if len(input) < 3 and codecs.BOM_UTF8.startswith(input):
# not enough data to decide if this is a BOM
# => try again on the next call
return (u"", 0)
self.decode = codecs.utf_8_decode
return decode(input, errors)
### encodings module API
def getregentry():
return (encode,decode,StreamReader,StreamWriter)