From 412ed3b8a7388da4850d2a832679fe21804a1591 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Sun, 8 Jan 2006 10:45:39 +0000 Subject: [PATCH] Patch #1177307: UTF-8-Sig codec. --- Doc/lib/libcodecs.tex | 125 +++++++++++++++++++++++++++++++++++++ Lib/encodings/utf_8_sig.py | 57 +++++++++++++++++ Lib/test/test_codecs.py | 28 +++++++++ Misc/NEWS | 3 +- 4 files changed, 212 insertions(+), 1 deletion(-) create mode 100644 Lib/encodings/utf_8_sig.py diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex index be2385e79ed..71d6fe87c80 100644 --- a/Doc/lib/libcodecs.tex +++ b/Doc/lib/libcodecs.tex @@ -522,6 +522,113 @@ the \function{lookup()} function to construct the instance. \class{StreamReader} and \class{StreamWriter} classes. They inherit all other methods and attribute from the underlying stream. +\subsection{Encodings and Unicode\label{encodings-overview}} + +Unicode strings are stored internally as sequences of codepoints (to +be precise as Py_UNICODE arrays). Depending on the way Python is +compiled (either via --enable-unicode=ucs2 or --enable-unicode=ucs4, +with the former being the default) Py_UNICODE is either a 16-bit or +32-bit data type. Once a Unicode object is used outside of CPU and +memory, CPU endianness and how these arrays are stored as bytes become +an issue. Transforming a unicode object into a sequence of bytes is +called encoding and recreating the unicode object from the sequence of +bytes is known as decoding. There are many different methods how this +transformation can be done (these methods are also called encodings). +The simplest method is to map the codepoints 0-255 to the bytes +0x0-0xff. This means that a unicode object that contains codepoints +above U+00FF can't be encoded with this method (which is called +'latin-1' or 'iso-8859-1'). unicode.encode() will raise a +UnicodeEncodeError that looks like this: UnicodeEncodeError: 'latin-1' +codec can't encode character u'\u1234' in position 3: ordinal not in +range(256) + +There's another group of encodings (the so called charmap encodings) +that choose a different subset of all unicode code points and how +these codepoints are mapped to the bytes 0x0-0xff. To see how this is +done simply open e.g. encodings/cp1252.py (which is an encoding that +is used primarily on Windows). There's string constant with 256 +characters that shows you which character is mapped to which byte +value. + +All of these encodings can only encode 256 of the 65536 (or 1114111) +codepoints defined in unicode. A simple and straightforward way that +can store each Unicode code point, is to store each codepoint as two +consecutive bytes. There are two possibilities: Store the bytes in big +endian or in little endian order. These two encodings are called +UTF-16-BE and UTF-16-LE respectively. Their disadvantage is that if +e.g. you use UTF-16-BE on a little endian machine you will always have +to swap bytes on encoding and decoding. UTF-16 avoids this problem: +Bytes will always be in natural endianness. When these bytes are read +by a CPU with a different endianness, then bytes have to be swapped +though. To be able to detect the endianness of a UTF-16 byte sequence, +there's the so called BOM (the "Byte Order Mark"). This is the Unicode +character U+FEFF. This character will be prepended to every UTF-16 +byte sequence. The byte swapped version of this character (0xFFFE) is +an illegal character that may not appear in a Unicode text. So when +the first character in an UTF-16 byte sequence appears to be a U+FFFE +the bytes have to be swapped on decoding. Unfortunately upto Unicode +4.0 the character U+FEFF had a second purpose as a "ZERO WIDTH +NO-BREAK SPACE": A character that has no width and doesn't allow a +word to be split. It can e.g. be used to give hints to a ligature +algorithm. With Unicode 4.0 using U+FEFF as a ZERO WIDTH NO-BREAK +SPACE has been deprecated (with U+2060 (WORD JOINER) assuming this +role). Nevertheless Unicode software still must be able to handle +U+FEFF in both roles: As a BOM it's a device to determine the storage +layout of the encoded bytes, and vanishes once the byte sequence has +been decoded into a Unicode string; as a ZERO WIDTH NO-BREAK SPACE +it's a normal character that will be decoded like any other. + +There's another encoding that is able to encoding the full range of +Unicode characters: UTF-8. UTF-8 is an 8bit encoding, which means +there are no issues with byte order in UTF-8. Each byte in a UTF-8 +byte sequence consists of two parts: Marker bits (the most significant +bits) and payload bits. The marker bits are a sequence of zero to six +1 bits followed by a 0 bit. Unicode characters are encoded like this +(with x being a payload bit, which when concatenated give the Unicode +character): + +\begin{tableii}{l|l}{textrm}{}{Range}{Encoding} +\lineii{U-00000000 ... U-0000007F}{0xxxxxxx} +\lineii{U-00000080 ... U-000007FF}{110xxxxx 10xxxxxx} +\lineii{U-00000800 ... U-0000FFFF}{1110xxxx 10xxxxxx 10xxxxxx} +\lineii{U-00010000 ... U-001FFFFF}{11110xxx 10xxxxxx 10xxxxxx 10xxxxxx} +\lineii{U-00200000 ... U-03FFFFFF}{111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx} +\lineii{U-04000000 ... U-7FFFFFFF}{1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx} +\end{tableii} + +The least significant bit of the Unicode character is the rightmost x +bit. + +As UTF-8 is an 8bit encoding no BOM is required and any U+FEFF +character in the decoded Unicode string (even if it's the first +character) is treated as a ZERO WIDTH NO-BREAK SPACE. + +Without external information it's impossible to reliably determine +which encoding was used for encoding a Unicode string. Each charmap +encoding can decode any random byte sequence. However that's not +possible with UTF-8, as UTF-8 byte sequences have a structure that +doesn't allow arbitrary byte sequence. To increase the reliability +with which an UTF-8 encoding can be detected, Microsoft invented a +variant of UTF-8 (that Python 2.5 calls "utf-8-sig") for its Notepad +program: Before any of the Unicode characters is written to the file, +a UTF-8 encoded BOM (which looks like this as a byte sequence: 0xef, +0xbb, 0xbf) is written. As it's rather improbably that any charmap +encoded file starts with these byte values (which would e.g. map to + + LATIN SMALL LETTER I WITH DIAERESIS + RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + INVERTED QUESTION MARK + +in iso-8859-1), this increases the probability that a utf-8-sig +encoding can be correctly guessed from the byte sequence. So here the +BOM is not used to be able to determine the byte order used for +generating the byte sequence, but as a signature that helps in +guessing the encoding. On encoding the utf-8-sig codec will write +0xef, 0xbb, 0xbf as the first three bytes to the file. On decoding +utf-8-sig will skip those three bytes if they appear as the first +three bytes in the file. + + \subsection{Standard Encodings\label{standard-encodings}} Python comes with a number of codecs builtin, either implemented as C @@ -890,6 +997,10 @@ exist: {U8, UTF, utf8} {all languages} +\lineiii{utf_8_sig} + {} + {all languages} + \end{longtableiii} A number of codecs are specific to Python, so their codec names have @@ -1058,3 +1169,17 @@ Convert a label to \ASCII, as specified in \rfc{3490}. \begin{funcdesc}{ToUnicode}{label} Convert a label to Unicode, as specified in \rfc{3490}. \end{funcdesc} + + \subsection{\module{encodings.utf_8_sig} --- + UTF-8 codec with BOM signature} +\declaremodule{standard}{encodings.utf-8-sig} % XXX utf_8_sig gives TeX errors +\modulesynopsis{UTF-8 codec with BOM signature} +\moduleauthor{Walter D\"orwald} + +\versionadded{2.5} + +This module implements a variant of the UTF-8 codec: On encoding a +UTF-8 encoded BOM will be prepended to the UTF-8 encoded bytes. For +the stateful encoder this is only done once (on the first write to the +byte stream). For decoding an optional UTF-8 encoded BOM at the start +of the data will be skipped. diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py new file mode 100644 index 00000000000..fa437e69290 --- /dev/null +++ b/Lib/encodings/utf_8_sig.py @@ -0,0 +1,57 @@ +""" Python 'utf-8-sig' Codec +This work similar to UTF-8 with the following changes: + +* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the + first three bytes. + +* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these + bytes will be skipped. +""" +import codecs + +### Codec APIs + +def encode(input, errors='strict'): + return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input)) + +def decode(input, errors='strict'): + prefix = 0 + if input.startswith(codecs.BOM_UTF8): + input = input[3:] + prefix = 3 + (output, consumed) = codecs.utf_8_decode(input, errors, True) + return (output, consumed+prefix) + +class StreamWriter(codecs.StreamWriter): + def reset(self): + codecs.StreamWriter.reset(self) + try: + del self.encode + except AttributeError: + pass + + def encode(self, input, errors='strict'): + self.encode = codecs.utf_8_encode + return encode(input, errors) + +class StreamReader(codecs.StreamReader): + def reset(self): + codecs.StreamReader.reset(self) + try: + del self.decode + except AttributeError: + pass + + def decode(self, input, errors='strict'): + if len(input) < 3 and codecs.BOM_UTF8.startswith(input): + # not enough data to decide if this is a BOM + # => try again on the next call + return (u"", 0) + self.decode = codecs.utf_8_decode + return decode(input, errors) + +### encodings module API + +def getregentry(): + + return (encode,decode,StreamReader,StreamWriter) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index b344f9a6ba3..ded5d1917a6 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -367,6 +367,33 @@ class CharBufferTest(unittest.TestCase): self.assertRaises(TypeError, codecs.charbuffer_encode) self.assertRaises(TypeError, codecs.charbuffer_encode, 42) +class UTF8SigTest(ReadTest): + encoding = "utf-8-sig" + + def test_partial(self): + self.check_partial( + u"\ufeff\x00\xff\u07ff\u0800\uffff", + [ + u"", + u"", + u"", # First BOM has been read and skipped + u"", + u"", + u"\ufeff", # Second BOM has been read and emitted + u"\ufeff\x00", # "\x00" read and emitted + u"\ufeff\x00", # First byte of encoded u"\xff" read + u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read + u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read + u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read + u"\ufeff\x00\xff\u07ff", + u"\ufeff\x00\xff\u07ff", + u"\ufeff\x00\xff\u07ff\u0800", + u"\ufeff\x00\xff\u07ff\u0800", + u"\ufeff\x00\xff\u07ff\u0800", + u"\ufeff\x00\xff\u07ff\u0800\uffff", + ] + ) + class EscapeDecodeTest(unittest.TestCase): def test_empty(self): self.assertEquals(codecs.escape_decode(""), ("", 0)) @@ -1044,6 +1071,7 @@ def test_main(): UTF16LETest, UTF16BETest, UTF8Test, + UTF8SigTest, UTF7Test, UTF16ExTest, ReadBufferTest, diff --git a/Misc/NEWS b/Misc/NEWS index 77dbacdb687..3860fd332e4 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -319,6 +319,8 @@ Extension Modules Library ------- +- Patch #1177307: Added a new codec utf_8_sig for UTF-8 with a BOM signature. + - Patch #1157027: cookielib mishandles RFC 2109 cookies in Netscape mode - Patch #1117398: cookielib.LWPCookieJar and .MozillaCookieJar now raise @@ -674,7 +676,6 @@ Build Tests for sanity in tzname when HAVE_TZNAME defined were also defined. Closes bug #1096244. Thanks Gregory Bond. - C API -----