Apply SF patch #1775604: This adds three new codecs (utf-32, utf-32-le and

ut-32-be). On narrow builds the codecs combine surrogate pairs in the unicode
object into one codepoint on encoding and create surrogate pairs for
codepoints outside the BMP on decoding. Lone surrogates are passed through
unchanged in all cases.

Backport to the trunk will follow.
This commit is contained in:
Walter Dörwald 2007-08-16 21:55:45 +00:00
parent 066100909a
commit 41980caf64
12 changed files with 1001 additions and 2 deletions

View File

@ -1405,6 +1405,74 @@ These are the UTF-8 codec APIs:
object. Error handling is "strict". Return *NULL* if an exception was raised
by the codec.
These are the UTF-32 codec APIs:
.. % --- UTF-32 Codecs ------------------------------------------------------ */
.. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder)
Decode *length* bytes from a UTF-32 encoded buffer string and return the
corresponding Unicode object. *errors* (if non-*NULL*) defines the error
handling. It defaults to "strict".
If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte
order::
*byteorder == -1: little endian
*byteorder == 0: native order
*byteorder == 1: big endian
and then switches if the first four bytes of the input data are a byte order mark
(BOM) and the specified byte order is native order. This BOM is not copied into
the resulting Unicode string. After completion, *\*byteorder* is set to the
current byte order at the end of input data.
In a narrow build codepoints outside the BMP will be decoded as surrogate pairs.
If *byteorder* is *NULL*, the codec starts in native order mode.
Return *NULL* if an exception was raised by the codec.
.. versionadded:: 3.0
.. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed)
If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If
*consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat
trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible
by four) as an error. Those bytes will not be decoded and the number of bytes
that have been decoded will be stored in *consumed*.
.. versionadded:: 3.0
.. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder)
Return a Python bytes object holding the UTF-32 encoded value of the Unicode
data in *s*. If *byteorder* is not ``0``, output is written according to the
following byte order::
byteorder == -1: little endian
byteorder == 0: native byte order (writes a BOM mark)
byteorder == 1: big endian
If byteorder is ``0``, the output string will always start with the Unicode BOM
mark (U+FEFF). In the other two modes, no BOM mark is prepended.
If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output
as a single codepoint.
Return *NULL* if an exception was raised by the codec.
.. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode)
Return a Python string using the UTF-32 encoding in native byte order. The
string always starts with a BOM mark. Error handling is "strict". Return
*NULL* if an exception was raised by the codec.
These are the UTF-16 codec APIs:
.. % --- UTF-16 Codecs ------------------------------------------------------ */

View File

@ -1089,6 +1089,12 @@ particular, the following variants typically exist:
| shift_jisx0213 | shiftjisx0213, sjisx0213, | Japanese |
| | s_jisx0213 | |
+-----------------+--------------------------------+--------------------------------+
| utf_32 | U32, utf32 | all languages |
+-----------------+--------------------------------+--------------------------------+
| utf_32_be | UTF-32BE | all languages |
+-----------------+--------------------------------+--------------------------------+
| utf_32_le | UTF-32LE | all languages |
+-----------------+--------------------------------+--------------------------------+
| utf_16 | U16, utf16 | all languages |
+-----------------+--------------------------------+--------------------------------+
| utf_16_be | UTF-16BE | all languages (BMP only) |

View File

@ -138,6 +138,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
@ -154,6 +155,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
@ -165,6 +168,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
@ -225,6 +229,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
@ -241,6 +246,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
@ -252,6 +259,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
@ -749,6 +757,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
const char *errors /* error handling */
);
/* --- UTF-32 Codecs ------------------------------------------------------ */
/* Decodes length bytes from a UTF-32 encoded buffer string and returns
the corresponding Unicode object.
errors (if non-NULL) defines the error handling. It defaults
to "strict".
If byteorder is non-NULL, the decoder starts decoding using the
given byte order:
*byteorder == -1: little endian
*byteorder == 0: native order
*byteorder == 1: big endian
In native mode, the first four bytes of the stream are checked for a
BOM mark. If found, the BOM mark is analysed, the byte order
adjusted and the BOM skipped. In the other modes, no BOM mark
interpretation is done. After completion, *byteorder is set to the
current byte order at the end of input data.
If byteorder is NULL, the codec starts in native order mode.
*/
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
const char *string, /* UTF-32 encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
int *byteorder /* pointer to byteorder to use
0=native;-1=LE,1=BE; updated on
exit */
);
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
const char *string, /* UTF-32 encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
int *byteorder, /* pointer to byteorder to use
0=native;-1=LE,1=BE; updated on
exit */
Py_ssize_t *consumed /* bytes consumed */
);
/* Returns a Python string using the UTF-32 encoding in native byte
order. The string always starts with a BOM mark. */
PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
PyObject *unicode /* Unicode object */
);
/* Returns a Python string object holding the UTF-32 encoded value of
the Unicode data.
If byteorder is not 0, output is written according to the following
byte order:
byteorder == -1: little endian
byteorder == 0: native byte order (writes a BOM mark)
byteorder == 1: big endian
If byteorder is 0, the output string will always start with the
Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
prepended.
*/
PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
const Py_UNICODE *data, /* Unicode char buffer */
Py_ssize_t length, /* number of Py_UNICODE chars to encode */
const char *errors, /* error handling */
int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
);
/* --- UTF-16 Codecs ------------------------------------------------------ */
/* Decodes length bytes from a UTF-16 encoded buffer string and returns

View File

@ -490,6 +490,16 @@ aliases = {
'unicodelittleunmarked' : 'utf_16_le',
'utf_16le' : 'utf_16_le',
# utf_32 codec
'u32' : 'utf_32',
'utf32' : 'utf_32',
# utf_32_be codec
'utf_32be' : 'utf_32_be',
# utf_32_le codec
'utf_32le' : 'utf_32_le',
# utf_7 codec
'u7' : 'utf_7',
'utf7' : 'utf_7',

144
Lib/encodings/utf_32.py Normal file
View File

@ -0,0 +1,144 @@
"""
Python 'utf-32' Codec
"""
import codecs, sys
### Codec APIs
encode = codecs.utf_32_encode
def decode(input, errors='strict'):
return codecs.utf_32_decode(input, errors, True)
class IncrementalEncoder(codecs.IncrementalEncoder):
def __init__(self, errors='strict'):
codecs.IncrementalEncoder.__init__(self, errors)
self.encoder = None
def encode(self, input, final=False):
if self.encoder is None:
result = codecs.utf_32_encode(input, self.errors)[0]
if sys.byteorder == 'little':
self.encoder = codecs.utf_32_le_encode
else:
self.encoder = codecs.utf_32_be_encode
return result
return self.encoder(input, self.errors)[0]
def reset(self):
codecs.IncrementalEncoder.reset(self)
self.encoder = None
def getstate(self):
# state info we return to the caller:
# 0: stream is in natural order for this platform
# 2: endianness hasn't been determined yet
# (we're never writing in unnatural order)
return (2 if self.encoder is None else 0)
def setstate(self, state):
if state:
self.encoder = None
else:
if sys.byteorder == 'little':
self.encoder = codecs.utf_32_le_encode
else:
self.encoder = codecs.utf_32_be_encode
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def __init__(self, errors='strict'):
codecs.BufferedIncrementalDecoder.__init__(self, errors)
self.decoder = None
def _buffer_decode(self, input, errors, final):
if self.decoder is None:
(output, consumed, byteorder) = \
codecs.utf_32_ex_decode(input, errors, 0, final)
if byteorder == -1:
self.decoder = codecs.utf_32_le_decode
elif byteorder == 1:
self.decoder = codecs.utf_32_be_decode
elif consumed >= 4:
raise UnicodeError("UTF-32 stream does not start with BOM")
return (output, consumed)
return self.decoder(input, self.errors, final)
def reset(self):
codecs.BufferedIncrementalDecoder.reset(self)
self.decoder = None
def getstate(self):
# additonal state info from the base class must be None here,
# as it isn't passed along to the caller
state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
# additional state info we pass to the caller:
# 0: stream is in natural order for this platform
# 1: stream is in unnatural order
# 2: endianness hasn't been determined yet
if self.decoder is None:
return (state, 2)
addstate = int((sys.byteorder == "big") !=
(self.decoder is codecs.utf_32_be_decode))
return (state, addstate)
def setstate(self, state):
# state[1] will be ignored by BufferedIncrementalDecoder.setstate()
codecs.BufferedIncrementalDecoder.setstate(self, state)
state = state[1]
if state == 0:
self.decoder = (codecs.utf_32_be_decode
if sys.byteorder == "big"
else codecs.utf_32_le_decode)
elif state == 1:
self.decoder = (codecs.utf_32_le_decode
if sys.byteorder == "big"
else codecs.utf_32_be_decode)
else:
self.decoder = None
class StreamWriter(codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
self.bom_written = False
codecs.StreamWriter.__init__(self, stream, errors)
def encode(self, input, errors='strict'):
self.bom_written = True
result = codecs.utf_32_encode(input, errors)
if sys.byteorder == 'little':
self.encode = codecs.utf_32_le_encode
else:
self.encode = codecs.utf_32_be_encode
return result
class StreamReader(codecs.StreamReader):
def reset(self):
codecs.StreamReader.reset(self)
try:
del self.decode
except AttributeError:
pass
def decode(self, input, errors='strict'):
(object, consumed, byteorder) = \
codecs.utf_32_ex_decode(input, errors, 0, False)
if byteorder == -1:
self.decode = codecs.utf_32_le_decode
elif byteorder == 1:
self.decode = codecs.utf_32_le_decode
elif consumed>=4:
raise UnicodeError,"UTF-32 stream does not start with BOM"
return (object, consumed)
### encodings module API
def getregentry():
return codecs.CodecInfo(
name='utf-32',
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)

View File

@ -0,0 +1,37 @@
"""
Python 'utf-32-be' Codec
"""
import codecs
### Codec APIs
encode = codecs.utf_32_be_encode
def decode(input, errors='strict'):
return codecs.utf_32_be_decode(input, errors, True)
class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.utf_32_be_encode(input, self.errors)[0]
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
_buffer_decode = codecs.utf_32_be_decode
class StreamWriter(codecs.StreamWriter):
encode = codecs.utf_32_be_encode
class StreamReader(codecs.StreamReader):
decode = codecs.utf_32_be_decode
### encodings module API
def getregentry():
return codecs.CodecInfo(
name='utf-32-be',
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)

View File

@ -0,0 +1,37 @@
"""
Python 'utf-32-le' Codec
"""
import codecs
### Codec APIs
encode = codecs.utf_32_le_encode
def decode(input, errors='strict'):
return codecs.utf_32_le_decode(input, errors, True)
class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.utf_32_le_encode(input, self.errors)[0]
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
_buffer_decode = codecs.utf_32_le_decode
class StreamWriter(codecs.StreamWriter):
encode = codecs.utf_32_le_encode
class StreamReader(codecs.StreamReader):
decode = codecs.utf_32_le_decode
### encodings module API
def getregentry():
return codecs.CodecInfo(
name='utf-32-le',
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)

View File

@ -285,7 +285,8 @@ class CodecCallbackTest(unittest.TestCase):
def test_longstrings(self):
# test long strings to check for memory overflow problems
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
"backslashreplace"]
# register the handlers under different names,
# to prevent the codec from recognizing the name
for err in errors:
@ -293,7 +294,8 @@ class CodecCallbackTest(unittest.TestCase):
l = 1000
errors += [ "test." + err for err in errors ]
for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
"utf-8", "utf-7", "utf-16", "utf-32"):
for err in errors:
try:
uni.encode(enc, err)
@ -812,6 +814,7 @@ class CodecCallbackTest(unittest.TestCase):
("utf-7", b"++"),
("utf-8", b"\xff"),
("utf-16", b"\xff"),
("utf-32", b"\xff"),
("unicode-escape", b"\\u123g"),
("raw-unicode-escape", b"\\u123g"),
("unicode-internal", b"\xff"),

View File

@ -277,6 +277,143 @@ class ReadTest(unittest.TestCase, MixInCheckStateHandling):
self.assertEqual(reader.readline(), s5)
self.assertEqual(reader.readline(), "")
class UTF32Test(ReadTest):
encoding = "utf-32"
spamle = (b'\xff\xfe\x00\x00'
b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
spambe = (b'\x00\x00\xfe\xff'
b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
def test_only_one_bom(self):
_,_,reader,writer = codecs.lookup(self.encoding)
# encode some stream
s = io.BytesIO()
f = writer(s)
f.write("spam")
f.write("spam")
d = s.getvalue()
# check whether there is exactly one BOM in it
self.assert_(d == self.spamle or d == self.spambe)
# try to read it back
s = io.BytesIO(d)
f = reader(s)
self.assertEquals(f.read(), "spamspam")
def test_badbom(self):
s = io.BytesIO(4*b"\xff")
f = codecs.getreader(self.encoding)(s)
self.assertRaises(UnicodeError, f.read)
s = io.BytesIO(8*b"\xff")
f = codecs.getreader(self.encoding)(s)
self.assertRaises(UnicodeError, f.read)
def test_partial(self):
self.check_partial(
"\x00\xff\u0100\uffff",
[
"", # first byte of BOM read
"", # second byte of BOM read
"", # third byte of BOM read
"", # fourth byte of BOM read => byteorder known
"",
"",
"",
"\x00",
"\x00",
"\x00",
"\x00",
"\x00\xff",
"\x00\xff",
"\x00\xff",
"\x00\xff",
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100\uffff",
]
)
def test_errors(self):
self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
b"\xff", "strict", True)
def test_decoder_state(self):
self.check_state_handling_decode(self.encoding,
"spamspam", self.spamle)
self.check_state_handling_decode(self.encoding,
"spamspam", self.spambe)
class UTF32LETest(ReadTest):
encoding = "utf-32-le"
def test_partial(self):
self.check_partial(
"\x00\xff\u0100\uffff",
[
"",
"",
"",
"\x00",
"\x00",
"\x00",
"\x00",
"\x00\xff",
"\x00\xff",
"\x00\xff",
"\x00\xff",
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100\uffff",
]
)
def test_simple(self):
self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
def test_errors(self):
self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
b"\xff", "strict", True)
class UTF32BETest(ReadTest):
encoding = "utf-32-be"
def test_partial(self):
self.check_partial(
"\x00\xff\u0100\uffff",
[
"",
"",
"",
"\x00",
"\x00",
"\x00",
"\x00",
"\x00\xff",
"\x00\xff",
"\x00\xff",
"\x00\xff",
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100",
"\x00\xff\u0100\uffff",
]
)
def test_simple(self):
self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
def test_errors(self):
self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
b"\xff", "strict", True)
class UTF16Test(ReadTest):
encoding = "utf-16"
@ -1284,6 +1421,9 @@ class WithStmtTest(unittest.TestCase):
def test_main():
test_support.run_unittest(
UTF32Test,
UTF32LETest,
UTF32BETest,
UTF16Test,
UTF16LETest,
UTF16BETest,

View File

@ -213,6 +213,8 @@ Library
- Patch #1680961: atexit has been reimplemented in C.
- Add new codecs for UTF-32, UTF-32-LE and UTF-32-BE.
Build
-----

View File

@ -412,6 +412,126 @@ utf_16_ex_decode(PyObject *self,
return tuple;
}
static PyObject *
utf_32_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
int byteorder = 0;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded;
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode",
&data, &size, &errors, &final))
return NULL;
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "negative argument");
return 0;
}
consumed = size; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
final ? NULL : &consumed);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
static PyObject *
utf_32_le_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
int byteorder = -1;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded = NULL;
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode",
&data, &size, &errors, &final))
return NULL;
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "negative argument");
return 0;
}
consumed = size; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
&byteorder, final ? NULL : &consumed);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
static PyObject *
utf_32_be_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
int byteorder = 1;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded = NULL;
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode",
&data, &size, &errors, &final))
return NULL;
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "negative argument");
return 0;
}
consumed = size; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
&byteorder, final ? NULL : &consumed);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
/* This non-standard version also provides access to the byteorder
parameter of the builtin UTF-32 codec.
It returns a tuple (unicode, bytesread, byteorder) with byteorder
being the value in effect at the end of data.
*/
static PyObject *
utf_32_ex_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
int byteorder = 0;
PyObject *unicode, *tuple;
int final = 0;
Py_ssize_t consumed;
if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode",
&data, &size, &errors, &byteorder, &final))
return NULL;
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "negative argument");
return 0;
}
consumed = size; /* This is overwritten unless final is true. */
unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
final ? NULL : &consumed);
if (unicode == NULL)
return NULL;
tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
Py_DECREF(unicode);
return tuple;
}
static PyObject *
unicode_escape_decode(PyObject *self,
PyObject *args)
@ -700,6 +820,83 @@ utf_16_be_encode(PyObject *self,
return v;
}
/* This version provides access to the byteorder parameter of the
builtin UTF-32 codecs as optional third argument. It defaults to 0
which means: use the native byte order and prepend the data with a
BOM mark.
*/
static PyObject *
utf_32_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
int byteorder = 0;
if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
&str, &errors, &byteorder))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
byteorder),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
utf_32_le_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
-1),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
utf_32_be_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
+1),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
unicode_escape_encode(PyObject *self,
PyObject *args)
@ -916,6 +1113,13 @@ static PyMethodDef _codecs_functions[] = {
{"utf_16_le_decode", utf_16_le_decode, METH_VARARGS},
{"utf_16_be_decode", utf_16_be_decode, METH_VARARGS},
{"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS},
{"utf_32_encode", utf_32_encode, METH_VARARGS},
{"utf_32_le_encode", utf_32_le_encode, METH_VARARGS},
{"utf_32_be_encode", utf_32_be_encode, METH_VARARGS},
{"utf_32_decode", utf_32_decode, METH_VARARGS},
{"utf_32_le_decode", utf_32_le_decode, METH_VARARGS},
{"utf_32_be_decode", utf_32_be_decode, METH_VARARGS},
{"utf_32_ex_decode", utf_32_ex_decode, METH_VARARGS},
{"unicode_escape_encode", unicode_escape_encode, METH_VARARGS},
{"unicode_escape_decode", unicode_escape_decode, METH_VARARGS},
{"unicode_internal_encode", unicode_internal_encode, METH_VARARGS},

View File

@ -1992,6 +1992,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
NULL);
}
/* --- UTF-32 Codec ------------------------------------------------------- */
PyObject *
PyUnicode_DecodeUTF32(const char *s,
Py_ssize_t size,
const char *errors,
int *byteorder)
{
return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
}
PyObject *
PyUnicode_DecodeUTF32Stateful(const char *s,
Py_ssize_t size,
const char *errors,
int *byteorder,
Py_ssize_t *consumed)
{
const char *starts = s;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
Py_ssize_t outpos;
PyUnicodeObject *unicode;
Py_UNICODE *p;
#ifndef Py_UNICODE_WIDE
int i, pairs;
#else
const int pairs = 0;
#endif
const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */
const char *errmsg = "";
/* On narrow builds we split characters outside the BMP into two
codepoints => count how much extra space we need. */
#ifndef Py_UNICODE_WIDE
for (i = pairs = 0; i < size/4; i++)
if (((Py_UCS4 *)s)[i] >= 0x10000)
pairs++;
#endif
/* Offsets from q for retrieving bytes in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int iorder[] = {0, 1, 2, 3};
#else
int iorder[] = {3, 2, 1, 0};
#endif
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
/* This might be one to much, because of a BOM */
unicode = _PyUnicode_New((size+3)/4+pairs);
if (!unicode)
return NULL;
if (size == 0)
return (PyObject *)unicode;
/* Unpack UTF-32 encoded data */
p = unicode->str;
q = (unsigned char *)s;
e = q + size;
if (byteorder)
bo = *byteorder;
/* Check for BOM marks (U+FEFF) in the input and adjust current
byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */
if (bo == 0) {
if (size >= 4) {
const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
(q[iorder[1]] << 8) | q[iorder[0]];
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (bom == 0x0000FEFF) {
q += 4;
bo = -1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = 1;
}
#else
if (bom == 0x0000FEFF) {
q += 4;
bo = 1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = -1;
}
#endif
}
}
if (bo == -1) {
/* force LE */
iorder[0] = 0;
iorder[1] = 1;
iorder[2] = 2;
iorder[3] = 3;
}
else if (bo == 1) {
/* force BE */
iorder[0] = 3;
iorder[1] = 2;
iorder[2] = 1;
iorder[3] = 0;
}
while (q < e) {
Py_UCS4 ch;
/* remaining bytes at the end? (size should be divisible by 4) */
if (e-q<4) {
if (consumed)
break;
errmsg = "truncated data";
startinpos = ((const char *)q)-starts;
endinpos = ((const char *)e)-starts;
goto utf32Error;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
}
ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
(q[iorder[1]] << 8) | q[iorder[0]];
if (ch >= 0x110000)
{
errmsg = "codepoint not in range(0x110000)";
startinpos = ((const char *)q)-starts;
endinpos = startinpos+4;
goto utf32Error;
}
#ifndef Py_UNICODE_WIDE
if (ch >= 0x10000)
{
*p++ = 0xD800 | ((ch-0x10000) >> 10);
*p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
}
else
#endif
*p++ = ch;
q += 4;
continue;
utf32Error:
outpos = p-PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf32", errmsg,
&starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
(PyObject **)&unicode, &outpos, &p))
goto onError;
}
if (byteorder)
*byteorder = bo;
if (consumed)
*consumed = (const char *)q-starts;
/* Adjust length */
if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
goto onError;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return (PyObject *)unicode;
onError:
Py_DECREF(unicode);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return NULL;
}
PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Py_ssize_t size,
const char *errors,
int byteorder)
{
PyObject *v;
unsigned char *p;
#ifndef Py_UNICODE_WIDE
int i, pairs;
#else
const int pairs = 0;
#endif
/* Offsets from p for storing byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int iorder[] = {0, 1, 2, 3};
#else
int iorder[] = {3, 2, 1, 0};
#endif
#define STORECHAR(CH) \
do { \
p[iorder[3]] = ((CH) >> 24) & 0xff; \
p[iorder[2]] = ((CH) >> 16) & 0xff; \
p[iorder[1]] = ((CH) >> 8) & 0xff; \
p[iorder[0]] = (CH) & 0xff; \
p += 4; \
} while(0)
/* In narrow builds we can output surrogate pairs as one codepoint,
so we need less space. */
#ifndef Py_UNICODE_WIDE
for (i = pairs = 0; i < size-1; i++)
if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
pairs++;
#endif
v = PyBytes_FromStringAndSize(NULL,
4 * (size - pairs + (byteorder == 0)));
if (v == NULL)
return NULL;
p = (unsigned char *)PyBytes_AS_STRING(v);
if (byteorder == 0)
STORECHAR(0xFEFF);
if (size == 0)
return v;
if (byteorder == -1) {
/* force LE */
iorder[0] = 0;
iorder[1] = 1;
iorder[2] = 2;
iorder[3] = 3;
}
else if (byteorder == 1) {
/* force BE */
iorder[0] = 3;
iorder[1] = 2;
iorder[2] = 1;
iorder[3] = 0;
}
while (size-- > 0) {
Py_UCS4 ch = *s++;
#ifndef Py_UNICODE_WIDE
if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
Py_UCS4 ch2 = *s;
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
s++;
size--;
}
}
#endif
STORECHAR(ch);
}
return v;
#undef STORECHAR
}
PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
{
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
}
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
NULL,
0);
}
/* --- UTF-16 Codec ------------------------------------------------------- */
PyObject *