mirror of https://github.com/python/cpython
Issue #10783: struct.pack() doesn't encode implicitly unicode to UTF-8
* Replace "bytes" by "bytes object" in struct error messages * Document the API change in What's new in Python 3.2 * Fix test_wave * Remove also ugly implicit conversions in test_struct
This commit is contained in:
parent
e398da9ad0
commit
da9ec995f6
|
@ -164,58 +164,53 @@ platform-dependent.
|
|||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``c`` | :c:type:`char` | bytes of length 1 | 1 | |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``b`` | :c:type:`signed char` | integer | 1 | \(1),\(4) |
|
||||
| ``b`` | :c:type:`signed char` | integer | 1 | \(1),\(3) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``B`` | :c:type:`unsigned char` | integer | 1 | \(4) |
|
||||
| ``B`` | :c:type:`unsigned char` | integer | 1 | \(3) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``?`` | :c:type:`_Bool` | bool | 1 | \(2) |
|
||||
| ``?`` | :c:type:`_Bool` | bool | 1 | \(1) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``h`` | :c:type:`short` | integer | 2 | \(4) |
|
||||
| ``h`` | :c:type:`short` | integer | 2 | \(3) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``H`` | :c:type:`unsigned short` | integer | 2 | \(4) |
|
||||
| ``H`` | :c:type:`unsigned short` | integer | 2 | \(3) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``i`` | :c:type:`int` | integer | 4 | \(4) |
|
||||
| ``i`` | :c:type:`int` | integer | 4 | \(3) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``I`` | :c:type:`unsigned int` | integer | 4 | \(4) |
|
||||
| ``I`` | :c:type:`unsigned int` | integer | 4 | \(3) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``l`` | :c:type:`long` | integer | 4 | \(4) |
|
||||
| ``l`` | :c:type:`long` | integer | 4 | \(3) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``L`` | :c:type:`unsigned long` | integer | 4 | \(4) |
|
||||
| ``L`` | :c:type:`unsigned long` | integer | 4 | \(3) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``q`` | :c:type:`long long` | integer | 8 | \(3), \(4) |
|
||||
| ``q`` | :c:type:`long long` | integer | 8 | \(2), \(3) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``Q`` | :c:type:`unsigned long | integer | 8 | \(3), \(4) |
|
||||
| ``Q`` | :c:type:`unsigned long | integer | 8 | \(2), \(3) |
|
||||
| | long` | | | |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``f`` | :c:type:`float` | float | 4 | \(5) |
|
||||
| ``f`` | :c:type:`float` | float | 4 | \(4) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``d`` | :c:type:`double` | float | 8 | \(5) |
|
||||
| ``d`` | :c:type:`double` | float | 8 | \(4) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``s`` | :c:type:`char[]` | bytes | | \(1) |
|
||||
| ``s`` | :c:type:`char[]` | bytes | | |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``p`` | :c:type:`char[]` | bytes | | \(1) |
|
||||
| ``p`` | :c:type:`char[]` | bytes | | |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
| ``P`` | :c:type:`void \*` | integer | | \(6) |
|
||||
| ``P`` | :c:type:`void \*` | integer | | \(5) |
|
||||
+--------+--------------------------+--------------------+----------------+------------+
|
||||
|
||||
Notes:
|
||||
|
||||
(1)
|
||||
The ``c``, ``s`` and ``p`` conversion codes operate on :class:`bytes`
|
||||
objects, but packing with such codes also supports :class:`str` objects,
|
||||
which are encoded using UTF-8.
|
||||
|
||||
(2)
|
||||
The ``'?'`` conversion code corresponds to the :c:type:`_Bool` type defined by
|
||||
C99. If this type is not available, it is simulated using a :c:type:`char`. In
|
||||
standard mode, it is always represented by one byte.
|
||||
|
||||
(3)
|
||||
(2)
|
||||
The ``'q'`` and ``'Q'`` conversion codes are available in native mode only if
|
||||
the platform C compiler supports C :c:type:`long long`, or, on Windows,
|
||||
:c:type:`__int64`. They are always available in standard modes.
|
||||
|
||||
(4)
|
||||
(3)
|
||||
When attempting to pack a non-integer using any of the integer conversion
|
||||
codes, if the non-integer has a :meth:`__index__` method then that method is
|
||||
called to convert the argument to an integer before packing.
|
||||
|
@ -223,12 +218,12 @@ Notes:
|
|||
.. versionchanged:: 3.2
|
||||
Use of the :meth:`__index__` method for non-integers is new in 3.2.
|
||||
|
||||
(5)
|
||||
(4)
|
||||
For the ``'f'`` and ``'d'`` conversion codes, the packed representation uses
|
||||
the IEEE 754 binary32 (for ``'f'``) or binary64 (for ``'d'``) format,
|
||||
regardless of the floating-point format used by the platform.
|
||||
|
||||
(6)
|
||||
(5)
|
||||
The ``'P'`` format character is only available for the native byte ordering
|
||||
(selected as the default or with the ``'@'`` byte order character). The byte
|
||||
order character ``'='`` chooses to use little- or big-endian ordering based
|
||||
|
@ -310,9 +305,9 @@ the result in a named tuple::
|
|||
The ordering of format characters may have an impact on size since the padding
|
||||
needed to satisfy alignment requirements is different::
|
||||
|
||||
>>> pack('ci', '*', 0x12131415)
|
||||
>>> pack('ci', b'*', 0x12131415)
|
||||
b'*\x00\x00\x00\x12\x13\x14\x15'
|
||||
>>> pack('ic', 0x12131415, '*')
|
||||
>>> pack('ic', 0x12131415, b'*')
|
||||
b'\x12\x13\x14\x15*'
|
||||
>>> calcsize('ci')
|
||||
8
|
||||
|
|
|
@ -1705,3 +1705,7 @@ require changes to your code:
|
|||
|
||||
(Contributed by Georg Brandl and Mattias Brändström;
|
||||
`appspot issue 53094 <http://codereview.appspot.com/53094>`_.)
|
||||
|
||||
* :func:`struct.pack` doesn't encode implicitly unicode to UTF-8 anymore: use
|
||||
explicit conversion instead and replace unicode literals by bytes literals.
|
||||
|
||||
|
|
|
@ -82,58 +82,52 @@ class StructTest(unittest.TestCase):
|
|||
# Test some of the new features in detail
|
||||
# (format, argument, big-endian result, little-endian result, asymmetric)
|
||||
tests = [
|
||||
('c', 'a', 'a', 'a', 0),
|
||||
('xc', 'a', '\0a', '\0a', 0),
|
||||
('cx', 'a', 'a\0', 'a\0', 0),
|
||||
('s', 'a', 'a', 'a', 0),
|
||||
('0s', 'helloworld', '', '', 1),
|
||||
('1s', 'helloworld', 'h', 'h', 1),
|
||||
('9s', 'helloworld', 'helloworl', 'helloworl', 1),
|
||||
('10s', 'helloworld', 'helloworld', 'helloworld', 0),
|
||||
('11s', 'helloworld', 'helloworld\0', 'helloworld\0', 1),
|
||||
('20s', 'helloworld', 'helloworld'+10*'\0', 'helloworld'+10*'\0', 1),
|
||||
('b', 7, '\7', '\7', 0),
|
||||
('b', -7, '\371', '\371', 0),
|
||||
('B', 7, '\7', '\7', 0),
|
||||
('B', 249, '\371', '\371', 0),
|
||||
('h', 700, '\002\274', '\274\002', 0),
|
||||
('h', -700, '\375D', 'D\375', 0),
|
||||
('H', 700, '\002\274', '\274\002', 0),
|
||||
('H', 0x10000-700, '\375D', 'D\375', 0),
|
||||
('i', 70000000, '\004,\035\200', '\200\035,\004', 0),
|
||||
('i', -70000000, '\373\323\342\200', '\200\342\323\373', 0),
|
||||
('I', 70000000, '\004,\035\200', '\200\035,\004', 0),
|
||||
('I', 0x100000000-70000000, '\373\323\342\200', '\200\342\323\373', 0),
|
||||
('l', 70000000, '\004,\035\200', '\200\035,\004', 0),
|
||||
('l', -70000000, '\373\323\342\200', '\200\342\323\373', 0),
|
||||
('L', 70000000, '\004,\035\200', '\200\035,\004', 0),
|
||||
('L', 0x100000000-70000000, '\373\323\342\200', '\200\342\323\373', 0),
|
||||
('f', 2.0, '@\000\000\000', '\000\000\000@', 0),
|
||||
('d', 2.0, '@\000\000\000\000\000\000\000',
|
||||
'\000\000\000\000\000\000\000@', 0),
|
||||
('f', -2.0, '\300\000\000\000', '\000\000\000\300', 0),
|
||||
('d', -2.0, '\300\000\000\000\000\000\000\000',
|
||||
'\000\000\000\000\000\000\000\300', 0),
|
||||
('?', 0, '\0', '\0', 0),
|
||||
('?', 3, '\1', '\1', 1),
|
||||
('?', True, '\1', '\1', 0),
|
||||
('?', [], '\0', '\0', 1),
|
||||
('?', (1,), '\1', '\1', 1),
|
||||
('c', b'a', b'a', b'a', 0),
|
||||
('xc', b'a', b'\0a', b'\0a', 0),
|
||||
('cx', b'a', b'a\0', b'a\0', 0),
|
||||
('s', b'a', b'a', b'a', 0),
|
||||
('0s', b'helloworld', b'', b'', 1),
|
||||
('1s', b'helloworld', b'h', b'h', 1),
|
||||
('9s', b'helloworld', b'helloworl', b'helloworl', 1),
|
||||
('10s', b'helloworld', b'helloworld', b'helloworld', 0),
|
||||
('11s', b'helloworld', b'helloworld\0', b'helloworld\0', 1),
|
||||
('20s', b'helloworld', b'helloworld'+10*b'\0', b'helloworld'+10*b'\0', 1),
|
||||
('b', 7, b'\7', b'\7', 0),
|
||||
('b', -7, b'\371', b'\371', 0),
|
||||
('B', 7, b'\7', b'\7', 0),
|
||||
('B', 249, b'\371', b'\371', 0),
|
||||
('h', 700, b'\002\274', b'\274\002', 0),
|
||||
('h', -700, b'\375D', b'D\375', 0),
|
||||
('H', 700, b'\002\274', b'\274\002', 0),
|
||||
('H', 0x10000-700, b'\375D', b'D\375', 0),
|
||||
('i', 70000000, b'\004,\035\200', b'\200\035,\004', 0),
|
||||
('i', -70000000, b'\373\323\342\200', b'\200\342\323\373', 0),
|
||||
('I', 70000000, b'\004,\035\200', b'\200\035,\004', 0),
|
||||
('I', 0x100000000-70000000, b'\373\323\342\200', b'\200\342\323\373', 0),
|
||||
('l', 70000000, b'\004,\035\200', b'\200\035,\004', 0),
|
||||
('l', -70000000, b'\373\323\342\200', b'\200\342\323\373', 0),
|
||||
('L', 70000000, b'\004,\035\200', b'\200\035,\004', 0),
|
||||
('L', 0x100000000-70000000, b'\373\323\342\200', b'\200\342\323\373', 0),
|
||||
('f', 2.0, b'@\000\000\000', b'\000\000\000@', 0),
|
||||
('d', 2.0, b'@\000\000\000\000\000\000\000',
|
||||
b'\000\000\000\000\000\000\000@', 0),
|
||||
('f', -2.0, b'\300\000\000\000', b'\000\000\000\300', 0),
|
||||
('d', -2.0, b'\300\000\000\000\000\000\000\000',
|
||||
b'\000\000\000\000\000\000\000\300', 0),
|
||||
('?', 0, b'\0', b'\0', 0),
|
||||
('?', 3, b'\1', b'\1', 1),
|
||||
('?', True, b'\1', b'\1', 0),
|
||||
('?', [], b'\0', b'\0', 1),
|
||||
('?', (1,), b'\1', b'\1', 1),
|
||||
]
|
||||
|
||||
for fmt, arg, big, lil, asy in tests:
|
||||
big = bytes(big, "latin-1")
|
||||
lil = bytes(lil, "latin-1")
|
||||
for (xfmt, exp) in [('>'+fmt, big), ('!'+fmt, big), ('<'+fmt, lil),
|
||||
('='+fmt, ISBIGENDIAN and big or lil)]:
|
||||
res = struct.pack(xfmt, arg)
|
||||
self.assertEqual(res, exp)
|
||||
self.assertEqual(struct.calcsize(xfmt), len(res))
|
||||
rev = struct.unpack(xfmt, res)[0]
|
||||
if isinstance(arg, str):
|
||||
# Strings are returned as bytes since you can't know the
|
||||
# encoding of the string when packed.
|
||||
arg = bytes(arg, 'latin1')
|
||||
if rev != arg:
|
||||
self.assertTrue(asy)
|
||||
|
||||
|
@ -334,15 +328,14 @@ class StructTest(unittest.TestCase):
|
|||
def test_p_code(self):
|
||||
# Test p ("Pascal string") code.
|
||||
for code, input, expected, expectedback in [
|
||||
('p','abc', '\x00', b''),
|
||||
('1p', 'abc', '\x00', b''),
|
||||
('2p', 'abc', '\x01a', b'a'),
|
||||
('3p', 'abc', '\x02ab', b'ab'),
|
||||
('4p', 'abc', '\x03abc', b'abc'),
|
||||
('5p', 'abc', '\x03abc\x00', b'abc'),
|
||||
('6p', 'abc', '\x03abc\x00\x00', b'abc'),
|
||||
('1000p', 'x'*1000, '\xff' + 'x'*999, b'x'*255)]:
|
||||
expected = bytes(expected, "latin-1")
|
||||
('p', b'abc', b'\x00', b''),
|
||||
('1p', b'abc', b'\x00', b''),
|
||||
('2p', b'abc', b'\x01a', b'a'),
|
||||
('3p', b'abc', b'\x02ab', b'ab'),
|
||||
('4p', b'abc', b'\x03abc', b'abc'),
|
||||
('5p', b'abc', b'\x03abc\x00', b'abc'),
|
||||
('6p', b'abc', b'\x03abc\x00\x00', b'abc'),
|
||||
('1000p', b'x'*1000, b'\xff' + b'x'*999, b'x'*255)]:
|
||||
got = struct.pack(code, input)
|
||||
self.assertEqual(got, expected)
|
||||
(got,) = struct.unpack(code, got)
|
||||
|
@ -401,15 +394,11 @@ class StructTest(unittest.TestCase):
|
|||
s = struct.Struct(fmt)
|
||||
for cls in (bytes, bytearray):
|
||||
data = cls(test_string)
|
||||
if not isinstance(data, (bytes, bytearray)):
|
||||
bytes_data = bytes(data, 'latin1')
|
||||
else:
|
||||
bytes_data = data
|
||||
self.assertEqual(s.unpack_from(data), (b'abcd',))
|
||||
self.assertEqual(s.unpack_from(data, 2), (b'cd01',))
|
||||
self.assertEqual(s.unpack_from(data, 4), (b'0123',))
|
||||
for i in range(6):
|
||||
self.assertEqual(s.unpack_from(data, i), (bytes_data[i:i+4],))
|
||||
self.assertEqual(s.unpack_from(data, i), (data[i:i+4],))
|
||||
for i in range(6, len(test_string) + 1):
|
||||
self.assertRaises(struct.error, s.unpack_from, data, i)
|
||||
for cls in (bytes, bytearray):
|
||||
|
|
|
@ -467,11 +467,11 @@ class Wave_write:
|
|||
self._datalength = self._nframes * self._nchannels * self._sampwidth
|
||||
self._form_length_pos = self._file.tell()
|
||||
self._file.write(struct.pack('<l4s4slhhllhh4s',
|
||||
36 + self._datalength, 'WAVE', 'fmt ', 16,
|
||||
36 + self._datalength, b'WAVE', b'fmt ', 16,
|
||||
WAVE_FORMAT_PCM, self._nchannels, self._framerate,
|
||||
self._nchannels * self._framerate * self._sampwidth,
|
||||
self._nchannels * self._sampwidth,
|
||||
self._sampwidth * 8, 'data'))
|
||||
self._sampwidth * 8, b'data'))
|
||||
self._data_length_pos = self._file.tell()
|
||||
self._file.write(struct.pack('<l', self._datalength))
|
||||
self._headerwritten = True
|
||||
|
|
|
@ -18,6 +18,9 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #10783: struct.pack() doesn't encode implicitly unicode to UTF-8
|
||||
anymore.
|
||||
|
||||
- Issue #10730: Add SVG mime types to mimetypes module.
|
||||
|
||||
- Issue #10768: Make the Tkinter ScrolledText widget work again.
|
||||
|
|
|
@ -462,14 +462,9 @@ np_ubyte(char *p, PyObject *v, const formatdef *f)
|
|||
static int
|
||||
np_char(char *p, PyObject *v, const formatdef *f)
|
||||
{
|
||||
if (PyUnicode_Check(v)) {
|
||||
v = _PyUnicode_AsDefaultEncodedString(v, NULL);
|
||||
if (v == NULL)
|
||||
return -1;
|
||||
}
|
||||
if (!PyBytes_Check(v) || PyBytes_Size(v) != 1) {
|
||||
PyErr_SetString(StructError,
|
||||
"char format requires bytes or string of length 1");
|
||||
"char format requires a bytes object of length 1");
|
||||
return -1;
|
||||
}
|
||||
*p = *PyBytes_AsString(v);
|
||||
|
@ -1345,7 +1340,7 @@ s_init(PyObject *self, PyObject *args, PyObject *kwds)
|
|||
if (!PyBytes_Check(o_format)) {
|
||||
Py_DECREF(o_format);
|
||||
PyErr_Format(PyExc_TypeError,
|
||||
"Struct() argument 1 must be bytes, not %.200s",
|
||||
"Struct() argument 1 must be a bytes object, not %.200s",
|
||||
Py_TYPE(o_format)->tp_name);
|
||||
return -1;
|
||||
}
|
||||
|
@ -1423,7 +1418,7 @@ s_unpack(PyObject *self, PyObject *input)
|
|||
return NULL;
|
||||
if (vbuf.len != soself->s_size) {
|
||||
PyErr_Format(StructError,
|
||||
"unpack requires a bytes argument of length %zd",
|
||||
"unpack requires a bytes object of length %zd",
|
||||
soself->s_size);
|
||||
PyBuffer_Release(&vbuf);
|
||||
return NULL;
|
||||
|
@ -1503,15 +1498,10 @@ s_pack_internal(PyStructObject *soself, PyObject *args, int offset, char* buf)
|
|||
if (e->format == 's') {
|
||||
int isstring;
|
||||
void *p;
|
||||
if (PyUnicode_Check(v)) {
|
||||
v = _PyUnicode_AsDefaultEncodedString(v, NULL);
|
||||
if (v == NULL)
|
||||
return -1;
|
||||
}
|
||||
isstring = PyBytes_Check(v);
|
||||
if (!isstring && !PyByteArray_Check(v)) {
|
||||
PyErr_SetString(StructError,
|
||||
"argument for 's' must be a bytes or string");
|
||||
"argument for 's' must be a bytes object");
|
||||
return -1;
|
||||
}
|
||||
if (isstring) {
|
||||
|
@ -1529,15 +1519,10 @@ s_pack_internal(PyStructObject *soself, PyObject *args, int offset, char* buf)
|
|||
} else if (e->format == 'p') {
|
||||
int isstring;
|
||||
void *p;
|
||||
if (PyUnicode_Check(v)) {
|
||||
v = _PyUnicode_AsDefaultEncodedString(v, NULL);
|
||||
if (v == NULL)
|
||||
return -1;
|
||||
}
|
||||
isstring = PyBytes_Check(v);
|
||||
if (!isstring && !PyByteArray_Check(v)) {
|
||||
PyErr_SetString(StructError,
|
||||
"argument for 'p' must be a bytes or string");
|
||||
"argument for 'p' must be a bytes object");
|
||||
return -1;
|
||||
}
|
||||
if (isstring) {
|
||||
|
@ -1691,7 +1676,7 @@ static struct PyMethodDef s_methods[] = {
|
|||
{NULL, NULL} /* sentinel */
|
||||
};
|
||||
|
||||
PyDoc_STRVAR(s__doc__,
|
||||
PyDoc_STRVAR(s__doc__,
|
||||
"Struct(fmt) --> compiled struct object\n"
|
||||
"\n"
|
||||
"Return a new Struct object which writes and reads binary data according to\n"
|
||||
|
|
Loading…
Reference in New Issue