From ac22f6aa989f18c33c12615af1c66c73cf75d5e7 Mon Sep 17 00:00:00 2001 From: Christopher Thorne Date: Thu, 1 Nov 2018 10:48:49 +0000 Subject: [PATCH] bpo-33578: Add getstate/setstate for CJK codec (GH-6984) This implements getstate and setstate for the cjkcodecs multibyte incremental encoders/decoders, primarily to fix issues with seek/tell. The encoder getstate/setstate is slightly tricky as the "state" is pending bytes + MultibyteCodec_State but only an integer can be returned. The approach I've taken is to encode this data into a long, similar to how .tell() encodes a "cookie_type" as a long. https://bugs.python.org/issue33578 --- Lib/test/test_io.py | 28 ++++ Lib/test/test_multibytecodec.py | 113 +++++++++++++ Misc/ACKS | 1 + .../2018-06-08-23-55-34.bpo-33578.7oSsjG.rst | 1 + Modules/cjkcodecs/_codecs_cn.c | 38 +++-- Modules/cjkcodecs/clinic/multibytecodec.c.h | 90 +++++++++- Modules/cjkcodecs/multibytecodec.c | 154 ++++++++++++++++++ Modules/cjkcodecs/multibytecodec.h | 13 +- 8 files changed, 416 insertions(+), 22 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2018-06-08-23-55-34.bpo-33578.7oSsjG.rst diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index d927bb96ceb..14352ff84ff 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -2971,6 +2971,34 @@ class TextIOWrapperTest(unittest.TestCase): finally: StatefulIncrementalDecoder.codecEnabled = 0 + def test_multibyte_seek_and_tell(self): + f = self.open(support.TESTFN, "w", encoding="euc_jp") + f.write("AB\n\u3046\u3048\n") + f.close() + + f = self.open(support.TESTFN, "r", encoding="euc_jp") + self.assertEqual(f.readline(), "AB\n") + p0 = f.tell() + self.assertEqual(f.readline(), "\u3046\u3048\n") + p1 = f.tell() + f.seek(p0) + self.assertEqual(f.readline(), "\u3046\u3048\n") + self.assertEqual(f.tell(), p1) + f.close() + + def test_seek_with_encoder_state(self): + f = self.open(support.TESTFN, "w", encoding="euc_jis_2004") + f.write("\u00e6\u0300") + p0 = f.tell() + f.write("\u00e6") + f.seek(p0) + f.write("\u0300") + f.close() + + f = self.open(support.TESTFN, "r", encoding="euc_jis_2004") + self.assertEqual(f.readline(), "\u00e6\u0300\u0300") + f.close() + def test_encoded_writes(self): data = "1234567890" tests = ("utf-16", diff --git a/Lib/test/test_multibytecodec.py b/Lib/test/test_multibytecodec.py index 01a1cd3c693..8e8362b70fd 100644 --- a/Lib/test/test_multibytecodec.py +++ b/Lib/test/test_multibytecodec.py @@ -117,6 +117,88 @@ class Test_IncrementalEncoder(unittest.TestCase): self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123') self.assertEqual(encoder.encode('', True), b'\xa9\xdc') + def test_state_methods_with_buffer_state(self): + # euc_jis_2004 stores state as a buffer of pending bytes + encoder = codecs.getincrementalencoder('euc_jis_2004')() + + initial_state = encoder.getstate() + self.assertEqual(encoder.encode('\u00e6\u0300'), b'\xab\xc4') + encoder.setstate(initial_state) + self.assertEqual(encoder.encode('\u00e6\u0300'), b'\xab\xc4') + + self.assertEqual(encoder.encode('\u00e6'), b'') + partial_state = encoder.getstate() + self.assertEqual(encoder.encode('\u0300'), b'\xab\xc4') + encoder.setstate(partial_state) + self.assertEqual(encoder.encode('\u0300'), b'\xab\xc4') + + def test_state_methods_with_non_buffer_state(self): + # iso2022_jp stores state without using a buffer + encoder = codecs.getincrementalencoder('iso2022_jp')() + + self.assertEqual(encoder.encode('z'), b'z') + en_state = encoder.getstate() + + self.assertEqual(encoder.encode('\u3042'), b'\x1b\x24\x42\x24\x22') + jp_state = encoder.getstate() + self.assertEqual(encoder.encode('z'), b'\x1b\x28\x42z') + + encoder.setstate(jp_state) + self.assertEqual(encoder.encode('\u3042'), b'\x24\x22') + + encoder.setstate(en_state) + self.assertEqual(encoder.encode('z'), b'z') + + def test_getstate_returns_expected_value(self): + # Note: getstate is implemented such that these state values + # are expected to be the same across all builds of Python, + # regardless of x32/64 bit, endianness and compiler. + + # euc_jis_2004 stores state as a buffer of pending bytes + buffer_state_encoder = codecs.getincrementalencoder('euc_jis_2004')() + self.assertEqual(buffer_state_encoder.getstate(), 0) + buffer_state_encoder.encode('\u00e6') + self.assertEqual(buffer_state_encoder.getstate(), + int.from_bytes( + b"\x02" + b"\xc3\xa6" + b"\x00\x00\x00\x00\x00\x00\x00\x00", + 'little')) + buffer_state_encoder.encode('\u0300') + self.assertEqual(buffer_state_encoder.getstate(), 0) + + # iso2022_jp stores state without using a buffer + non_buffer_state_encoder = codecs.getincrementalencoder('iso2022_jp')() + self.assertEqual(non_buffer_state_encoder.getstate(), + int.from_bytes( + b"\x00" + b"\x42\x42\x00\x00\x00\x00\x00\x00", + 'little')) + non_buffer_state_encoder.encode('\u3042') + self.assertEqual(non_buffer_state_encoder.getstate(), + int.from_bytes( + b"\x00" + b"\xc2\x42\x00\x00\x00\x00\x00\x00", + 'little')) + + def test_setstate_validates_input_size(self): + encoder = codecs.getincrementalencoder('euc_jp')() + pending_size_nine = int.from_bytes( + b"\x09" + b"\x00\x00\x00\x00\x00\x00\x00\x00" + b"\x00\x00\x00\x00\x00\x00\x00\x00", + 'little') + self.assertRaises(UnicodeError, encoder.setstate, pending_size_nine) + + def test_setstate_validates_input_bytes(self): + encoder = codecs.getincrementalencoder('euc_jp')() + invalid_utf8 = int.from_bytes( + b"\x01" + b"\xff" + b"\x00\x00\x00\x00\x00\x00\x00\x00", + 'little') + self.assertRaises(UnicodeDecodeError, encoder.setstate, invalid_utf8) + def test_issue5640(self): encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace') self.assertEqual(encoder.encode('\xff'), b'\\xff') @@ -165,6 +247,37 @@ class Test_IncrementalDecoder(unittest.TestCase): decoder = codecs.getincrementaldecoder(enc)() self.assertRaises(TypeError, decoder.decode, "") + def test_state_methods(self): + decoder = codecs.getincrementaldecoder('euc_jp')() + + # Decode a complete input sequence + self.assertEqual(decoder.decode(b'\xa4\xa6'), '\u3046') + pending1, _ = decoder.getstate() + self.assertEqual(pending1, b'') + + # Decode first half of a partial input sequence + self.assertEqual(decoder.decode(b'\xa4'), '') + pending2, flags2 = decoder.getstate() + self.assertEqual(pending2, b'\xa4') + + # Decode second half of a partial input sequence + self.assertEqual(decoder.decode(b'\xa6'), '\u3046') + pending3, _ = decoder.getstate() + self.assertEqual(pending3, b'') + + # Jump back and decode second half of partial input sequence again + decoder.setstate((pending2, flags2)) + self.assertEqual(decoder.decode(b'\xa6'), '\u3046') + pending4, _ = decoder.getstate() + self.assertEqual(pending4, b'') + + def test_setstate_validates_input(self): + decoder = codecs.getincrementaldecoder('euc_jp')() + self.assertRaises(TypeError, decoder.setstate, 123) + self.assertRaises(TypeError, decoder.setstate, ("invalid", 0)) + self.assertRaises(TypeError, decoder.setstate, (b"1234", "invalid")) + self.assertRaises(UnicodeError, decoder.setstate, (b"123456789", 0)) + class Test_StreamReader(unittest.TestCase): def test_bug1728403(self): try: diff --git a/Misc/ACKS b/Misc/ACKS index 043d604a3f9..08ff6d11fdd 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -1626,6 +1626,7 @@ Nicolas M. ThiƩry James Thomas Robin Thomas Brian Thorne +Christopher Thorne Stephen Thorne Jeremy Thurgood Eric Tiedemann diff --git a/Misc/NEWS.d/next/Library/2018-06-08-23-55-34.bpo-33578.7oSsjG.rst b/Misc/NEWS.d/next/Library/2018-06-08-23-55-34.bpo-33578.7oSsjG.rst new file mode 100644 index 00000000000..4e2e4627dc5 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-06-08-23-55-34.bpo-33578.7oSsjG.rst @@ -0,0 +1 @@ +Implement multibyte encoder/decoder state methods diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c index 1fcc220b8db..8a62f7e257c 100644 --- a/Modules/cjkcodecs/_codecs_cn.c +++ b/Modules/cjkcodecs/_codecs_cn.c @@ -51,6 +51,12 @@ ; \ } +/* + * codecs in this file use the first byte of MultibyteCodec_State.c[8] + * to store a 0 or 1 state value + */ +#define CN_STATE_OFFSET 0 + /* * GB2312 codec */ @@ -329,15 +335,15 @@ DECODER(gb18030) ENCODER_INIT(hz) { - state->i = 0; + state->c[CN_STATE_OFFSET] = 0; return 0; } ENCODER_RESET(hz) { - if (state->i != 0) { + if (state->c[CN_STATE_OFFSET] != 0) { WRITEBYTE2('~', '}'); - state->i = 0; + state->c[CN_STATE_OFFSET] = 0; NEXT_OUT(2); } return 0; @@ -350,10 +356,10 @@ ENCODER(hz) DBCHAR code; if (c < 0x80) { - if (state->i) { + if (state->c[CN_STATE_OFFSET]) { WRITEBYTE2('~', '}'); NEXT_OUT(2); - state->i = 0; + state->c[CN_STATE_OFFSET] = 0; } WRITEBYTE1((unsigned char)c); NEXT(1, 1); @@ -375,10 +381,10 @@ ENCODER(hz) if (code & 0x8000) /* MSB set: GBK */ return 1; - if (state->i == 0) { + if (state->c[CN_STATE_OFFSET] == 0) { WRITEBYTE4('~', '{', code >> 8, code & 0xff); NEXT(1, 4); - state->i = 1; + state->c[CN_STATE_OFFSET] = 1; } else { WRITEBYTE2(code >> 8, code & 0xff); @@ -391,13 +397,13 @@ ENCODER(hz) DECODER_INIT(hz) { - state->i = 0; + state->c[CN_STATE_OFFSET] = 0; return 0; } DECODER_RESET(hz) { - state->i = 0; + state->c[CN_STATE_OFFSET] = 0; return 0; } @@ -411,14 +417,14 @@ DECODER(hz) unsigned char c2 = INBYTE2; REQUIRE_INBUF(2); - if (c2 == '~' && state->i == 0) + if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0) OUTCHAR('~'); - else if (c2 == '{' && state->i == 0) - state->i = 1; /* set GB */ - else if (c2 == '\n' && state->i == 0) + else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0) + state->c[CN_STATE_OFFSET] = 1; /* set GB */ + else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0) ; /* line-continuation */ - else if (c2 == '}' && state->i == 1) - state->i = 0; /* set ASCII */ + else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1) + state->c[CN_STATE_OFFSET] = 0; /* set ASCII */ else return 1; NEXT_IN(2); @@ -428,7 +434,7 @@ DECODER(hz) if (c & 0x80) return 1; - if (state->i == 0) { /* ASCII mode */ + if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */ OUTCHAR(c); NEXT_IN(1); } diff --git a/Modules/cjkcodecs/clinic/multibytecodec.c.h b/Modules/cjkcodecs/clinic/multibytecodec.c.h index 25857fc6d6f..a58bb646a41 100644 --- a/Modules/cjkcodecs/clinic/multibytecodec.c.h +++ b/Modules/cjkcodecs/clinic/multibytecodec.c.h @@ -115,6 +115,50 @@ exit: return return_value; } +PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_getstate__doc__, +"getstate($self, /)\n" +"--\n" +"\n"); + +#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF \ + {"getstate", (PyCFunction)_multibytecodec_MultibyteIncrementalEncoder_getstate, METH_NOARGS, _multibytecodec_MultibyteIncrementalEncoder_getstate__doc__}, + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self); + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_getstate(MultibyteIncrementalEncoderObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _multibytecodec_MultibyteIncrementalEncoder_getstate_impl(self); +} + +PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_setstate__doc__, +"setstate($self, state, /)\n" +"--\n" +"\n"); + +#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF \ + {"setstate", (PyCFunction)_multibytecodec_MultibyteIncrementalEncoder_setstate, METH_O, _multibytecodec_MultibyteIncrementalEncoder_setstate__doc__}, + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self, + PyLongObject *statelong); + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_setstate(MultibyteIncrementalEncoderObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + PyLongObject *statelong; + + if (!PyArg_Parse(arg, "O!:setstate", &PyLong_Type, &statelong)) { + goto exit; + } + return_value = _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(self, statelong); + +exit: + return return_value; +} + PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_reset__doc__, "reset($self, /)\n" "--\n" @@ -169,6 +213,50 @@ exit: return return_value; } +PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_getstate__doc__, +"getstate($self, /)\n" +"--\n" +"\n"); + +#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF \ + {"getstate", (PyCFunction)_multibytecodec_MultibyteIncrementalDecoder_getstate, METH_NOARGS, _multibytecodec_MultibyteIncrementalDecoder_getstate__doc__}, + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self); + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_getstate(MultibyteIncrementalDecoderObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _multibytecodec_MultibyteIncrementalDecoder_getstate_impl(self); +} + +PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_setstate__doc__, +"setstate($self, state, /)\n" +"--\n" +"\n"); + +#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF \ + {"setstate", (PyCFunction)_multibytecodec_MultibyteIncrementalDecoder_setstate, METH_O, _multibytecodec_MultibyteIncrementalDecoder_setstate__doc__}, + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self, + PyObject *state); + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_setstate(MultibyteIncrementalDecoderObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + PyObject *state; + + if (!PyArg_Parse(arg, "O!:setstate", &PyTuple_Type, &state)) { + goto exit; + } + return_value = _multibytecodec_MultibyteIncrementalDecoder_setstate_impl(self, state); + +exit: + return return_value; +} + PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_reset__doc__, "reset($self, /)\n" "--\n" @@ -330,4 +418,4 @@ PyDoc_STRVAR(_multibytecodec___create_codec__doc__, #define _MULTIBYTECODEC___CREATE_CODEC_METHODDEF \ {"__create_codec", (PyCFunction)_multibytecodec___create_codec, METH_O, _multibytecodec___create_codec__doc__}, -/*[clinic end generated code: output=680f59f4cfe63c25 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=2fa0a38494716b97 input=a9049054013a1b77]*/ diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index 22172b043bc..4633499a8ab 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -895,6 +895,93 @@ _multibytecodec_MultibyteIncrementalEncoder_encode_impl(MultibyteIncrementalEnco return encoder_encode_stateful(STATEFUL_ECTX(self), input, final); } +/*[clinic input] +_multibytecodec.MultibyteIncrementalEncoder.getstate +[clinic start generated code]*/ + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self) +/*[clinic end generated code: output=9794a5ace70d7048 input=4a2a82874ffa40bb]*/ +{ + /* state made up of 1 byte for buffer size, up to MAXENCPENDING*4 bytes + for UTF-8 encoded buffer (each character can use up to 4 + bytes), and required bytes for MultibyteCodec_State.c. A byte + array is used to avoid different compilers generating different + values for the same state, e.g. as a result of struct padding. + */ + unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)]; + Py_ssize_t statesize; + const char *pendingbuffer = NULL; + Py_ssize_t pendingsize; + + if (self->pending != NULL) { + pendingbuffer = PyUnicode_AsUTF8AndSize(self->pending, &pendingsize); + if (pendingbuffer == NULL) { + return NULL; + } + if (pendingsize > MAXENCPENDING*4) { + PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); + return NULL; + } + statebytes[0] = pendingsize; + memcpy(statebytes+1, pendingbuffer, pendingsize); + statesize = 1 + pendingsize; + } else { + statebytes[0] = 0; + statesize = 1; + } + memcpy(statebytes+statesize, self->state.c, + sizeof(self->state.c)); + statesize += sizeof(self->state.c); + + return (PyObject *)_PyLong_FromByteArray(statebytes, statesize, + 1 /* little-endian */ , + 0 /* unsigned */ ); +} + +/*[clinic input] +_multibytecodec.MultibyteIncrementalEncoder.setstate + state as statelong: object(type='PyLongObject *', subclass_of='&PyLong_Type') + / +[clinic start generated code]*/ + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self, + PyLongObject *statelong) +/*[clinic end generated code: output=4e5e98ac1f4039ca input=c80fb5830d4d2f76]*/ +{ + PyObject *pending = NULL; + unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)]; + + if (_PyLong_AsByteArray(statelong, statebytes, sizeof(statebytes), + 1 /* little-endian */ , + 0 /* unsigned */ ) < 0) { + goto errorexit; + } + + if (statebytes[0] > MAXENCPENDING*4) { + PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); + return NULL; + } + + pending = PyUnicode_DecodeUTF8((const char *)statebytes+1, + statebytes[0], "strict"); + if (pending == NULL) { + goto errorexit; + } + + Py_CLEAR(self->pending); + self->pending = pending; + memcpy(self->state.c, statebytes+1+statebytes[0], + sizeof(self->state.c)); + + Py_RETURN_NONE; + +errorexit: + Py_XDECREF(pending); + return NULL; +} + /*[clinic input] _multibytecodec.MultibyteIncrementalEncoder.reset [clinic start generated code]*/ @@ -919,6 +1006,8 @@ _multibytecodec_MultibyteIncrementalEncoder_reset_impl(MultibyteIncrementalEncod static struct PyMethodDef mbiencoder_methods[] = { _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_ENCODE_METHODDEF + _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF + _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_RESET_METHODDEF {NULL, NULL}, }; @@ -984,6 +1073,7 @@ mbiencoder_dealloc(MultibyteIncrementalEncoderObject *self) { PyObject_GC_UnTrack(self); ERROR_DECREF(self->errors); + Py_CLEAR(self->pending); Py_TYPE(self)->tp_free(self); } @@ -1119,6 +1209,68 @@ errorexit: return NULL; } +/*[clinic input] +_multibytecodec.MultibyteIncrementalDecoder.getstate +[clinic start generated code]*/ + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self) +/*[clinic end generated code: output=255009c4713b7f82 input=4006aa49bddbaa75]*/ +{ + PyObject *buffer; + + buffer = PyBytes_FromStringAndSize((const char *)self->pending, + self->pendingsize); + if (buffer == NULL) { + return NULL; + } + + return make_tuple(buffer, (Py_ssize_t)*self->state.c); +} + +/*[clinic input] +_multibytecodec.MultibyteIncrementalDecoder.setstate + state: object(subclass_of='&PyTuple_Type') + / +[clinic start generated code]*/ + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self, + PyObject *state) +/*[clinic end generated code: output=106b2fbca3e2dcc2 input=e5d794e8baba1a47]*/ +{ + PyObject *buffer; + Py_ssize_t buffersize; + char *bufferstr; + unsigned long long flag; + + if (!PyArg_ParseTuple(state, "SK;setstate(): illegal state argument", + &buffer, &flag)) + { + return NULL; + } + + buffersize = PyBytes_Size(buffer); + if (buffersize == -1) { + return NULL; + } + + if (buffersize > MAXDECPENDING) { + PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); + return NULL; + } + + bufferstr = PyBytes_AsString(buffer); + if (bufferstr == NULL) { + return NULL; + } + self->pendingsize = buffersize; + memcpy(self->pending, bufferstr, self->pendingsize); + memcpy(self->state.c, (unsigned char *)&flag, sizeof(flag)); + + Py_RETURN_NONE; +} + /*[clinic input] _multibytecodec.MultibyteIncrementalDecoder.reset [clinic start generated code]*/ @@ -1137,6 +1289,8 @@ _multibytecodec_MultibyteIncrementalDecoder_reset_impl(MultibyteIncrementalDecod static struct PyMethodDef mbidecoder_methods[] = { _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_DECODE_METHODDEF + _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF + _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_RESET_METHODDEF {NULL, NULL}, }; diff --git a/Modules/cjkcodecs/multibytecodec.h b/Modules/cjkcodecs/multibytecodec.h index 5b8c22276b4..6d34534ee68 100644 --- a/Modules/cjkcodecs/multibytecodec.h +++ b/Modules/cjkcodecs/multibytecodec.h @@ -16,12 +16,15 @@ typedef uint16_t ucs2_t, DBCHAR; typedef unsigned short ucs2_t, DBCHAR; #endif -typedef union { - void *p; - int i; +/* + * A struct that provides 8 bytes of state for multibyte + * codecs. Codecs are free to use this how they want. Note: if you + * need to add a new field to this struct, ensure that its byte order + * is independent of CPU endianness so that the return value of + * getstate doesn't differ between little and big endian CPUs. + */ +typedef struct { unsigned char c[8]; - ucs2_t u2[4]; - Py_UCS4 u4[2]; } MultibyteCodec_State; typedef int (*mbcodec_init)(const void *config);