bpo-33578: Add getstate/setstate for CJK codec (GH-6984)

This implements getstate and setstate for the cjkcodecs multibyte incremental encoders/decoders, primarily to fix issues with seek/tell.

The encoder getstate/setstate is slightly tricky as the "state" is pending bytes + MultibyteCodec_State but only an integer can be returned. The approach I've taken is to encode this data into a long, similar to how .tell() encodes a "cookie_type" as a long.


https://bugs.python.org/issue33578
This commit is contained in:
Christopher Thorne 2018-11-01 10:48:49 +00:00 committed by Miss Islington (bot)
parent 4b5e62dbb2
commit ac22f6aa98
8 changed files with 416 additions and 22 deletions

View File

@ -2971,6 +2971,34 @@ class TextIOWrapperTest(unittest.TestCase):
finally:
StatefulIncrementalDecoder.codecEnabled = 0
def test_multibyte_seek_and_tell(self):
f = self.open(support.TESTFN, "w", encoding="euc_jp")
f.write("AB\n\u3046\u3048\n")
f.close()
f = self.open(support.TESTFN, "r", encoding="euc_jp")
self.assertEqual(f.readline(), "AB\n")
p0 = f.tell()
self.assertEqual(f.readline(), "\u3046\u3048\n")
p1 = f.tell()
f.seek(p0)
self.assertEqual(f.readline(), "\u3046\u3048\n")
self.assertEqual(f.tell(), p1)
f.close()
def test_seek_with_encoder_state(self):
f = self.open(support.TESTFN, "w", encoding="euc_jis_2004")
f.write("\u00e6\u0300")
p0 = f.tell()
f.write("\u00e6")
f.seek(p0)
f.write("\u0300")
f.close()
f = self.open(support.TESTFN, "r", encoding="euc_jis_2004")
self.assertEqual(f.readline(), "\u00e6\u0300\u0300")
f.close()
def test_encoded_writes(self):
data = "1234567890"
tests = ("utf-16",

View File

@ -117,6 +117,88 @@ class Test_IncrementalEncoder(unittest.TestCase):
self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
self.assertEqual(encoder.encode('', True), b'\xa9\xdc')
def test_state_methods_with_buffer_state(self):
# euc_jis_2004 stores state as a buffer of pending bytes
encoder = codecs.getincrementalencoder('euc_jis_2004')()
initial_state = encoder.getstate()
self.assertEqual(encoder.encode('\u00e6\u0300'), b'\xab\xc4')
encoder.setstate(initial_state)
self.assertEqual(encoder.encode('\u00e6\u0300'), b'\xab\xc4')
self.assertEqual(encoder.encode('\u00e6'), b'')
partial_state = encoder.getstate()
self.assertEqual(encoder.encode('\u0300'), b'\xab\xc4')
encoder.setstate(partial_state)
self.assertEqual(encoder.encode('\u0300'), b'\xab\xc4')
def test_state_methods_with_non_buffer_state(self):
# iso2022_jp stores state without using a buffer
encoder = codecs.getincrementalencoder('iso2022_jp')()
self.assertEqual(encoder.encode('z'), b'z')
en_state = encoder.getstate()
self.assertEqual(encoder.encode('\u3042'), b'\x1b\x24\x42\x24\x22')
jp_state = encoder.getstate()
self.assertEqual(encoder.encode('z'), b'\x1b\x28\x42z')
encoder.setstate(jp_state)
self.assertEqual(encoder.encode('\u3042'), b'\x24\x22')
encoder.setstate(en_state)
self.assertEqual(encoder.encode('z'), b'z')
def test_getstate_returns_expected_value(self):
# Note: getstate is implemented such that these state values
# are expected to be the same across all builds of Python,
# regardless of x32/64 bit, endianness and compiler.
# euc_jis_2004 stores state as a buffer of pending bytes
buffer_state_encoder = codecs.getincrementalencoder('euc_jis_2004')()
self.assertEqual(buffer_state_encoder.getstate(), 0)
buffer_state_encoder.encode('\u00e6')
self.assertEqual(buffer_state_encoder.getstate(),
int.from_bytes(
b"\x02"
b"\xc3\xa6"
b"\x00\x00\x00\x00\x00\x00\x00\x00",
'little'))
buffer_state_encoder.encode('\u0300')
self.assertEqual(buffer_state_encoder.getstate(), 0)
# iso2022_jp stores state without using a buffer
non_buffer_state_encoder = codecs.getincrementalencoder('iso2022_jp')()
self.assertEqual(non_buffer_state_encoder.getstate(),
int.from_bytes(
b"\x00"
b"\x42\x42\x00\x00\x00\x00\x00\x00",
'little'))
non_buffer_state_encoder.encode('\u3042')
self.assertEqual(non_buffer_state_encoder.getstate(),
int.from_bytes(
b"\x00"
b"\xc2\x42\x00\x00\x00\x00\x00\x00",
'little'))
def test_setstate_validates_input_size(self):
encoder = codecs.getincrementalencoder('euc_jp')()
pending_size_nine = int.from_bytes(
b"\x09"
b"\x00\x00\x00\x00\x00\x00\x00\x00"
b"\x00\x00\x00\x00\x00\x00\x00\x00",
'little')
self.assertRaises(UnicodeError, encoder.setstate, pending_size_nine)
def test_setstate_validates_input_bytes(self):
encoder = codecs.getincrementalencoder('euc_jp')()
invalid_utf8 = int.from_bytes(
b"\x01"
b"\xff"
b"\x00\x00\x00\x00\x00\x00\x00\x00",
'little')
self.assertRaises(UnicodeDecodeError, encoder.setstate, invalid_utf8)
def test_issue5640(self):
encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')
self.assertEqual(encoder.encode('\xff'), b'\\xff')
@ -165,6 +247,37 @@ class Test_IncrementalDecoder(unittest.TestCase):
decoder = codecs.getincrementaldecoder(enc)()
self.assertRaises(TypeError, decoder.decode, "")
def test_state_methods(self):
decoder = codecs.getincrementaldecoder('euc_jp')()
# Decode a complete input sequence
self.assertEqual(decoder.decode(b'\xa4\xa6'), '\u3046')
pending1, _ = decoder.getstate()
self.assertEqual(pending1, b'')
# Decode first half of a partial input sequence
self.assertEqual(decoder.decode(b'\xa4'), '')
pending2, flags2 = decoder.getstate()
self.assertEqual(pending2, b'\xa4')
# Decode second half of a partial input sequence
self.assertEqual(decoder.decode(b'\xa6'), '\u3046')
pending3, _ = decoder.getstate()
self.assertEqual(pending3, b'')
# Jump back and decode second half of partial input sequence again
decoder.setstate((pending2, flags2))
self.assertEqual(decoder.decode(b'\xa6'), '\u3046')
pending4, _ = decoder.getstate()
self.assertEqual(pending4, b'')
def test_setstate_validates_input(self):
decoder = codecs.getincrementaldecoder('euc_jp')()
self.assertRaises(TypeError, decoder.setstate, 123)
self.assertRaises(TypeError, decoder.setstate, ("invalid", 0))
self.assertRaises(TypeError, decoder.setstate, (b"1234", "invalid"))
self.assertRaises(UnicodeError, decoder.setstate, (b"123456789", 0))
class Test_StreamReader(unittest.TestCase):
def test_bug1728403(self):
try:

View File

@ -1626,6 +1626,7 @@ Nicolas M. Thiéry
James Thomas
Robin Thomas
Brian Thorne
Christopher Thorne
Stephen Thorne
Jeremy Thurgood
Eric Tiedemann

View File

@ -0,0 +1 @@
Implement multibyte encoder/decoder state methods

View File

@ -51,6 +51,12 @@
; \
}
/*
* codecs in this file use the first byte of MultibyteCodec_State.c[8]
* to store a 0 or 1 state value
*/
#define CN_STATE_OFFSET 0
/*
* GB2312 codec
*/
@ -329,15 +335,15 @@ DECODER(gb18030)
ENCODER_INIT(hz)
{
state->i = 0;
state->c[CN_STATE_OFFSET] = 0;
return 0;
}
ENCODER_RESET(hz)
{
if (state->i != 0) {
if (state->c[CN_STATE_OFFSET] != 0) {
WRITEBYTE2('~', '}');
state->i = 0;
state->c[CN_STATE_OFFSET] = 0;
NEXT_OUT(2);
}
return 0;
@ -350,10 +356,10 @@ ENCODER(hz)
DBCHAR code;
if (c < 0x80) {
if (state->i) {
if (state->c[CN_STATE_OFFSET]) {
WRITEBYTE2('~', '}');
NEXT_OUT(2);
state->i = 0;
state->c[CN_STATE_OFFSET] = 0;
}
WRITEBYTE1((unsigned char)c);
NEXT(1, 1);
@ -375,10 +381,10 @@ ENCODER(hz)
if (code & 0x8000) /* MSB set: GBK */
return 1;
if (state->i == 0) {
if (state->c[CN_STATE_OFFSET] == 0) {
WRITEBYTE4('~', '{', code >> 8, code & 0xff);
NEXT(1, 4);
state->i = 1;
state->c[CN_STATE_OFFSET] = 1;
}
else {
WRITEBYTE2(code >> 8, code & 0xff);
@ -391,13 +397,13 @@ ENCODER(hz)
DECODER_INIT(hz)
{
state->i = 0;
state->c[CN_STATE_OFFSET] = 0;
return 0;
}
DECODER_RESET(hz)
{
state->i = 0;
state->c[CN_STATE_OFFSET] = 0;
return 0;
}
@ -411,14 +417,14 @@ DECODER(hz)
unsigned char c2 = INBYTE2;
REQUIRE_INBUF(2);
if (c2 == '~' && state->i == 0)
if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0)
OUTCHAR('~');
else if (c2 == '{' && state->i == 0)
state->i = 1; /* set GB */
else if (c2 == '\n' && state->i == 0)
else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0)
state->c[CN_STATE_OFFSET] = 1; /* set GB */
else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0)
; /* line-continuation */
else if (c2 == '}' && state->i == 1)
state->i = 0; /* set ASCII */
else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1)
state->c[CN_STATE_OFFSET] = 0; /* set ASCII */
else
return 1;
NEXT_IN(2);
@ -428,7 +434,7 @@ DECODER(hz)
if (c & 0x80)
return 1;
if (state->i == 0) { /* ASCII mode */
if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */
OUTCHAR(c);
NEXT_IN(1);
}

View File

@ -115,6 +115,50 @@ exit:
return return_value;
}
PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_getstate__doc__,
"getstate($self, /)\n"
"--\n"
"\n");
#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF \
{"getstate", (PyCFunction)_multibytecodec_MultibyteIncrementalEncoder_getstate, METH_NOARGS, _multibytecodec_MultibyteIncrementalEncoder_getstate__doc__},
static PyObject *
_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self);
static PyObject *
_multibytecodec_MultibyteIncrementalEncoder_getstate(MultibyteIncrementalEncoderObject *self, PyObject *Py_UNUSED(ignored))
{
return _multibytecodec_MultibyteIncrementalEncoder_getstate_impl(self);
}
PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_setstate__doc__,
"setstate($self, state, /)\n"
"--\n"
"\n");
#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF \
{"setstate", (PyCFunction)_multibytecodec_MultibyteIncrementalEncoder_setstate, METH_O, _multibytecodec_MultibyteIncrementalEncoder_setstate__doc__},
static PyObject *
_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self,
PyLongObject *statelong);
static PyObject *
_multibytecodec_MultibyteIncrementalEncoder_setstate(MultibyteIncrementalEncoderObject *self, PyObject *arg)
{
PyObject *return_value = NULL;
PyLongObject *statelong;
if (!PyArg_Parse(arg, "O!:setstate", &PyLong_Type, &statelong)) {
goto exit;
}
return_value = _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(self, statelong);
exit:
return return_value;
}
PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_reset__doc__,
"reset($self, /)\n"
"--\n"
@ -169,6 +213,50 @@ exit:
return return_value;
}
PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_getstate__doc__,
"getstate($self, /)\n"
"--\n"
"\n");
#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF \
{"getstate", (PyCFunction)_multibytecodec_MultibyteIncrementalDecoder_getstate, METH_NOARGS, _multibytecodec_MultibyteIncrementalDecoder_getstate__doc__},
static PyObject *
_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self);
static PyObject *
_multibytecodec_MultibyteIncrementalDecoder_getstate(MultibyteIncrementalDecoderObject *self, PyObject *Py_UNUSED(ignored))
{
return _multibytecodec_MultibyteIncrementalDecoder_getstate_impl(self);
}
PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_setstate__doc__,
"setstate($self, state, /)\n"
"--\n"
"\n");
#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF \
{"setstate", (PyCFunction)_multibytecodec_MultibyteIncrementalDecoder_setstate, METH_O, _multibytecodec_MultibyteIncrementalDecoder_setstate__doc__},
static PyObject *
_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self,
PyObject *state);
static PyObject *
_multibytecodec_MultibyteIncrementalDecoder_setstate(MultibyteIncrementalDecoderObject *self, PyObject *arg)
{
PyObject *return_value = NULL;
PyObject *state;
if (!PyArg_Parse(arg, "O!:setstate", &PyTuple_Type, &state)) {
goto exit;
}
return_value = _multibytecodec_MultibyteIncrementalDecoder_setstate_impl(self, state);
exit:
return return_value;
}
PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_reset__doc__,
"reset($self, /)\n"
"--\n"
@ -330,4 +418,4 @@ PyDoc_STRVAR(_multibytecodec___create_codec__doc__,
#define _MULTIBYTECODEC___CREATE_CODEC_METHODDEF \
{"__create_codec", (PyCFunction)_multibytecodec___create_codec, METH_O, _multibytecodec___create_codec__doc__},
/*[clinic end generated code: output=680f59f4cfe63c25 input=a9049054013a1b77]*/
/*[clinic end generated code: output=2fa0a38494716b97 input=a9049054013a1b77]*/

View File

@ -895,6 +895,93 @@ _multibytecodec_MultibyteIncrementalEncoder_encode_impl(MultibyteIncrementalEnco
return encoder_encode_stateful(STATEFUL_ECTX(self), input, final);
}
/*[clinic input]
_multibytecodec.MultibyteIncrementalEncoder.getstate
[clinic start generated code]*/
static PyObject *
_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self)
/*[clinic end generated code: output=9794a5ace70d7048 input=4a2a82874ffa40bb]*/
{
/* state made up of 1 byte for buffer size, up to MAXENCPENDING*4 bytes
for UTF-8 encoded buffer (each character can use up to 4
bytes), and required bytes for MultibyteCodec_State.c. A byte
array is used to avoid different compilers generating different
values for the same state, e.g. as a result of struct padding.
*/
unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)];
Py_ssize_t statesize;
const char *pendingbuffer = NULL;
Py_ssize_t pendingsize;
if (self->pending != NULL) {
pendingbuffer = PyUnicode_AsUTF8AndSize(self->pending, &pendingsize);
if (pendingbuffer == NULL) {
return NULL;
}
if (pendingsize > MAXENCPENDING*4) {
PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
return NULL;
}
statebytes[0] = pendingsize;
memcpy(statebytes+1, pendingbuffer, pendingsize);
statesize = 1 + pendingsize;
} else {
statebytes[0] = 0;
statesize = 1;
}
memcpy(statebytes+statesize, self->state.c,
sizeof(self->state.c));
statesize += sizeof(self->state.c);
return (PyObject *)_PyLong_FromByteArray(statebytes, statesize,
1 /* little-endian */ ,
0 /* unsigned */ );
}
/*[clinic input]
_multibytecodec.MultibyteIncrementalEncoder.setstate
state as statelong: object(type='PyLongObject *', subclass_of='&PyLong_Type')
/
[clinic start generated code]*/
static PyObject *
_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self,
PyLongObject *statelong)
/*[clinic end generated code: output=4e5e98ac1f4039ca input=c80fb5830d4d2f76]*/
{
PyObject *pending = NULL;
unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)];
if (_PyLong_AsByteArray(statelong, statebytes, sizeof(statebytes),
1 /* little-endian */ ,
0 /* unsigned */ ) < 0) {
goto errorexit;
}
if (statebytes[0] > MAXENCPENDING*4) {
PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
return NULL;
}
pending = PyUnicode_DecodeUTF8((const char *)statebytes+1,
statebytes[0], "strict");
if (pending == NULL) {
goto errorexit;
}
Py_CLEAR(self->pending);
self->pending = pending;
memcpy(self->state.c, statebytes+1+statebytes[0],
sizeof(self->state.c));
Py_RETURN_NONE;
errorexit:
Py_XDECREF(pending);
return NULL;
}
/*[clinic input]
_multibytecodec.MultibyteIncrementalEncoder.reset
[clinic start generated code]*/
@ -919,6 +1006,8 @@ _multibytecodec_MultibyteIncrementalEncoder_reset_impl(MultibyteIncrementalEncod
static struct PyMethodDef mbiencoder_methods[] = {
_MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_ENCODE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_RESET_METHODDEF
{NULL, NULL},
};
@ -984,6 +1073,7 @@ mbiencoder_dealloc(MultibyteIncrementalEncoderObject *self)
{
PyObject_GC_UnTrack(self);
ERROR_DECREF(self->errors);
Py_CLEAR(self->pending);
Py_TYPE(self)->tp_free(self);
}
@ -1119,6 +1209,68 @@ errorexit:
return NULL;
}
/*[clinic input]
_multibytecodec.MultibyteIncrementalDecoder.getstate
[clinic start generated code]*/
static PyObject *
_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self)
/*[clinic end generated code: output=255009c4713b7f82 input=4006aa49bddbaa75]*/
{
PyObject *buffer;
buffer = PyBytes_FromStringAndSize((const char *)self->pending,
self->pendingsize);
if (buffer == NULL) {
return NULL;
}
return make_tuple(buffer, (Py_ssize_t)*self->state.c);
}
/*[clinic input]
_multibytecodec.MultibyteIncrementalDecoder.setstate
state: object(subclass_of='&PyTuple_Type')
/
[clinic start generated code]*/
static PyObject *
_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self,
PyObject *state)
/*[clinic end generated code: output=106b2fbca3e2dcc2 input=e5d794e8baba1a47]*/
{
PyObject *buffer;
Py_ssize_t buffersize;
char *bufferstr;
unsigned long long flag;
if (!PyArg_ParseTuple(state, "SK;setstate(): illegal state argument",
&buffer, &flag))
{
return NULL;
}
buffersize = PyBytes_Size(buffer);
if (buffersize == -1) {
return NULL;
}
if (buffersize > MAXDECPENDING) {
PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
return NULL;
}
bufferstr = PyBytes_AsString(buffer);
if (bufferstr == NULL) {
return NULL;
}
self->pendingsize = buffersize;
memcpy(self->pending, bufferstr, self->pendingsize);
memcpy(self->state.c, (unsigned char *)&flag, sizeof(flag));
Py_RETURN_NONE;
}
/*[clinic input]
_multibytecodec.MultibyteIncrementalDecoder.reset
[clinic start generated code]*/
@ -1137,6 +1289,8 @@ _multibytecodec_MultibyteIncrementalDecoder_reset_impl(MultibyteIncrementalDecod
static struct PyMethodDef mbidecoder_methods[] = {
_MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_DECODE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_RESET_METHODDEF
{NULL, NULL},
};

View File

@ -16,12 +16,15 @@ typedef uint16_t ucs2_t, DBCHAR;
typedef unsigned short ucs2_t, DBCHAR;
#endif
typedef union {
void *p;
int i;
/*
* A struct that provides 8 bytes of state for multibyte
* codecs. Codecs are free to use this how they want. Note: if you
* need to add a new field to this struct, ensure that its byte order
* is independent of CPU endianness so that the return value of
* getstate doesn't differ between little and big endian CPUs.
*/
typedef struct {
unsigned char c[8];
ucs2_t u2[4];
Py_UCS4 u4[2];
} MultibyteCodec_State;
typedef int (*mbcodec_init)(const void *config);