Close #17694: Add minimum length to _PyUnicodeWriter

* Add also min_char attribute to _PyUnicodeWriter structure (currently unused)
 * _PyUnicodeWriter_Init() has no more argument (except the writer itself):
   min_length and overallocate must be set explicitly
 * In error handlers, only enable overallocation if the replacement string
   is longer than 1 character
 * CJK decoders don't use overallocation anymore
 * Set min_length, instead of preallocating memory using
   _PyUnicodeWriter_Prepare(), in many decoders
 * _PyUnicode_DecodeUnicodeInternal() checks for integer overflow
This commit is contained in:
Victor Stinner 2013-04-17 23:02:17 +02:00
parent e84a51c38e
commit 8f674ccd64
7 changed files with 81 additions and 71 deletions

View File

@ -898,22 +898,28 @@ typedef struct {
Py_UCS4 maxchar; Py_UCS4 maxchar;
Py_ssize_t size; Py_ssize_t size;
Py_ssize_t pos; Py_ssize_t pos;
/* minimum length of the buffer when overallocation is enabled,
see _PyUnicodeWriter_Init() */ /* minimum number of allocated characters (default: 0) */
Py_ssize_t min_length; Py_ssize_t min_length;
/* minimum character (default: 127, ASCII) */
Py_UCS4 min_char;
/* If non-zero, overallocate the buffer by 25% (default: 0). */
unsigned char overallocate; unsigned char overallocate;
/* If readonly is 1, buffer is a shared string (cannot be modified) /* If readonly is 1, buffer is a shared string (cannot be modified)
and size is set to 0. */ and size is set to 0. */
unsigned char readonly; unsigned char readonly;
} _PyUnicodeWriter ; } _PyUnicodeWriter ;
/* Initialize a Unicode writer. /* Initialize a Unicode writer.
*
If min_length is greater than zero, _PyUnicodeWriter_Prepare() * By default, the minimum buffer size is 0 character and overallocation is
overallocates the buffer and min_length is the minimum length in characters * disabled. Set min_length, min_char and overallocate attributes to control
of the buffer. */ * the allocation of the buffer. */
PyAPI_FUNC(void) PyAPI_FUNC(void)
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length); _PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
/* Prepare the buffer to write 'length' characters /* Prepare the buffer to write 'length' characters
with the specified maximum character. with the specified maximum character.

View File

@ -633,7 +633,8 @@ MultibyteCodec_Decode(MultibyteCodecObject *self,
return make_tuple(PyUnicode_New(0, 0), 0); return make_tuple(PyUnicode_New(0, 0), 0);
} }
_PyUnicodeWriter_Init(&buf.writer, datalen); _PyUnicodeWriter_Init(&buf.writer);
buf.writer.min_length = datalen;
buf.excobj = NULL; buf.excobj = NULL;
buf.inbuf = buf.inbuf_top = (unsigned char *)data; buf.inbuf = buf.inbuf_top = (unsigned char *)data;
buf.inbuf_end = buf.inbuf_top + datalen; buf.inbuf_end = buf.inbuf_top + datalen;
@ -839,7 +840,7 @@ decoder_prepare_buffer(MultibyteDecodeBuffer *buf, const char *data,
{ {
buf->inbuf = buf->inbuf_top = (const unsigned char *)data; buf->inbuf = buf->inbuf_top = (const unsigned char *)data;
buf->inbuf_end = buf->inbuf_top + size; buf->inbuf_end = buf->inbuf_top + size;
_PyUnicodeWriter_Init(&buf->writer, size); buf->writer.min_length += size;
return 0; return 0;
} }
@ -1037,7 +1038,7 @@ mbidecoder_decode(MultibyteIncrementalDecoderObject *self,
data = pdata.buf; data = pdata.buf;
size = pdata.len; size = pdata.len;
_PyUnicodeWriter_Init(&buf.writer, 1); _PyUnicodeWriter_Init(&buf.writer);
buf.excobj = NULL; buf.excobj = NULL;
origpending = self->pendingsize; origpending = self->pendingsize;
@ -1241,7 +1242,7 @@ mbstreamreader_iread(MultibyteStreamReaderObject *self,
if (sizehint == 0) if (sizehint == 0)
return PyUnicode_New(0, 0); return PyUnicode_New(0, 0);
_PyUnicodeWriter_Init(&buf.writer, 1); _PyUnicodeWriter_Init(&buf.writer);
buf.excobj = NULL; buf.excobj = NULL;
cres = NULL; cres = NULL;

View File

@ -705,7 +705,7 @@ complex__format__(PyObject* self, PyObject* args)
if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
return NULL; return NULL;
_PyUnicodeWriter_Init(&writer, 0); _PyUnicodeWriter_Init(&writer);
ret = _PyComplex_FormatAdvancedWriter( ret = _PyComplex_FormatAdvancedWriter(
&writer, &writer,
self, self,

View File

@ -1711,7 +1711,7 @@ float__format__(PyObject *self, PyObject *args)
if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
return NULL; return NULL;
_PyUnicodeWriter_Init(&writer, 0); _PyUnicodeWriter_Init(&writer);
ret = _PyFloat_FormatAdvancedWriter( ret = _PyFloat_FormatAdvancedWriter(
&writer, &writer,
self, self,

View File

@ -4379,7 +4379,7 @@ long__format__(PyObject *self, PyObject *args)
if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
return NULL; return NULL;
_PyUnicodeWriter_Init(&writer, 0); _PyUnicodeWriter_Init(&writer);
ret = _PyLong_FormatAdvancedWriter( ret = _PyLong_FormatAdvancedWriter(
&writer, &writer,
self, self,

View File

@ -906,7 +906,6 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs,
int recursion_depth, AutoNumber *auto_number) int recursion_depth, AutoNumber *auto_number)
{ {
_PyUnicodeWriter writer; _PyUnicodeWriter writer;
Py_ssize_t minlen;
/* check the recursion level */ /* check the recursion level */
if (recursion_depth <= 0) { if (recursion_depth <= 0) {
@ -915,8 +914,9 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs,
return NULL; return NULL;
} }
minlen = PyUnicode_GET_LENGTH(input->str) + 100; _PyUnicodeWriter_Init(&writer);
_PyUnicodeWriter_Init(&writer, minlen); writer.overallocate = 1;
writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
if (!do_markup(input, args, kwargs, &writer, recursion_depth, if (!do_markup(input, args, kwargs, &writer, recursion_depth,
auto_number)) { auto_number)) {

View File

@ -2665,7 +2665,9 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
const char *f; const char *f;
_PyUnicodeWriter writer; _PyUnicodeWriter writer;
_PyUnicodeWriter_Init(&writer, strlen(format) + 100); _PyUnicodeWriter_Init(&writer);
writer.min_length = strlen(format) + 100;
writer.overallocate = 1;
/* va_list may be an array (of 1 item) on some platforms (ex: AMD64). /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
Copy it to be able to pass a reference to a subfunction. */ Copy it to be able to pass a reference to a subfunction. */
@ -4117,6 +4119,9 @@ unicode_decode_call_errorhandler_writer(
goto onError; goto onError;
} }
if (PyUnicode_READY(repunicode) < 0)
goto onError;
if (PyUnicode_GET_LENGTH(repunicode) > 1)
writer->overallocate = 1; writer->overallocate = 1;
if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
return return
@ -4256,9 +4261,8 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
} }
/* Start off assuming it's all ASCII. Widen later as necessary. */ /* Start off assuming it's all ASCII. Widen later as necessary. */
_PyUnicodeWriter_Init(&writer, 0); _PyUnicodeWriter_Init(&writer);
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) writer.min_length = size;
goto onError;
shiftOutStart = 0; shiftOutStart = 0;
e = s + size; e = s + size;
@ -4655,7 +4659,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
return get_latin1_char((unsigned char)s[0]); return get_latin1_char((unsigned char)s[0]);
} }
_PyUnicodeWriter_Init(&writer, 0); _PyUnicodeWriter_Init(&writer);
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
goto onError; goto onError;
@ -4910,7 +4914,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
le = bo <= 0; le = bo <= 0;
#endif #endif
_PyUnicodeWriter_Init(&writer, 0); _PyUnicodeWriter_Init(&writer);
if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1) if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
goto onError; goto onError;
@ -5149,7 +5153,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
/* Note: size will always be longer than the resulting Unicode /* Note: size will always be longer than the resulting Unicode
character count */ character count */
_PyUnicodeWriter_Init(&writer, 0); _PyUnicodeWriter_Init(&writer);
if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1) if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
goto onError; goto onError;
@ -5420,11 +5424,9 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
and we determined it's exact size (common case) and we determined it's exact size (common case)
or it contains \x, \u, ... escape sequences. then we create a or it contains \x, \u, ... escape sequences. then we create a
legacy wchar string and resize it at the end of this function. */ legacy wchar string and resize it at the end of this function. */
_PyUnicodeWriter_Init(&writer, 0); _PyUnicodeWriter_Init(&writer);
if (len > 0) { if (len > 0) {
if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1) writer.min_length = len;
goto onError;
assert(writer.kind == PyUnicode_1BYTE_KIND);
} }
else { else {
/* Escaped strings will always be longer than the resulting /* Escaped strings will always be longer than the resulting
@ -5432,8 +5434,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
length after conversion to the true value. length after conversion to the true value.
(but if the error callback returns a long replacement string (but if the error callback returns a long replacement string
we'll have to allocate more space) */ we'll have to allocate more space) */
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) writer.min_length = size;
goto onError;
} }
if (size == 0) if (size == 0)
@ -5461,10 +5462,6 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
if (s > end) if (s > end)
c = '\0'; /* Invalid after \ */ c = '\0'; /* Invalid after \ */
/* The only case in which i == ascii_length is a backslash
followed by a newline. */
assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
switch (c) { switch (c) {
/* \x escapes */ /* \x escapes */
@ -5787,9 +5784,8 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
Unicode string, so we start with size here and then reduce the Unicode string, so we start with size here and then reduce the
length after conversion to the true value. (But decoding error length after conversion to the true value. (But decoding error
handler might have to resize the string) */ handler might have to resize the string) */
_PyUnicodeWriter_Init(&writer, 1); _PyUnicodeWriter_Init(&writer);
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) writer.min_length = size;
goto onError;
end = s + size; end = s + size;
while (s < end) { while (s < end) {
@ -5982,12 +5978,14 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
if (size == 0) if (size == 0)
_Py_RETURN_UNICODE_EMPTY(); _Py_RETURN_UNICODE_EMPTY();
/* XXX overflow detection missing */ _PyUnicodeWriter_Init(&writer);
_PyUnicodeWriter_Init(&writer, 0); if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1) PyErr_NoMemory();
goto onError; goto onError;
end = s + size; }
writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
end = s + size;
while (s < end) { while (s < end) {
Py_UNICODE uch; Py_UNICODE uch;
Py_UCS4 ch; Py_UCS4 ch;
@ -6429,9 +6427,9 @@ PyUnicode_DecodeASCII(const char *s,
if (size == 1 && (unsigned char)s[0] < 128) if (size == 1 && (unsigned char)s[0] < 128)
return get_latin1_char((unsigned char)s[0]); return get_latin1_char((unsigned char)s[0]);
_PyUnicodeWriter_Init(&writer, 0); _PyUnicodeWriter_Init(&writer);
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0)
goto onError; return NULL;
e = s + size; e = s + size;
data = writer.data; data = writer.data;
@ -7280,7 +7278,7 @@ PyUnicode_DecodeCharmap(const char *s,
if (size == 0) if (size == 0)
_Py_RETURN_UNICODE_EMPTY(); _Py_RETURN_UNICODE_EMPTY();
_PyUnicodeWriter_Init(&writer, 0); _PyUnicodeWriter_Init(&writer);
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
goto onError; goto onError;
@ -7312,7 +7310,7 @@ PyUnicode_DecodeCharmap(const char *s,
ch = *s; ch = *s;
x = mapdata_ucs1[ch]; x = mapdata_ucs1[ch];
if (x > maxchar) { if (x > maxchar) {
if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1) if (_PyUnicodeWriter_Prepare(&writer, 1, 0xff) == -1)
goto onError; goto onError;
maxchar = writer.maxchar; maxchar = writer.maxchar;
outdata = (Py_UCS1 *)writer.data; outdata = (Py_UCS1 *)writer.data;
@ -12841,21 +12839,27 @@ unicode_endswith(PyObject *self,
Py_LOCAL_INLINE(void) Py_LOCAL_INLINE(void)
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
{ {
if (!writer->readonly)
writer->size = PyUnicode_GET_LENGTH(writer->buffer); writer->size = PyUnicode_GET_LENGTH(writer->buffer);
else {
/* Copy-on-write mode: set buffer size to 0 so
* _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
* next write. */
writer->size = 0;
}
writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
writer->data = PyUnicode_DATA(writer->buffer); writer->data = PyUnicode_DATA(writer->buffer);
writer->kind = PyUnicode_KIND(writer->buffer); writer->kind = PyUnicode_KIND(writer->buffer);
} }
void void
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length) _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
{ {
memset(writer, 0, sizeof(*writer)); memset(writer, 0, sizeof(*writer));
#ifdef Py_DEBUG #ifdef Py_DEBUG
writer->kind = 5; /* invalid kind */ writer->kind = 5; /* invalid kind */
#endif #endif
writer->min_length = Py_MAX(min_length, 100); writer->min_char = 127;
writer->overallocate = (min_length > 0);
} }
int int
@ -12873,29 +12877,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
} }
newlen = writer->pos + length; newlen = writer->pos + length;
maxchar = MAX_MAXCHAR(maxchar, writer->min_char);
if (writer->buffer == NULL) { if (writer->buffer == NULL) {
if (writer->overallocate) { assert(!writer->readonly);
if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
/* overallocate 25% to limit the number of resize */ /* overallocate 25% to limit the number of resize */
if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
newlen += newlen / 4; newlen += newlen / 4;
}
if (newlen < writer->min_length) if (newlen < writer->min_length)
newlen = writer->min_length; newlen = writer->min_length;
}
writer->buffer = PyUnicode_New(newlen, maxchar); writer->buffer = PyUnicode_New(newlen, maxchar);
if (writer->buffer == NULL) if (writer->buffer == NULL)
return -1; return -1;
_PyUnicodeWriter_Update(writer);
return 0;
} }
else if (newlen > writer->size) {
if (newlen > writer->size) { if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
if (writer->overallocate) {
/* overallocate 25% to limit the number of resize */ /* overallocate 25% to limit the number of resize */
if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
newlen += newlen / 4; newlen += newlen / 4;
}
if (newlen < writer->min_length) if (newlen < writer->min_length)
newlen = writer->min_length; newlen = writer->min_length;
}
if (maxchar > writer->maxchar || writer->readonly) { if (maxchar > writer->maxchar || writer->readonly) {
/* resize + widen */ /* resize + widen */
@ -12913,7 +12916,6 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
return -1; return -1;
} }
writer->buffer = newbuffer; writer->buffer = newbuffer;
_PyUnicodeWriter_Update(writer);
} }
else if (maxchar > writer->maxchar) { else if (maxchar > writer->maxchar) {
assert(!writer->readonly); assert(!writer->readonly);
@ -12924,8 +12926,8 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
writer->buffer, 0, writer->pos); writer->buffer, 0, writer->pos);
Py_DECREF(writer->buffer); Py_DECREF(writer->buffer);
writer->buffer = newbuffer; writer->buffer = newbuffer;
_PyUnicodeWriter_Update(writer);
} }
_PyUnicodeWriter_Update(writer);
return 0; return 0;
} }
@ -12959,11 +12961,10 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
maxchar = PyUnicode_MAX_CHAR_VALUE(str); maxchar = PyUnicode_MAX_CHAR_VALUE(str);
if (maxchar > writer->maxchar || len > writer->size - writer->pos) { if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
if (writer->buffer == NULL && !writer->overallocate) { if (writer->buffer == NULL && !writer->overallocate) {
writer->readonly = 1;
Py_INCREF(str); Py_INCREF(str);
writer->buffer = str; writer->buffer = str;
_PyUnicodeWriter_Update(writer); _PyUnicodeWriter_Update(writer);
writer->readonly = 1;
writer->size = 0;
writer->pos += len; writer->pos += len;
return 0; return 0;
} }
@ -13080,7 +13081,7 @@ unicode__format__(PyObject* self, PyObject* args)
if (PyUnicode_READY(self) == -1) if (PyUnicode_READY(self) == -1)
return NULL; return NULL;
_PyUnicodeWriter_Init(&writer, 0); _PyUnicodeWriter_Init(&writer);
ret = _PyUnicode_FormatAdvancedWriter(&writer, ret = _PyUnicode_FormatAdvancedWriter(&writer,
self, format_spec, 0, self, format_spec, 0,
PyUnicode_GET_LENGTH(format_spec)); PyUnicode_GET_LENGTH(format_spec));
@ -14164,7 +14165,9 @@ PyUnicode_Format(PyObject *format, PyObject *args)
ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
ctx.fmtpos = 0; ctx.fmtpos = 0;
_PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100); _PyUnicodeWriter_Init(&ctx.writer);
ctx.writer.min_length = ctx.fmtcnt + 100;
ctx.writer.overallocate = 1;
if (PyTuple_Check(args)) { if (PyTuple_Check(args)) {
ctx.arglen = PyTuple_Size(args); ctx.arglen = PyTuple_Size(args);