From b6cd014d75394ca48e6113d614d36ed16fbe7b13 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 3 May 2012 02:17:04 +0200 Subject: [PATCH] Unicode: optimize creating of 1-character strings --- Objects/unicodeobject.c | 58 +++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e22fcfd02b1..f427fd3d11c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1919,8 +1919,18 @@ _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) return unicode_empty; } assert(size > 0); - if (size == 1 && u[0] < 256) - return get_latin1_char((unsigned char)u[0]); + if (size == 1) { + Py_UCS4 ch = u[0]; + if (ch < 256) + return get_latin1_char((unsigned char)ch); + + res = PyUnicode_New(1, ch); + if (res == NULL) + return NULL; + PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); + assert(_PyUnicode_CheckConsistency(res, 1)); + return res; + } max_char = ucs2lib_find_max_char(u, u + size); res = PyUnicode_New(size, max_char); @@ -1947,8 +1957,18 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) return unicode_empty; } assert(size > 0); - if (size == 1 && u[0] < 256) - return get_latin1_char((unsigned char)u[0]); + if (size == 1) { + Py_UCS4 ch = u[0]; + if (ch < 256) + return get_latin1_char((unsigned char)ch); + + res = PyUnicode_New(1, ch); + if (res == NULL) + return NULL; + PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); + assert(_PyUnicode_CheckConsistency(res, 1)); + return res; + } max_char = ucs4lib_find_max_char(u, u + size); res = PyUnicode_New(size, max_char); @@ -11368,10 +11388,33 @@ unicode_find(PyObject *self, PyObject *args) static PyObject * unicode_getitem(PyObject *self, Py_ssize_t index) { - Py_UCS4 ch = PyUnicode_ReadChar(self, index); - if (ch == (Py_UCS4)-1) + void *data; + enum PyUnicode_Kind kind; + Py_UCS4 ch; + PyObject *res; + + if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { + PyErr_BadArgument(); return NULL; - return PyUnicode_FromOrdinal(ch); + } + if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { + PyErr_SetString(PyExc_IndexError, "string index out of range"); + return NULL; + } + kind = PyUnicode_KIND(self); + data = PyUnicode_DATA(self); + ch = PyUnicode_READ(kind, data, index); + if (ch < 256) + return get_latin1_char(ch); + + res = PyUnicode_New(1, ch); + if (res == NULL) + return NULL; + kind = PyUnicode_KIND(res); + data = PyUnicode_DATA(res); + PyUnicode_WRITE(kind, data, 0, ch); + assert(_PyUnicode_CheckConsistency(res, 1)); + return res; } /* Believe it or not, this produces the same value for ASCII strings @@ -12039,7 +12082,6 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) } if (PyUnicode_IS_ASCII(self)) { - kind = PyUnicode_KIND(self); data = PyUnicode_1BYTE_DATA(self); return unicode_fromascii(data + start, length); }