Issue #10156: In the interpreter's initialization phase, unicode globals
are now initialized dynamically as needed.
This commit is contained in:
commit
ed3c4128c0
|
@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1?
|
|||
Core and Builtins
|
||||
-----------------
|
||||
|
||||
- Issue #10156: In the interpreter's initialization phase, unicode globals
|
||||
are now initialized dynamically as needed.
|
||||
|
||||
- Issue #16980: Fix processing of escaped non-ascii bytes in the
|
||||
unicode-escape-decode decoder.
|
||||
|
||||
|
|
|
@ -49,8 +49,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|||
|
||||
/* --- Globals ------------------------------------------------------------
|
||||
|
||||
The globals are initialized by the _PyUnicode_Init() API and should
|
||||
not be used before calling that API.
|
||||
NOTE: In the interpreter's initialization phase, some globals are currently
|
||||
initialized dynamically as needed. In the process Unicode objects may
|
||||
be created before the Unicode type is ready.
|
||||
|
||||
*/
|
||||
|
||||
|
@ -171,17 +172,36 @@ extern "C" {
|
|||
Another way to look at this is that to say that the actual reference
|
||||
count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
|
||||
*/
|
||||
static PyObject *interned;
|
||||
static PyObject *interned = NULL;
|
||||
|
||||
/* The empty Unicode object is shared to improve performance. */
|
||||
static PyObject *unicode_empty;
|
||||
static PyObject *unicode_empty = NULL;
|
||||
|
||||
#define _Py_INCREF_UNICODE_EMPTY() \
|
||||
do { \
|
||||
if (unicode_empty != NULL) \
|
||||
Py_INCREF(unicode_empty); \
|
||||
else { \
|
||||
unicode_empty = PyUnicode_New(0, 0); \
|
||||
if (unicode_empty != NULL) { \
|
||||
Py_INCREF(unicode_empty); \
|
||||
assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define _Py_RETURN_UNICODE_EMPTY() \
|
||||
do { \
|
||||
_Py_INCREF_UNICODE_EMPTY(); \
|
||||
return unicode_empty; \
|
||||
} while (0)
|
||||
|
||||
/* List of static strings. */
|
||||
static _Py_Identifier *static_strings;
|
||||
static _Py_Identifier *static_strings = NULL;
|
||||
|
||||
/* Single character Unicode strings in the Latin-1 range are being
|
||||
shared as well. */
|
||||
static PyObject *unicode_latin1[256];
|
||||
static PyObject *unicode_latin1[256] = {NULL};
|
||||
|
||||
/* Fast detection of the most frequent whitespace characters */
|
||||
const unsigned char _Py_ascii_whitespace[] = {
|
||||
|
@ -406,9 +426,8 @@ unicode_result_wchar(PyObject *unicode)
|
|||
|
||||
len = _PyUnicode_WSTR_LENGTH(unicode);
|
||||
if (len == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
Py_DECREF(unicode);
|
||||
return unicode_empty;
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
}
|
||||
|
||||
if (len == 1) {
|
||||
|
@ -442,8 +461,8 @@ unicode_result_ready(PyObject *unicode)
|
|||
length = PyUnicode_GET_LENGTH(unicode);
|
||||
if (length == 0) {
|
||||
if (unicode != unicode_empty) {
|
||||
Py_INCREF(unicode_empty);
|
||||
Py_DECREF(unicode);
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
}
|
||||
return unicode_empty;
|
||||
}
|
||||
|
@ -520,7 +539,7 @@ static OSVERSIONINFOEX winver;
|
|||
|
||||
#define BLOOM_MASK unsigned long
|
||||
|
||||
static BLOOM_MASK bloom_linebreak;
|
||||
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
|
||||
|
||||
#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
|
||||
#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
|
||||
|
@ -1602,9 +1621,11 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
|
|||
return 0;
|
||||
|
||||
if (length == 0) {
|
||||
_Py_INCREF_UNICODE_EMPTY();
|
||||
if (!unicode_empty)
|
||||
return -1;
|
||||
Py_DECREF(*p_unicode);
|
||||
*p_unicode = unicode_empty;
|
||||
Py_INCREF(*p_unicode);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1727,10 +1748,8 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
|
|||
some optimizations which share commonly used objects. */
|
||||
|
||||
/* Optimization for empty strings */
|
||||
if (size == 0 && unicode_empty != NULL) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (size == 0)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
|
||||
/* Single character Unicode objects in the Latin-1 range are
|
||||
shared when using this constructor */
|
||||
|
@ -1889,10 +1908,8 @@ _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
|
|||
PyObject *res;
|
||||
unsigned char max_char;
|
||||
|
||||
if (size == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (size == 0)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
assert(size > 0);
|
||||
if (size == 1)
|
||||
return get_latin1_char(u[0]);
|
||||
|
@ -1912,10 +1929,8 @@ _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
|
|||
PyObject *res;
|
||||
Py_UCS2 max_char;
|
||||
|
||||
if (size == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (size == 0)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
assert(size > 0);
|
||||
if (size == 1) {
|
||||
Py_UCS4 ch = u[0];
|
||||
|
@ -1950,10 +1965,8 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
|
|||
PyObject *res;
|
||||
Py_UCS4 max_char;
|
||||
|
||||
if (size == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (size == 0)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
assert(size > 0);
|
||||
if (size == 1) {
|
||||
Py_UCS4 ch = u[0];
|
||||
|
@ -2245,10 +2258,8 @@ PyObject *
|
|||
PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
|
||||
{
|
||||
if (w == NULL) {
|
||||
if (size == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (size == 0)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
PyErr_BadInternalCall();
|
||||
return NULL;
|
||||
}
|
||||
|
@ -2825,15 +2836,11 @@ PyUnicode_FromEncodedObject(register PyObject *obj,
|
|||
|
||||
/* Decoding bytes objects is the most common case and should be fast */
|
||||
if (PyBytes_Check(obj)) {
|
||||
if (PyBytes_GET_SIZE(obj) == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
v = unicode_empty;
|
||||
}
|
||||
else {
|
||||
v = PyUnicode_Decode(
|
||||
PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
|
||||
encoding, errors);
|
||||
}
|
||||
if (PyBytes_GET_SIZE(obj) == 0)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
v = PyUnicode_Decode(
|
||||
PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
|
||||
encoding, errors);
|
||||
return v;
|
||||
}
|
||||
|
||||
|
@ -2853,12 +2860,11 @@ PyUnicode_FromEncodedObject(register PyObject *obj,
|
|||
}
|
||||
|
||||
if (buffer.len == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
v = unicode_empty;
|
||||
PyBuffer_Release(&buffer);
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
}
|
||||
else
|
||||
v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
|
||||
|
||||
v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
|
||||
PyBuffer_Release(&buffer);
|
||||
return v;
|
||||
}
|
||||
|
@ -4201,8 +4207,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
|
|||
if (size == 0) {
|
||||
if (consumed)
|
||||
*consumed = 0;
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
}
|
||||
|
||||
/* Start off assuming it's all ASCII. Widen later as necessary. */
|
||||
|
@ -4609,8 +4614,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
|||
if (size == 0) {
|
||||
if (consumed)
|
||||
*consumed = 0;
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
}
|
||||
|
||||
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
|
||||
|
@ -4868,8 +4872,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
|
|||
if (q == e) {
|
||||
if (consumed)
|
||||
*consumed = size;
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
}
|
||||
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
|
@ -5108,8 +5111,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
|
|||
if (q == e) {
|
||||
if (consumed)
|
||||
*consumed = size;
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
}
|
||||
|
||||
#if PY_LITTLE_ENDIAN
|
||||
|
@ -5386,10 +5388,8 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
|
|||
Py_ssize_t len;
|
||||
|
||||
len = length_of_escaped_ascii_string(s, size);
|
||||
if (len == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (len == 0)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
|
||||
/* After length_of_escaped_ascii_string() there are two alternatives,
|
||||
either the string is pure ASCII with named escapes like \n, etc.
|
||||
|
@ -5781,10 +5781,8 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
|||
PyObject *errorHandler = NULL;
|
||||
PyObject *exc = NULL;
|
||||
|
||||
if (size == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (size == 0)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
|
||||
/* Escaped strings will always be longer than the resulting
|
||||
Unicode string, so we start with size here and then reduce the
|
||||
|
@ -5988,10 +5986,8 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
|
|||
1))
|
||||
return NULL;
|
||||
|
||||
if (size == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (size == 0)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
|
||||
/* XXX overflow detection missing */
|
||||
_PyUnicodeWriter_Init(&writer, 0);
|
||||
|
@ -6439,10 +6435,8 @@ PyUnicode_DecodeASCII(const char *s,
|
|||
PyObject *errorHandler = NULL;
|
||||
PyObject *exc = NULL;
|
||||
|
||||
if (size == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (size == 0)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
|
||||
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
|
||||
if (size == 1 && (unsigned char)s[0] < 128)
|
||||
|
@ -6820,8 +6814,7 @@ decode_code_page_stateful(int code_page,
|
|||
if (chunk_size == 0 && done) {
|
||||
if (v != NULL)
|
||||
break;
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
}
|
||||
|
||||
|
||||
|
@ -7298,10 +7291,8 @@ PyUnicode_DecodeCharmap(const char *s,
|
|||
if (mapping == NULL)
|
||||
return PyUnicode_DecodeLatin1(s, size, errors);
|
||||
|
||||
if (size == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (size == 0)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
_PyUnicodeWriter_Init(&writer, 0);
|
||||
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
|
||||
goto onError;
|
||||
|
@ -9354,9 +9345,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
|
|||
/* If empty sequence, return u"". */
|
||||
if (seqlen == 0) {
|
||||
Py_DECREF(fseq);
|
||||
Py_INCREF(unicode_empty);
|
||||
res = unicode_empty;
|
||||
return res;
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
}
|
||||
|
||||
/* If singleton sequence with an exact Unicode, return that. */
|
||||
|
@ -10056,7 +10045,9 @@ replace(PyObject *self, PyObject *str1,
|
|||
}
|
||||
new_size = slen + n * (len2 - len1);
|
||||
if (new_size == 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
_Py_INCREF_UNICODE_EMPTY();
|
||||
if (!unicode_empty)
|
||||
goto error;
|
||||
u = unicode_empty;
|
||||
goto done;
|
||||
}
|
||||
|
@ -11559,10 +11550,8 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
|
|||
PyErr_SetString(PyExc_IndexError, "string index out of range");
|
||||
return NULL;
|
||||
}
|
||||
if (start >= length || end < start) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (start >= length || end < start)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
|
||||
length = end - start;
|
||||
if (PyUnicode_IS_ASCII(self)) {
|
||||
|
@ -11689,10 +11678,8 @@ unicode_repeat(PyObject *str, Py_ssize_t len)
|
|||
PyObject *u;
|
||||
Py_ssize_t nchars, n;
|
||||
|
||||
if (len < 1) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (len < 1)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
|
||||
/* no repeat, return original string */
|
||||
if (len == 1)
|
||||
|
@ -12832,8 +12819,7 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
|
|||
{
|
||||
if (writer->pos == 0) {
|
||||
Py_XDECREF(writer->buffer);
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
}
|
||||
if (writer->readonly) {
|
||||
assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
|
||||
|
@ -13051,8 +13037,7 @@ unicode_subscript(PyObject* self, PyObject* item)
|
|||
}
|
||||
|
||||
if (slicelength <= 0) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
} else if (start == 0 && step == 1 &&
|
||||
slicelength == PyUnicode_GET_LENGTH(self)) {
|
||||
return unicode_result_unchanged(self);
|
||||
|
@ -14056,10 +14041,8 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
|||
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
|
||||
kwlist, &x, &encoding, &errors))
|
||||
return NULL;
|
||||
if (x == NULL) {
|
||||
Py_INCREF(unicode_empty);
|
||||
return unicode_empty;
|
||||
}
|
||||
if (x == NULL)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
if (encoding == NULL && errors == NULL)
|
||||
return PyObject_Str(x);
|
||||
else
|
||||
|
@ -14228,8 +14211,6 @@ PyTypeObject PyUnicode_Type = {
|
|||
|
||||
int _PyUnicode_Init(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* XXX - move this array to unicodectype.c ? */
|
||||
Py_UCS2 linebreak[] = {
|
||||
0x000A, /* LINE FEED */
|
||||
|
@ -14243,13 +14224,11 @@ int _PyUnicode_Init(void)
|
|||
};
|
||||
|
||||
/* Init the implementation */
|
||||
unicode_empty = PyUnicode_New(0, 0);
|
||||
_Py_INCREF_UNICODE_EMPTY();
|
||||
if (!unicode_empty)
|
||||
Py_FatalError("Can't create empty string");
|
||||
assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
|
||||
Py_DECREF(unicode_empty);
|
||||
|
||||
for (i = 0; i < 256; i++)
|
||||
unicode_latin1[i] = NULL;
|
||||
if (PyType_Ready(&PyUnicode_Type) < 0)
|
||||
Py_FatalError("Can't initialize 'unicode'");
|
||||
|
||||
|
@ -14289,15 +14268,10 @@ _PyUnicode_Fini(void)
|
|||
{
|
||||
int i;
|
||||
|
||||
Py_XDECREF(unicode_empty);
|
||||
unicode_empty = NULL;
|
||||
Py_CLEAR(unicode_empty);
|
||||
|
||||
for (i = 0; i < 256; i++) {
|
||||
if (unicode_latin1[i]) {
|
||||
Py_DECREF(unicode_latin1[i]);
|
||||
unicode_latin1[i] = NULL;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < 256; i++)
|
||||
Py_CLEAR(unicode_latin1[i]);
|
||||
_PyUnicode_ClearStaticStrings();
|
||||
(void)PyUnicode_ClearFreeList();
|
||||
}
|
||||
|
@ -14426,8 +14400,7 @@ _Py_ReleaseInternedUnicodeStrings(void)
|
|||
"mortal/immortal\n", mortal_size, immortal_size);
|
||||
Py_DECREF(keys);
|
||||
PyDict_Clear(interned);
|
||||
Py_DECREF(interned);
|
||||
interned = NULL;
|
||||
Py_CLEAR(interned);
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue