diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 435a72a5220..d8947e700f8 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -71,6 +71,8 @@ struct _Py_bytes_state { }; struct _Py_unicode_state { + // The empty Unicode object is a singleton to improve performance. + PyObject *empty; struct _Py_unicode_fs_codec fs_codec; }; diff --git a/Include/internal/pycore_pylifecycle.h b/Include/internal/pycore_pylifecycle.h index cd470441817..f29c7cb9f39 100644 --- a/Include/internal/pycore_pylifecycle.h +++ b/Include/internal/pycore_pylifecycle.h @@ -31,7 +31,7 @@ PyAPI_FUNC(int) _Py_IsLocaleCoercionTarget(const char *ctype_loc); /* Various one-time initializers */ -extern PyStatus _PyUnicode_Init(void); +extern PyStatus _PyUnicode_Init(PyThreadState *tstate); extern int _PyStructSequence_Init(void); extern int _PyLong_Init(PyThreadState *tstate); extern PyStatus _PyFaulthandler_Init(int enable); diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst index 9b94bcc0169..e970551f531 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst @@ -2,7 +2,7 @@ Each interpreter now its has own free lists, singletons and caches: * Free lists: float, tuple, list, dict, frame, context, asynchronous generator, MemoryError. -* Singletons: empty tuple, empty bytes string, +* Singletons: empty tuple, empty bytes string, empty Unicode string, single byte character. * Slice cache. diff --git a/Objects/stringlib/asciilib.h b/Objects/stringlib/asciilib.h index 8599d38a5a7..7749e8fb339 100644 --- a/Objects/stringlib/asciilib.h +++ b/Objects/stringlib/asciilib.h @@ -11,7 +11,6 @@ #define STRINGLIB_CHAR Py_UCS1 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_GET_EMPTY() unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL diff --git a/Objects/stringlib/partition.h b/Objects/stringlib/partition.h index 3731df56987..bcc217697b2 100644 --- a/Objects/stringlib/partition.h +++ b/Objects/stringlib/partition.h @@ -1,9 +1,14 @@ /* stringlib: partition implementation */ #ifndef STRINGLIB_FASTSEARCH_H -#error must include "stringlib/fastsearch.h" before including this module +# error must include "stringlib/fastsearch.h" before including this module #endif +#if !STRINGLIB_MUTABLE && !defined(STRINGLIB_GET_EMPTY) +# error "STRINGLIB_GET_EMPTY must be defined if STRINGLIB_MUTABLE is zero" +#endif + + Py_LOCAL_INLINE(PyObject*) STRINGLIB(partition)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h index c12ecc59e5c..88641b25d47 100644 --- a/Objects/stringlib/stringdefs.h +++ b/Objects/stringlib/stringdefs.h @@ -1,10 +1,6 @@ #ifndef STRINGLIB_STRINGDEFS_H #define STRINGLIB_STRINGDEFS_H -#ifndef STRINGLIB_GET_EMPTY -# error "STRINGLIB_GET_EMPTY macro must be defined" -#endif - /* this is sort of a hack. there's at least one place (formatting floats) where some stringlib code takes a different path if it's compiled as unicode. */ diff --git a/Objects/stringlib/ucs1lib.h b/Objects/stringlib/ucs1lib.h index bdf30356b84..5b0b8a025e8 100644 --- a/Objects/stringlib/ucs1lib.h +++ b/Objects/stringlib/ucs1lib.h @@ -11,7 +11,6 @@ #define STRINGLIB_CHAR Py_UCS1 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_GET_EMPTY() unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL diff --git a/Objects/stringlib/ucs2lib.h b/Objects/stringlib/ucs2lib.h index 9d688880186..6af01511c5f 100644 --- a/Objects/stringlib/ucs2lib.h +++ b/Objects/stringlib/ucs2lib.h @@ -11,7 +11,6 @@ #define STRINGLIB_CHAR Py_UCS2 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_GET_EMPTY() unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL diff --git a/Objects/stringlib/ucs4lib.h b/Objects/stringlib/ucs4lib.h index c7dfa527433..39071a0cdf0 100644 --- a/Objects/stringlib/ucs4lib.h +++ b/Objects/stringlib/ucs4lib.h @@ -11,7 +11,6 @@ #define STRINGLIB_CHAR Py_UCS4 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_GET_EMPTY() unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL diff --git a/Objects/stringlib/unicodedefs.h b/Objects/stringlib/unicodedefs.h index e4d4163afc2..5ea79cd4f50 100644 --- a/Objects/stringlib/unicodedefs.h +++ b/Objects/stringlib/unicodedefs.h @@ -13,7 +13,6 @@ #define STRINGLIB_CHAR Py_UNICODE #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_GET_EMPTY() unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1433848c81f..06ca7a5751d 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -222,26 +222,43 @@ extern "C" { static PyObject *interned = NULL; #endif -/* The empty Unicode object is shared to improve performance. */ -static PyObject *unicode_empty = NULL; +static struct _Py_unicode_state* +get_unicode_state(void) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + return &interp->unicode; +} -#define _Py_INCREF_UNICODE_EMPTY() \ - do { \ - if (unicode_empty != NULL) \ - Py_INCREF(unicode_empty); \ - else { \ - unicode_empty = PyUnicode_New(0, 0); \ - if (unicode_empty != NULL) { \ - Py_INCREF(unicode_empty); \ - assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ - } \ - } \ - } while (0) -#define _Py_RETURN_UNICODE_EMPTY() \ - do { \ - _Py_INCREF_UNICODE_EMPTY(); \ - return unicode_empty; \ +// Return a borrowed reference to the empty string singleton. +// Return NULL if the singleton was not created yet. +static inline PyObject* unicode_get_empty(void) +{ + struct _Py_unicode_state *state = get_unicode_state(); + return state->empty; +} + +static inline PyObject* unicode_new_empty(void) +{ + struct _Py_unicode_state *state = get_unicode_state(); + PyObject *empty = state->empty; + if (empty != NULL) { + Py_INCREF(empty); + } + else { + empty = PyUnicode_New(0, 0); + if (empty != NULL) { + Py_INCREF(empty); + assert(_PyUnicode_CheckConsistency(empty, 1)); + state->empty = empty; + } + } + return empty; +} + +#define _Py_RETURN_UNICODE_EMPTY() \ + do { \ + return unicode_new_empty(); \ } while (0) static inline void @@ -676,11 +693,15 @@ unicode_result_ready(PyObject *unicode) length = PyUnicode_GET_LENGTH(unicode); if (length == 0) { - if (unicode != unicode_empty) { + PyObject *empty = unicode_get_empty(); + if (unicode != empty) { Py_DECREF(unicode); - _Py_RETURN_UNICODE_EMPTY(); + + Py_INCREF(empty); + return empty; } - return unicode_empty; + // unicode is the empty string singleton + return unicode; } #ifdef LATIN1_SINGLETONS @@ -864,7 +885,7 @@ xmlcharrefreplace(_PyBytesWriter *writer, char *str, to keep things simple, we use a single bitmask, using the least 5 bits from each unicode characters as the bit index. */ -/* the linebreak mask is set up by Unicode_Init below */ +/* the linebreak mask is set up by _PyUnicode_Init() below */ #if LONG_BIT >= 128 #define BLOOM_WIDTH 128 @@ -938,6 +959,8 @@ ensure_unicode(PyObject *obj) /* Compilation of templated routines */ +#define STRINGLIB_GET_EMPTY() unicode_get_empty() + #include "stringlib/asciilib.h" #include "stringlib/fastsearch.h" #include "stringlib/partition.h" @@ -986,6 +1009,8 @@ _Py_COMP_DIAG_IGNORE_DEPR_DECLS #include "stringlib/undef.h" _Py_COMP_DIAG_POP +#undef STRINGLIB_GET_EMPTY + /* --- Unicode Object ----------------------------------------------------- */ static inline Py_ssize_t @@ -1234,9 +1259,12 @@ _PyUnicode_New(Py_ssize_t length) size_t new_size; /* Optimization for empty strings */ - if (length == 0 && unicode_empty != NULL) { - Py_INCREF(unicode_empty); - return (PyUnicodeObject*)unicode_empty; + if (length == 0) { + PyObject *empty = unicode_get_empty(); + if (empty != NULL) { + Py_INCREF(empty); + return (PyUnicodeObject *)empty; + } } /* Ensure we won't overflow the size. */ @@ -1386,6 +1414,15 @@ _PyUnicode_Dump(PyObject *op) PyObject * PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) { + /* Optimization for empty strings */ + if (size == 0) { + PyObject *empty = unicode_get_empty(); + if (empty != NULL) { + Py_INCREF(empty); + return empty; + } + } + PyObject *obj; PyCompactUnicodeObject *unicode; void *data; @@ -1394,12 +1431,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) Py_ssize_t char_size; Py_ssize_t struct_size; - /* Optimization for empty strings */ - if (size == 0 && unicode_empty != NULL) { - Py_INCREF(unicode_empty); - return unicode_empty; - } - is_ascii = 0; is_sharing = 0; struct_size = sizeof(PyCompactUnicodeObject); @@ -1970,7 +2001,8 @@ unicode_dealloc(PyObject *unicode) static int unicode_is_singleton(PyObject *unicode) { - if (unicode == unicode_empty) { + struct _Py_unicode_state *state = get_unicode_state(); + if (unicode == state->empty) { return 1; } #ifdef LATIN1_SINGLETONS @@ -2026,10 +2058,10 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) return 0; if (length == 0) { - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) + PyObject *empty = unicode_new_empty(); + if (!empty) return -1; - Py_SETREF(*p_unicode, unicode_empty); + Py_SETREF(*p_unicode, empty); return 0; } @@ -10836,10 +10868,10 @@ replace(PyObject *self, PyObject *str1, } new_size = slen + n * (len2 - len1); if (new_size == 0) { - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) + PyObject *empty = unicode_new_empty(); + if (!empty) goto error; - u = unicode_empty; + u = empty; goto done; } if (new_size > (PY_SSIZE_T_MAX / rkind)) { @@ -11497,10 +11529,13 @@ PyUnicode_Concat(PyObject *left, PyObject *right) return NULL; /* Shortcuts */ - if (left == unicode_empty) + PyObject *empty = unicode_get_empty(); // Borrowed reference + if (left == empty) { return PyUnicode_FromObject(right); - if (right == unicode_empty) + } + if (right == empty) { return PyUnicode_FromObject(left); + } left_len = PyUnicode_GET_LENGTH(left); right_len = PyUnicode_GET_LENGTH(right); @@ -11551,14 +11586,16 @@ PyUnicode_Append(PyObject **p_left, PyObject *right) goto error; /* Shortcuts */ - if (left == unicode_empty) { + PyObject *empty = unicode_get_empty(); // Borrowed reference + if (left == empty) { Py_DECREF(left); Py_INCREF(right); *p_left = right; return; } - if (right == unicode_empty) + if (right == empty) { return; + } left_len = PyUnicode_GET_LENGTH(left); right_len = PyUnicode_GET_LENGTH(right); @@ -13255,12 +13292,12 @@ PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj) len1 = PyUnicode_GET_LENGTH(str_obj); len2 = PyUnicode_GET_LENGTH(sep_obj); if (kind1 < kind2 || len1 < len2) { - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) + PyObject *empty = unicode_get_empty(); // Borrowed reference + if (!empty) { out = NULL; + } else { - out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty); - Py_DECREF(unicode_empty); + out = PyTuple_Pack(3, str_obj, empty, empty); } return out; } @@ -13313,12 +13350,12 @@ PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj) len1 = PyUnicode_GET_LENGTH(str_obj); len2 = PyUnicode_GET_LENGTH(sep_obj); if (kind1 < kind2 || len1 < len2) { - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) + PyObject *empty = unicode_get_empty(); // Borrowed reference + if (!empty) { out = NULL; + } else { - out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj); - Py_DECREF(unicode_empty); + out = PyTuple_Pack(3, empty, empty, str_obj); } return out; } @@ -15538,10 +15575,10 @@ PyTypeObject PyUnicode_Type = { /* Initialize the Unicode implementation */ PyStatus -_PyUnicode_Init(void) +_PyUnicode_Init(PyThreadState *tstate) { /* XXX - move this array to unicodectype.c ? */ - Py_UCS2 linebreak[] = { + const Py_UCS2 linebreak[] = { 0x000A, /* LINE FEED */ 0x000D, /* CARRIAGE RETURN */ 0x001C, /* FILE SEPARATOR */ @@ -15553,29 +15590,31 @@ _PyUnicode_Init(void) }; /* Init the implementation */ - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) { - return _PyStatus_ERR("Can't create empty string"); + PyObject *empty = unicode_new_empty(); + if (!empty) { + return _PyStatus_NO_MEMORY(); } - Py_DECREF(unicode_empty); + Py_DECREF(empty); - if (PyType_Ready(&PyUnicode_Type) < 0) { - return _PyStatus_ERR("Can't initialize unicode type"); - } + if (_Py_IsMainInterpreter(tstate)) { + /* initialize the linebreak bloom filter */ + bloom_linebreak = make_bloom_mask( + PyUnicode_2BYTE_KIND, linebreak, + Py_ARRAY_LENGTH(linebreak)); - /* initialize the linebreak bloom filter */ - bloom_linebreak = make_bloom_mask( - PyUnicode_2BYTE_KIND, linebreak, - Py_ARRAY_LENGTH(linebreak)); + if (PyType_Ready(&PyUnicode_Type) < 0) { + return _PyStatus_ERR("Can't initialize unicode type"); + } - if (PyType_Ready(&EncodingMapType) < 0) { - return _PyStatus_ERR("Can't initialize encoding map type"); - } - if (PyType_Ready(&PyFieldNameIter_Type) < 0) { - return _PyStatus_ERR("Can't initialize field name iterator type"); - } - if (PyType_Ready(&PyFormatterIter_Type) < 0) { - return _PyStatus_ERR("Can't initialize formatter iter type"); + if (PyType_Ready(&EncodingMapType) < 0) { + return _PyStatus_ERR("Can't initialize encoding map type"); + } + if (PyType_Ready(&PyFieldNameIter_Type) < 0) { + return _PyStatus_ERR("Can't initialize field name iterator type"); + } + if (PyType_Ready(&PyFormatterIter_Type) < 0) { + return _PyStatus_ERR("Can't initialize formatter iter type"); + } } return _PyStatus_OK(); } @@ -16205,7 +16244,10 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void) void _PyUnicode_Fini(PyThreadState *tstate) { - if (_Py_IsMainInterpreter(tstate)) { + struct _Py_unicode_state *state = &tstate->interp->unicode; + + int is_main_interp = _Py_IsMainInterpreter(tstate); + if (is_main_interp) { #if defined(WITH_VALGRIND) || defined(__INSURE__) /* Insure++ is a memory analysis tool that aids in discovering * memory leaks and other memory problems. On Python exit, the @@ -16218,9 +16260,11 @@ _PyUnicode_Fini(PyThreadState *tstate) */ unicode_release_interned(); #endif /* __INSURE__ */ + } - Py_CLEAR(unicode_empty); + Py_CLEAR(state->empty); + if (is_main_interp) { #ifdef LATIN1_SINGLETONS for (Py_ssize_t i = 0; i < 256; i++) { Py_CLEAR(unicode_latin1[i]); diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index f0b40b3aa68..eda4c6ad7e4 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -595,11 +595,9 @@ pycore_init_types(PyThreadState *tstate) return _PyStatus_ERR("can't init longs"); } - if (is_main_interp) { - status = _PyUnicode_Init(); - if (_PyStatus_EXCEPTION(status)) { - return status; - } + status = _PyUnicode_Init(tstate); + if (_PyStatus_EXCEPTION(status)) { + return status; } status = _PyExc_Init(tstate);