bpo-40521: Make empty Unicode string per interpreter (GH-21096)

Each interpreter now has its own empty Unicode string singleton.
2020-06-24 00:10:40 +02:00 · 2020-06-24 00:10:40 +02:00 · f363d0a6e9
parent d051801052
commit f363d0a6e9
12 changed files with 130 additions and 90 deletions
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@ -71,6 +71,8 @@ struct _Py_bytes_state {
 };

 struct _Py_unicode_state {
+    // The empty Unicode object is a singleton to improve performance.
+    PyObject *empty;
    struct _Py_unicode_fs_codec fs_codec;
 };

--- a/Include/internal/pycore_pylifecycle.h
+++ b/Include/internal/pycore_pylifecycle.h
@ -31,7 +31,7 @@ PyAPI_FUNC(int) _Py_IsLocaleCoercionTarget(const char *ctype_loc);

 /* Various one-time initializers */

-extern PyStatus _PyUnicode_Init(void);
+extern PyStatus _PyUnicode_Init(PyThreadState *tstate);
 extern int _PyStructSequence_Init(void);
 extern int _PyLong_Init(PyThreadState *tstate);
 extern PyStatus _PyFaulthandler_Init(int enable);
--- a/Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
+++ b/Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
@ -2,7 +2,7 @@ Each interpreter now its has own free lists, singletons and caches:

 * Free lists: float, tuple, list, dict, frame, context,
  asynchronous generator, MemoryError.
-* Singletons: empty tuple, empty bytes string,
+* Singletons: empty tuple, empty bytes string, empty Unicode string,
  single byte character.
 * Slice cache.

--- a/Objects/stringlib/asciilib.h
+++ b/Objects/stringlib/asciilib.h
@ -11,7 +11,6 @@
 #define STRINGLIB_CHAR           Py_UCS1
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
-#define STRINGLIB_GET_EMPTY()    unicode_empty
 #define STRINGLIB_ISSPACE        Py_UNICODE_ISSPACE
 #define STRINGLIB_ISLINEBREAK    BLOOM_LINEBREAK
 #define STRINGLIB_ISDECIMAL      Py_UNICODE_ISDECIMAL
--- a/Objects/stringlib/partition.h
+++ b/Objects/stringlib/partition.h
@ -1,9 +1,14 @@
 /* stringlib: partition implementation */

 #ifndef STRINGLIB_FASTSEARCH_H
-#error must include "stringlib/fastsearch.h" before including this module
+#  error must include "stringlib/fastsearch.h" before including this module
 #endif

+#if !STRINGLIB_MUTABLE && !defined(STRINGLIB_GET_EMPTY)
+#  error "STRINGLIB_GET_EMPTY must be defined if STRINGLIB_MUTABLE is zero"
+#endif
+
+
 Py_LOCAL_INLINE(PyObject*)
 STRINGLIB(partition)(PyObject* str_obj,
                    const STRINGLIB_CHAR* str, Py_ssize_t str_len,
--- a/Objects/stringlib/stringdefs.h
+++ b/Objects/stringlib/stringdefs.h
@ -1,10 +1,6 @@
 #ifndef STRINGLIB_STRINGDEFS_H
 #define STRINGLIB_STRINGDEFS_H

-#ifndef STRINGLIB_GET_EMPTY
-#  error "STRINGLIB_GET_EMPTY macro must be defined"
-#endif
-
 /* this is sort of a hack.  there's at least one place (formatting
   floats) where some stringlib code takes a different path if it's
   compiled as unicode. */
--- a/Objects/stringlib/ucs1lib.h
+++ b/Objects/stringlib/ucs1lib.h
@ -11,7 +11,6 @@
 #define STRINGLIB_CHAR           Py_UCS1
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
-#define STRINGLIB_GET_EMPTY()    unicode_empty
 #define STRINGLIB_ISSPACE        Py_UNICODE_ISSPACE
 #define STRINGLIB_ISLINEBREAK    BLOOM_LINEBREAK
 #define STRINGLIB_ISDECIMAL      Py_UNICODE_ISDECIMAL
--- a/Objects/stringlib/ucs2lib.h
+++ b/Objects/stringlib/ucs2lib.h
@ -11,7 +11,6 @@
 #define STRINGLIB_CHAR           Py_UCS2
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
-#define STRINGLIB_GET_EMPTY()    unicode_empty
 #define STRINGLIB_ISSPACE        Py_UNICODE_ISSPACE
 #define STRINGLIB_ISLINEBREAK    BLOOM_LINEBREAK
 #define STRINGLIB_ISDECIMAL      Py_UNICODE_ISDECIMAL
--- a/Objects/stringlib/ucs4lib.h
+++ b/Objects/stringlib/ucs4lib.h
@ -11,7 +11,6 @@
 #define STRINGLIB_CHAR           Py_UCS4
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
-#define STRINGLIB_GET_EMPTY()    unicode_empty
 #define STRINGLIB_ISSPACE        Py_UNICODE_ISSPACE
 #define STRINGLIB_ISLINEBREAK    BLOOM_LINEBREAK
 #define STRINGLIB_ISDECIMAL      Py_UNICODE_ISDECIMAL
--- a/Objects/stringlib/unicodedefs.h
+++ b/Objects/stringlib/unicodedefs.h
@ -13,7 +13,6 @@
 #define STRINGLIB_CHAR           Py_UNICODE
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
-#define STRINGLIB_GET_EMPTY()    unicode_empty
 #define STRINGLIB_ISSPACE        Py_UNICODE_ISSPACE
 #define STRINGLIB_ISLINEBREAK    BLOOM_LINEBREAK
 #define STRINGLIB_ISDECIMAL      Py_UNICODE_ISDECIMAL
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -222,26 +222,43 @@ extern "C" {
 static PyObject *interned = NULL;
 #endif

-/* The empty Unicode object is shared to improve performance. */
-static PyObject *unicode_empty = NULL;
+static struct _Py_unicode_state*
+get_unicode_state(void)
+{
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    return &interp->unicode;
+}

-#define _Py_INCREF_UNICODE_EMPTY()                      \
-    do {                                                \
-        if (unicode_empty != NULL)                      \
-            Py_INCREF(unicode_empty);                   \
-        else {                                          \
-            unicode_empty = PyUnicode_New(0, 0);        \
-            if (unicode_empty != NULL) {                \
-                Py_INCREF(unicode_empty);               \
-                assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
-            }                                           \
-        }                                               \
-    } while (0)
+
+// Return a borrowed reference to the empty string singleton.
+// Return NULL if the singleton was not created yet.
+static inline PyObject* unicode_get_empty(void)
+{
+    struct _Py_unicode_state *state = get_unicode_state();
+    return state->empty;
+}
+
+static inline PyObject* unicode_new_empty(void)
+{
+    struct _Py_unicode_state *state = get_unicode_state();
+    PyObject *empty = state->empty;
+    if (empty != NULL) {
+        Py_INCREF(empty);
+    }
+    else {
+        empty = PyUnicode_New(0, 0);
+        if (empty != NULL) {
+            Py_INCREF(empty);
+            assert(_PyUnicode_CheckConsistency(empty, 1));
+            state->empty = empty;
+        }
+    }
+    return empty;
+}

 #define _Py_RETURN_UNICODE_EMPTY()   \
    do {                             \
-        _Py_INCREF_UNICODE_EMPTY();                     \
-        return unicode_empty;                           \
+        return unicode_new_empty();  \
    } while (0)

 static inline void
@ -676,11 +693,15 @@ unicode_result_ready(PyObject *unicode)

    length = PyUnicode_GET_LENGTH(unicode);
    if (length == 0) {
-        if (unicode != unicode_empty) {
+        PyObject *empty = unicode_get_empty();
+        if (unicode != empty) {
            Py_DECREF(unicode);
-            _Py_RETURN_UNICODE_EMPTY();
+
+            Py_INCREF(empty);
+            return empty;
        }
-        return unicode_empty;
+        // unicode is the empty string singleton
+        return unicode;
    }

 #ifdef LATIN1_SINGLETONS
@ -864,7 +885,7 @@ xmlcharrefreplace(_PyBytesWriter *writer, char *str,
   to keep things simple, we use a single bitmask, using the least 5
   bits from each unicode characters as the bit index. */

-/* the linebreak mask is set up by Unicode_Init below */
+/* the linebreak mask is set up by _PyUnicode_Init() below */

 #if LONG_BIT >= 128
 #define BLOOM_WIDTH 128
@ -938,6 +959,8 @@ ensure_unicode(PyObject *obj)

 /* Compilation of templated routines */

+#define STRINGLIB_GET_EMPTY()    unicode_get_empty()
+
 #include "stringlib/asciilib.h"
 #include "stringlib/fastsearch.h"
 #include "stringlib/partition.h"
@ -986,6 +1009,8 @@ _Py_COMP_DIAG_IGNORE_DEPR_DECLS
 #include "stringlib/undef.h"
 _Py_COMP_DIAG_POP

+#undef STRINGLIB_GET_EMPTY
+
 /* --- Unicode Object ----------------------------------------------------- */

 static inline Py_ssize_t
@ -1234,9 +1259,12 @@ _PyUnicode_New(Py_ssize_t length)
    size_t new_size;

    /* Optimization for empty strings */
-    if (length == 0 && unicode_empty != NULL) {
-        Py_INCREF(unicode_empty);
-        return (PyUnicodeObject*)unicode_empty;
+    if (length == 0) {
+        PyObject *empty = unicode_get_empty();
+        if (empty != NULL) {
+            Py_INCREF(empty);
+            return (PyUnicodeObject *)empty;
+        }
    }

    /* Ensure we won't overflow the size. */
@ -1386,6 +1414,15 @@ _PyUnicode_Dump(PyObject *op)
 PyObject *
 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
 {
+    /* Optimization for empty strings */
+    if (size == 0) {
+        PyObject *empty = unicode_get_empty();
+        if (empty != NULL) {
+            Py_INCREF(empty);
+            return empty;
+        }
+    }
+
    PyObject *obj;
    PyCompactUnicodeObject *unicode;
    void *data;
@ -1394,12 +1431,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
    Py_ssize_t char_size;
    Py_ssize_t struct_size;

-    /* Optimization for empty strings */
-    if (size == 0 && unicode_empty != NULL) {
-        Py_INCREF(unicode_empty);
-        return unicode_empty;
-    }
-
    is_ascii = 0;
    is_sharing = 0;
    struct_size = sizeof(PyCompactUnicodeObject);
@ -1970,7 +2001,8 @@ unicode_dealloc(PyObject *unicode)
 static int
 unicode_is_singleton(PyObject *unicode)
 {
-    if (unicode == unicode_empty) {
+    struct _Py_unicode_state *state = get_unicode_state();
+    if (unicode == state->empty) {
        return 1;
    }
 #ifdef LATIN1_SINGLETONS
@ -2026,10 +2058,10 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
        return 0;

    if (length == 0) {
-        _Py_INCREF_UNICODE_EMPTY();
-        if (!unicode_empty)
+        PyObject *empty = unicode_new_empty();
+        if (!empty)
            return -1;
-        Py_SETREF(*p_unicode, unicode_empty);
+        Py_SETREF(*p_unicode, empty);
        return 0;
    }

@ -10836,10 +10868,10 @@ replace(PyObject *self, PyObject *str1,
        }
        new_size = slen + n * (len2 - len1);
        if (new_size == 0) {
-            _Py_INCREF_UNICODE_EMPTY();
-            if (!unicode_empty)
+            PyObject *empty = unicode_new_empty();
+            if (!empty)
                goto error;
-            u = unicode_empty;
+            u = empty;
            goto done;
        }
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
@ -11497,10 +11529,13 @@ PyUnicode_Concat(PyObject *left, PyObject *right)
        return NULL;

    /* Shortcuts */
-    if (left == unicode_empty)
+    PyObject *empty = unicode_get_empty();  // Borrowed reference
+    if (left == empty) {
        return PyUnicode_FromObject(right);
-    if (right == unicode_empty)
+    }
+    if (right == empty) {
        return PyUnicode_FromObject(left);
+    }

    left_len = PyUnicode_GET_LENGTH(left);
    right_len = PyUnicode_GET_LENGTH(right);
@ -11551,14 +11586,16 @@ PyUnicode_Append(PyObject **p_left, PyObject *right)
        goto error;

    /* Shortcuts */
-    if (left == unicode_empty) {
+    PyObject *empty = unicode_get_empty();  // Borrowed reference
+    if (left == empty) {
        Py_DECREF(left);
        Py_INCREF(right);
        *p_left = right;
        return;
    }
-    if (right == unicode_empty)
+    if (right == empty) {
        return;
+    }

    left_len = PyUnicode_GET_LENGTH(left);
    right_len = PyUnicode_GET_LENGTH(right);
@ -13255,12 +13292,12 @@ PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
    len1 = PyUnicode_GET_LENGTH(str_obj);
    len2 = PyUnicode_GET_LENGTH(sep_obj);
    if (kind1 < kind2 || len1 < len2) {
-        _Py_INCREF_UNICODE_EMPTY();
-        if (!unicode_empty)
+        PyObject *empty = unicode_get_empty();  // Borrowed reference
+        if (!empty) {
            out = NULL;
+        }
        else {
-            out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
-            Py_DECREF(unicode_empty);
+            out = PyTuple_Pack(3, str_obj, empty, empty);
        }
        return out;
    }
@ -13313,12 +13350,12 @@ PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
    len1 = PyUnicode_GET_LENGTH(str_obj);
    len2 = PyUnicode_GET_LENGTH(sep_obj);
    if (kind1 < kind2 || len1 < len2) {
-        _Py_INCREF_UNICODE_EMPTY();
-        if (!unicode_empty)
+        PyObject *empty = unicode_get_empty();  // Borrowed reference
+        if (!empty) {
            out = NULL;
+        }
        else {
-            out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
-            Py_DECREF(unicode_empty);
+            out = PyTuple_Pack(3, empty, empty, str_obj);
        }
        return out;
    }
@ -15538,10 +15575,10 @@ PyTypeObject PyUnicode_Type = {
 /* Initialize the Unicode implementation */

 PyStatus
-_PyUnicode_Init(void)
+_PyUnicode_Init(PyThreadState *tstate)
 {
    /* XXX - move this array to unicodectype.c ? */
-    Py_UCS2 linebreak[] = {
+    const Py_UCS2 linebreak[] = {
        0x000A, /* LINE FEED */
        0x000D, /* CARRIAGE RETURN */
        0x001C, /* FILE SEPARATOR */
@ -15553,21 +15590,22 @@ _PyUnicode_Init(void)
    };

    /* Init the implementation */
-    _Py_INCREF_UNICODE_EMPTY();
-    if (!unicode_empty) {
-        return _PyStatus_ERR("Can't create empty string");
-    }
-    Py_DECREF(unicode_empty);
-
-    if (PyType_Ready(&PyUnicode_Type) < 0) {
-        return _PyStatus_ERR("Can't initialize unicode type");
+    PyObject *empty = unicode_new_empty();
+    if (!empty) {
+        return _PyStatus_NO_MEMORY();
    }
+    Py_DECREF(empty);

+    if (_Py_IsMainInterpreter(tstate)) {
        /* initialize the linebreak bloom filter */
        bloom_linebreak = make_bloom_mask(
            PyUnicode_2BYTE_KIND, linebreak,
            Py_ARRAY_LENGTH(linebreak));

+        if (PyType_Ready(&PyUnicode_Type) < 0) {
+            return _PyStatus_ERR("Can't initialize unicode type");
+        }
+
        if (PyType_Ready(&EncodingMapType) < 0) {
             return _PyStatus_ERR("Can't initialize encoding map type");
        }
@ -15577,6 +15615,7 @@ _PyUnicode_Init(void)
        if (PyType_Ready(&PyFormatterIter_Type) < 0) {
            return _PyStatus_ERR("Can't initialize formatter iter type");
        }
+    }
    return _PyStatus_OK();
 }

@ -16205,7 +16244,10 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void)
 void
 _PyUnicode_Fini(PyThreadState *tstate)
 {
-    if (_Py_IsMainInterpreter(tstate)) {
+    struct _Py_unicode_state *state = &tstate->interp->unicode;
+
+    int is_main_interp = _Py_IsMainInterpreter(tstate);
+    if (is_main_interp) {
 #if defined(WITH_VALGRIND) || defined(__INSURE__)
        /* Insure++ is a memory analysis tool that aids in discovering
         * memory leaks and other memory problems.  On Python exit, the
@ -16218,9 +16260,11 @@ _PyUnicode_Fini(PyThreadState *tstate)
         */
        unicode_release_interned();
 #endif /* __INSURE__ */
+    }

-        Py_CLEAR(unicode_empty);
+    Py_CLEAR(state->empty);

+    if (is_main_interp) {
 #ifdef LATIN1_SINGLETONS
        for (Py_ssize_t i = 0; i < 256; i++) {
            Py_CLEAR(unicode_latin1[i]);
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@ -595,12 +595,10 @@ pycore_init_types(PyThreadState *tstate)
        return _PyStatus_ERR("can't init longs");
    }

-    if (is_main_interp) {
-        status = _PyUnicode_Init();
+    status = _PyUnicode_Init(tstate);
    if (_PyStatus_EXCEPTION(status)) {
        return status;
    }
-    }

    status = _PyExc_Init(tstate);
    if (_PyStatus_EXCEPTION(status)) {