bpo-40521: Optimize PyUnicode_New(0, maxchar) (GH-21099)

Functions of unicodeobject.c, like PyUnicode_New(), no longer check if the empty Unicode singleton has been initialized or not. Consider that it is always initialized. The Unicode API must not be used before _PyUnicode_Init() or after _PyUnicode_Fini().
2020-06-24 00:34:07 +02:00 · 2020-06-24 00:34:07 +02:00 · 90ed8a6d71
parent f363d0a6e9
commit 90ed8a6d71
1 changed files with 25 additions and 55 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -231,28 +231,19 @@ get_unicode_state(void)


 // Return a borrowed reference to the empty string singleton.
-// Return NULL if the singleton was not created yet.
 static inline PyObject* unicode_get_empty(void)
 {
    struct _Py_unicode_state *state = get_unicode_state();
+    // unicode_get_empty() must not be called before _PyUnicode_Init()
+    // or after _PyUnicode_Fini()
+    assert(state->empty != NULL);
    return state->empty;
 }

 static inline PyObject* unicode_new_empty(void)
 {
-    struct _Py_unicode_state *state = get_unicode_state();
-    PyObject *empty = state->empty;
-    if (empty != NULL) {
+    PyObject *empty = unicode_get_empty();
    Py_INCREF(empty);
-    }
-    else {
-        empty = PyUnicode_New(0, 0);
-        if (empty != NULL) {
-            Py_INCREF(empty);
-            assert(_PyUnicode_CheckConsistency(empty, 1));
-            state->empty = empty;
-        }
-    }
    return empty;
 }

@ -696,12 +687,9 @@ unicode_result_ready(PyObject *unicode)
        PyObject *empty = unicode_get_empty();
        if (unicode != empty) {
            Py_DECREF(unicode);
-
            Py_INCREF(empty);
-            return empty;
        }
-        // unicode is the empty string singleton
-        return unicode;
+        return empty;
    }

 #ifdef LATIN1_SINGLETONS
@ -1260,11 +1248,7 @@ _PyUnicode_New(Py_ssize_t length)

    /* Optimization for empty strings */
    if (length == 0) {
-        PyObject *empty = unicode_get_empty();
-        if (empty != NULL) {
-            Py_INCREF(empty);
-            return (PyUnicodeObject *)empty;
-        }
+        return (PyUnicodeObject *)unicode_new_empty();
    }

    /* Ensure we won't overflow the size. */
@ -1416,11 +1400,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
 {
    /* Optimization for empty strings */
    if (size == 0) {
-        PyObject *empty = unicode_get_empty();
-        if (empty != NULL) {
-            Py_INCREF(empty);
-            return empty;
-        }
+        return unicode_new_empty();
    }

    PyObject *obj;
@ -2001,8 +1981,7 @@ unicode_dealloc(PyObject *unicode)
 static int
 unicode_is_singleton(PyObject *unicode)
 {
-    struct _Py_unicode_state *state = get_unicode_state();
-    if (unicode == state->empty) {
+    if (unicode == unicode_get_empty()) {
        return 1;
    }
 #ifdef LATIN1_SINGLETONS
@ -2059,8 +2038,6 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)

    if (length == 0) {
        PyObject *empty = unicode_new_empty();
-        if (!empty)
-            return -1;
        Py_SETREF(*p_unicode, empty);
        return 0;
    }
@ -10868,10 +10845,7 @@ replace(PyObject *self, PyObject *str1,
        }
        new_size = slen + n * (len2 - len1);
        if (new_size == 0) {
-            PyObject *empty = unicode_new_empty();
-            if (!empty)
-                goto error;
-            u = empty;
+            u = unicode_new_empty();
            goto done;
        }
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
@ -13293,13 +13267,7 @@ PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
    len2 = PyUnicode_GET_LENGTH(sep_obj);
    if (kind1 < kind2 || len1 < len2) {
        PyObject *empty = unicode_get_empty();  // Borrowed reference
-        if (!empty) {
-            out = NULL;
-        }
-        else {
-            out = PyTuple_Pack(3, str_obj, empty, empty);
-        }
-        return out;
+        return PyTuple_Pack(3, str_obj, empty, empty);
    }
    buf1 = PyUnicode_DATA(str_obj);
    buf2 = PyUnicode_DATA(sep_obj);
@ -13351,13 +13319,7 @@ PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
    len2 = PyUnicode_GET_LENGTH(sep_obj);
    if (kind1 < kind2 || len1 < len2) {
        PyObject *empty = unicode_get_empty();  // Borrowed reference
-        if (!empty) {
-            out = NULL;
-        }
-        else {
-            out = PyTuple_Pack(3, empty, empty, str_obj);
-        }
-        return out;
+        return PyTuple_Pack(3, empty, empty, str_obj);
    }
    buf1 = PyUnicode_DATA(str_obj);
    buf2 = PyUnicode_DATA(sep_obj);
@ -15589,12 +15551,20 @@ _PyUnicode_Init(PyThreadState *tstate)
        0x2029, /* PARAGRAPH SEPARATOR */
    };

-    /* Init the implementation */
-    PyObject *empty = unicode_new_empty();
-    if (!empty) {
+    // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
+    // optimized to always use state->empty without having to check if it is
+    // NULL or not.
+    PyObject *empty = PyUnicode_New(1, 0);
+    if (empty == NULL) {
        return _PyStatus_NO_MEMORY();
    }
-    Py_DECREF(empty);
+    PyUnicode_1BYTE_DATA(empty)[0] = 0;
+    _PyUnicode_LENGTH(empty) = 0;
+    assert(_PyUnicode_CheckConsistency(empty, 1));
+
+    struct _Py_unicode_state *state = &tstate->interp->unicode;
+    assert(state->empty == NULL);
+    state->empty = empty;

    if (_Py_IsMainInterpreter(tstate)) {
        /* initialize the linebreak bloom filter */