bpo-40521: Make empty Unicode string per interpreter (GH-21096)

Each interpreter now has its own empty Unicode string singleton.
This commit is contained in:
Victor Stinner 2020-06-24 00:10:40 +02:00 committed by GitHub
parent d051801052
commit f363d0a6e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 130 additions and 90 deletions

View File

@ -71,6 +71,8 @@ struct _Py_bytes_state {
}; };
struct _Py_unicode_state { struct _Py_unicode_state {
// The empty Unicode object is a singleton to improve performance.
PyObject *empty;
struct _Py_unicode_fs_codec fs_codec; struct _Py_unicode_fs_codec fs_codec;
}; };

View File

@ -31,7 +31,7 @@ PyAPI_FUNC(int) _Py_IsLocaleCoercionTarget(const char *ctype_loc);
/* Various one-time initializers */ /* Various one-time initializers */
extern PyStatus _PyUnicode_Init(void); extern PyStatus _PyUnicode_Init(PyThreadState *tstate);
extern int _PyStructSequence_Init(void); extern int _PyStructSequence_Init(void);
extern int _PyLong_Init(PyThreadState *tstate); extern int _PyLong_Init(PyThreadState *tstate);
extern PyStatus _PyFaulthandler_Init(int enable); extern PyStatus _PyFaulthandler_Init(int enable);

View File

@ -2,7 +2,7 @@ Each interpreter now its has own free lists, singletons and caches:
* Free lists: float, tuple, list, dict, frame, context, * Free lists: float, tuple, list, dict, frame, context,
asynchronous generator, MemoryError. asynchronous generator, MemoryError.
* Singletons: empty tuple, empty bytes string, * Singletons: empty tuple, empty bytes string, empty Unicode string,
single byte character. single byte character.
* Slice cache. * Slice cache.

View File

@ -11,7 +11,6 @@
#define STRINGLIB_CHAR Py_UCS1 #define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_GET_EMPTY() unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL

View File

@ -1,9 +1,14 @@
/* stringlib: partition implementation */ /* stringlib: partition implementation */
#ifndef STRINGLIB_FASTSEARCH_H #ifndef STRINGLIB_FASTSEARCH_H
#error must include "stringlib/fastsearch.h" before including this module # error must include "stringlib/fastsearch.h" before including this module
#endif #endif
#if !STRINGLIB_MUTABLE && !defined(STRINGLIB_GET_EMPTY)
# error "STRINGLIB_GET_EMPTY must be defined if STRINGLIB_MUTABLE is zero"
#endif
Py_LOCAL_INLINE(PyObject*) Py_LOCAL_INLINE(PyObject*)
STRINGLIB(partition)(PyObject* str_obj, STRINGLIB(partition)(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* str, Py_ssize_t str_len,

View File

@ -1,10 +1,6 @@
#ifndef STRINGLIB_STRINGDEFS_H #ifndef STRINGLIB_STRINGDEFS_H
#define STRINGLIB_STRINGDEFS_H #define STRINGLIB_STRINGDEFS_H
#ifndef STRINGLIB_GET_EMPTY
# error "STRINGLIB_GET_EMPTY macro must be defined"
#endif
/* this is sort of a hack. there's at least one place (formatting /* this is sort of a hack. there's at least one place (formatting
floats) where some stringlib code takes a different path if it's floats) where some stringlib code takes a different path if it's
compiled as unicode. */ compiled as unicode. */

View File

@ -11,7 +11,6 @@
#define STRINGLIB_CHAR Py_UCS1 #define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_GET_EMPTY() unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL

View File

@ -11,7 +11,6 @@
#define STRINGLIB_CHAR Py_UCS2 #define STRINGLIB_CHAR Py_UCS2
#define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_GET_EMPTY() unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL

View File

@ -11,7 +11,6 @@
#define STRINGLIB_CHAR Py_UCS4 #define STRINGLIB_CHAR Py_UCS4
#define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_GET_EMPTY() unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL

View File

@ -13,7 +13,6 @@
#define STRINGLIB_CHAR Py_UNICODE #define STRINGLIB_CHAR Py_UNICODE
#define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_GET_EMPTY() unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL

View File

@ -222,26 +222,43 @@ extern "C" {
static PyObject *interned = NULL; static PyObject *interned = NULL;
#endif #endif
/* The empty Unicode object is shared to improve performance. */ static struct _Py_unicode_state*
static PyObject *unicode_empty = NULL; get_unicode_state(void)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
return &interp->unicode;
}
#define _Py_INCREF_UNICODE_EMPTY() \
do { \
if (unicode_empty != NULL) \
Py_INCREF(unicode_empty); \
else { \
unicode_empty = PyUnicode_New(0, 0); \
if (unicode_empty != NULL) { \
Py_INCREF(unicode_empty); \
assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
} \
} \
} while (0)
#define _Py_RETURN_UNICODE_EMPTY() \ // Return a borrowed reference to the empty string singleton.
do { \ // Return NULL if the singleton was not created yet.
_Py_INCREF_UNICODE_EMPTY(); \ static inline PyObject* unicode_get_empty(void)
return unicode_empty; \ {
struct _Py_unicode_state *state = get_unicode_state();
return state->empty;
}
static inline PyObject* unicode_new_empty(void)
{
struct _Py_unicode_state *state = get_unicode_state();
PyObject *empty = state->empty;
if (empty != NULL) {
Py_INCREF(empty);
}
else {
empty = PyUnicode_New(0, 0);
if (empty != NULL) {
Py_INCREF(empty);
assert(_PyUnicode_CheckConsistency(empty, 1));
state->empty = empty;
}
}
return empty;
}
#define _Py_RETURN_UNICODE_EMPTY() \
do { \
return unicode_new_empty(); \
} while (0) } while (0)
static inline void static inline void
@ -676,11 +693,15 @@ unicode_result_ready(PyObject *unicode)
length = PyUnicode_GET_LENGTH(unicode); length = PyUnicode_GET_LENGTH(unicode);
if (length == 0) { if (length == 0) {
if (unicode != unicode_empty) { PyObject *empty = unicode_get_empty();
if (unicode != empty) {
Py_DECREF(unicode); Py_DECREF(unicode);
_Py_RETURN_UNICODE_EMPTY();
Py_INCREF(empty);
return empty;
} }
return unicode_empty; // unicode is the empty string singleton
return unicode;
} }
#ifdef LATIN1_SINGLETONS #ifdef LATIN1_SINGLETONS
@ -864,7 +885,7 @@ xmlcharrefreplace(_PyBytesWriter *writer, char *str,
to keep things simple, we use a single bitmask, using the least 5 to keep things simple, we use a single bitmask, using the least 5
bits from each unicode characters as the bit index. */ bits from each unicode characters as the bit index. */
/* the linebreak mask is set up by Unicode_Init below */ /* the linebreak mask is set up by _PyUnicode_Init() below */
#if LONG_BIT >= 128 #if LONG_BIT >= 128
#define BLOOM_WIDTH 128 #define BLOOM_WIDTH 128
@ -938,6 +959,8 @@ ensure_unicode(PyObject *obj)
/* Compilation of templated routines */ /* Compilation of templated routines */
#define STRINGLIB_GET_EMPTY() unicode_get_empty()
#include "stringlib/asciilib.h" #include "stringlib/asciilib.h"
#include "stringlib/fastsearch.h" #include "stringlib/fastsearch.h"
#include "stringlib/partition.h" #include "stringlib/partition.h"
@ -986,6 +1009,8 @@ _Py_COMP_DIAG_IGNORE_DEPR_DECLS
#include "stringlib/undef.h" #include "stringlib/undef.h"
_Py_COMP_DIAG_POP _Py_COMP_DIAG_POP
#undef STRINGLIB_GET_EMPTY
/* --- Unicode Object ----------------------------------------------------- */ /* --- Unicode Object ----------------------------------------------------- */
static inline Py_ssize_t static inline Py_ssize_t
@ -1234,9 +1259,12 @@ _PyUnicode_New(Py_ssize_t length)
size_t new_size; size_t new_size;
/* Optimization for empty strings */ /* Optimization for empty strings */
if (length == 0 && unicode_empty != NULL) { if (length == 0) {
Py_INCREF(unicode_empty); PyObject *empty = unicode_get_empty();
return (PyUnicodeObject*)unicode_empty; if (empty != NULL) {
Py_INCREF(empty);
return (PyUnicodeObject *)empty;
}
} }
/* Ensure we won't overflow the size. */ /* Ensure we won't overflow the size. */
@ -1386,6 +1414,15 @@ _PyUnicode_Dump(PyObject *op)
PyObject * PyObject *
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
{ {
/* Optimization for empty strings */
if (size == 0) {
PyObject *empty = unicode_get_empty();
if (empty != NULL) {
Py_INCREF(empty);
return empty;
}
}
PyObject *obj; PyObject *obj;
PyCompactUnicodeObject *unicode; PyCompactUnicodeObject *unicode;
void *data; void *data;
@ -1394,12 +1431,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
Py_ssize_t char_size; Py_ssize_t char_size;
Py_ssize_t struct_size; Py_ssize_t struct_size;
/* Optimization for empty strings */
if (size == 0 && unicode_empty != NULL) {
Py_INCREF(unicode_empty);
return unicode_empty;
}
is_ascii = 0; is_ascii = 0;
is_sharing = 0; is_sharing = 0;
struct_size = sizeof(PyCompactUnicodeObject); struct_size = sizeof(PyCompactUnicodeObject);
@ -1970,7 +2001,8 @@ unicode_dealloc(PyObject *unicode)
static int static int
unicode_is_singleton(PyObject *unicode) unicode_is_singleton(PyObject *unicode)
{ {
if (unicode == unicode_empty) { struct _Py_unicode_state *state = get_unicode_state();
if (unicode == state->empty) {
return 1; return 1;
} }
#ifdef LATIN1_SINGLETONS #ifdef LATIN1_SINGLETONS
@ -2026,10 +2058,10 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
return 0; return 0;
if (length == 0) { if (length == 0) {
_Py_INCREF_UNICODE_EMPTY(); PyObject *empty = unicode_new_empty();
if (!unicode_empty) if (!empty)
return -1; return -1;
Py_SETREF(*p_unicode, unicode_empty); Py_SETREF(*p_unicode, empty);
return 0; return 0;
} }
@ -10836,10 +10868,10 @@ replace(PyObject *self, PyObject *str1,
} }
new_size = slen + n * (len2 - len1); new_size = slen + n * (len2 - len1);
if (new_size == 0) { if (new_size == 0) {
_Py_INCREF_UNICODE_EMPTY(); PyObject *empty = unicode_new_empty();
if (!unicode_empty) if (!empty)
goto error; goto error;
u = unicode_empty; u = empty;
goto done; goto done;
} }
if (new_size > (PY_SSIZE_T_MAX / rkind)) { if (new_size > (PY_SSIZE_T_MAX / rkind)) {
@ -11497,10 +11529,13 @@ PyUnicode_Concat(PyObject *left, PyObject *right)
return NULL; return NULL;
/* Shortcuts */ /* Shortcuts */
if (left == unicode_empty) PyObject *empty = unicode_get_empty(); // Borrowed reference
if (left == empty) {
return PyUnicode_FromObject(right); return PyUnicode_FromObject(right);
if (right == unicode_empty) }
if (right == empty) {
return PyUnicode_FromObject(left); return PyUnicode_FromObject(left);
}
left_len = PyUnicode_GET_LENGTH(left); left_len = PyUnicode_GET_LENGTH(left);
right_len = PyUnicode_GET_LENGTH(right); right_len = PyUnicode_GET_LENGTH(right);
@ -11551,14 +11586,16 @@ PyUnicode_Append(PyObject **p_left, PyObject *right)
goto error; goto error;
/* Shortcuts */ /* Shortcuts */
if (left == unicode_empty) { PyObject *empty = unicode_get_empty(); // Borrowed reference
if (left == empty) {
Py_DECREF(left); Py_DECREF(left);
Py_INCREF(right); Py_INCREF(right);
*p_left = right; *p_left = right;
return; return;
} }
if (right == unicode_empty) if (right == empty) {
return; return;
}
left_len = PyUnicode_GET_LENGTH(left); left_len = PyUnicode_GET_LENGTH(left);
right_len = PyUnicode_GET_LENGTH(right); right_len = PyUnicode_GET_LENGTH(right);
@ -13255,12 +13292,12 @@ PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
len1 = PyUnicode_GET_LENGTH(str_obj); len1 = PyUnicode_GET_LENGTH(str_obj);
len2 = PyUnicode_GET_LENGTH(sep_obj); len2 = PyUnicode_GET_LENGTH(sep_obj);
if (kind1 < kind2 || len1 < len2) { if (kind1 < kind2 || len1 < len2) {
_Py_INCREF_UNICODE_EMPTY(); PyObject *empty = unicode_get_empty(); // Borrowed reference
if (!unicode_empty) if (!empty) {
out = NULL; out = NULL;
}
else { else {
out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty); out = PyTuple_Pack(3, str_obj, empty, empty);
Py_DECREF(unicode_empty);
} }
return out; return out;
} }
@ -13313,12 +13350,12 @@ PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
len1 = PyUnicode_GET_LENGTH(str_obj); len1 = PyUnicode_GET_LENGTH(str_obj);
len2 = PyUnicode_GET_LENGTH(sep_obj); len2 = PyUnicode_GET_LENGTH(sep_obj);
if (kind1 < kind2 || len1 < len2) { if (kind1 < kind2 || len1 < len2) {
_Py_INCREF_UNICODE_EMPTY(); PyObject *empty = unicode_get_empty(); // Borrowed reference
if (!unicode_empty) if (!empty) {
out = NULL; out = NULL;
}
else { else {
out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj); out = PyTuple_Pack(3, empty, empty, str_obj);
Py_DECREF(unicode_empty);
} }
return out; return out;
} }
@ -15538,10 +15575,10 @@ PyTypeObject PyUnicode_Type = {
/* Initialize the Unicode implementation */ /* Initialize the Unicode implementation */
PyStatus PyStatus
_PyUnicode_Init(void) _PyUnicode_Init(PyThreadState *tstate)
{ {
/* XXX - move this array to unicodectype.c ? */ /* XXX - move this array to unicodectype.c ? */
Py_UCS2 linebreak[] = { const Py_UCS2 linebreak[] = {
0x000A, /* LINE FEED */ 0x000A, /* LINE FEED */
0x000D, /* CARRIAGE RETURN */ 0x000D, /* CARRIAGE RETURN */
0x001C, /* FILE SEPARATOR */ 0x001C, /* FILE SEPARATOR */
@ -15553,29 +15590,31 @@ _PyUnicode_Init(void)
}; };
/* Init the implementation */ /* Init the implementation */
_Py_INCREF_UNICODE_EMPTY(); PyObject *empty = unicode_new_empty();
if (!unicode_empty) { if (!empty) {
return _PyStatus_ERR("Can't create empty string"); return _PyStatus_NO_MEMORY();
} }
Py_DECREF(unicode_empty); Py_DECREF(empty);
if (PyType_Ready(&PyUnicode_Type) < 0) { if (_Py_IsMainInterpreter(tstate)) {
return _PyStatus_ERR("Can't initialize unicode type"); /* initialize the linebreak bloom filter */
} bloom_linebreak = make_bloom_mask(
PyUnicode_2BYTE_KIND, linebreak,
Py_ARRAY_LENGTH(linebreak));
/* initialize the linebreak bloom filter */ if (PyType_Ready(&PyUnicode_Type) < 0) {
bloom_linebreak = make_bloom_mask( return _PyStatus_ERR("Can't initialize unicode type");
PyUnicode_2BYTE_KIND, linebreak, }
Py_ARRAY_LENGTH(linebreak));
if (PyType_Ready(&EncodingMapType) < 0) { if (PyType_Ready(&EncodingMapType) < 0) {
return _PyStatus_ERR("Can't initialize encoding map type"); return _PyStatus_ERR("Can't initialize encoding map type");
} }
if (PyType_Ready(&PyFieldNameIter_Type) < 0) { if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
return _PyStatus_ERR("Can't initialize field name iterator type"); return _PyStatus_ERR("Can't initialize field name iterator type");
} }
if (PyType_Ready(&PyFormatterIter_Type) < 0) { if (PyType_Ready(&PyFormatterIter_Type) < 0) {
return _PyStatus_ERR("Can't initialize formatter iter type"); return _PyStatus_ERR("Can't initialize formatter iter type");
}
} }
return _PyStatus_OK(); return _PyStatus_OK();
} }
@ -16205,7 +16244,10 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void)
void void
_PyUnicode_Fini(PyThreadState *tstate) _PyUnicode_Fini(PyThreadState *tstate)
{ {
if (_Py_IsMainInterpreter(tstate)) { struct _Py_unicode_state *state = &tstate->interp->unicode;
int is_main_interp = _Py_IsMainInterpreter(tstate);
if (is_main_interp) {
#if defined(WITH_VALGRIND) || defined(__INSURE__) #if defined(WITH_VALGRIND) || defined(__INSURE__)
/* Insure++ is a memory analysis tool that aids in discovering /* Insure++ is a memory analysis tool that aids in discovering
* memory leaks and other memory problems. On Python exit, the * memory leaks and other memory problems. On Python exit, the
@ -16218,9 +16260,11 @@ _PyUnicode_Fini(PyThreadState *tstate)
*/ */
unicode_release_interned(); unicode_release_interned();
#endif /* __INSURE__ */ #endif /* __INSURE__ */
}
Py_CLEAR(unicode_empty); Py_CLEAR(state->empty);
if (is_main_interp) {
#ifdef LATIN1_SINGLETONS #ifdef LATIN1_SINGLETONS
for (Py_ssize_t i = 0; i < 256; i++) { for (Py_ssize_t i = 0; i < 256; i++) {
Py_CLEAR(unicode_latin1[i]); Py_CLEAR(unicode_latin1[i]);

View File

@ -595,11 +595,9 @@ pycore_init_types(PyThreadState *tstate)
return _PyStatus_ERR("can't init longs"); return _PyStatus_ERR("can't init longs");
} }
if (is_main_interp) { status = _PyUnicode_Init(tstate);
status = _PyUnicode_Init(); if (_PyStatus_EXCEPTION(status)) {
if (_PyStatus_EXCEPTION(status)) { return status;
return status;
}
} }
status = _PyExc_Init(tstate); status = _PyExc_Init(tstate);