From 2f9ada96e0d420fed0d09a032b37197f08ef167a Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 24 Jun 2020 02:22:21 +0200 Subject: [PATCH] bpo-40521: Make Unicode latin1 singletons per interpreter (GH-21101) Each interpreter now has its own Unicode latin1 singletons. Remove "ifdef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS" and "ifdef LATIN1_SINGLETONS": always enable latin1 singletons. Optimize unicode_result_ready(): only attempt to get a latin1 singleton for PyUnicode_1BYTE_KIND. --- Include/internal/pycore_interp.h | 3 + .../2020-05-20-01-17-34.bpo-40521.wvAehI.rst | 2 +- Objects/unicodeobject.c | 74 ++++++++----------- 3 files changed, 36 insertions(+), 43 deletions(-) diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index d8947e700f8..bf1769e5ce2 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -73,6 +73,9 @@ struct _Py_bytes_state { struct _Py_unicode_state { // The empty Unicode object is a singleton to improve performance. PyObject *empty; + /* Single character Unicode strings in the Latin-1 range are being + shared as well. */ + PyObject *latin1[256]; struct _Py_unicode_fs_codec fs_codec; }; diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst index e970551f531..43226931ccc 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst @@ -3,7 +3,7 @@ Each interpreter now its has own free lists, singletons and caches: * Free lists: float, tuple, list, dict, frame, context, asynchronous generator, MemoryError. * Singletons: empty tuple, empty bytes string, empty Unicode string, - single byte character. + single byte character, single Unicode (latin1) character. * Slice cache. They are no longer shared by all interpreters. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e4235b1aca3..5ba99514d29 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -303,17 +303,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, /* List of static strings. */ static _Py_Identifier *static_strings = NULL; -/* bpo-40521: Latin1 singletons are shared by all interpreters. */ -#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS -# define LATIN1_SINGLETONS -#endif - -#ifdef LATIN1_SINGLETONS -/* Single character Unicode strings in the Latin-1 range are being - shared as well. */ -static PyObject *unicode_latin1[256] = {NULL}; -#endif - /* Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -657,9 +646,8 @@ unicode_result_wchar(PyObject *unicode) if (len == 1) { wchar_t ch = _PyUnicode_WSTR(unicode)[0]; if ((Py_UCS4)ch < 256) { - PyObject *latin1_char = get_latin1_char((unsigned char)ch); Py_DECREF(unicode); - return latin1_char; + return get_latin1_char((unsigned char)ch); } } @@ -692,13 +680,13 @@ unicode_result_ready(PyObject *unicode) return empty; } -#ifdef LATIN1_SINGLETONS if (length == 1) { - const void *data = PyUnicode_DATA(unicode); int kind = PyUnicode_KIND(unicode); - Py_UCS4 ch = PyUnicode_READ(kind, data, 0); - if (ch < 256) { - PyObject *latin1_char = unicode_latin1[ch]; + if (kind == PyUnicode_1BYTE_KIND) { + Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); + Py_UCS1 ch = data[0]; + struct _Py_unicode_state *state = get_unicode_state(); + PyObject *latin1_char = state->latin1[ch]; if (latin1_char != NULL) { if (unicode != latin1_char) { Py_INCREF(latin1_char); @@ -709,12 +697,14 @@ unicode_result_ready(PyObject *unicode) else { assert(_PyUnicode_CheckConsistency(unicode, 1)); Py_INCREF(unicode); - unicode_latin1[ch] = unicode; + state->latin1[ch] = unicode; return unicode; } } + else { + assert(PyUnicode_READ_CHAR(unicode, 0) >= 256); + } } -#endif assert(_PyUnicode_CheckConsistency(unicode, 1)); return unicode; @@ -1981,18 +1971,18 @@ unicode_dealloc(PyObject *unicode) static int unicode_is_singleton(PyObject *unicode) { - if (unicode == unicode_get_empty()) { + struct _Py_unicode_state *state = get_unicode_state(); + if (unicode == state->empty) { return 1; } -#ifdef LATIN1_SINGLETONS PyASCIIObject *ascii = (PyASCIIObject *)unicode; if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) { Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); - if (ch < 256 && unicode_latin1[ch] == unicode) + if (ch < 256 && state->latin1[ch] == unicode) { return 1; + } } -#endif return 0; } #endif @@ -2130,17 +2120,15 @@ unicode_write_cstr(PyObject *unicode, Py_ssize_t index, } static PyObject* -get_latin1_char(unsigned char ch) +get_latin1_char(Py_UCS1 ch) { - PyObject *unicode; + struct _Py_unicode_state *state = get_unicode_state(); -#ifdef LATIN1_SINGLETONS - unicode = unicode_latin1[ch]; + PyObject *unicode = state->latin1[ch]; if (unicode) { Py_INCREF(unicode); return unicode; } -#endif unicode = PyUnicode_New(1, ch); if (!unicode) { @@ -2150,10 +2138,8 @@ get_latin1_char(unsigned char ch) PyUnicode_1BYTE_DATA(unicode)[0] = ch; assert(_PyUnicode_CheckConsistency(unicode, 1)); -#ifdef LATIN1_SINGLETONS Py_INCREF(unicode); - unicode_latin1[ch] = unicode; -#endif + state->latin1[ch] = unicode; return unicode; } @@ -2164,8 +2150,9 @@ unicode_char(Py_UCS4 ch) assert(ch <= MAX_UNICODE); - if (ch < 256) + if (ch < 256) { return get_latin1_char(ch); + } unicode = PyUnicode_New(1, ch); if (unicode == NULL) @@ -2367,11 +2354,13 @@ _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) PyObject *res; unsigned char max_char; - if (size == 0) + if (size == 0) { _Py_RETURN_UNICODE_EMPTY(); + } assert(size > 0); - if (size == 1) + if (size == 1) { return get_latin1_char(u[0]); + } max_char = ucs1lib_find_max_char(u, u + size); res = PyUnicode_New(size, max_char); @@ -5008,8 +4997,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, /* ASCII is equivalent to the first 128 ordinals in Unicode. */ if (size == 1 && (unsigned char)s[0] < 128) { - if (consumed) + if (consumed) { *consumed = 1; + } return get_latin1_char((unsigned char)s[0]); } @@ -7176,8 +7166,9 @@ PyUnicode_DecodeASCII(const char *s, _Py_RETURN_UNICODE_EMPTY(); /* ASCII is equivalent to the first 128 ordinals in Unicode. */ - if (size == 1 && (unsigned char)s[0] < 128) + if (size == 1 && (unsigned char)s[0] < 128) { return get_latin1_char((unsigned char)s[0]); + } // Shortcut for simple case PyObject *u = PyUnicode_New(size, 127); @@ -16234,12 +16225,11 @@ _PyUnicode_Fini(PyThreadState *tstate) Py_CLEAR(state->empty); + for (Py_ssize_t i = 0; i < 256; i++) { + Py_CLEAR(state->latin1[i]); + } + if (is_main_interp) { -#ifdef LATIN1_SINGLETONS - for (Py_ssize_t i = 0; i < 256; i++) { - Py_CLEAR(unicode_latin1[i]); - } -#endif unicode_clear_static_strings(); }