From 1680713e524016d93a94114c4a874ad71a090b95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Walter=20D=C3=B6rwald?= Date: Fri, 25 May 2007 13:52:07 +0000 Subject: [PATCH] Add interning of unicode strings by copying the functionality from stringobject.c. Intern "True" and "False" in bool_repr() again as it was in the 8bit string era. --- Include/stringobject.h | 4 -- Include/unicodeobject.h | 15 +++++ Modules/main.c | 3 +- Objects/boolobject.c | 4 +- Objects/unicodeobject.c | 139 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 158 insertions(+), 7 deletions(-) diff --git a/Include/stringobject.h b/Include/stringobject.h index 815619345e3..2b8cc2fdefc 100644 --- a/Include/stringobject.h +++ b/Include/stringobject.h @@ -48,10 +48,6 @@ typedef struct { */ } PyStringObject; -#define SSTATE_NOT_INTERNED 0 -#define SSTATE_INTERNED_MORTAL 1 -#define SSTATE_INTERNED_IMMORTAL 2 - PyAPI_DATA(PyTypeObject) PyBaseString_Type; PyAPI_DATA(PyTypeObject) PyString_Type; diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 131278d190a..2a27dbc0c83 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -390,6 +390,9 @@ typedef struct { Py_ssize_t length; /* Length of raw Unicode data in buffer */ Py_UNICODE *str; /* Raw Unicode buffer */ long hash; /* Hash value; -1 if not set */ + int state; /* != 0 if interned. In this case the two + * references from the dictionary to this object + * are *not* counted in ob_refcnt. */ PyObject *defenc; /* (Default) Encoded version as Python string, or NULL; this is used for implementing the buffer protocol */ @@ -397,6 +400,10 @@ typedef struct { PyAPI_DATA(PyTypeObject) PyUnicode_Type; +#define SSTATE_NOT_INTERNED 0 +#define SSTATE_INTERNED_MORTAL 1 +#define SSTATE_INTERNED_IMMORTAL 2 + #define PyUnicode_Check(op) \ PyType_FastSubclass((op)->ob_type, Py_TPFLAGS_UNICODE_SUBCLASS) #define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type) @@ -529,6 +536,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromObject( PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list); PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...); +PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); +PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); +PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *); +PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); + +/* Use only if you know it's a string */ +#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state) + /* --- wchar_t support for platforms which support it --------------------- */ #ifdef HAVE_WCHAR_H diff --git a/Modules/main.c b/Modules/main.c index 66dec8d9ee0..0bcd0f4fa64 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -521,7 +521,7 @@ Py_Main(int argc, char **argv) #ifdef __INSURE__ /* Insure++ is a memory analysis tool that aids in discovering * memory leaks and other memory problems. On Python exit, the - * interned string dictionary is flagged as being in use at exit + * interned string dictionaries are flagged as being in use at exit * (which it is). Under normal circumstances, this is fine because * the memory will be automatically reclaimed by the system. Under * memory debugging, it's a huge source of useless noise, so we @@ -529,6 +529,7 @@ Py_Main(int argc, char **argv) * reports. -baw */ _Py_ReleaseInternedStrings(); + _Py_ReleaseInternedUnicodeStrings(); #endif /* __INSURE__ */ return sts; diff --git a/Objects/boolobject.c b/Objects/boolobject.c index 0a9f958ff66..b0170f60855 100644 --- a/Objects/boolobject.c +++ b/Objects/boolobject.c @@ -24,10 +24,10 @@ bool_repr(PyObject *self) if (self == Py_True) s = true_str ? true_str : - (true_str = PyUnicode_FromString("True")); + (true_str = PyUnicode_InternFromString("True")); else s = false_str ? false_str : - (false_str = PyUnicode_FromString("False")); + (false_str = PyUnicode_InternFromString("False")); Py_XINCREF(s); return s; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 999b1661eb2..854310b4720 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -92,6 +92,16 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. extern "C" { #endif +/* This dictionary holds all interned unicode strings. Note that references + to strings in this dictionary are *not* counted in the string's ob_refcnt. + When the interned string reaches a refcnt of 0 the string deallocation + function will delete the reference from this dictionary. + + Another way to look at this is that to say that the actual reference + count of a string is: s->ob_refcnt + (s->ob_sstate?2:0) +*/ +static PyObject *interned; + /* Free list for Unicode objects */ static PyUnicodeObject *unicode_freelist; static int unicode_freelist_size; @@ -276,6 +286,7 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) unicode->str[length] = 0; unicode->length = length; unicode->hash = -1; + unicode->state = 0; unicode->defenc = NULL; return unicode; @@ -288,6 +299,25 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) static void unicode_dealloc(register PyUnicodeObject *unicode) { + switch (PyUnicode_CHECK_INTERNED(unicode)) { + case SSTATE_NOT_INTERNED: + break; + + case SSTATE_INTERNED_MORTAL: + /* revive dead object temporarily for DelItem */ + unicode->ob_refcnt = 3; + if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) + Py_FatalError( + "deletion of interned unicode string failed"); + break; + + case SSTATE_INTERNED_IMMORTAL: + Py_FatalError("Immortal interned unicode string died."); + + default: + Py_FatalError("Inconsistent interned unicode string state."); + } + if (PyUnicode_CheckExact(unicode) && unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { /* Keep-Alive optimization */ @@ -8564,6 +8594,115 @@ _PyUnicode_Fini(void) unicode_freelist_size = 0; } +void +PyUnicode_InternInPlace(PyObject **p) +{ + register PyUnicodeObject *s = (PyUnicodeObject *)(*p); + PyObject *t; + if (s == NULL || !PyUnicode_Check(s)) + Py_FatalError( + "PyUnicode_InternInPlace: unicode strings only please!"); + /* If it's a subclass, we don't really know what putting + it in the interned dict might do. */ + if (!PyUnicode_CheckExact(s)) + return; + if (PyUnicode_CHECK_INTERNED(s)) + return; + if (interned == NULL) { + interned = PyDict_New(); + if (interned == NULL) { + PyErr_Clear(); /* Don't leave an exception */ + return; + } + } + t = PyDict_GetItem(interned, (PyObject *)s); + if (t) { + Py_INCREF(t); + Py_DECREF(*p); + *p = t; + return; + } + + if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { + PyErr_Clear(); + return; + } + /* The two references in interned are not counted by refcnt. + The deallocator will take care of this */ + s->ob_refcnt -= 2; + PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; +} + +void +PyUnicode_InternImmortal(PyObject **p) +{ + PyUnicode_InternInPlace(p); + if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { + PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; + Py_INCREF(*p); + } +} + +PyObject * +PyUnicode_InternFromString(const char *cp) +{ + PyObject *s = PyUnicode_FromString(cp); + if (s == NULL) + return NULL; + PyUnicode_InternInPlace(&s); + return s; +} + +void _Py_ReleaseInternedUnicodeStrings(void) +{ + PyObject *keys; + PyUnicodeObject *s; + Py_ssize_t i, n; + Py_ssize_t immortal_size = 0, mortal_size = 0; + + if (interned == NULL || !PyDict_Check(interned)) + return; + keys = PyDict_Keys(interned); + if (keys == NULL || !PyList_Check(keys)) { + PyErr_Clear(); + return; + } + + /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak + detector, interned unicode strings are not forcibly deallocated; + rather, we give them their stolen references back, and then clear + and DECREF the interned dict. */ + + n = PyList_GET_SIZE(keys); + fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", + n); + for (i = 0; i < n; i++) { + s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); + switch (s->state) { + case SSTATE_NOT_INTERNED: + /* XXX Shouldn't happen */ + break; + case SSTATE_INTERNED_IMMORTAL: + s->ob_refcnt += 1; + immortal_size += s->length; + break; + case SSTATE_INTERNED_MORTAL: + s->ob_refcnt += 2; + mortal_size += s->length; + break; + default: + Py_FatalError("Inconsistent interned string state."); + } + s->state = SSTATE_NOT_INTERNED; + } + fprintf(stderr, "total size of all interned strings: " + "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " + "mortal/immortal\n", mortal_size, immortal_size); + Py_DECREF(keys); + PyDict_Clear(interned); + Py_DECREF(interned); + interned = NULL; +} /********************* Unicode Iterator **************************/