bpo-46845: Reduce dict size when all keys are Unicode (GH-31564)

This commit is contained in:
Inada Naoki 2022-03-02 08:09:28 +09:00 committed by GitHub
parent 21099fc064
commit 9833bb91e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 884 additions and 491 deletions

View File

@ -404,6 +404,11 @@ Optimizations
larger *k*).
(Contributed by Serhiy Storchaka in :issue:`37295`.)
* Dict don't store hash value when all inserted keys are Unicode objects.
This reduces dict size. For example, ``sys.getsizeof(dict.fromkeys("abcdefg"))``
becomes 272 bytes from 352 bytes on 64bit platform.
(Contributed by Inada Naoki in :issue:`46845`.)
CPython bytecode changes
========================

View File

@ -43,6 +43,11 @@ typedef struct {
PyObject *me_value; /* This field is only meaningful for combined tables */
} PyDictKeyEntry;
typedef struct {
PyObject *me_key; /* The key must be Unicode and have hash. */
PyObject *me_value; /* This field is only meaningful for combined tables */
} PyDictUnicodeEntry;
extern PyDictKeysObject *_PyDict_NewKeysForClass(void);
extern PyObject *_PyDict_FromKeys(PyObject *, PyObject *, PyObject *);
@ -70,6 +75,7 @@ extern PyObject *_PyDict_Pop_KnownHash(PyObject *, PyObject *, Py_hash_t, PyObje
#define DKIX_EMPTY (-1)
#define DKIX_DUMMY (-2) /* Used internally */
#define DKIX_ERROR (-3)
#define DKIX_KEY_CHANGED (-4) /* Used internally */
typedef enum {
DICT_KEYS_GENERAL = 0,
@ -114,7 +120,7 @@ struct _dictkeysobject {
Dynamically sized, SIZEOF_VOID_P is minimum. */
char dk_indices[]; /* char is required to avoid strict aliasing. */
/* "PyDictKeyEntry dk_entries[dk_usable];" array follows:
/* "PyDictKeyEntry or PyDictUnicodeEntry dk_entries[USABLE_FRACTION(DK_SIZE(dk))];" array follows:
see the DK_ENTRIES() macro */
};
@ -148,13 +154,20 @@ struct _dictvalues {
2 : sizeof(int32_t))
#endif
#define DK_ENTRIES(dk) \
((PyDictKeyEntry*)(&((int8_t*)((dk)->dk_indices))[(size_t)1 << (dk)->dk_log2_index_bytes]))
(assert(dk->dk_kind == DICT_KEYS_GENERAL), (PyDictKeyEntry*)(&((int8_t*)((dk)->dk_indices))[(size_t)1 << (dk)->dk_log2_index_bytes]))
#define DK_UNICODE_ENTRIES(dk) \
(assert(dk->dk_kind != DICT_KEYS_GENERAL), (PyDictUnicodeEntry*)(&((int8_t*)((dk)->dk_indices))[(size_t)1 << (dk)->dk_log2_index_bytes]))
#define DK_IS_UNICODE(dk) ((dk)->dk_kind != DICT_KEYS_GENERAL)
extern uint64_t _pydict_global_version;
#define DICT_NEXT_VERSION() (++_pydict_global_version)
extern PyObject *_PyObject_MakeDictFromInstanceAttributes(PyObject *obj, PyDictValues *values);
extern PyObject *_PyDict_FromItems(
PyObject *const *keys, Py_ssize_t keys_offset,
PyObject *const *values, Py_ssize_t values_offset,
Py_ssize_t length);
static inline void
_PyDictValues_AddToInsertionOrder(PyDictValues *values, Py_ssize_t ix)

View File

@ -1346,8 +1346,12 @@ class SizeofTest(unittest.TestCase):
check({}.__iter__, size('2P'))
# empty dict
check({}, size('nQ2P'))
# dict
check({"a": 1}, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 8 + (8*2//3)*calcsize('n2P'))
# dict (string key)
check({"a": 1}, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 8 + (8*2//3)*calcsize('2P'))
longdict = {str(i): i for i in range(8)}
check(longdict, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 16 + (16*2//3)*calcsize('2P'))
# dict (non-string key)
check({1: 1}, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 8 + (8*2//3)*calcsize('n2P'))
longdict = {1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8}
check(longdict, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 16 + (16*2//3)*calcsize('n2P'))
# dictionary-keyview
@ -1506,14 +1510,14 @@ class SizeofTest(unittest.TestCase):
)
class newstyleclass(object): pass
# Separate block for PyDictKeysObject with 8 keys and 5 entries
check(newstyleclass, s + calcsize(DICT_KEY_STRUCT_FORMAT) + 64 + 42*calcsize("n2P"))
check(newstyleclass, s + calcsize(DICT_KEY_STRUCT_FORMAT) + 64 + 42*calcsize("2P"))
# dict with shared keys
[newstyleclass() for _ in range(100)]
check(newstyleclass().__dict__, size('nQ2P') + self.P)
o = newstyleclass()
o.a = o.b = o.c = o.d = o.e = o.f = o.g = o.h = 1
# Separate block for PyDictKeysObject with 16 keys and 10 entries
check(newstyleclass, s + calcsize(DICT_KEY_STRUCT_FORMAT) + 64 + 42*calcsize("n2P"))
check(newstyleclass, s + calcsize(DICT_KEY_STRUCT_FORMAT) + 64 + 42*calcsize("2P"))
# dict with shared keys
check(newstyleclass().__dict__, size('nQ2P') + self.P)
# unicode

View File

@ -0,0 +1,3 @@
Reduces dict size by removing hash value from hash table when all inserted
keys are Unicode. For example, ``sys.getsizeof(dict.fromkeys("abcdefg"))``
becomes 272 bytes from 352 bytes on 64bit platform.

View File

@ -934,26 +934,11 @@ PyObject *
_PyStack_AsDict(PyObject *const *values, PyObject *kwnames)
{
Py_ssize_t nkwargs;
PyObject *kwdict;
Py_ssize_t i;
assert(kwnames != NULL);
nkwargs = PyTuple_GET_SIZE(kwnames);
kwdict = _PyDict_NewPresized(nkwargs);
if (kwdict == NULL) {
return NULL;
}
for (i = 0; i < nkwargs; i++) {
PyObject *key = PyTuple_GET_ITEM(kwnames, i);
PyObject *value = *values++;
/* If key already exists, replace it with the new value */
if (PyDict_SetItem(kwdict, key, value)) {
Py_DECREF(kwdict);
return NULL;
}
}
return kwdict;
return _PyDict_FromItems(&PyTuple_GET_ITEM(kwnames, 0), 1,
values, 1, nkwargs);
}

View File

@ -70,8 +70,8 @@ A values array
Tunable Dictionary Parameters
-----------------------------
See comments for PyDict_MINSIZE_SPLIT, PyDict_MINSIZE_COMBINED,
USABLE_FRACTION and GROWTH_RATE in dictobject.c
See comments for PyDict_MINSIZE, USABLE_FRACTION and GROWTH_RATE in
dictobject.c
Tune-ups should be measured across a broad range of applications and
use cases. A change to any parameter will help in some situations and

File diff suppressed because it is too large Load Diff

View File

@ -1457,7 +1457,7 @@ eval_frame_handle_pending(PyThreadState *tstate)
LOAD_##attr_or_method); \
assert(dict->ma_keys->dk_kind == DICT_KEYS_UNICODE); \
assert(cache0->index < dict->ma_keys->dk_nentries); \
PyDictKeyEntry *ep = DK_ENTRIES(dict->ma_keys) + cache0->index; \
PyDictUnicodeEntry *ep = DK_UNICODE_ENTRIES(dict->ma_keys) + cache0->index; \
res = ep->me_value; \
DEOPT_IF(res == NULL, LOAD_##attr_or_method); \
STAT_INC(LOAD_##attr_or_method, hit); \
@ -1595,6 +1595,19 @@ is_method(PyObject **stack_pointer, int args) {
return PEEK(args+2) != NULL;
}
static PyObject*
dictkeys_get_value_by_index(PyDictKeysObject *dk, int index)
{
if (DK_IS_UNICODE(dk)) {
PyDictUnicodeEntry *ep = DK_UNICODE_ENTRIES(dk) + index;
return ep->me_value;
}
else {
PyDictKeyEntry *ep = DK_ENTRIES(dk) + index;
return ep->me_value;
}
}
#define KWNAMES_LEN() \
(call_shape.kwnames == NULL ? 0 : ((int)PyTuple_GET_SIZE(call_shape.kwnames)))
@ -3030,8 +3043,7 @@ handle_eval_breaker:
_PyLoadGlobalCache *cache = (_PyLoadGlobalCache *)next_instr;
uint32_t version = read32(&cache->module_keys_version);
DEOPT_IF(dict->ma_keys->dk_version != version, LOAD_GLOBAL);
PyDictKeyEntry *ep = DK_ENTRIES(dict->ma_keys) + cache->index;
PyObject *res = ep->me_value;
PyObject *res = dictkeys_get_value_by_index(dict->ma_keys, cache->index);
DEOPT_IF(res == NULL, LOAD_GLOBAL);
JUMPBY(INLINE_CACHE_ENTRIES_LOAD_GLOBAL);
STAT_INC(LOAD_GLOBAL, hit);
@ -3051,8 +3063,7 @@ handle_eval_breaker:
uint16_t bltn_version = cache->builtin_keys_version;
DEOPT_IF(mdict->ma_keys->dk_version != mod_version, LOAD_GLOBAL);
DEOPT_IF(bdict->ma_keys->dk_version != bltn_version, LOAD_GLOBAL);
PyDictKeyEntry *ep = DK_ENTRIES(bdict->ma_keys) + cache->index;
PyObject *res = ep->me_value;
PyObject *res = dictkeys_get_value_by_index(bdict->ma_keys, cache->index);
DEOPT_IF(res == NULL, LOAD_GLOBAL);
JUMPBY(INLINE_CACHE_ENTRIES_LOAD_GLOBAL);
STAT_INC(LOAD_GLOBAL, hit);
@ -3272,20 +3283,12 @@ handle_eval_breaker:
}
TARGET(BUILD_MAP) {
Py_ssize_t i;
PyObject *map = _PyDict_NewPresized((Py_ssize_t)oparg);
PyObject *map = _PyDict_FromItems(
&PEEK(2*oparg), 2,
&PEEK(2*oparg - 1), 2,
oparg);
if (map == NULL)
goto error;
for (i = oparg; i > 0; i--) {
int err;
PyObject *key = PEEK(2*i);
PyObject *value = PEEK(2*i - 1);
err = PyDict_SetItem(map, key, value);
if (err != 0) {
Py_DECREF(map);
goto error;
}
}
while (oparg--) {
Py_DECREF(POP());
@ -3351,7 +3354,6 @@ handle_eval_breaker:
}
TARGET(BUILD_CONST_KEY_MAP) {
Py_ssize_t i;
PyObject *map;
PyObject *keys = TOP();
if (!PyTuple_CheckExact(keys) ||
@ -3360,20 +3362,12 @@ handle_eval_breaker:
"bad BUILD_CONST_KEY_MAP keys argument");
goto error;
}
map = _PyDict_NewPresized((Py_ssize_t)oparg);
map = _PyDict_FromItems(
&PyTuple_GET_ITEM(keys, 0), 1,
&PEEK(oparg + 1), 1, oparg);
if (map == NULL) {
goto error;
}
for (i = oparg; i > 0; i--) {
int err;
PyObject *key = PyTuple_GET_ITEM(keys, oparg - i);
PyObject *value = PEEK(i + 1);
err = PyDict_SetItem(map, key, value);
if (err != 0) {
Py_DECREF(map);
goto error;
}
}
Py_DECREF(POP());
while (oparg--) {
@ -3538,9 +3532,16 @@ handle_eval_breaker:
PyObject *name = GETITEM(names, cache0->original_oparg);
uint16_t hint = cache0->index;
DEOPT_IF(hint >= (size_t)dict->ma_keys->dk_nentries, LOAD_ATTR);
if (DK_IS_UNICODE(dict->ma_keys)) {
PyDictUnicodeEntry *ep = DK_UNICODE_ENTRIES(dict->ma_keys) + hint;
DEOPT_IF(ep->me_key != name, LOAD_ATTR);
res = ep->me_value;
}
else {
PyDictKeyEntry *ep = DK_ENTRIES(dict->ma_keys) + hint;
DEOPT_IF(ep->me_key != name, LOAD_ATTR);
res = ep->me_value;
}
DEOPT_IF(res == NULL, LOAD_ATTR);
STAT_INC(LOAD_ATTR, hit);
Py_INCREF(res);
@ -3630,15 +3631,27 @@ handle_eval_breaker:
PyObject *name = GETITEM(names, cache0->original_oparg);
uint16_t hint = cache0->index;
DEOPT_IF(hint >= (size_t)dict->ma_keys->dk_nentries, STORE_ATTR);
PyObject *value, *old_value;
if (DK_IS_UNICODE(dict->ma_keys)) {
PyDictUnicodeEntry *ep = DK_UNICODE_ENTRIES(dict->ma_keys) + hint;
DEOPT_IF(ep->me_key != name, STORE_ATTR);
old_value = ep->me_value;
DEOPT_IF(old_value == NULL, STORE_ATTR);
STACK_SHRINK(1);
value = POP();
ep->me_value = value;
}
else {
PyDictKeyEntry *ep = DK_ENTRIES(dict->ma_keys) + hint;
DEOPT_IF(ep->me_key != name, STORE_ATTR);
PyObject *old_value = ep->me_value;
old_value = ep->me_value;
DEOPT_IF(old_value == NULL, STORE_ATTR);
STAT_INC(STORE_ATTR, hit);
STACK_SHRINK(1);
PyObject *value = POP();
value = POP();
ep->me_value = value;
}
Py_DECREF(old_value);
STAT_INC(STORE_ATTR, hit);
/* Ensure dict is GC tracked if it needs to be */
if (!_PyObject_GC_IS_TRACKED(dict) && _PyObject_GC_MAY_BE_TRACKED(value)) {
_PyObject_GC_TRACK(dict);

View File

@ -787,12 +787,6 @@ class PyDictObjectPtr(PyObjectPtr):
def _get_entries(keys):
dk_nentries = int(keys['dk_nentries'])
dk_size = 1<<int(keys['dk_log2_size'])
try:
# <= Python 3.5
return keys['dk_entries'], dk_size
except RuntimeError:
# >= Python 3.6
pass
if dk_size <= 0xFF:
offset = dk_size
@ -805,7 +799,10 @@ class PyDictObjectPtr(PyObjectPtr):
ent_addr = keys['dk_indices'].address
ent_addr = ent_addr.cast(_type_unsigned_char_ptr()) + offset
if int(keys['dk_kind']) == 0: # DICT_KEYS_GENERAL
ent_ptr_t = gdb.lookup_type('PyDictKeyEntry').pointer()
else:
ent_ptr_t = gdb.lookup_type('PyDictUnicodeEntry').pointer()
ent_addr = ent_addr.cast(ent_ptr_t)
return ent_addr, dk_nentries