Implement PEP 412: Key-sharing dictionaries (closes #13903)
Patch from Mark Shannon.
This commit is contained in:
parent
80d07f8251
commit
7d95e40721
|
@ -13,78 +13,20 @@ extern "C" {
|
||||||
tuning dictionaries, and several ideas for possible optimizations.
|
tuning dictionaries, and several ideas for possible optimizations.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
|
||||||
There are three kinds of slots in the table:
|
|
||||||
|
|
||||||
1. Unused. me_key == me_value == NULL
|
|
||||||
Does not hold an active (key, value) pair now and never did. Unused can
|
|
||||||
transition to Active upon key insertion. This is the only case in which
|
|
||||||
me_key is NULL, and is each slot's initial state.
|
|
||||||
|
|
||||||
2. Active. me_key != NULL and me_key != dummy and me_value != NULL
|
|
||||||
Holds an active (key, value) pair. Active can transition to Dummy upon
|
|
||||||
key deletion. This is the only case in which me_value != NULL.
|
|
||||||
|
|
||||||
3. Dummy. me_key == dummy and me_value == NULL
|
|
||||||
Previously held an active (key, value) pair, but that was deleted and an
|
|
||||||
active pair has not yet overwritten the slot. Dummy can transition to
|
|
||||||
Active upon key insertion. Dummy slots cannot be made Unused again
|
|
||||||
(cannot have me_key set to NULL), else the probe sequence in case of
|
|
||||||
collision would have no way to know they were once active.
|
|
||||||
|
|
||||||
Note: .popitem() abuses the me_hash field of an Unused or Dummy slot to
|
|
||||||
hold a search finger. The me_hash field of Unused or Dummy slots has no
|
|
||||||
meaning otherwise.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* PyDict_MINSIZE is the minimum size of a dictionary. This many slots are
|
|
||||||
* allocated directly in the dict object (in the ma_smalltable member).
|
|
||||||
* It must be a power of 2, and at least 4. 8 allows dicts with no more
|
|
||||||
* than 5 active entries to live in ma_smalltable (and so avoid an
|
|
||||||
* additional malloc); instrumentation suggested this suffices for the
|
|
||||||
* majority of dicts (consisting mostly of usually-small instance dicts and
|
|
||||||
* usually-small dicts created to pass keyword arguments).
|
|
||||||
*/
|
|
||||||
#ifndef Py_LIMITED_API
|
#ifndef Py_LIMITED_API
|
||||||
#define PyDict_MINSIZE 8
|
|
||||||
|
|
||||||
|
typedef struct _dictkeysobject PyDictKeysObject;
|
||||||
|
|
||||||
|
/* The ma_values pointer is NULL for a combined table
|
||||||
|
* or points to an array of PyObject* for a split table
|
||||||
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
/* Cached hash code of me_key. */
|
|
||||||
Py_hash_t me_hash;
|
|
||||||
PyObject *me_key;
|
|
||||||
PyObject *me_value;
|
|
||||||
} PyDictEntry;
|
|
||||||
|
|
||||||
/*
|
|
||||||
To ensure the lookup algorithm terminates, there must be at least one Unused
|
|
||||||
slot (NULL key) in the table.
|
|
||||||
The value ma_fill is the number of non-NULL keys (sum of Active and Dummy);
|
|
||||||
ma_used is the number of non-NULL, non-dummy keys (== the number of non-NULL
|
|
||||||
values == the number of Active items).
|
|
||||||
To avoid slowing down lookups on a near-full table, we resize the table when
|
|
||||||
it's two-thirds full.
|
|
||||||
*/
|
|
||||||
typedef struct _dictobject PyDictObject;
|
|
||||||
struct _dictobject {
|
|
||||||
PyObject_HEAD
|
PyObject_HEAD
|
||||||
Py_ssize_t ma_fill; /* # Active + # Dummy */
|
Py_ssize_t ma_used;
|
||||||
Py_ssize_t ma_used; /* # Active */
|
PyDictKeysObject *ma_keys;
|
||||||
|
PyObject **ma_values;
|
||||||
|
} PyDictObject;
|
||||||
|
|
||||||
/* The table contains ma_mask + 1 slots, and that's a power of 2.
|
|
||||||
* We store the mask instead of the size because the mask is more
|
|
||||||
* frequently needed.
|
|
||||||
*/
|
|
||||||
Py_ssize_t ma_mask;
|
|
||||||
|
|
||||||
/* ma_table points to ma_smalltable for small tables, else to
|
|
||||||
* additional malloc'ed memory. ma_table is never NULL! This rule
|
|
||||||
* saves repeated runtime null-tests in the workhorse getitem and
|
|
||||||
* setitem calls.
|
|
||||||
*/
|
|
||||||
PyDictEntry *ma_table;
|
|
||||||
PyDictEntry *(*ma_lookup)(PyDictObject *mp, PyObject *key, Py_hash_t hash);
|
|
||||||
PyDictEntry ma_smalltable[PyDict_MINSIZE];
|
|
||||||
};
|
|
||||||
#endif /* Py_LIMITED_API */
|
#endif /* Py_LIMITED_API */
|
||||||
|
|
||||||
PyAPI_DATA(PyTypeObject) PyDict_Type;
|
PyAPI_DATA(PyTypeObject) PyDict_Type;
|
||||||
|
@ -117,6 +59,8 @@ PyAPI_FUNC(void) PyDict_Clear(PyObject *mp);
|
||||||
PyAPI_FUNC(int) PyDict_Next(
|
PyAPI_FUNC(int) PyDict_Next(
|
||||||
PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value);
|
PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value);
|
||||||
#ifndef Py_LIMITED_API
|
#ifndef Py_LIMITED_API
|
||||||
|
PyDictKeysObject *_PyDict_NewKeysForClass(void);
|
||||||
|
PyAPI_FUNC(PyObject *) PyObject_GenericGetDict(PyObject *, void *);
|
||||||
PyAPI_FUNC(int) _PyDict_Next(
|
PyAPI_FUNC(int) _PyDict_Next(
|
||||||
PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value, Py_hash_t *hash);
|
PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value, Py_hash_t *hash);
|
||||||
#endif
|
#endif
|
||||||
|
@ -131,6 +75,7 @@ PyAPI_FUNC(int) _PyDict_Contains(PyObject *mp, PyObject *key, Py_hash_t hash);
|
||||||
PyAPI_FUNC(PyObject *) _PyDict_NewPresized(Py_ssize_t minused);
|
PyAPI_FUNC(PyObject *) _PyDict_NewPresized(Py_ssize_t minused);
|
||||||
PyAPI_FUNC(void) _PyDict_MaybeUntrack(PyObject *mp);
|
PyAPI_FUNC(void) _PyDict_MaybeUntrack(PyObject *mp);
|
||||||
PyAPI_FUNC(int) _PyDict_HasOnlyStringKeys(PyObject *mp);
|
PyAPI_FUNC(int) _PyDict_HasOnlyStringKeys(PyObject *mp);
|
||||||
|
#define _PyDict_HasSplitTable(d) ((d)->ma_values != NULL)
|
||||||
|
|
||||||
PyAPI_FUNC(int) PyDict_ClearFreeList(void);
|
PyAPI_FUNC(int) PyDict_ClearFreeList(void);
|
||||||
#endif
|
#endif
|
||||||
|
@ -162,6 +107,11 @@ PyAPI_FUNC(int) PyDict_SetItemString(PyObject *dp, const char *key, PyObject *it
|
||||||
PyAPI_FUNC(int) _PyDict_SetItemId(PyObject *dp, struct _Py_Identifier *key, PyObject *item);
|
PyAPI_FUNC(int) _PyDict_SetItemId(PyObject *dp, struct _Py_Identifier *key, PyObject *item);
|
||||||
PyAPI_FUNC(int) PyDict_DelItemString(PyObject *dp, const char *key);
|
PyAPI_FUNC(int) PyDict_DelItemString(PyObject *dp, const char *key);
|
||||||
|
|
||||||
|
#ifndef Py_LIMITED_API
|
||||||
|
int _PyObjectDict_SetItem(PyTypeObject *tp, PyObject **dictptr, PyObject *name, PyObject *value);
|
||||||
|
PyObject *_PyDict_LoadGlobal(PyDictObject *, PyDictObject *, PyObject *);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -449,6 +449,7 @@ typedef struct _heaptypeobject {
|
||||||
see add_operators() in typeobject.c . */
|
see add_operators() in typeobject.c . */
|
||||||
PyBufferProcs as_buffer;
|
PyBufferProcs as_buffer;
|
||||||
PyObject *ht_name, *ht_slots, *ht_qualname;
|
PyObject *ht_name, *ht_slots, *ht_qualname;
|
||||||
|
struct _dictkeysobject *ht_cached_keys;
|
||||||
/* here are optional user slots, followed by the members. */
|
/* here are optional user slots, followed by the members. */
|
||||||
} PyHeapTypeObject;
|
} PyHeapTypeObject;
|
||||||
|
|
||||||
|
@ -517,7 +518,6 @@ PyAPI_FUNC(PyObject *) _PyObject_NextNotImplemented(PyObject *);
|
||||||
PyAPI_FUNC(PyObject *) PyObject_GenericGetAttr(PyObject *, PyObject *);
|
PyAPI_FUNC(PyObject *) PyObject_GenericGetAttr(PyObject *, PyObject *);
|
||||||
PyAPI_FUNC(int) PyObject_GenericSetAttr(PyObject *,
|
PyAPI_FUNC(int) PyObject_GenericSetAttr(PyObject *,
|
||||||
PyObject *, PyObject *);
|
PyObject *, PyObject *);
|
||||||
PyAPI_FUNC(PyObject *) PyObject_GenericGetDict(PyObject *, void *);
|
|
||||||
PyAPI_FUNC(int) PyObject_GenericSetDict(PyObject *, PyObject *, void *);
|
PyAPI_FUNC(int) PyObject_GenericSetDict(PyObject *, PyObject *, void *);
|
||||||
PyAPI_FUNC(Py_hash_t) PyObject_Hash(PyObject *);
|
PyAPI_FUNC(Py_hash_t) PyObject_Hash(PyObject *);
|
||||||
PyAPI_FUNC(Py_hash_t) PyObject_HashNotImplemented(PyObject *);
|
PyAPI_FUNC(Py_hash_t) PyObject_HashNotImplemented(PyObject *);
|
||||||
|
|
|
@ -321,6 +321,27 @@ class DictTest(unittest.TestCase):
|
||||||
self.assertEqual(hashed2.hash_count, 1)
|
self.assertEqual(hashed2.hash_count, 1)
|
||||||
self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
|
self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
|
||||||
|
|
||||||
|
def test_setitem_atomic_at_resize(self):
|
||||||
|
class Hashed(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.hash_count = 0
|
||||||
|
self.eq_count = 0
|
||||||
|
def __hash__(self):
|
||||||
|
self.hash_count += 1
|
||||||
|
return 42
|
||||||
|
def __eq__(self, other):
|
||||||
|
self.eq_count += 1
|
||||||
|
return id(self) == id(other)
|
||||||
|
hashed1 = Hashed()
|
||||||
|
# 5 items
|
||||||
|
y = {hashed1: 5, 0: 0, 1: 1, 2: 2, 3: 3}
|
||||||
|
hashed2 = Hashed()
|
||||||
|
# 6th item forces a resize
|
||||||
|
y[hashed2] = []
|
||||||
|
self.assertEqual(hashed1.hash_count, 1)
|
||||||
|
self.assertEqual(hashed2.hash_count, 1)
|
||||||
|
self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
|
||||||
|
|
||||||
def test_popitem(self):
|
def test_popitem(self):
|
||||||
# dict.popitem()
|
# dict.popitem()
|
||||||
for copymode in -1, +1:
|
for copymode in -1, +1:
|
||||||
|
|
|
@ -219,6 +219,8 @@ class QueryTestCase(unittest.TestCase):
|
||||||
others.should.not.be: like.this}"""
|
others.should.not.be: like.this}"""
|
||||||
self.assertEqual(DottedPrettyPrinter().pformat(o), exp)
|
self.assertEqual(DottedPrettyPrinter().pformat(o), exp)
|
||||||
|
|
||||||
|
@unittest.expectedFailure
|
||||||
|
#See http://bugs.python.org/issue13907
|
||||||
@test.support.cpython_only
|
@test.support.cpython_only
|
||||||
def test_set_reprs(self):
|
def test_set_reprs(self):
|
||||||
# This test creates a complex arrangement of frozensets and
|
# This test creates a complex arrangement of frozensets and
|
||||||
|
@ -241,10 +243,12 @@ class QueryTestCase(unittest.TestCase):
|
||||||
# Consequently, this test is fragile and
|
# Consequently, this test is fragile and
|
||||||
# implementation-dependent. Small changes to Python's sort
|
# implementation-dependent. Small changes to Python's sort
|
||||||
# algorithm cause the test to fail when it should pass.
|
# algorithm cause the test to fail when it should pass.
|
||||||
|
# XXX Or changes to the dictionary implmentation...
|
||||||
|
|
||||||
self.assertEqual(pprint.pformat(set()), 'set()')
|
self.assertEqual(pprint.pformat(set()), 'set()')
|
||||||
self.assertEqual(pprint.pformat(set(range(3))), '{0, 1, 2}')
|
self.assertEqual(pprint.pformat(set(range(3))), '{0, 1, 2}')
|
||||||
self.assertEqual(pprint.pformat(frozenset()), 'frozenset()')
|
self.assertEqual(pprint.pformat(frozenset()), 'frozenset()')
|
||||||
|
|
||||||
self.assertEqual(pprint.pformat(frozenset(range(3))), 'frozenset({0, 1, 2})')
|
self.assertEqual(pprint.pformat(frozenset(range(3))), 'frozenset({0, 1, 2})')
|
||||||
cube_repr_tgt = """\
|
cube_repr_tgt = """\
|
||||||
{frozenset(): frozenset({frozenset({2}), frozenset({0}), frozenset({1})}),
|
{frozenset(): frozenset({frozenset({2}), frozenset({0}), frozenset({1})}),
|
||||||
|
|
|
@ -687,9 +687,9 @@ class SizeofTest(unittest.TestCase):
|
||||||
# method-wrapper (descriptor object)
|
# method-wrapper (descriptor object)
|
||||||
check({}.__iter__, size(h + '2P'))
|
check({}.__iter__, size(h + '2P'))
|
||||||
# dict
|
# dict
|
||||||
check({}, size(h + '3P2P' + 8*'P2P'))
|
check({}, size(h + '3P' + '4P' + 8*'P2P'))
|
||||||
longdict = {1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8}
|
longdict = {1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8}
|
||||||
check(longdict, size(h + '3P2P' + 8*'P2P') + 16*size('P2P'))
|
check(longdict, size(h + '3P' + '4P') + 16*size('P2P'))
|
||||||
# dictionary-keyiterator
|
# dictionary-keyiterator
|
||||||
check({}.keys(), size(h + 'P'))
|
check({}.keys(), size(h + 'P'))
|
||||||
# dictionary-valueiterator
|
# dictionary-valueiterator
|
||||||
|
@ -831,7 +831,7 @@ class SizeofTest(unittest.TestCase):
|
||||||
# type
|
# type
|
||||||
# (PyTypeObject + PyNumberMethods + PyMappingMethods +
|
# (PyTypeObject + PyNumberMethods + PyMappingMethods +
|
||||||
# PySequenceMethods + PyBufferProcs)
|
# PySequenceMethods + PyBufferProcs)
|
||||||
s = size(vh + 'P2P15Pl4PP9PP11PI') + size('16Pi17P 3P 10P 2P 3P')
|
s = size(vh + 'P2P15Pl4PP9PP11PIP') + size('16Pi17P 3P 10P 2P 3P')
|
||||||
check(int, s)
|
check(int, s)
|
||||||
# class
|
# class
|
||||||
class newstyleclass(object): pass
|
class newstyleclass(object): pass
|
||||||
|
|
|
@ -10,6 +10,10 @@ What's New in Python 3.3.0 Alpha 3?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #13903: Implement PEP 412. Individual dictionary instances can now share
|
||||||
|
their keys with other dictionaries. Classes take advantage of this to share
|
||||||
|
their instance dictionary keys for improved memory and performance.
|
||||||
|
|
||||||
- Issue #14630: Fix a memory access bug for instances of a subclass of int
|
- Issue #14630: Fix a memory access bug for instances of a subclass of int
|
||||||
with value 0.
|
with value 0.
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
NOTES ON OPTIMIZING DICTIONARIES
|
NOTES ON DICTIONARIES
|
||||||
================================
|
================================
|
||||||
|
|
||||||
|
|
||||||
Principal Use Cases for Dictionaries
|
Principal Use Cases for Dictionaries
|
||||||
------------------------------------
|
------------------------------------
|
||||||
|
|
||||||
|
@ -21,7 +20,7 @@ Instance attribute lookup and Global variables
|
||||||
|
|
||||||
Builtins
|
Builtins
|
||||||
Frequent reads. Almost never written.
|
Frequent reads. Almost never written.
|
||||||
Size 126 interned strings (as of Py2.3b1).
|
About 150 interned strings (as of Py3.3).
|
||||||
A few keys are accessed much more frequently than others.
|
A few keys are accessed much more frequently than others.
|
||||||
|
|
||||||
Uniquification
|
Uniquification
|
||||||
|
@ -59,44 +58,43 @@ Dynamic Mappings
|
||||||
Characterized by deletions interspersed with adds and replacements.
|
Characterized by deletions interspersed with adds and replacements.
|
||||||
Performance benefits greatly from the re-use of dummy entries.
|
Performance benefits greatly from the re-use of dummy entries.
|
||||||
|
|
||||||
|
Data Layout
|
||||||
|
-----------
|
||||||
|
|
||||||
Data Layout (assuming a 32-bit box with 64 bytes per cache line)
|
Dictionaries are composed of 3 components:
|
||||||
----------------------------------------------------------------
|
The dictobject struct itself
|
||||||
|
A dict-keys object (keys & hashes)
|
||||||
Smalldicts (8 entries) are attached to the dictobject structure
|
A values array
|
||||||
and the whole group nearly fills two consecutive cache lines.
|
|
||||||
|
|
||||||
Larger dicts use the first half of the dictobject structure (one cache
|
|
||||||
line) and a separate, continuous block of entries (at 12 bytes each
|
|
||||||
for a total of 5.333 entries per cache line).
|
|
||||||
|
|
||||||
|
|
||||||
Tunable Dictionary Parameters
|
Tunable Dictionary Parameters
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
|
||||||
* PyDict_MINSIZE. Currently set to 8.
|
* PyDict_STARTSIZE. Starting size of dict (unless an instance dict).
|
||||||
Must be a power of two. New dicts have to zero-out every cell.
|
Currently set to 8. Must be a power of two.
|
||||||
Each additional 8 consumes 1.5 cache lines. Increasing improves
|
New dicts have to zero-out every cell.
|
||||||
the sparseness of small dictionaries but costs time to read in
|
Increasing improves the sparseness of small dictionaries but costs
|
||||||
the additional cache lines if they are not already in cache.
|
time to read in the additional cache lines if they are not already
|
||||||
That case is common when keyword arguments are passed.
|
in cache. That case is common when keyword arguments are passed.
|
||||||
|
Prior to version 3.3, PyDict_MINSIZE was used as the starting size
|
||||||
|
of a new dict.
|
||||||
|
|
||||||
* Maximum dictionary load in PyDict_SetItem. Currently set to 2/3.
|
* PyDict_MINSIZE. Minimum size of a dict.
|
||||||
Increasing this ratio makes dictionaries more dense resulting
|
Currently set to 4 (to keep instance dicts small).
|
||||||
in more collisions. Decreasing it improves sparseness at the
|
Must be a power of two. Prior to version 3.3, PyDict_MINSIZE was
|
||||||
expense of spreading entries over more cache lines and at the
|
set to 8.
|
||||||
|
|
||||||
|
* USABLE_FRACTION. Maximum dictionary load in PyDict_SetItem.
|
||||||
|
Currently set to 2/3. Increasing this ratio makes dictionaries more
|
||||||
|
dense resulting in more collisions. Decreasing it improves sparseness
|
||||||
|
at the expense of spreading entries over more cache lines and at the
|
||||||
cost of total memory consumed.
|
cost of total memory consumed.
|
||||||
|
|
||||||
The load test occurs in highly time sensitive code. Efforts
|
|
||||||
to make the test more complex (for example, varying the load
|
|
||||||
for different sizes) have degraded performance.
|
|
||||||
|
|
||||||
* Growth rate upon hitting maximum load. Currently set to *2.
|
* Growth rate upon hitting maximum load. Currently set to *2.
|
||||||
Raising this to *4 results in half the number of resizes,
|
Raising this to *4 results in half the number of resizes, less
|
||||||
less effort to resize, better sparseness for some (but not
|
effort to resize, better sparseness for some (but not all dict sizes),
|
||||||
all dict sizes), and potentially doubles memory consumption
|
and potentially doubles memory consumption depending on the size of
|
||||||
depending on the size of the dictionary. Setting to *4
|
the dictionary. Setting to *4 eliminates every other resize step.
|
||||||
eliminates every other resize step.
|
|
||||||
|
|
||||||
* Maximum sparseness (minimum dictionary load). What percentage
|
* Maximum sparseness (minimum dictionary load). What percentage
|
||||||
of entries can be unused before the dictionary shrinks to
|
of entries can be unused before the dictionary shrinks to
|
||||||
|
@ -135,136 +133,51 @@ by repeatedly invoking .pop will see no resizing, which might
|
||||||
not be necessary at all because the dictionary is eventually
|
not be necessary at all because the dictionary is eventually
|
||||||
discarded entirely.
|
discarded entirely.
|
||||||
|
|
||||||
|
The key differences between this implementation and earlier versions are:
|
||||||
|
1. The table can be split into two parts, the keys and the values.
|
||||||
|
|
||||||
|
2. There is an additional key-value combination: (key, NULL).
|
||||||
|
Unlike (<dummy>, NULL) which represents a deleted value, (key, NULL)
|
||||||
|
represented a yet to be inserted value. This combination can only occur
|
||||||
|
when the table is split.
|
||||||
|
|
||||||
|
3. No small table embedded in the dict,
|
||||||
|
as this would make sharing of key-tables impossible.
|
||||||
|
|
||||||
|
|
||||||
|
These changes have the following consequences.
|
||||||
|
1. General dictionaries are slightly larger.
|
||||||
|
|
||||||
|
2. All object dictionaries of a single class can share a single key-table,
|
||||||
|
saving about 60% memory for such cases.
|
||||||
|
|
||||||
Results of Cache Locality Experiments
|
Results of Cache Locality Experiments
|
||||||
-------------------------------------
|
--------------------------------------
|
||||||
|
|
||||||
When an entry is retrieved from memory, 4.333 adjacent entries are also
|
Experiments on an earlier design of dictionary, in which all tables were
|
||||||
retrieved into a cache line. Since accessing items in cache is *much*
|
combined, showed the following:
|
||||||
cheaper than a cache miss, an enticing idea is to probe the adjacent
|
|
||||||
entries as a first step in collision resolution. Unfortunately, the
|
|
||||||
introduction of any regularity into collision searches results in more
|
|
||||||
collisions than the current random chaining approach.
|
|
||||||
|
|
||||||
Exploiting cache locality at the expense of additional collisions fails
|
When an entry is retrieved from memory, several adjacent entries are also
|
||||||
to payoff when the entries are already loaded in cache (the expense
|
retrieved into a cache line. Since accessing items in cache is *much*
|
||||||
is paid with no compensating benefit). This occurs in small dictionaries
|
cheaper than a cache miss, an enticing idea is to probe the adjacent
|
||||||
where the whole dictionary fits into a pair of cache lines. It also
|
entries as a first step in collision resolution. Unfortunately, the
|
||||||
occurs frequently in large dictionaries which have a common access pattern
|
introduction of any regularity into collision searches results in more
|
||||||
where some keys are accessed much more frequently than others. The
|
collisions than the current random chaining approach.
|
||||||
more popular entries *and* their collision chains tend to remain in cache.
|
|
||||||
|
|
||||||
To exploit cache locality, change the collision resolution section
|
Exploiting cache locality at the expense of additional collisions fails
|
||||||
in lookdict() and lookdict_string(). Set i^=1 at the top of the
|
to payoff when the entries are already loaded in cache (the expense
|
||||||
loop and move the i = (i << 2) + i + perturb + 1 to an unrolled
|
is paid with no compensating benefit). This occurs in small dictionaries
|
||||||
version of the loop.
|
where the whole dictionary fits into a pair of cache lines. It also
|
||||||
|
occurs frequently in large dictionaries which have a common access pattern
|
||||||
|
where some keys are accessed much more frequently than others. The
|
||||||
|
more popular entries *and* their collision chains tend to remain in cache.
|
||||||
|
|
||||||
This optimization strategy can be leveraged in several ways:
|
To exploit cache locality, change the collision resolution section
|
||||||
|
in lookdict() and lookdict_string(). Set i^=1 at the top of the
|
||||||
|
loop and move the i = (i << 2) + i + perturb + 1 to an unrolled
|
||||||
|
version of the loop.
|
||||||
|
|
||||||
* If the dictionary is kept sparse (through the tunable parameters),
|
For split tables, the above will apply to the keys, but the value will
|
||||||
then the occurrence of additional collisions is lessened.
|
always be in a different cache line from the key.
|
||||||
|
|
||||||
* If lookdict() and lookdict_string() are specialized for small dicts
|
|
||||||
and for largedicts, then the versions for large_dicts can be given
|
|
||||||
an alternate search strategy without increasing collisions in small dicts
|
|
||||||
which already have the maximum benefit of cache locality.
|
|
||||||
|
|
||||||
* If the use case for a dictionary is known to have a random key
|
|
||||||
access pattern (as opposed to a more common pattern with a Zipf's law
|
|
||||||
distribution), then there will be more benefit for large dictionaries
|
|
||||||
because any given key is no more likely than another to already be
|
|
||||||
in cache.
|
|
||||||
|
|
||||||
* In use cases with paired accesses to the same key, the second access
|
|
||||||
is always in cache and gets no benefit from efforts to further improve
|
|
||||||
cache locality.
|
|
||||||
|
|
||||||
Optimizing the Search of Small Dictionaries
|
|
||||||
-------------------------------------------
|
|
||||||
|
|
||||||
If lookdict() and lookdict_string() are specialized for smaller dictionaries,
|
|
||||||
then a custom search approach can be implemented that exploits the small
|
|
||||||
search space and cache locality.
|
|
||||||
|
|
||||||
* The simplest example is a linear search of contiguous entries. This is
|
|
||||||
simple to implement, guaranteed to terminate rapidly, never searches
|
|
||||||
the same entry twice, and precludes the need to check for dummy entries.
|
|
||||||
|
|
||||||
* A more advanced example is a self-organizing search so that the most
|
|
||||||
frequently accessed entries get probed first. The organization
|
|
||||||
adapts if the access pattern changes over time. Treaps are ideally
|
|
||||||
suited for self-organization with the most common entries at the
|
|
||||||
top of the heap and a rapid binary search pattern. Most probes and
|
|
||||||
results are all located at the top of the tree allowing them all to
|
|
||||||
be located in one or two cache lines.
|
|
||||||
|
|
||||||
* Also, small dictionaries may be made more dense, perhaps filling all
|
|
||||||
eight cells to take the maximum advantage of two cache lines.
|
|
||||||
|
|
||||||
|
|
||||||
Strategy Pattern
|
|
||||||
----------------
|
|
||||||
|
|
||||||
Consider allowing the user to set the tunable parameters or to select a
|
|
||||||
particular search method. Since some dictionary use cases have known
|
|
||||||
sizes and access patterns, the user may be able to provide useful hints.
|
|
||||||
|
|
||||||
1) For example, if membership testing or lookups dominate runtime and memory
|
|
||||||
is not at a premium, the user may benefit from setting the maximum load
|
|
||||||
ratio at 5% or 10% instead of the usual 66.7%. This will sharply
|
|
||||||
curtail the number of collisions but will increase iteration time.
|
|
||||||
The builtin namespace is a prime example of a dictionary that can
|
|
||||||
benefit from being highly sparse.
|
|
||||||
|
|
||||||
2) Dictionary creation time can be shortened in cases where the ultimate
|
|
||||||
size of the dictionary is known in advance. The dictionary can be
|
|
||||||
pre-sized so that no resize operations are required during creation.
|
|
||||||
Not only does this save resizes, but the key insertion will go
|
|
||||||
more quickly because the first half of the keys will be inserted into
|
|
||||||
a more sparse environment than before. The preconditions for this
|
|
||||||
strategy arise whenever a dictionary is created from a key or item
|
|
||||||
sequence and the number of *unique* keys is known.
|
|
||||||
|
|
||||||
3) If the key space is large and the access pattern is known to be random,
|
|
||||||
then search strategies exploiting cache locality can be fruitful.
|
|
||||||
The preconditions for this strategy arise in simulations and
|
|
||||||
numerical analysis.
|
|
||||||
|
|
||||||
4) If the keys are fixed and the access pattern strongly favors some of
|
|
||||||
the keys, then the entries can be stored contiguously and accessed
|
|
||||||
with a linear search or treap. This exploits knowledge of the data,
|
|
||||||
cache locality, and a simplified search routine. It also eliminates
|
|
||||||
the need to test for dummy entries on each probe. The preconditions
|
|
||||||
for this strategy arise in symbol tables and in the builtin dictionary.
|
|
||||||
|
|
||||||
|
|
||||||
Readonly Dictionaries
|
|
||||||
---------------------
|
|
||||||
Some dictionary use cases pass through a build stage and then move to a
|
|
||||||
more heavily exercised lookup stage with no further changes to the
|
|
||||||
dictionary.
|
|
||||||
|
|
||||||
An idea that emerged on python-dev is to be able to convert a dictionary
|
|
||||||
to a read-only state. This can help prevent programming errors and also
|
|
||||||
provide knowledge that can be exploited for lookup optimization.
|
|
||||||
|
|
||||||
The dictionary can be immediately rebuilt (eliminating dummy entries),
|
|
||||||
resized (to an appropriate level of sparseness), and the keys can be
|
|
||||||
jostled (to minimize collisions). The lookdict() routine can then
|
|
||||||
eliminate the test for dummy entries (saving about 1/4 of the time
|
|
||||||
spent in the collision resolution loop).
|
|
||||||
|
|
||||||
An additional possibility is to insert links into the empty spaces
|
|
||||||
so that dictionary iteration can proceed in len(d) steps instead of
|
|
||||||
(mp->mask + 1) steps. Alternatively, a separate tuple of keys can be
|
|
||||||
kept just for iteration.
|
|
||||||
|
|
||||||
|
|
||||||
Caching Lookups
|
|
||||||
---------------
|
|
||||||
The idea is to exploit key access patterns by anticipating future lookups
|
|
||||||
based on previous lookups.
|
|
||||||
|
|
||||||
The simplest incarnation is to save the most recently accessed entry.
|
|
||||||
This gives optimal performance for use cases where every get is followed
|
|
||||||
by a set or del to the same key.
|
|
||||||
|
|
1771
Objects/dictobject.c
1771
Objects/dictobject.c
File diff suppressed because it is too large
Load Diff
|
@ -1188,13 +1188,10 @@ _PyObject_GenericSetAttrWithDict(PyObject *obj, PyObject *name,
|
||||||
if (dict == NULL) {
|
if (dict == NULL) {
|
||||||
dictptr = _PyObject_GetDictPtr(obj);
|
dictptr = _PyObject_GetDictPtr(obj);
|
||||||
if (dictptr != NULL) {
|
if (dictptr != NULL) {
|
||||||
dict = *dictptr;
|
res = _PyObjectDict_SetItem(Py_TYPE(obj), dictptr, name, value);
|
||||||
if (dict == NULL && value != NULL) {
|
if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError))
|
||||||
dict = PyDict_New();
|
PyErr_SetObject(PyExc_AttributeError, name);
|
||||||
if (dict == NULL)
|
goto done;
|
||||||
goto done;
|
|
||||||
*dictptr = dict;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (dict != NULL) {
|
if (dict != NULL) {
|
||||||
|
@ -1236,22 +1233,6 @@ PyObject_GenericSetAttr(PyObject *obj, PyObject *name, PyObject *value)
|
||||||
return _PyObject_GenericSetAttrWithDict(obj, name, value, NULL);
|
return _PyObject_GenericSetAttrWithDict(obj, name, value, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *
|
|
||||||
PyObject_GenericGetDict(PyObject *obj, void *context)
|
|
||||||
{
|
|
||||||
PyObject *dict, **dictptr = _PyObject_GetDictPtr(obj);
|
|
||||||
if (dictptr == NULL) {
|
|
||||||
PyErr_SetString(PyExc_AttributeError,
|
|
||||||
"This object has no __dict__");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
dict = *dictptr;
|
|
||||||
if (dict == NULL)
|
|
||||||
*dictptr = dict = PyDict_New();
|
|
||||||
Py_XINCREF(dict);
|
|
||||||
return dict;
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
int
|
||||||
PyObject_GenericSetDict(PyObject *obj, PyObject *value, void *context)
|
PyObject_GenericSetDict(PyObject *obj, PyObject *value, void *context)
|
||||||
{
|
{
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
MCACHE_MAX_ATTR_SIZE, since it might be a problem if very large
|
MCACHE_MAX_ATTR_SIZE, since it might be a problem if very large
|
||||||
strings are used as attribute names. */
|
strings are used as attribute names. */
|
||||||
#define MCACHE_MAX_ATTR_SIZE 100
|
#define MCACHE_MAX_ATTR_SIZE 100
|
||||||
#define MCACHE_SIZE_EXP 10
|
#define MCACHE_SIZE_EXP 9
|
||||||
#define MCACHE_HASH(version, name_hash) \
|
#define MCACHE_HASH(version, name_hash) \
|
||||||
(((unsigned int)(version) * (unsigned int)(name_hash)) \
|
(((unsigned int)(version) * (unsigned int)(name_hash)) \
|
||||||
>> (8*sizeof(unsigned int) - MCACHE_SIZE_EXP))
|
>> (8*sizeof(unsigned int) - MCACHE_SIZE_EXP))
|
||||||
|
@ -2306,6 +2306,9 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
|
||||||
type->tp_dictoffset = slotoffset;
|
type->tp_dictoffset = slotoffset;
|
||||||
slotoffset += sizeof(PyObject *);
|
slotoffset += sizeof(PyObject *);
|
||||||
}
|
}
|
||||||
|
if (type->tp_dictoffset) {
|
||||||
|
et->ht_cached_keys = _PyDict_NewKeysForClass();
|
||||||
|
}
|
||||||
if (add_weak) {
|
if (add_weak) {
|
||||||
assert(!base->tp_itemsize);
|
assert(!base->tp_itemsize);
|
||||||
type->tp_weaklistoffset = slotoffset;
|
type->tp_weaklistoffset = slotoffset;
|
||||||
|
@ -2411,6 +2414,9 @@ PyType_FromSpec(PyType_Spec *spec)
|
||||||
res->ht_type.tp_doc = tp_doc;
|
res->ht_type.tp_doc = tp_doc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (res->ht_type.tp_dictoffset) {
|
||||||
|
res->ht_cached_keys = _PyDict_NewKeysForClass();
|
||||||
|
}
|
||||||
|
|
||||||
if (PyType_Ready(&res->ht_type) < 0)
|
if (PyType_Ready(&res->ht_type) < 0)
|
||||||
goto fail;
|
goto fail;
|
||||||
|
@ -2767,9 +2773,13 @@ type_traverse(PyTypeObject *type, visitproc visit, void *arg)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern void
|
||||||
|
_PyDictKeys_DecRef(PyDictKeysObject *keys);
|
||||||
|
|
||||||
static int
|
static int
|
||||||
type_clear(PyTypeObject *type)
|
type_clear(PyTypeObject *type)
|
||||||
{
|
{
|
||||||
|
PyDictKeysObject *cached_keys;
|
||||||
/* Because of type_is_gc(), the collector only calls this
|
/* Because of type_is_gc(), the collector only calls this
|
||||||
for heaptypes. */
|
for heaptypes. */
|
||||||
assert(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
|
assert(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
|
||||||
|
@ -2801,6 +2811,11 @@ type_clear(PyTypeObject *type)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
PyType_Modified(type);
|
PyType_Modified(type);
|
||||||
|
cached_keys = ((PyHeapTypeObject *)type)->ht_cached_keys;
|
||||||
|
if (cached_keys != NULL) {
|
||||||
|
((PyHeapTypeObject *)type)->ht_cached_keys = NULL;
|
||||||
|
_PyDictKeys_DecRef(cached_keys);
|
||||||
|
}
|
||||||
if (type->tp_dict)
|
if (type->tp_dict)
|
||||||
PyDict_Clear(type->tp_dict);
|
PyDict_Clear(type->tp_dict);
|
||||||
Py_CLEAR(type->tp_mro);
|
Py_CLEAR(type->tp_mro);
|
||||||
|
|
|
@ -2123,70 +2123,31 @@ PyEval_EvalFrameEx(PyFrameObject *f, int throwflag)
|
||||||
w = GETITEM(names, oparg);
|
w = GETITEM(names, oparg);
|
||||||
if (PyDict_CheckExact(f->f_globals)
|
if (PyDict_CheckExact(f->f_globals)
|
||||||
&& PyDict_CheckExact(f->f_builtins)) {
|
&& PyDict_CheckExact(f->f_builtins)) {
|
||||||
if (PyUnicode_CheckExact(w)) {
|
x = _PyDict_LoadGlobal((PyDictObject *)f->f_globals,
|
||||||
/* Inline the PyDict_GetItem() calls.
|
(PyDictObject *)f->f_builtins,
|
||||||
WARNING: this is an extreme speed hack.
|
w);
|
||||||
Do not try this at home. */
|
|
||||||
Py_hash_t hash = ((PyASCIIObject *)w)->hash;
|
|
||||||
if (hash != -1) {
|
|
||||||
PyDictObject *d;
|
|
||||||
PyDictEntry *e;
|
|
||||||
d = (PyDictObject *)(f->f_globals);
|
|
||||||
e = d->ma_lookup(d, w, hash);
|
|
||||||
if (e == NULL) {
|
|
||||||
x = NULL;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
x = e->me_value;
|
|
||||||
if (x != NULL) {
|
|
||||||
Py_INCREF(x);
|
|
||||||
PUSH(x);
|
|
||||||
DISPATCH();
|
|
||||||
}
|
|
||||||
d = (PyDictObject *)(f->f_builtins);
|
|
||||||
e = d->ma_lookup(d, w, hash);
|
|
||||||
if (e == NULL) {
|
|
||||||
x = NULL;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
x = e->me_value;
|
|
||||||
if (x != NULL) {
|
|
||||||
Py_INCREF(x);
|
|
||||||
PUSH(x);
|
|
||||||
DISPATCH();
|
|
||||||
}
|
|
||||||
goto load_global_error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* This is the un-inlined version of the code above */
|
|
||||||
x = PyDict_GetItem(f->f_globals, w);
|
|
||||||
if (x == NULL) {
|
if (x == NULL) {
|
||||||
x = PyDict_GetItem(f->f_builtins, w);
|
if (!PyErr_Occurred())
|
||||||
if (x == NULL) {
|
format_exc_check_arg(PyExc_NameError,
|
||||||
load_global_error:
|
GLOBAL_NAME_ERROR_MSG, w);
|
||||||
format_exc_check_arg(
|
|
||||||
PyExc_NameError,
|
|
||||||
GLOBAL_NAME_ERROR_MSG, w);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Py_INCREF(x);
|
|
||||||
PUSH(x);
|
|
||||||
DISPATCH();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Slow-path if globals or builtins is not a dict */
|
|
||||||
x = PyObject_GetItem(f->f_globals, w);
|
|
||||||
if (x == NULL) {
|
|
||||||
x = PyObject_GetItem(f->f_builtins, w);
|
|
||||||
if (x == NULL) {
|
|
||||||
if (PyErr_ExceptionMatches(PyExc_KeyError))
|
|
||||||
format_exc_check_arg(
|
|
||||||
PyExc_NameError,
|
|
||||||
GLOBAL_NAME_ERROR_MSG, w);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
/* Slow-path if globals or builtins is not a dict */
|
||||||
|
x = PyObject_GetItem(f->f_globals, w);
|
||||||
|
if (x == NULL) {
|
||||||
|
x = PyObject_GetItem(f->f_builtins, w);
|
||||||
|
if (x == NULL) {
|
||||||
|
if (PyErr_ExceptionMatches(PyExc_KeyError))
|
||||||
|
format_exc_check_arg(
|
||||||
|
PyExc_NameError,
|
||||||
|
GLOBAL_NAME_ERROR_MSG, w);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Py_INCREF(x);
|
||||||
PUSH(x);
|
PUSH(x);
|
||||||
DISPATCH();
|
DISPATCH();
|
||||||
|
|
||||||
|
|
|
@ -634,9 +634,14 @@ class PyDictObjectPtr(PyObjectPtr):
|
||||||
Yields a sequence of (PyObjectPtr key, PyObjectPtr value) pairs,
|
Yields a sequence of (PyObjectPtr key, PyObjectPtr value) pairs,
|
||||||
analagous to dict.iteritems()
|
analagous to dict.iteritems()
|
||||||
'''
|
'''
|
||||||
for i in safe_range(self.field('ma_mask') + 1):
|
keys = self.field('ma_keys')
|
||||||
ep = self.field('ma_table') + i
|
values = self.field('ma_values')
|
||||||
pyop_value = PyObjectPtr.from_pyobject_ptr(ep['me_value'])
|
for i in safe_range(keys['dk_size']):
|
||||||
|
ep = keys['dk_entries'].address + i
|
||||||
|
if long(values):
|
||||||
|
pyop_value = PyObjectPtr.from_pyobject_ptr(values[i])
|
||||||
|
else:
|
||||||
|
pyop_value = PyObjectPtr.from_pyobject_ptr(ep['me_value'])
|
||||||
if not pyop_value.is_null():
|
if not pyop_value.is_null():
|
||||||
pyop_key = PyObjectPtr.from_pyobject_ptr(ep['me_key'])
|
pyop_key = PyObjectPtr.from_pyobject_ptr(ep['me_key'])
|
||||||
yield (pyop_key, pyop_value)
|
yield (pyop_key, pyop_value)
|
||||||
|
|
Loading…
Reference in New Issue