Implement PEP 412: Key-sharing dictionaries (closes #13903)

Patch from Mark Shannon.
2012-04-23 11:24:50 -04:00 · 2012-04-23 11:24:50 -04:00 · 7d95e40721
parent 80d07f8251
commit 7d95e40721
12 changed files with 1353 additions and 904 deletions
--- a/Include/dictobject.h
+++ b/Include/dictobject.h
@ -13,78 +13,20 @@ extern "C" {
   tuning dictionaries, and several ideas for possible optimizations.
 */

-/*
-There are three kinds of slots in the table:
-
-1. Unused.  me_key == me_value == NULL
-   Does not hold an active (key, value) pair now and never did.  Unused can
-   transition to Active upon key insertion.  This is the only case in which
-   me_key is NULL, and is each slot's initial state.
-
-2. Active.  me_key != NULL and me_key != dummy and me_value != NULL
-   Holds an active (key, value) pair.  Active can transition to Dummy upon
-   key deletion.  This is the only case in which me_value != NULL.
-
-3. Dummy.  me_key == dummy and me_value == NULL
-   Previously held an active (key, value) pair, but that was deleted and an
-   active pair has not yet overwritten the slot.  Dummy can transition to
-   Active upon key insertion.  Dummy slots cannot be made Unused again
-   (cannot have me_key set to NULL), else the probe sequence in case of
-   collision would have no way to know they were once active.
-
-Note: .popitem() abuses the me_hash field of an Unused or Dummy slot to
-hold a search finger.  The me_hash field of Unused or Dummy slots has no
-meaning otherwise.
-*/
-
-/* PyDict_MINSIZE is the minimum size of a dictionary.  This many slots are
- * allocated directly in the dict object (in the ma_smalltable member).
- * It must be a power of 2, and at least 4.  8 allows dicts with no more
- * than 5 active entries to live in ma_smalltable (and so avoid an
- * additional malloc); instrumentation suggested this suffices for the
- * majority of dicts (consisting mostly of usually-small instance dicts and
- * usually-small dicts created to pass keyword arguments).
- */
 #ifndef Py_LIMITED_API
-#define PyDict_MINSIZE 8

+typedef struct _dictkeysobject PyDictKeysObject;
+
+/* The ma_values pointer is NULL for a combined table
+ * or points to an array of PyObject* for a split table
+ */
 typedef struct {
-    /* Cached hash code of me_key. */
-    Py_hash_t me_hash;
-    PyObject *me_key;
-    PyObject *me_value;
-} PyDictEntry;
-
-/*
-To ensure the lookup algorithm terminates, there must be at least one Unused
-slot (NULL key) in the table.
-The value ma_fill is the number of non-NULL keys (sum of Active and Dummy);
-ma_used is the number of non-NULL, non-dummy keys (== the number of non-NULL
-values == the number of Active items).
-To avoid slowing down lookups on a near-full table, we resize the table when
-it's two-thirds full.
-*/
-typedef struct _dictobject PyDictObject;
-struct _dictobject {
    PyObject_HEAD
-    Py_ssize_t ma_fill;  /* # Active + # Dummy */
-    Py_ssize_t ma_used;  /* # Active */
+    Py_ssize_t ma_used;
+    PyDictKeysObject *ma_keys;
+    PyObject **ma_values;
+} PyDictObject;

-    /* The table contains ma_mask + 1 slots, and that's a power of 2.
-     * We store the mask instead of the size because the mask is more
-     * frequently needed.
-     */
-    Py_ssize_t ma_mask;
-
-    /* ma_table points to ma_smalltable for small tables, else to
-     * additional malloc'ed memory.  ma_table is never NULL!  This rule
-     * saves repeated runtime null-tests in the workhorse getitem and
-     * setitem calls.
-     */
-    PyDictEntry *ma_table;
-    PyDictEntry *(*ma_lookup)(PyDictObject *mp, PyObject *key, Py_hash_t hash);
-    PyDictEntry ma_smalltable[PyDict_MINSIZE];
-};
 #endif /* Py_LIMITED_API */

 PyAPI_DATA(PyTypeObject) PyDict_Type;
@ -117,6 +59,8 @@ PyAPI_FUNC(void) PyDict_Clear(PyObject *mp);
 PyAPI_FUNC(int) PyDict_Next(
    PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value);
 #ifndef Py_LIMITED_API
+PyDictKeysObject *_PyDict_NewKeysForClass(void);
+PyAPI_FUNC(PyObject *) PyObject_GenericGetDict(PyObject *, void *);
 PyAPI_FUNC(int) _PyDict_Next(
    PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value, Py_hash_t *hash);
 #endif
@ -131,6 +75,7 @@ PyAPI_FUNC(int) _PyDict_Contains(PyObject *mp, PyObject *key, Py_hash_t hash);
 PyAPI_FUNC(PyObject *) _PyDict_NewPresized(Py_ssize_t minused);
 PyAPI_FUNC(void) _PyDict_MaybeUntrack(PyObject *mp);
 PyAPI_FUNC(int) _PyDict_HasOnlyStringKeys(PyObject *mp);
+#define _PyDict_HasSplitTable(d) ((d)->ma_values != NULL)

 PyAPI_FUNC(int) PyDict_ClearFreeList(void);
 #endif
@ -162,6 +107,11 @@ PyAPI_FUNC(int) PyDict_SetItemString(PyObject *dp, const char *key, PyObject *it
 PyAPI_FUNC(int) _PyDict_SetItemId(PyObject *dp, struct _Py_Identifier *key, PyObject *item);
 PyAPI_FUNC(int) PyDict_DelItemString(PyObject *dp, const char *key);

+#ifndef Py_LIMITED_API
+int _PyObjectDict_SetItem(PyTypeObject *tp, PyObject **dictptr, PyObject *name, PyObject *value);
+PyObject *_PyDict_LoadGlobal(PyDictObject *, PyDictObject *, PyObject *);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
--- a/Include/object.h
+++ b/Include/object.h
@ -449,6 +449,7 @@ typedef struct _heaptypeobject {
                                      see add_operators() in typeobject.c . */
    PyBufferProcs as_buffer;
    PyObject *ht_name, *ht_slots, *ht_qualname;
+    struct _dictkeysobject *ht_cached_keys;
    /* here are optional user slots, followed by the members. */
 } PyHeapTypeObject;

@ -517,7 +518,6 @@ PyAPI_FUNC(PyObject *) _PyObject_NextNotImplemented(PyObject *);
 PyAPI_FUNC(PyObject *) PyObject_GenericGetAttr(PyObject *, PyObject *);
 PyAPI_FUNC(int) PyObject_GenericSetAttr(PyObject *,
                                              PyObject *, PyObject *);
-PyAPI_FUNC(PyObject *) PyObject_GenericGetDict(PyObject *, void *);
 PyAPI_FUNC(int) PyObject_GenericSetDict(PyObject *, PyObject *, void *);
 PyAPI_FUNC(Py_hash_t) PyObject_Hash(PyObject *);
 PyAPI_FUNC(Py_hash_t) PyObject_HashNotImplemented(PyObject *);
--- a/Lib/test/test_dict.py
+++ b/Lib/test/test_dict.py
@ -321,6 +321,27 @@ class DictTest(unittest.TestCase):
        self.assertEqual(hashed2.hash_count, 1)
        self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)

+    def test_setitem_atomic_at_resize(self):
+        class Hashed(object):
+            def __init__(self):
+                self.hash_count = 0
+                self.eq_count = 0
+            def __hash__(self):
+                self.hash_count += 1
+                return 42
+            def __eq__(self, other):
+                self.eq_count += 1
+                return id(self) == id(other)
+        hashed1 = Hashed()
+        # 5 items
+        y = {hashed1: 5, 0: 0, 1: 1, 2: 2, 3: 3}
+        hashed2 = Hashed()
+        # 6th item forces a resize
+        y[hashed2] = []
+        self.assertEqual(hashed1.hash_count, 1)
+        self.assertEqual(hashed2.hash_count, 1)
+        self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
+
    def test_popitem(self):
        # dict.popitem()
        for copymode in -1, +1:
--- a/Lib/test/test_pprint.py
+++ b/Lib/test/test_pprint.py
@ -219,6 +219,8 @@ class QueryTestCase(unittest.TestCase):
 others.should.not.be: like.this}"""
        self.assertEqual(DottedPrettyPrinter().pformat(o), exp)

+    @unittest.expectedFailure
+    #See http://bugs.python.org/issue13907
    @test.support.cpython_only
    def test_set_reprs(self):
        # This test creates a complex arrangement of frozensets and
@ -241,10 +243,12 @@ class QueryTestCase(unittest.TestCase):
        # Consequently, this test is fragile and
        # implementation-dependent.  Small changes to Python's sort
        # algorithm cause the test to fail when it should pass.
+        # XXX Or changes to the dictionary implmentation...

        self.assertEqual(pprint.pformat(set()), 'set()')
        self.assertEqual(pprint.pformat(set(range(3))), '{0, 1, 2}')
        self.assertEqual(pprint.pformat(frozenset()), 'frozenset()')
+
        self.assertEqual(pprint.pformat(frozenset(range(3))), 'frozenset({0, 1, 2})')
        cube_repr_tgt = """\
 {frozenset(): frozenset({frozenset({2}), frozenset({0}), frozenset({1})}),
--- a/Lib/test/test_sys.py
+++ b/Lib/test/test_sys.py
@ -687,9 +687,9 @@ class SizeofTest(unittest.TestCase):
        # method-wrapper (descriptor object)
        check({}.__iter__, size(h + '2P'))
        # dict
-        check({}, size(h + '3P2P' + 8*'P2P'))
+        check({}, size(h + '3P' + '4P' + 8*'P2P'))
        longdict = {1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8}
-        check(longdict, size(h + '3P2P' + 8*'P2P') + 16*size('P2P'))
+        check(longdict, size(h + '3P' + '4P') + 16*size('P2P'))
        # dictionary-keyiterator
        check({}.keys(), size(h + 'P'))
        # dictionary-valueiterator
@ -831,7 +831,7 @@ class SizeofTest(unittest.TestCase):
        # type
        # (PyTypeObject + PyNumberMethods + PyMappingMethods +
        #  PySequenceMethods + PyBufferProcs)
-        s = size(vh + 'P2P15Pl4PP9PP11PI') + size('16Pi17P 3P 10P 2P 3P')
+        s = size(vh + 'P2P15Pl4PP9PP11PIP') + size('16Pi17P 3P 10P 2P 3P')
        check(int, s)
        # class
        class newstyleclass(object): pass
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,10 @@ What's New in Python 3.3.0 Alpha 3?
 Core and Builtins
 -----------------

+- Issue #13903: Implement PEP 412. Individual dictionary instances can now share
+  their keys with other dictionaries. Classes take advantage of this to share
+  their instance dictionary keys for improved memory and performance.
+
 - Issue #14630: Fix a memory access bug for instances of a subclass of int
  with value 0.

--- a/Objects/dictnotes.txt
+++ b/Objects/dictnotes.txt
@ -1,7 +1,6 @@
-NOTES ON OPTIMIZING DICTIONARIES
+NOTES ON DICTIONARIES
 ================================

-
 Principal Use Cases for Dictionaries
 ------------------------------------

@ -21,7 +20,7 @@ Instance attribute lookup and Global variables

 Builtins
    Frequent reads.  Almost never written.
-    Size 126 interned strings (as of Py2.3b1).
+    About 150 interned strings (as of Py3.3).
    A few keys are accessed much more frequently than others.

 Uniquification
@ -59,44 +58,43 @@ Dynamic Mappings
    Characterized by deletions interspersed with adds and replacements.
    Performance benefits greatly from the re-use of dummy entries.

+Data Layout
+-----------

-Data Layout (assuming a 32-bit box with 64 bytes per cache line)
----------------------------------------------------------------
-
-Smalldicts (8 entries) are attached to the dictobject structure
-and the whole group nearly fills two consecutive cache lines.
-
-Larger dicts use the first half of the dictobject structure (one cache
-line) and a separate, continuous block of entries (at 12 bytes each
-for a total of 5.333 entries per cache line).
+Dictionaries are composed of 3 components:
+The dictobject struct itself
+A dict-keys object (keys & hashes)
+A values array


 Tunable Dictionary Parameters
 -----------------------------

-* PyDict_MINSIZE.  Currently set to 8.
-    Must be a power of two.  New dicts have to zero-out every cell.
-    Each additional 8 consumes 1.5 cache lines.  Increasing improves
-    the sparseness of small dictionaries but costs time to read in
-    the additional cache lines if they are not already in cache.
-    That case is common when keyword arguments are passed.
+* PyDict_STARTSIZE. Starting size of dict (unless an instance dict).
+    Currently set to 8. Must be a power of two.
+    New dicts have to zero-out every cell.
+    Increasing improves the sparseness of small dictionaries but costs
+    time to read in the additional cache lines if they are not already
+    in cache. That case is common when keyword arguments are passed.
+    Prior to version 3.3, PyDict_MINSIZE was used as the starting size
+    of a new dict.

-* Maximum dictionary load in PyDict_SetItem.  Currently set to 2/3.
-    Increasing this ratio makes dictionaries more dense resulting
-    in more collisions.  Decreasing it improves sparseness at the
-    expense of spreading entries over more cache lines and at the
+* PyDict_MINSIZE. Minimum size of a dict.
+    Currently set to 4 (to keep instance dicts small).
+    Must be a power of two. Prior to version 3.3, PyDict_MINSIZE was
+    set to 8.
+
+* USABLE_FRACTION. Maximum dictionary load in PyDict_SetItem.
+    Currently set to 2/3. Increasing this ratio makes dictionaries more
+    dense resulting in more collisions.  Decreasing it improves sparseness
+    at the expense of spreading entries over more cache lines and at the
    cost of total memory consumed.

-    The load test occurs in highly time sensitive code.  Efforts
-    to make the test more complex (for example, varying the load
-    for different sizes) have degraded performance.
-
 * Growth rate upon hitting maximum load.  Currently set to *2.
-    Raising this to *4 results in half the number of resizes,
-    less effort to resize, better sparseness for some (but not
-    all dict sizes), and potentially doubles memory consumption
-    depending on the size of the dictionary.  Setting to *4
-    eliminates every other resize step.
+    Raising this to *4 results in half the number of resizes, less
+    effort to resize, better sparseness for some (but not all dict sizes),
+    and potentially doubles memory consumption depending on the size of
+    the dictionary.  Setting to *4 eliminates every other resize step.

 * Maximum sparseness (minimum dictionary load).  What percentage
    of entries can be unused before the dictionary shrinks to
@ -135,136 +133,51 @@ by repeatedly invoking .pop will see no resizing, which might
 not be necessary at all because the dictionary is eventually
 discarded entirely.

+The key differences between this implementation and earlier versions are:
+    1. The table can be split into two parts, the keys and the values.
+
+    2. There is an additional key-value combination: (key, NULL).
+       Unlike (<dummy>, NULL) which represents a deleted value, (key, NULL)
+       represented a yet to be inserted value. This combination can only occur
+       when the table is split.
+
+    3. No small table embedded in the dict,
+       as this would make sharing of key-tables impossible.
+
+
+These changes have the following consequences.
+   1. General dictionaries are slightly larger.
+
+   2. All object dictionaries of a single class can share a single key-table,
+      saving about 60% memory for such cases.

 Results of Cache Locality Experiments
-------------------------------------
+--------------------------------------

-When an entry is retrieved from memory, 4.333 adjacent entries are also
-retrieved into a cache line.  Since accessing items in cache is *much*
-cheaper than a cache miss, an enticing idea is to probe the adjacent
-entries as a first step in collision resolution.  Unfortunately, the
-introduction of any regularity into collision searches results in more
-collisions than the current random chaining approach.
+Experiments on an earlier design of dictionary, in which all tables were
+combined, showed the following:

-Exploiting cache locality at the expense of additional collisions fails
-to payoff when the entries are already loaded in cache (the expense
-is paid with no compensating benefit).  This occurs in small dictionaries
-where the whole dictionary fits into a pair of cache lines.  It also
-occurs frequently in large dictionaries which have a common access pattern
-where some keys are accessed much more frequently than others.  The
-more popular entries *and* their collision chains tend to remain in cache.
+  When an entry is retrieved from memory, several adjacent entries are also
+  retrieved into a cache line.  Since accessing items in cache is *much*
+  cheaper than a cache miss, an enticing idea is to probe the adjacent
+  entries as a first step in collision resolution.  Unfortunately, the
+  introduction of any regularity into collision searches results in more
+  collisions than the current random chaining approach.

-To exploit cache locality, change the collision resolution section
-in lookdict() and lookdict_string().  Set i^=1 at the top of the
-loop and move the  i = (i << 2) + i + perturb + 1 to an unrolled
-version of the loop.
+  Exploiting cache locality at the expense of additional collisions fails
+  to payoff when the entries are already loaded in cache (the expense
+  is paid with no compensating benefit).  This occurs in small dictionaries
+  where the whole dictionary fits into a pair of cache lines.  It also
+  occurs frequently in large dictionaries which have a common access pattern
+  where some keys are accessed much more frequently than others.  The
+  more popular entries *and* their collision chains tend to remain in cache.

-This optimization strategy can be leveraged in several ways:
+  To exploit cache locality, change the collision resolution section
+  in lookdict() and lookdict_string().  Set i^=1 at the top of the
+  loop and move the  i = (i << 2) + i + perturb + 1 to an unrolled
+  version of the loop.

-* If the dictionary is kept sparse (through the tunable parameters),
-then the occurrence of additional collisions is lessened.
-
-* If lookdict() and lookdict_string() are specialized for small dicts
-and for largedicts, then the versions for large_dicts can be given
-an alternate search strategy without increasing collisions in small dicts
-which already have the maximum benefit of cache locality.
-
-* If the use case for a dictionary is known to have a random key
-access pattern (as opposed to a more common pattern with a Zipf's law
-distribution), then there will be more benefit for large dictionaries
-because any given key is no more likely than another to already be
-in cache.
-
-* In use cases with paired accesses to the same key, the second access
-is always in cache and gets no benefit from efforts to further improve
-cache locality.
-
-Optimizing the Search of Small Dictionaries
-------------------------------------------
-
-If lookdict() and lookdict_string() are specialized for smaller dictionaries,
-then a custom search approach can be implemented that exploits the small
-search space and cache locality.
-
-* The simplest example is a linear search of contiguous entries.  This is
-  simple to implement, guaranteed to terminate rapidly, never searches
-  the same entry twice, and precludes the need to check for dummy entries.
-
-* A more advanced example is a self-organizing search so that the most
-  frequently accessed entries get probed first.  The organization
-  adapts if the access pattern changes over time.  Treaps are ideally
-  suited for self-organization with the most common entries at the
-  top of the heap and a rapid binary search pattern.  Most probes and
-  results are all located at the top of the tree allowing them all to
-  be located in one or two cache lines.
-
-* Also, small dictionaries may be made more dense, perhaps filling all
-  eight cells to take the maximum advantage of two cache lines.
+For split tables, the above will apply to the keys, but the value will
+always be in a different cache line from the key.


-Strategy Pattern
----------------
-
-Consider allowing the user to set the tunable parameters or to select a
-particular search method.  Since some dictionary use cases have known
-sizes and access patterns, the user may be able to provide useful hints.
-
-1) For example, if membership testing or lookups dominate runtime and memory
-   is not at a premium, the user may benefit from setting the maximum load
-   ratio at 5% or 10% instead of the usual 66.7%.  This will sharply
-   curtail the number of collisions but will increase iteration time.
-   The builtin namespace is a prime example of a dictionary that can
-   benefit from being highly sparse.
-
-2) Dictionary creation time can be shortened in cases where the ultimate
-   size of the dictionary is known in advance.  The dictionary can be
-   pre-sized so that no resize operations are required during creation.
-   Not only does this save resizes, but the key insertion will go
-   more quickly because the first half of the keys will be inserted into
-   a more sparse environment than before.  The preconditions for this
-   strategy arise whenever a dictionary is created from a key or item
-   sequence and the number of *unique* keys is known.
-
-3) If the key space is large and the access pattern is known to be random,
-   then search strategies exploiting cache locality can be fruitful.
-   The preconditions for this strategy arise in simulations and
-   numerical analysis.
-
-4) If the keys are fixed and the access pattern strongly favors some of
-   the keys, then the entries can be stored contiguously and accessed
-   with a linear search or treap.  This exploits knowledge of the data,
-   cache locality, and a simplified search routine.  It also eliminates
-   the need to test for dummy entries on each probe.  The preconditions
-   for this strategy arise in symbol tables and in the builtin dictionary.
-
-
-Readonly Dictionaries
---------------------
-Some dictionary use cases pass through a build stage and then move to a
-more heavily exercised lookup stage with no further changes to the
-dictionary.
-
-An idea that emerged on python-dev is to be able to convert a dictionary
-to a read-only state.  This can help prevent programming errors and also
-provide knowledge that can be exploited for lookup optimization.
-
-The dictionary can be immediately rebuilt (eliminating dummy entries),
-resized (to an appropriate level of sparseness), and the keys can be
-jostled (to minimize collisions).  The lookdict() routine can then
-eliminate the test for dummy entries (saving about 1/4 of the time
-spent in the collision resolution loop).
-
-An additional possibility is to insert links into the empty spaces
-so that dictionary iteration can proceed in len(d) steps instead of
-(mp->mask + 1) steps.  Alternatively, a separate tuple of keys can be
-kept just for iteration.
-
-
-Caching Lookups
---------------
-The idea is to exploit key access patterns by anticipating future lookups
-based on previous lookups.
-
-The simplest incarnation is to save the most recently accessed entry.
-This gives optimal performance for use cases where every get is followed
-by a set or del to the same key.
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
--- a/Objects/object.c
+++ b/Objects/object.c
@ -1188,13 +1188,10 @@ _PyObject_GenericSetAttrWithDict(PyObject *obj, PyObject *name,
    if (dict == NULL) {
        dictptr = _PyObject_GetDictPtr(obj);
        if (dictptr != NULL) {
-            dict = *dictptr;
-            if (dict == NULL && value != NULL) {
-                dict = PyDict_New();
-                if (dict == NULL)
-                    goto done;
-                *dictptr = dict;
-            }
+            res = _PyObjectDict_SetItem(Py_TYPE(obj), dictptr, name, value);
+            if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError))
+                PyErr_SetObject(PyExc_AttributeError, name);
+            goto done;
        }
    }
    if (dict != NULL) {
@ -1236,22 +1233,6 @@ PyObject_GenericSetAttr(PyObject *obj, PyObject *name, PyObject *value)
    return _PyObject_GenericSetAttrWithDict(obj, name, value, NULL);
 }

-PyObject *
-PyObject_GenericGetDict(PyObject *obj, void *context)
-{
-    PyObject *dict, **dictptr = _PyObject_GetDictPtr(obj);
-    if (dictptr == NULL) {
-        PyErr_SetString(PyExc_AttributeError,
-                        "This object has no __dict__");
-        return NULL;
-    }
-    dict = *dictptr;
-    if (dict == NULL)
-        *dictptr = dict = PyDict_New();
-    Py_XINCREF(dict);
-    return dict;
-}
-
 int
 PyObject_GenericSetDict(PyObject *obj, PyObject *value, void *context)
 {
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@ -14,7 +14,7 @@
   MCACHE_MAX_ATTR_SIZE, since it might be a problem if very large
   strings are used as attribute names. */
 #define MCACHE_MAX_ATTR_SIZE    100
-#define MCACHE_SIZE_EXP         10
+#define MCACHE_SIZE_EXP         9
 #define MCACHE_HASH(version, name_hash)                                 \
        (((unsigned int)(version) * (unsigned int)(name_hash))          \
         >> (8*sizeof(unsigned int) - MCACHE_SIZE_EXP))
@ -2306,6 +2306,9 @@ type_new(PyTypeObject *metatype, PyObject *args, PyObject *kwds)
            type->tp_dictoffset = slotoffset;
        slotoffset += sizeof(PyObject *);
    }
+    if (type->tp_dictoffset) {
+        et->ht_cached_keys = _PyDict_NewKeysForClass();
+    }
    if (add_weak) {
        assert(!base->tp_itemsize);
        type->tp_weaklistoffset = slotoffset;
@ -2411,6 +2414,9 @@ PyType_FromSpec(PyType_Spec *spec)
            res->ht_type.tp_doc = tp_doc;
        }
    }
+    if (res->ht_type.tp_dictoffset) {
+        res->ht_cached_keys = _PyDict_NewKeysForClass();
+    }

    if (PyType_Ready(&res->ht_type) < 0)
        goto fail;
@ -2767,9 +2773,13 @@ type_traverse(PyTypeObject *type, visitproc visit, void *arg)
    return 0;
 }

+extern void
+_PyDictKeys_DecRef(PyDictKeysObject *keys);
+
 static int
 type_clear(PyTypeObject *type)
 {
+    PyDictKeysObject *cached_keys;
    /* Because of type_is_gc(), the collector only calls this
       for heaptypes. */
    assert(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
@ -2801,6 +2811,11 @@ type_clear(PyTypeObject *type)
    */

    PyType_Modified(type);
+    cached_keys = ((PyHeapTypeObject *)type)->ht_cached_keys;
+    if (cached_keys != NULL) {
+        ((PyHeapTypeObject *)type)->ht_cached_keys = NULL;
+        _PyDictKeys_DecRef(cached_keys);
+    }
    if (type->tp_dict)
        PyDict_Clear(type->tp_dict);
    Py_CLEAR(type->tp_mro);
--- a/Python/ceval.c
+++ b/Python/ceval.c
@ -2123,70 +2123,31 @@ PyEval_EvalFrameEx(PyFrameObject *f, int throwflag)
            w = GETITEM(names, oparg);
            if (PyDict_CheckExact(f->f_globals)
                && PyDict_CheckExact(f->f_builtins)) {
-                if (PyUnicode_CheckExact(w)) {
-                    /* Inline the PyDict_GetItem() calls.
-                       WARNING: this is an extreme speed hack.
-                       Do not try this at home. */
-                    Py_hash_t hash = ((PyASCIIObject *)w)->hash;
-                    if (hash != -1) {
-                        PyDictObject *d;
-                        PyDictEntry *e;
-                        d = (PyDictObject *)(f->f_globals);
-                        e = d->ma_lookup(d, w, hash);
-                        if (e == NULL) {
-                            x = NULL;
-                            break;
-                        }
-                        x = e->me_value;
-                        if (x != NULL) {
-                            Py_INCREF(x);
-                            PUSH(x);
-                            DISPATCH();
-                        }
-                        d = (PyDictObject *)(f->f_builtins);
-                        e = d->ma_lookup(d, w, hash);
-                        if (e == NULL) {
-                            x = NULL;
-                            break;
-                        }
-                        x = e->me_value;
-                        if (x != NULL) {
-                            Py_INCREF(x);
-                            PUSH(x);
-                            DISPATCH();
-                        }
-                        goto load_global_error;
-                    }
-                }
-                /* This is the un-inlined version of the code above */
-                x = PyDict_GetItem(f->f_globals, w);
+                x = _PyDict_LoadGlobal((PyDictObject *)f->f_globals,
+                                       (PyDictObject *)f->f_builtins,
+                                       w);
                if (x == NULL) {
-                    x = PyDict_GetItem(f->f_builtins, w);
-                    if (x == NULL) {
-                      load_global_error:
-                        format_exc_check_arg(
-                                    PyExc_NameError,
-                                    GLOBAL_NAME_ERROR_MSG, w);
-                        break;
-                    }
-                }
-                Py_INCREF(x);
-                PUSH(x);
-                DISPATCH();
-            }
-
-            /* Slow-path if globals or builtins is not a dict */
-            x = PyObject_GetItem(f->f_globals, w);
-            if (x == NULL) {
-                x = PyObject_GetItem(f->f_builtins, w);
-                if (x == NULL) {
-                    if (PyErr_ExceptionMatches(PyExc_KeyError))
-                        format_exc_check_arg(
-                                    PyExc_NameError,
-                                    GLOBAL_NAME_ERROR_MSG, w);
+                    if (!PyErr_Occurred())
+                        format_exc_check_arg(PyExc_NameError,
+                                             GLOBAL_NAME_ERROR_MSG, w);
                    break;
                }
            }
+            else {
+                /* Slow-path if globals or builtins is not a dict */
+                x = PyObject_GetItem(f->f_globals, w);
+                if (x == NULL) {
+                    x = PyObject_GetItem(f->f_builtins, w);
+                    if (x == NULL) {
+                        if (PyErr_ExceptionMatches(PyExc_KeyError))
+                            format_exc_check_arg(
+                                        PyExc_NameError,
+                                        GLOBAL_NAME_ERROR_MSG, w);
+                        break;
+                    }
+                }
+            }
+            Py_INCREF(x);
            PUSH(x);
            DISPATCH();

--- a/Tools/gdb/libpython.py
+++ b/Tools/gdb/libpython.py
@ -634,9 +634,14 @@ class PyDictObjectPtr(PyObjectPtr):
        Yields a sequence of (PyObjectPtr key, PyObjectPtr value) pairs,
        analagous to dict.iteritems()
        '''
-        for i in safe_range(self.field('ma_mask') + 1):
-            ep = self.field('ma_table') + i
-            pyop_value = PyObjectPtr.from_pyobject_ptr(ep['me_value'])
+        keys = self.field('ma_keys')
+        values = self.field('ma_values')
+        for i in safe_range(keys['dk_size']):
+            ep = keys['dk_entries'].address + i
+            if long(values):
+                pyop_value = PyObjectPtr.from_pyobject_ptr(values[i])
+            else:
+                pyop_value = PyObjectPtr.from_pyobject_ptr(ep['me_value'])
            if not pyop_value.is_null():
                pyop_key = PyObjectPtr.from_pyobject_ptr(ep['me_key'])
                yield (pyop_key, pyop_value)