Issue #4688: Add a heuristic so that tuples and dicts containing only

untrackable objects are not tracked by the garbage collector. This can
reduce the size of collections and therefore the garbage collection overhead
on long-running programs, depending on their particular use of datatypes.

(trivia: this makes the "binary_trees" benchmark from the Computer Language
Shootout 40% faster)
This commit is contained in:
Antoine Pitrou 2009-03-23 18:41:45 +00:00
parent e5b78563b6
commit f8387af262
11 changed files with 401 additions and 2 deletions

View File

@ -140,6 +140,31 @@ The :mod:`gc` module provides the following functions:
.. versionadded:: 2.3
.. function:: is_tracked(obj)
Returns True if the object is currently tracked by the garbage collector,
False otherwise. As a general rule, instances of atomic types aren't
tracked and instances of non-atomic types (containers, user-defined
objects...) are. However, some type-specific optimizations can be present
in order to suppress the garbage collector footprint of simple instances
(e.g. dicts containing only atomic keys and values)::
>>> gc.is_tracked(0)
False
>>> gc.is_tracked("a")
False
>>> gc.is_tracked([])
True
>>> gc.is_tracked({})
False
>>> gc.is_tracked({"a": 1})
False
>>> gc.is_tracked({"a": []})
True
.. versionadded:: 2.7
The following variable is provided for read-only access (you can mutate its
value but should not rebind it):

View File

@ -111,6 +111,7 @@ PyAPI_FUNC(PyObject *) PyDict_Copy(PyObject *mp);
PyAPI_FUNC(int) PyDict_Contains(PyObject *mp, PyObject *key);
PyAPI_FUNC(int) _PyDict_Contains(PyObject *mp, PyObject *key, long hash);
PyAPI_FUNC(PyObject *) _PyDict_NewPresized(Py_ssize_t minused);
PyAPI_FUNC(void) _PyDict_MaybeUntrack(PyObject *mp);
/* PyDict_Update(mp, other) is equivalent to PyDict_Merge(mp, other, 1). */
PyAPI_FUNC(int) PyDict_Update(PyObject *mp, PyObject *other);

View File

@ -285,6 +285,17 @@ extern PyGC_Head *_PyGC_generation0;
g->gc.gc_next = NULL; \
} while (0);
/* True if the object is currently tracked by the GC. */
#define _PyObject_GC_IS_TRACKED(o) \
((_Py_AS_GC(o))->gc.gc_refs != _PyGC_REFS_UNTRACKED)
/* True if the object may be tracked by the GC in the future, or already is.
This can be useful to implement some optimizations. */
#define _PyObject_GC_MAY_BE_TRACKED(obj) \
(PyObject_IS_GC(obj) && \
(!PyTuple_CheckExact(obj) || _PyObject_GC_IS_TRACKED(obj)))
PyAPI_FUNC(PyObject *) _PyObject_GC_Malloc(size_t);
PyAPI_FUNC(PyObject *) _PyObject_GC_New(PyTypeObject *);
PyAPI_FUNC(PyVarObject *) _PyObject_GC_NewVar(PyTypeObject *, Py_ssize_t);

View File

@ -44,6 +44,7 @@ PyAPI_FUNC(int) PyTuple_SetItem(PyObject *, Py_ssize_t, PyObject *);
PyAPI_FUNC(PyObject *) PyTuple_GetSlice(PyObject *, Py_ssize_t, Py_ssize_t);
PyAPI_FUNC(int) _PyTuple_Resize(PyObject **, Py_ssize_t);
PyAPI_FUNC(PyObject *) PyTuple_Pack(Py_ssize_t, ...);
PyAPI_FUNC(void) _PyTuple_MaybeUntrack(PyObject *);
/* Macro, trading safety for speed */
#define PyTuple_GET_ITEM(op, i) (((PyTupleObject *)(op))->ob_item[i])

View File

@ -569,6 +569,104 @@ class DictTest(unittest.TestCase):
gc.collect()
self.assert_(ref() is None, "Cycle was not collected")
def _not_tracked(self, t):
# Nested containers can take several collections to untrack
gc.collect()
gc.collect()
self.assertFalse(gc.is_tracked(t), t)
def _tracked(self, t):
self.assertTrue(gc.is_tracked(t), t)
gc.collect()
gc.collect()
self.assertTrue(gc.is_tracked(t), t)
def test_track_literals(self):
# Test GC-optimization of dict literals
x, y, z, w = 1.5, "a", (1, None), []
self._not_tracked({})
self._not_tracked({x:(), y:x, z:1})
self._not_tracked({1: "a", "b": 2})
self._not_tracked({1: 2, (None, True, False, ()): int})
self._not_tracked({1: object()})
# Dicts with mutable elements are always tracked, even if those
# elements are not tracked right now.
self._tracked({1: []})
self._tracked({1: ([],)})
self._tracked({1: {}})
self._tracked({1: set()})
def test_track_dynamic(self):
# Test GC-optimization of dynamically-created dicts
class MyObject(object):
pass
x, y, z, w, o = 1.5, "a", (1, object()), [], MyObject()
d = dict()
self._not_tracked(d)
d[1] = "a"
self._not_tracked(d)
d[y] = 2
self._not_tracked(d)
d[z] = 3
self._not_tracked(d)
self._not_tracked(d.copy())
d[4] = w
self._tracked(d)
self._tracked(d.copy())
d[4] = None
self._not_tracked(d)
self._not_tracked(d.copy())
# dd isn't tracked right now, but it may mutate and therefore d
# which contains it must be tracked.
d = dict()
dd = dict()
d[1] = dd
self._not_tracked(dd)
self._tracked(d)
dd[1] = d
self._tracked(dd)
d = dict.fromkeys([x, y, z])
self._not_tracked(d)
dd = dict()
dd.update(d)
self._not_tracked(dd)
d = dict.fromkeys([x, y, z, o])
self._tracked(d)
dd = dict()
dd.update(d)
self._tracked(dd)
d = dict(x=x, y=y, z=z)
self._not_tracked(d)
d = dict(x=x, y=y, z=z, w=w)
self._tracked(d)
d = dict()
d.update(x=x, y=y, z=z)
self._not_tracked(d)
d.update(w=w)
self._tracked(d)
d = dict([(x, y), (z, 1)])
self._not_tracked(d)
d = dict([(x, y), (z, w)])
self._tracked(d)
d = dict()
d.update([(x, y), (z, 1)])
self._not_tracked(d)
d.update([(x, y), (z, w)])
self._tracked(d)
def test_track_subtypes(self):
# Dict subtypes are always tracked
class MyDict(dict):
pass
self._tracked(MyDict())
from test import mapping_tests

View File

@ -415,6 +415,37 @@ class GCTests(unittest.TestCase):
self.assertEqual(gc.get_referents(1, 'a', 4j), [])
def test_is_tracked(self):
# Atomic built-in types are not tracked, user-defined objects and
# mutable containers are.
# NOTE: types with special optimizations (e.g. tuple) have tests
# in their own test files instead.
self.assertFalse(gc.is_tracked(None))
self.assertFalse(gc.is_tracked(1))
self.assertFalse(gc.is_tracked(1.0))
self.assertFalse(gc.is_tracked(1.0 + 5.0j))
self.assertFalse(gc.is_tracked(True))
self.assertFalse(gc.is_tracked(False))
self.assertFalse(gc.is_tracked("a"))
self.assertFalse(gc.is_tracked(u"a"))
self.assertFalse(gc.is_tracked(bytearray("a")))
self.assertFalse(gc.is_tracked(type))
self.assertFalse(gc.is_tracked(int))
self.assertFalse(gc.is_tracked(object))
self.assertFalse(gc.is_tracked(object()))
class OldStyle:
pass
class NewStyle(object):
pass
self.assertTrue(gc.is_tracked(gc))
self.assertTrue(gc.is_tracked(OldStyle))
self.assertTrue(gc.is_tracked(OldStyle()))
self.assertTrue(gc.is_tracked(NewStyle))
self.assertTrue(gc.is_tracked(NewStyle()))
self.assertTrue(gc.is_tracked([]))
self.assertTrue(gc.is_tracked(set()))
def test_bug1055820b(self):
# Corresponds to temp2b.py in the bug report.

View File

@ -1,5 +1,7 @@
from test import test_support, seq_tests
import gc
class TupleTest(seq_tests.CommonTest):
type2test = tuple
@ -82,6 +84,69 @@ class TupleTest(seq_tests.CommonTest):
self.assertEqual(repr(a0), "()")
self.assertEqual(repr(a2), "(0, 1, 2)")
def _not_tracked(self, t):
# Nested tuples can take several collections to untrack
gc.collect()
gc.collect()
self.assertFalse(gc.is_tracked(t), t)
def _tracked(self, t):
self.assertTrue(gc.is_tracked(t), t)
gc.collect()
gc.collect()
self.assertTrue(gc.is_tracked(t), t)
def test_track_literals(self):
# Test GC-optimization of tuple literals
x, y, z = 1.5, "a", []
self._not_tracked(())
self._not_tracked((1,))
self._not_tracked((1, 2))
self._not_tracked((1, 2, "a"))
self._not_tracked((1, 2, (None, True, False, ()), int))
self._not_tracked((object(),))
self._not_tracked(((1, x), y, (2, 3)))
# Tuples with mutable elements are always tracked, even if those
# elements are not tracked right now.
self._tracked(([],))
self._tracked(([1],))
self._tracked(({},))
self._tracked((set(),))
self._tracked((x, y, z))
def check_track_dynamic(self, tp, always_track):
x, y, z = 1.5, "a", []
check = self._tracked if always_track else self._not_tracked
check(tp())
check(tp([]))
check(tp(set()))
check(tp([1, x, y]))
check(tp(obj for obj in [1, x, y]))
check(tp(set([1, x, y])))
check(tp(tuple([obj]) for obj in [1, x, y]))
check(tuple(tp([obj]) for obj in [1, x, y]))
self._tracked(tp([z]))
self._tracked(tp([[x, y]]))
self._tracked(tp([{x: y}]))
self._tracked(tp(obj for obj in [x, y, z]))
self._tracked(tp(tuple([obj]) for obj in [x, y, z]))
self._tracked(tuple(tp([obj]) for obj in [x, y, z]))
def test_track_dynamic(self):
# Test GC-optimization of dynamically constructed tuples.
self.check_track_dynamic(tuple, False)
def test_track_subtypes(self):
# Tuple subtypes must always be tracked
class MyTuple(tuple):
pass
self.check_track_dynamic(MyTuple, True)
def test_main():
test_support.run_unittest(TupleTest)

View File

@ -12,6 +12,11 @@ What's New in Python 2.7 alpha 1
Core and Builtins
-----------------
- Issue #4688: Add a heuristic so that tuples and dicts containing only
untrackable objects are not tracked by the garbage collector. This can
reduce the size of collections and therefore the garbage collection overhead
on long-running programs, depending on their particular use of datatypes.
- Issue #5512: Rewrite PyLong long division algorithm (x_divrem) to
improve its performance. Long divisions and remainder operations
are now between 50% and 150% faster.

View File

@ -432,7 +432,13 @@ move_unreachable(PyGC_Head *young, PyGC_Head *unreachable)
(void) traverse(op,
(visitproc)visit_reachable,
(void *)young);
next = gc->gc.gc_next;
next = gc->gc.gc_next;
if (PyTuple_CheckExact(op)) {
_PyTuple_MaybeUntrack(op);
}
else if (PyDict_CheckExact(op)) {
_PyDict_MaybeUntrack(op);
}
}
else {
/* This *may* be unreachable. To make progress,
@ -1264,6 +1270,26 @@ gc_get_objects(PyObject *self, PyObject *noargs)
return result;
}
PyDoc_STRVAR(gc_is_tracked__doc__,
"is_tracked(obj) -> bool\n"
"\n"
"Returns true if the object is tracked by the garbage collector.\n"
"Simple atomic objects will return false.\n"
);
static PyObject *
gc_is_tracked(PyObject *self, PyObject *obj)
{
PyObject *result;
if (PyObject_IS_GC(obj) && IS_TRACKED(obj))
result = Py_True;
else
result = Py_False;
Py_INCREF(result);
return result;
}
PyDoc_STRVAR(gc__doc__,
"This module provides access to the garbage collector for reference cycles.\n"
@ -1278,6 +1304,7 @@ PyDoc_STRVAR(gc__doc__,
"set_threshold() -- Set the collection thresholds.\n"
"get_threshold() -- Return the current the collection thresholds.\n"
"get_objects() -- Return a list of all objects tracked by the collector.\n"
"is_tracked() -- Returns true if a given object is tracked.\n"
"get_referrers() -- Return the list of objects that refer to an object.\n"
"get_referents() -- Return the list of objects that an object refers to.\n");
@ -1293,6 +1320,7 @@ static PyMethodDef GcMethods[] = {
{"collect", (PyCFunction)gc_collect,
METH_VARARGS | METH_KEYWORDS, gc_collect__doc__},
{"get_objects", gc_get_objects,METH_NOARGS, gc_get_objects__doc__},
{"is_tracked", gc_is_tracked, METH_O, gc_is_tracked__doc__},
{"get_referrers", gc_get_referrers, METH_VARARGS,
gc_get_referrers__doc__},
{"get_referents", gc_get_referents, METH_VARARGS,

View File

@ -180,6 +180,24 @@ show_alloc(void)
}
#endif
/* Debug statistic to count GC tracking of dicts */
#ifdef SHOW_TRACK_COUNT
static Py_ssize_t count_untracked = 0;
static Py_ssize_t count_tracked = 0;
static void
show_track(void)
{
fprintf(stderr, "Dicts created: %" PY_FORMAT_SIZE_T "d\n",
count_tracked + count_untracked);
fprintf(stderr, "Dicts tracked by the GC: %" PY_FORMAT_SIZE_T
"d\n", count_tracked);
fprintf(stderr, "%.2f%% dict tracking rate\n\n",
(100.0*count_tracked/(count_untracked+count_tracked)));
}
#endif
/* Initialization macros.
There are two ways to create a dict: PyDict_New() is the main C API
function, and the tp_new slot maps to dict_new(). In the latter case we
@ -232,6 +250,9 @@ PyDict_New(void)
#endif
#ifdef SHOW_ALLOC_COUNT
Py_AtExit(show_alloc);
#endif
#ifdef SHOW_TRACK_COUNT
Py_AtExit(show_track);
#endif
}
if (numfree) {
@ -262,10 +283,12 @@ PyDict_New(void)
#endif
}
mp->ma_lookup = lookdict_string;
#ifdef SHOW_TRACK_COUNT
count_untracked++;
#endif
#ifdef SHOW_CONVERSION_COUNTS
++created;
#endif
_PyObject_GC_TRACK(mp);
return (PyObject *)mp;
}
@ -433,6 +456,52 @@ lookdict_string(PyDictObject *mp, PyObject *key, register long hash)
return 0;
}
#ifdef SHOW_TRACK_COUNT
#define INCREASE_TRACK_COUNT \
(count_tracked++, count_untracked--);
#define DECREASE_TRACK_COUNT \
(count_tracked--, count_untracked++);
#else
#define INCREASE_TRACK_COUNT
#define DECREASE_TRACK_COUNT
#endif
#define MAINTAIN_TRACKING(mp, key, value) \
do { \
if (!_PyObject_GC_IS_TRACKED(mp)) { \
if (_PyObject_GC_MAY_BE_TRACKED(key) || \
_PyObject_GC_MAY_BE_TRACKED(value)) { \
_PyObject_GC_TRACK(mp); \
INCREASE_TRACK_COUNT \
} \
} \
} while(0)
void
_PyDict_MaybeUntrack(PyObject *op)
{
PyDictObject *mp;
PyObject *value;
Py_ssize_t mask, i;
PyDictEntry *ep;
if (!PyDict_CheckExact(op) || !_PyObject_GC_IS_TRACKED(op))
return;
mp = (PyDictObject *) op;
ep = mp->ma_table;
mask = mp->ma_mask;
for (i = 0; i <= mask; i++) {
if ((value = ep[i].me_value) == NULL)
continue;
if (_PyObject_GC_MAY_BE_TRACKED(value) ||
_PyObject_GC_MAY_BE_TRACKED(ep[i].me_key))
return;
}
_PyObject_GC_UNTRACK(op);
}
/*
Internal routine to insert a new item into the table.
Used both by the internal resize routine and by the public insert routine.
@ -453,6 +522,7 @@ insertdict(register PyDictObject *mp, PyObject *key, long hash, PyObject *value)
Py_DECREF(value);
return -1;
}
MAINTAIN_TRACKING(mp, key, value);
if (ep->me_value != NULL) {
old_value = ep->me_value;
ep->me_value = value;
@ -492,6 +562,7 @@ insertdict_clean(register PyDictObject *mp, PyObject *key, long hash,
PyDictEntry *ep0 = mp->ma_table;
register PyDictEntry *ep;
MAINTAIN_TRACKING(mp, key, value);
i = hash & mask;
ep = &ep0[i];
for (perturb = hash; ep->me_key != NULL; perturb >>= PERTURB_SHIFT) {
@ -2202,8 +2273,17 @@ dict_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
assert(d->ma_table == NULL && d->ma_fill == 0 && d->ma_used == 0);
INIT_NONZERO_DICT_SLOTS(d);
d->ma_lookup = lookdict_string;
/* The object has been implicitely tracked by tp_alloc */
if (type == &PyDict_Type)
_PyObject_GC_UNTRACK(d);
#ifdef SHOW_CONVERSION_COUNTS
++created;
#endif
#ifdef SHOW_TRACK_COUNT
if (_PyObject_GC_IS_TRACKED(d))
count_tracked++;
else
count_untracked++;
#endif
}
return self;

View File

@ -23,11 +23,36 @@ Py_ssize_t fast_tuple_allocs;
Py_ssize_t tuple_zero_allocs;
#endif
/* Debug statistic to count GC tracking of tuples.
Please note that tuples are only untracked when considered by the GC, and
many of them will be dead before. Therefore, a tracking rate close to 100%
does not necessarily prove that the heuristic is inefficient.
*/
#ifdef SHOW_TRACK_COUNT
static Py_ssize_t count_untracked = 0;
static Py_ssize_t count_tracked = 0;
static void
show_track(void)
{
fprintf(stderr, "Tuples created: %" PY_FORMAT_SIZE_T "d\n",
count_tracked + count_untracked);
fprintf(stderr, "Tuples tracked by the GC: %" PY_FORMAT_SIZE_T
"d\n", count_tracked);
fprintf(stderr, "%.2f%% tuple tracking rate\n\n",
(100.0*count_tracked/(count_untracked+count_tracked)));
}
#endif
PyObject *
PyTuple_New(register Py_ssize_t size)
{
register PyTupleObject *op;
Py_ssize_t i;
#ifdef SHOW_TRACK_COUNT
count_tracked++;
#endif
if (size < 0) {
PyErr_BadInternalCall();
return NULL;
@ -131,6 +156,32 @@ PyTuple_SetItem(register PyObject *op, register Py_ssize_t i, PyObject *newitem)
return 0;
}
void
_PyTuple_MaybeUntrack(PyObject *op)
{
PyTupleObject *t;
Py_ssize_t i, n;
if (!PyTuple_CheckExact(op) || !_PyObject_GC_IS_TRACKED(op))
return;
t = (PyTupleObject *) op;
n = Py_SIZE(t);
for (i = 0; i < n; i++) {
PyObject *elt = PyTuple_GET_ITEM(t, i);
/* Tuple with NULL elements aren't
fully constructed, don't untrack
them yet. */
if (!elt ||
_PyObject_GC_MAY_BE_TRACKED(elt))
return;
}
#ifdef SHOW_TRACK_COUNT
count_tracked--;
count_untracked++;
#endif
_PyObject_GC_UNTRACK(op);
}
PyObject *
PyTuple_Pack(Py_ssize_t n, ...)
{
@ -880,6 +931,9 @@ PyTuple_Fini(void)
(void)PyTuple_ClearFreeList();
#endif
#ifdef SHOW_TRACK_COUNT
show_track();
#endif
}
/*********************** Tuple Iterator **************************/