From 14b78f5fc54fa96a084b26597ede891e88b9fc05 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 9 Jan 2009 22:27:08 +0000 Subject: [PATCH] Merged revisions 68462 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ........ r68462 | antoine.pitrou | 2009-01-09 22:40:55 +0100 (ven., 09 janv. 2009) | 6 lines Issue #4074: Change the criteria for doing a full garbage collection (i.e. collecting the oldest generation) so that allocating lots of objects without destroying them does not show quadratic performance. Based on a proposal by Martin von Löwis at http://mail.python.org/pipermail/python-dev/2008-June/080579.html. ........ --- Misc/NEWS | 6 +++++ Modules/gcmodule.c | 66 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/Misc/NEWS b/Misc/NEWS index 570f647f936..7832505816e 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,12 @@ What's New in Python 3.1 alpha 0 Core and Builtins ----------------- +- Issue #4074: Change the criteria for doing a full garbage collection (i.e. + collecting the oldest generation) so that allocating lots of objects without + destroying them does not show quadratic performance. Based on a proposal by + Martin von Löwis at + http://mail.python.org/pipermail/python-dev/2008-June/080579.html. + - Issue #4604: Some objects of the I/O library could still be used after having been closed (for instance, a read() call could return some previously buffered data). Patch by Dmitry Vasiliev. diff --git a/Modules/gcmodule.c b/Modules/gcmodule.c index 8462ee93faa..2474721eeb6 100644 --- a/Modules/gcmodule.c +++ b/Modules/gcmodule.c @@ -68,6 +68,55 @@ static PyObject *gc_str = NULL; /* Python string used to look for __del__ attribute. */ static PyObject *delstr = NULL; +/* This is the number of objects who survived the last full collection. It + approximates the number of long lived objects tracked by the GC. + + (by "full collection", we mean a collection of the oldest generation). +*/ +static Py_ssize_t long_lived_total = 0; + +/* This is the number of objects who survived all "non-full" collections, + and are awaiting to undergo a full collection for the first time. + +*/ +static Py_ssize_t long_lived_pending = 0; + +/* + NOTE: about the counting of long-lived objects. + + To limit the cost of garbage collection, there are two strategies; + - make each collection faster, e.g. by scanning fewer objects + - do less collections + This heuristic is about the latter strategy. + + In addition to the various configurable thresholds, we only trigger a + full collection if the ratio + long_lived_pending / long_lived_total + is above a given value (hardwired to 25%). + + The reason is that, while "non-full" collections (i.e., collections of + the young and middle generations) will always examine roughly the same + number of objects -- determined by the aforementioned thresholds --, + the cost of a full collection is proportional to the total number of + long-lived objects, which is virtually unbounded. + + Indeed, it has been remarked that doing a full collection every + of object creations entails a dramatic performance + degradation in workloads which consist in creating and storing lots of + long-lived objects (e.g. building a large list of GC-tracked objects would + show quadratic performance, instead of linear as expected: see issue #4074). + + Using the above ratio, instead, yields amortized linear performance in + the total number of objects (the effect of which can be summarized + thusly: "each full garbage collection is more and more costly as the + number of objects grows, but we do fewer and fewer of them"). + + This heuristic was suggested by Martin von Löwis on python-dev in + June 2008. His original analysis and proposal can be found at: + http://mail.python.org/pipermail/python-dev/2008-June/080579.html +*/ + + /* set for debugging information */ #define DEBUG_STATS (1<<0) /* print collection statistics */ #define DEBUG_COLLECTABLE (1<<1) /* print collectable objects */ @@ -795,8 +844,16 @@ collect(int generation) move_unreachable(young, &unreachable); /* Move reachable objects to next generation. */ - if (young != old) + if (young != old) { + if (generation == NUM_GENERATIONS - 2) { + long_lived_pending += gc_list_size(young); + } gc_list_merge(young, old); + } + else { + long_lived_pending = 0; + long_lived_total = gc_list_size(young); + } /* All objects in unreachable are trash, but objects reachable from * finalizers can't safely be deleted. Python programmers should take @@ -890,6 +947,13 @@ collect_generations(void) * generations younger than it will be collected. */ for (i = NUM_GENERATIONS-1; i >= 0; i--) { if (generations[i].count > generations[i].threshold) { + /* Avoid quadratic performance degradation in number + of tracked objects. See comments at the beginning + of this file, and issue #4074. + */ + if (i == NUM_GENERATIONS - 1 + && long_lived_pending < long_lived_total / 4) + continue; n = collect(i); break; }