From e05f65a0c6fb0b59fe72bcb8055eb74a3f63bff8 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 10 Aug 2002 05:21:15 +0000 Subject: [PATCH] 1. Combined the base and length arrays into a single array of structs. This is friendlier for caches. 2. Cut MIN_GALLOP to 7, but added a per-sort min_gallop vrbl that adapts the "get into galloping mode" threshold higher when galloping isn't paying, and lower when it is. There's no known case where this hurts. It's (of course) neutral for /sort, \sort and =sort. It also happens to be neutral for !sort. It cuts a tiny # of compares in 3sort and +sort. For *sort, it reduces the # of compares to better than what this used to do when MIN_GALLOP was hardcoded to 10 (it did about 0.1% more *sort compares before, but given how close we are to the limit, this is "a lot"!). %sort used to do about 1.5% more compares, and ~sort about 3.6% more. Here are exact counts: i *sort 3sort +sort %sort ~sort !sort 15 449235 33019 33016 51328 188720 65534 before 448885 33016 33007 50426 182083 65534 after 0.08% 0.01% 0.03% 1.79% 3.65% 0.00% %ch from after 16 963714 65824 65809 103409 377634 131070 962991 65821 65808 101667 364341 131070 0.08% 0.00% 0.00% 1.71% 3.65% 0.00% 17 2059092 131413 131362 209130 755476 262142 2057533 131410 131361 206193 728871 262142 0.08% 0.00% 0.00% 1.42% 3.65% 0.00% 18 4380687 262440 262460 421998 1511174 524286 4377402 262437 262459 416347 1457945 524286 0.08% 0.00% 0.00% 1.36% 3.65% 0.00% 19 9285709 524581 524634 848590 3022584 1048574 9278734 524580 524633 837947 2916107 1048574 0.08% 0.00% 0.00% 1.27% 3.65% 0.00% 20 19621118 1048960 1048942 1715806 6045418 2097150 19606028 1048958 1048941 1694896 5832445 2097150 0.08% 0.00% 0.00% 1.23% 3.65% 0.00% 3. Added some key asserts I overlooked before. 4. Updated the doc file. --- Objects/listobject.c | 84 ++++++++++++++++++++++++++++---------------- Objects/listsort.txt | 61 ++++++++++++++++++++------------ 2 files changed, 92 insertions(+), 53 deletions(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index 7fad905060b..cea8597f231 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1122,11 +1122,10 @@ fail: */ #define MAX_MERGE_PENDING 85 -/* If a run wins MIN_GALLOP times in a row, we switch to galloping mode, - * and stay there until both runs win less often than MIN_GALLOP - * consecutive times. See listsort.txt for more info. +/* When we get into galloping mode, we stay there until both runs win less + * often than MIN_GALLOP consecutive times. See listsort.txt for more info. */ -#define MIN_GALLOP 8 +#define MIN_GALLOP 7 /* Avoid malloc for small temp arrays. */ #define MERGESTATE_TEMP_SIZE 256 @@ -1134,10 +1133,21 @@ fail: /* One MergeState exists on the stack per invocation of mergesort. It's just * a convenient way to pass state around among the helper functions. */ +struct s_slice { + PyObject **base; + int len; +}; + typedef struct s_MergeState { /* The user-supplied comparison function. or NULL if none given. */ PyObject *compare; + /* This controls when we get *into* galloping mode. It's initialized + * to MIN_GALLOP. merge_lo and merge_hi tend to nudge it higher for + * random data, and lower for highly structured data. + */ + int min_gallop; + /* 'a' is temp storage to help with merges. It contains room for * alloced entries. */ @@ -1148,14 +1158,13 @@ typedef struct s_MergeState { * address base[i] and extends for len[i] elements. It's always * true (so long as the indices are in bounds) that * - * base[i] + len[i] == base[i+1] + * pending[i].base + pending[i].len == pending[i+1].base * * so we could cut the storage for this, but it's a minor amount, * and keeping all the info explicit simplifies the code. */ int n; - PyObject **base[MAX_MERGE_PENDING]; - int len[MAX_MERGE_PENDING]; + struct s_slice pending[MAX_MERGE_PENDING]; /* 'a' points to this when possible, rather than muck with malloc. */ PyObject *temparray[MERGESTATE_TEMP_SIZE]; @@ -1170,6 +1179,7 @@ merge_init(MergeState *ms, PyObject *compare) ms->a = ms->temparray; ms->alloced = MERGESTATE_TEMP_SIZE; ms->n = 0; + ms->min_gallop = MIN_GALLOP; } /* Free all the temp memory owned by the MergeState. This must be called @@ -1224,6 +1234,7 @@ merge_lo(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) PyObject *compare; PyObject **dest; int result = -1; /* guilty until proved innocent */ + int min_gallop = ms->min_gallop; assert(ms && pa && pb && na > 0 && nb > 0 && pa + na == pb); if (MERGE_GETMEM(ms, na) < 0) @@ -1248,6 +1259,7 @@ merge_lo(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) * appears to win consistently. */ for (;;) { + assert(na > 1 && nb > 0); k = ISLT(*pb, *pa, compare); if (k) { if (k < 0) @@ -1258,7 +1270,7 @@ merge_lo(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) --nb; if (nb == 0) goto Succeed; - if (bcount >= MIN_GALLOP) + if (bcount >= min_gallop) break; } else { @@ -1268,7 +1280,7 @@ merge_lo(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) --na; if (na == 1) goto CopyB; - if (acount >= MIN_GALLOP) + if (acount >= min_gallop) break; } } @@ -1278,7 +1290,11 @@ merge_lo(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) * (if ever) neither run appears to be winning consistently * anymore. */ + ++min_gallop; do { + assert(na > 1 && nb > 0); + min_gallop -= min_gallop > 1; + ms->min_gallop = min_gallop; k = gallop_right(*pb, pa, na, 0, compare); acount = k; if (k) { @@ -1319,6 +1335,8 @@ merge_lo(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) if (na == 1) goto CopyB; } while (acount >= MIN_GALLOP || bcount >= MIN_GALLOP); + ++min_gallop; /* penalize it for leaving galloping mode */ + ms->min_gallop = min_gallop; } Succeed: result = 0; @@ -1349,6 +1367,7 @@ merge_hi(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) int result = -1; /* guilty until proved innocent */ PyObject **basea; PyObject **baseb; + int min_gallop = ms->min_gallop; assert(ms && pa && pb && na > 0 && nb > 0 && pa + na == pb); if (MERGE_GETMEM(ms, nb) < 0) @@ -1376,6 +1395,7 @@ merge_hi(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) * appears to win consistently. */ for (;;) { + assert(na > 0 && nb > 1); k = ISLT(*pb, *pa, compare); if (k) { if (k < 0) @@ -1386,7 +1406,7 @@ merge_hi(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) --na; if (na == 0) goto Succeed; - if (acount >= MIN_GALLOP) + if (acount >= min_gallop) break; } else { @@ -1396,7 +1416,7 @@ merge_hi(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) --nb; if (nb == 1) goto CopyA; - if (bcount >= MIN_GALLOP) + if (bcount >= min_gallop) break; } } @@ -1406,7 +1426,11 @@ merge_hi(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) * (if ever) neither run appears to be winning consistently * anymore. */ + ++min_gallop; do { + assert(na > 0 && nb > 1); + min_gallop -= min_gallop > 1; + ms->min_gallop = min_gallop; k = gallop_right(*pb, basea, na, na-1, compare); if (k < 0) goto Fail; @@ -1449,6 +1473,8 @@ merge_hi(MergeState *ms, PyObject **pa, int na, PyObject **pb, int nb) if (na == 0) goto Succeed; } while (acount >= MIN_GALLOP || bcount >= MIN_GALLOP); + ++min_gallop; /* penalize it for leaving galloping mode */ + ms->min_gallop = min_gallop; } Succeed: result = 0; @@ -1482,10 +1508,10 @@ merge_at(MergeState *ms, int i) assert(i >= 0); assert(i == ms->n - 2 || i == ms->n - 3); - pa = ms->base[i]; - pb = ms->base[i+1]; - na = ms->len[i]; - nb = ms->len[i+1]; + pa = ms->pending[i].base; + na = ms->pending[i].len; + pb = ms->pending[i+1].base; + nb = ms->pending[i+1].len; assert(na > 0 && nb > 0); assert(pa + na == pb); @@ -1493,11 +1519,9 @@ merge_at(MergeState *ms, int i) * run now, also slide over the last run (which isn't involved * in this merge). The current run i+1 goes away in any case. */ - if (i == ms->n - 3) { - ms->len[i+1] = ms->len[i+2]; - ms->base[i+1] = ms->base[i+2]; - } - ms->len[i] = na + nb; + ms->pending[i].len = na + nb; + if (i == ms->n - 3) + ms->pending[i+1] = ms->pending[i+2]; --ms->n; /* Where does b start in a? Elements in a before that can be @@ -1541,18 +1565,18 @@ merge_at(MergeState *ms, int i) static int merge_collapse(MergeState *ms) { - int *len = ms->len; + struct s_slice *p = ms->pending; assert(ms); while (ms->n > 1) { int n = ms->n - 2; - if (n > 0 && len[n-1] <= len[n] + len[n+1]) { - if (len[n-1] < len[n+1]) + if (n > 0 && p[n-1].len <= p[n].len + p[n+1].len) { + if (p[n-1].len < p[n+1].len) --n; if (merge_at(ms, n) < 0) return -1; } - else if (len[n] <= len[n+1]) { + else if (p[n].len <= p[n+1].len) { if (merge_at(ms, n) < 0) return -1; } @@ -1570,12 +1594,12 @@ merge_collapse(MergeState *ms) static int merge_force_collapse(MergeState *ms) { - int *len = ms->len; + struct s_slice *p = ms->pending; assert(ms); while (ms->n > 1) { int n = ms->n - 2; - if (n > 0 && len[n-1] < len[n+1]) + if (n > 0 && p[n-1].len < p[n+1].len) --n; if (merge_at(ms, n) < 0) return -1; @@ -1664,8 +1688,8 @@ listsort(PyListObject *self, PyObject *args) } /* Push run onto pending-runs stack, and maybe merge. */ assert(ms.n < MAX_MERGE_PENDING); - ms.base[ms.n] = lo; - ms.len[ms.n] = n; + ms.pending[ms.n].base = lo; + ms.pending[ms.n].len = n; ++ms.n; if (merge_collapse(&ms) < 0) goto fail; @@ -1678,8 +1702,8 @@ listsort(PyListObject *self, PyObject *args) if (merge_force_collapse(&ms) < 0) goto fail; assert(ms.n == 1); - assert(ms.base[0] == self->ob_item); - assert(ms.len[0] == self->ob_size); + assert(ms.pending[0].base == self->ob_item); + assert(ms.pending[0].len == self->ob_size); succeed: result = Py_None; diff --git a/Objects/listsort.txt b/Objects/listsort.txt index 545ce513d02..c5612888803 100644 --- a/Objects/listsort.txt +++ b/Objects/listsort.txt @@ -95,31 +95,31 @@ Comparison with Python's Samplesort Hybrid below that, it's either astronomically lucky, or is finding exploitable structure in the data. - n lg(n!) *sort 3sort +sort %sort ~sort !sort -------- ------- ------ -------- ------- ------ ------- -------- - 32768 444255 453096 453614 32908 452871 130491 469141 old - 449235 33019 33016 51328 188720 65534 new - 0.86% 1273.80% -0.33% 782.31% -30.85% 615.87% %ch from new + n lg(n!) *sort 3sort +sort %sort ~sort !sort +------- ------- ------ ------- ------- ------ ------- -------- + 32768 444255 453096 453614 32908 452871 130491 469141 old + 448885 33016 33007 50426 182083 65534 new + 0.94% 1273.92% -0.30% 798.09% -28.33% 615.87% %ch from new 65536 954037 972699 981940 65686 973104 260029 1004607 - 963714 65824 65809 103409 377634 131070 - 0.93% 1391.77% -0.19% 841.02% -31.14% 666.47% + 962991 65821 65808 101667 364341 131070 + 1.01% 1391.83% -0.19% 857.15% -28.63% 666.47% 131072 2039137 2101881 2091491 131232 2092894 554790 2161379 - 2059092 131413 131362 209130 755476 262142 - 2.08% 1491.54% -0.10% 900.76% -26.56% 724.51% + 2057533 131410 131361 206193 728871 262142 + 2.16% 1491.58% -0.10% 915.02% -23.88% 724.51% 262144 4340409 4464460 4403233 262314 4445884 1107842 4584560 - 4380687 262440 262460 421998 1511174 524286 - 1.91% 1577.81% -0.06% 953.53% -26.69% 774.44% + 4377402 262437 262459 416347 1457945 524286 + 1.99% 1577.82% -0.06% 967.83% -24.01% 774.44% 524288 9205096 9453356 9408463 524468 9441930 2218577 9692015 - 9285709 524581 524634 848590 3022584 1048574 - 1.81% 1693.52% -0.03% 1012.66% -26.60% 824.30% + 9278734 524580 524633 837947 2916107 1048574 + 1.88% 1693.52% -0.03% 1026.79% -23.92% 824.30% 1048576 19458756 19950272 19838588 1048766 19912134 4430649 20434212 - 19621118 1048960 1048942 1715806 6045418 2097150 - 1.68% 1791.26% -0.02% 1060.51% -26.71% 874.38% + 19606028 1048958 1048941 1694896 5832445 2097150 + 1.76% 1791.27% -0.02% 1074.83% -24.03% 874.38% Discussion of cases: @@ -171,7 +171,6 @@ Comparison with Python's Samplesort Hybrid bytes each on this box) needed by each test, again with arguments "15 20 1": - 2**i *sort \sort /sort 3sort +sort %sort ~sort =sort !sort 32768 16384 0 0 6256 0 10821 12288 0 16383 65536 32766 0 0 21652 0 31276 24576 0 32767 @@ -430,6 +429,11 @@ etc. We stay in galloping mode until both searches find slices to copy less than MIN_GALLOP elements long, at which point we go back to one-pair- at-a-time mode. +A refinement: The MergeState struct contains the value of min_gallop that +controls when we enter galloping mode, initialized to MIN_GALLOP. +merge_lo() and merge_hi() adjust this higher when gallooping isn't paying +off, and lower when it is. + Galloping --------- @@ -536,13 +540,21 @@ at the other values. At and after i=6, galloping always wins. We can't guess in advance when it's going to win, though, so we do one pair at a time until the evidence seems strong that galloping may pay. MIN_GALLOP -is 8 as I type this, and that's pretty strong evidence. However, if the data -is random, it simply will trigger galloping mode purely by luck every now -and again, and it's quite likely to hit one of the losing cases next. 8 -favors protecting against a slowdown on random data at the expense of giving -up small wins on lightly clustered data, and tiny marginal wins on highly -clustered data (they win huge anyway, and if you're getting a factor of -10 speedup, another percent just isn't worth fighting for). +is 7, and that's pretty strong evidence. However, if the data is random, it +simply will trigger galloping mode purely by luck every now and again, and +it's quite likely to hit one of the losing cases next. On the other hand, +in cases like ~sort, galloping always pays, and MIN_GALLOP is larger than it +"should be" then. So the MergeState struct keeps a min_gallop variable +that merge_lo and merge_hi adjust: the longer we stay in galloping mode, +the smaller min_gallop gets, making it easier to transition back to +galloping mode (if we ever leave it in the current merge, and at the +start of the next merge). But whenever the gallop loop doesn't pay, +min_gallop is increased by one, making it harder to transition to back +to galloping mode (and again both within a merge and across merges). For +random data, this all but eliminates the gallop penalty: min_gallop grows +large enough that we almost never get into galloping mode. And for cases +like ~sort, min_gallop can fall to as low as 1. This seems to work well, +but in all it's a minor improvement over using a fixed MIN_GALLOP value. Galloping Complication @@ -567,6 +579,9 @@ wildly unbalanced runs already enjoys excellent performance. Comparing Average # of Compares on Random Arrays ------------------------------------------------ +[NOTE: This was done when the new algorithm used about 0.1% more compares + on random data than does its current incarnation.] + Here list.sort() is samplesort, and list.msort() this sort: """