diff --git a/Lib/test/test_sort.py b/Lib/test/test_sort.py index 3b6ad4d17b0..2a7cfb7affa 100644 --- a/Lib/test/test_sort.py +++ b/Lib/test/test_sort.py @@ -128,6 +128,27 @@ class TestBase(unittest.TestCase): x = [e for e, i in augmented] # a stable sort of s check("stability", x, s) + def test_small_stability(self): + from itertools import product + from operator import itemgetter + + # Exhaustively test stability across all lists of small lengths + # and only a few distinct elements. + # This can provoke edge cases that randomization is unlikely to find. + # But it can grow very expensive quickly, so don't overdo it. + NELTS = 3 + MAXSIZE = 9 + + pick0 = itemgetter(0) + for length in range(MAXSIZE + 1): + # There are NELTS ** length distinct lists. + for t in product(range(NELTS), repeat=length): + xs = list(zip(t, range(length))) + # Stability forced by index in each element. + forced = sorted(xs) + # Use key= to hide the index from compares. + native = sorted(xs, key=pick0) + self.assertEqual(forced, native) #============================================================================== class TestBugs(unittest.TestCase): diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-03-11-00-45-39.gh-issue-116554.gYumG5.rst b/Misc/NEWS.d/next/Core and Builtins/2024-03-11-00-45-39.gh-issue-116554.gYumG5.rst new file mode 100644 index 00000000000..82f92789de0 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-03-11-00-45-39.gh-issue-116554.gYumG5.rst @@ -0,0 +1 @@ +``list.sort()`` now exploits more cases of partial ordering, particularly those with long descending runs with sub-runs of equal values. Those are recognized as single runs now (previously, each block of repeated values caused a new run to be created). diff --git a/Objects/listobject.c b/Objects/listobject.c index 759902c06b4..7179d5929e2 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1618,10 +1618,11 @@ struct s_MergeState { /* binarysort is the best method for sorting small arrays: it does few compares, but can do data movement quadratic in the number of elements. - [lo, hi) is a contiguous slice of a list, and is sorted via + [lo.keys, hi) is a contiguous slice of a list of keys, and is sorted via binary insertion. This sort is stable. - On entry, must have lo <= start <= hi, and that [lo, start) is already - sorted (pass start == lo if you don't know!). + On entry, must have lo.keys <= start <= hi, and that + [lo.keys, start) is already sorted (pass start == lo.keys if you don't + know!). If islt() complains return -1, else 0. Even in case of error, the output slice will be some permutation of the input (nothing is lost or duplicated). @@ -1634,7 +1635,7 @@ binarysort(MergeState *ms, sortslice lo, PyObject **hi, PyObject **start) PyObject *pivot; assert(lo.keys <= start && start <= hi); - /* assert [lo, start) is sorted */ + /* assert [lo.keys, start) is sorted */ if (lo.keys == start) ++start; for (; start < hi; ++start) { @@ -1643,9 +1644,9 @@ binarysort(MergeState *ms, sortslice lo, PyObject **hi, PyObject **start) r = start; pivot = *r; /* Invariants: - * pivot >= all in [lo, l). + * pivot >= all in [lo.keys, l). * pivot < all in [r, start). - * The second is vacuously true at the start. + * These are vacuously true at the start. */ assert(l < r); do { @@ -1656,7 +1657,7 @@ binarysort(MergeState *ms, sortslice lo, PyObject **hi, PyObject **start) l = p+1; } while (l < r); assert(l == r); - /* The invariants still hold, so pivot >= all in [lo, l) and + /* The invariants still hold, so pivot >= all in [lo.keys, l) and pivot < all in [l, start), so pivot belongs at l. Note that if there are elements equal to pivot, l points to the first slot after them -- that's why this sort is stable. @@ -1671,7 +1672,7 @@ binarysort(MergeState *ms, sortslice lo, PyObject **hi, PyObject **start) p = start + offset; pivot = *p; l += offset; - for (p = start + offset; p > l; --p) + for ( ; p > l; --p) *p = *(p-1); *l = pivot; } @@ -1682,56 +1683,115 @@ binarysort(MergeState *ms, sortslice lo, PyObject **hi, PyObject **start) return -1; } +static void +sortslice_reverse(sortslice *s, Py_ssize_t n) +{ + reverse_slice(s->keys, &s->keys[n]); + if (s->values != NULL) + reverse_slice(s->values, &s->values[n]); +} + /* -Return the length of the run beginning at lo, in the slice [lo, hi). lo < hi -is required on entry. "A run" is the longest ascending sequence, with - - lo[0] <= lo[1] <= lo[2] <= ... - -or the longest descending sequence, with - - lo[0] > lo[1] > lo[2] > ... - -Boolean *descending is set to 0 in the former case, or to 1 in the latter. -For its intended use in a stable mergesort, the strictness of the defn of -"descending" is needed so that the caller can safely reverse a descending -sequence without violating stability (strict > ensures there are no equal -elements to get out of order). +Return the length of the run beginning at slo->keys, spanning no more than +nremaining elements. The run beginning there may be ascending or descending, +but the function permutes it in place, if needed, so that it's always ascending +upon return. Returns -1 in case of error. */ static Py_ssize_t -count_run(MergeState *ms, PyObject **lo, PyObject **hi, int *descending) +count_run(MergeState *ms, sortslice *slo, Py_ssize_t nremaining) { - Py_ssize_t k; + Py_ssize_t k; /* used by IFLT macro expansion */ Py_ssize_t n; + PyObject ** const lo = slo->keys; - assert(lo < hi); - *descending = 0; - ++lo; - if (lo == hi) - return 1; + /* In general, as things go on we've established that the slice starts + with a monotone run of n elements, starting at lo. */ - n = 2; - IFLT(*lo, *(lo-1)) { - *descending = 1; - for (lo = lo+1; lo < hi; ++lo, ++n) { - IFLT(*lo, *(lo-1)) - ; - else + /* We're n elements into the slice, and the most recent neq+1 elments are + * all equal. This reverses them in-place, and resets neq for reuse. + */ +#define REVERSE_LAST_NEQ \ + if (neq) { \ + sortslice slice = *slo; \ + ++neq; \ + sortslice_advance(&slice, n - neq); \ + sortslice_reverse(&slice, neq); \ + neq = 0; \ + } + + /* Sticking to only __lt__ compares is confusing and error-prone. But in + * this routine, almost all uses of IFLT can be captured by tiny macros + * giving mnemonic names to the intent. Note that inline functions don't + * work for this (IFLT expands to code including `goto fail`). + */ +#define IF_NEXT_LARGER IFLT(lo[n-1], lo[n]) +#define IF_NEXT_SMALLER IFLT(lo[n], lo[n-1]) + + assert(nremaining); + /* try ascending run first */ + for (n = 1; n < nremaining; ++n) { + IF_NEXT_SMALLER + break; + } + if (n == nremaining) + return n; + /* lo[n] is strictly less */ + /* If n is 1 now, then the first compare established it's a descending + * run, so fall through to the descending case. But if n > 1, there are + * n elements in an ascending run terminated by the strictly less lo[n]. + * If the first key < lo[n-1], *somewhere* along the way the sequence + * increased, so we're done (there is no descending run). + * Else first key >= lo[n-1], which implies that the entire ascending run + * consists of equal elements. In that case, this is a descending run, + * and we reverse the all-equal prefix in-place. + */ + if (n > 1) { + IFLT(lo[0], lo[n-1]) + return n; + sortslice_reverse(slo, n); + } + ++n; /* in all cases it's been established that lo[n] has been resolved */ + + /* Finish descending run. All-squal subruns are reversed in-place on the + * fly. Their original order will be restored at the end by the whole-slice + * reversal. + */ + Py_ssize_t neq = 0; + for ( ; n < nremaining; ++n) { + IF_NEXT_SMALLER { + /* This ends the most recent run of equal elments, but still in + * the "descending" direction. + */ + REVERSE_LAST_NEQ + } + else { + IF_NEXT_LARGER /* descending run is over */ break; + else /* not x < y and not y < x implies x == y */ + ++neq; } } - else { - for (lo = lo+1; lo < hi; ++lo, ++n) { - IFLT(*lo, *(lo-1)) - break; - } + REVERSE_LAST_NEQ + sortslice_reverse(slo, n); /* transform to ascending run */ + + /* And after reversing, it's possible this can be extended by a + * naturally increasing suffix; e.g., [3, 2, 3, 4, 1] makes an + * ascending run from the first 4 elements. + */ + for ( ; n < nremaining; ++n) { + IF_NEXT_SMALLER + break; } return n; fail: return -1; + +#undef REVERSE_LAST_NEQ +#undef IF_NEXT_SMALLER +#undef IF_NEXT_LARGER } /* @@ -2449,14 +2509,6 @@ merge_compute_minrun(Py_ssize_t n) return n + r; } -static void -reverse_sortslice(sortslice *s, Py_ssize_t n) -{ - reverse_slice(s->keys, &s->keys[n]); - if (s->values != NULL) - reverse_slice(s->values, &s->values[n]); -} - /* Here we define custom comparison functions to optimize for the cases one commonly * encounters in practice: homogeneous lists, often of one of the basic types. */ @@ -2824,15 +2876,12 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) */ minrun = merge_compute_minrun(nremaining); do { - int descending; Py_ssize_t n; /* Identify next run. */ - n = count_run(&ms, lo.keys, lo.keys + nremaining, &descending); + n = count_run(&ms, &lo, nremaining); if (n < 0) goto fail; - if (descending) - reverse_sortslice(&lo, n); /* If short, extend to min(minrun, nremaining). */ if (n < minrun) { const Py_ssize_t force = nremaining <= minrun ? diff --git a/Objects/listsort.txt b/Objects/listsort.txt index 32a59e510f0..4f84e2c87da 100644 --- a/Objects/listsort.txt +++ b/Objects/listsort.txt @@ -212,24 +212,43 @@ A detailed description of timsort follows. Runs ---- -count_run() returns the # of elements in the next run. A run is either -"ascending", which means non-decreasing: +count_run() returns the # of elements in the next run, and, if it's a +descending run, reverses it in-place. A run is either "ascending", which +means non-decreasing: a0 <= a1 <= a2 <= ... -or "descending", which means strictly decreasing: +or "descending", which means non-increasing: - a0 > a1 > a2 > ... + a0 >= a1 >= a2 >= ... Note that a run is always at least 2 long, unless we start at the array's -last element. +last element. If all elements in the array are equal, it can be viewed as +both ascending and descending. Upon return, the run count_run() identifies +is always ascending. -The definition of descending is strict, because the main routine reverses -a descending run in-place, transforming a descending run into an ascending -run. Reversal is done via the obvious fast "swap elements starting at each -end, and converge at the middle" method, and that can violate stability if -the slice contains any equal elements. Using a strict definition of -descending ensures that a descending run contains distinct elements. +Reversal is done via the obvious fast "swap elements starting at each +end, and converge at the middle" method. That can violate stability if +the slice contains any equal elements. For that reason, for a long time +the code used strict inequality (">" rather than ">=") in its definition +of descending. + +Removing that restriction required some complication: when processing a +descending run, all-equal sub-runs of elements are reversed in-place, on the +fly. Their original relative order is restored "by magic" via the final +"reverse the entire run" step. + +This makes processing descending runs a little more costly. We only use +`__lt__` comparisons, so that `x == y` has to be deduced from +`not x < y and not y < x`. But so long as a run remains strictly decreasing, +only one of those compares needs to be done per loop iteration. So the primsry +extra cost is paid only when there are equal elements, and they get some +compensating benefit by not needing to end the descending run. + +There's one more trick added since the original: after reversing a descending +run, it's possible that it can be extended by an adjacent ascending run. For +example, given [3, 2, 1, 3, 4, 5, 0], the 3-element descending prefix is +reversed in-place, and then extended by [3, 4, 5]. If an array is random, it's very unlikely we'll see long runs. If a natural run contains less than minrun elements (see next section), the main loop