gh-119105: Differ.compare is too slow [for degenerate cases] (#119492)

``_fancy_replace()`` is no longer recursive. and a single call does a worst-case linear number of ratio() computations instead of quadratic. This renders toothless a universe of pathological cases. Some inputs may produce different output, but that's rare, and I didn't find a case where the final diff appeared to be of materially worse quality. To the contrary, by refusing to even consider synching on lines "far apart", there was more easy-to-digest locality in the output.
2024-05-24 22:08:21 -05:00 · 2024-05-24 22:08:21 -05:00 · de19694cfb
parent 84be5244de
commit de19694cfb
2 changed files with 46 additions and 71 deletions
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@ -908,79 +908,52 @@ class Differ:
        + abcdefGhijkl
        ?    ^  ^  ^
        """
-        from operator import ge, gt
+        # Don't synch up unless the lines have a similarity score above
-        # Don't synch up unless the lines have a similarity score of at
+        # cutoff. Previously only the smallest pair was handled here,
-        # least cutoff; best_ratio tracks the best score seen so far.
+        # and if there are many pairs with the best ratio, recursion
-        # Keep track of all index pairs achieving the best ratio and
+        # could grow very deep, and runtime cubic. See:
        # deal with them here. Previously only the smallest pair was
        # handled here, and if there are many pairs with the best ratio,
        # recursion could grow very deep, and runtime cubic. See:
        # https://github.com/python/cpython/issues/119105
-        best_ratio, cutoff = 0.74, 0.75
+        #
        # Later, more pathological cases prompted removing recursion
        # entirely.
        cutoff = 0.74999
        cruncher = SequenceMatcher(self.charjunk)
        eqi, eqj = None, None   # 1st indices of equal lines (if any)
        # List of index pairs achieving best_ratio. Strictly increasing
        # in both index positions.
        max_pairs = []
        maxi = -1 # `i` index of last pair in max_pairs
        # search for the pair that matches best without being identical
        # (identical lines must be junk lines, & we don't want to synch
        # up on junk -- unless we have to)
        crqr = cruncher.real_quick_ratio
        cqr = cruncher.quick_ratio
        cr = cruncher.ratio
        for j in range(blo, bhi):
            bj = b[j]
            cruncher.set_seq2(bj)
            # Find new best, if possible. Else search for the smallest i
            # (if any) > maxi that equals the best ratio
            search_equal = True
            for i in range(alo, ahi):
                ai = a[i]
                if ai == bj:
                    if eqi is None:
                        eqi, eqj = i, j
                    continue
                cruncher.set_seq1(ai)
                # computing similarity is expensive, so use the quick
                # upper bounds first -- have seen this speed up messy
                # compares by a factor of 3.
                cmp = ge if search_equal and i > maxi else gt
                if (cmp(crqr(), best_ratio)
                      and cmp(cqr(), best_ratio)
                      and cmp((ratio := cr()), best_ratio)):
                    if ratio > best_ratio:
                        best_ratio = ratio
                        max_pairs.clear()
                    else:
                        assert best_ratio == ratio and search_equal
                        assert i > maxi
                    max_pairs.append((i, j))
                    maxi = i
                    search_equal = False
        if best_ratio < cutoff:
            assert not max_pairs
            # no non-identical "pretty close" pair
            if eqi is None:
                # no identical pair either -- treat it as a straight replace
                yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
                return
            # no close pair, but an identical pair -- synch up on that
            max_pairs = [(eqi, eqj)]
        else:
            # there's a close pair, so forget the identical pair (if any)
            assert max_pairs
            eqi = None
-        last_i, last_j = alo, blo
+        WINDOW = 10
-        for this_i, this_j in max_pairs:
+        best_i = best_j = None
-            # pump out diffs from before the synch point
+        dump_i, dump_j = alo, blo # smallest indices not yet resolved
-            yield from self._fancy_helper(a, last_i, this_i,
+        for j in range(blo, bhi):
-                                          b, last_j, this_j)
+            cruncher.set_seq2(b[j])
            # Search the corresponding i's within WINDOW for rhe highest
            # ratio greater than `cutoff`.
            aequiv = alo + (j - blo)
            arange = range(max(aequiv - WINDOW, dump_i),
                           min(aequiv + WINDOW + 1, ahi))
            if not arange: # likely exit if `a` is shorter than `b`
                break
            best_ratio = cutoff
            for i in arange:
                cruncher.set_seq1(a[i])
                # Ordering by cheapest to most expensive ratio is very
                # valuable, most often getting out early.
                if (crqr() > best_ratio
                      and cqr() > best_ratio
                      and cr() > best_ratio):
                    best_i, best_j, best_ratio = i, j, cr()
            if best_i is None:
                # found nothing to synch on yet - move to next j
                continue
            # pump out straight replace from before this synch pair
            yield from self._fancy_helper(a, dump_i, best_i,
                                          b, dump_j, best_j)
            # do intraline marking on the synch pair
-            aelt, belt = a[this_i], b[this_j]
+            aelt, belt = a[best_i], b[best_j]
-            if eqi is None:
+            if aelt != belt:
                # pump out a '-', '?', '+', '?' quad for the synched lines
                atags = btags = ""
                cruncher.set_seqs(aelt, belt)
@ -1002,17 +975,18 @@ class Differ:
            else:
                # the synch pair is identical
                yield '  ' + aelt
-            last_i, last_j = this_i + 1, this_j + 1
+            dump_i, dump_j = best_i + 1, best_j + 1
            best_i = best_j = None
-        # pump out diffs from after the last synch point
+        # pump out straight replace from after the last synch pair
-        yield from self._fancy_helper(a, last_i, ahi,
+        yield from self._fancy_helper(a, dump_i, ahi,
-                                      b, last_j, bhi)
+                                      b, dump_j, bhi)
    def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
        g = []
        if alo < ahi:
            if blo < bhi:
-                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
+                g = self._plain_replace(a, alo, ahi, b, blo, bhi)
            else:
                g = self._dump('-', a, alo, ahi)
        elif blo < bhi:
--- a/Misc/NEWS.d/next/Library/2024-05-24-04-05-37.gh-issue-119105.aDSRFn.rst
+++ b/Misc/NEWS.d/next/Library/2024-05-24-04-05-37.gh-issue-119105.aDSRFn.rst
@ -0,0 +1 @@
 ``difflib``'s ``DIffer.compare()`` (and so also ``ndiff``) can no longer be provoked into cubic-time behavior, or into unbounded recursion, and should generally be faster in ordinary cases too. Results may change in some cases, although that should be rare. Correctness of diffs is not affected. Some similar lines far apart may be reported as deleting one and adding the other, where before they were displayed on adjacent output lines with markup showing the intraline differences.
		`@ -0,0 +1 @@`
							``difflib``'s ``DIffer.compare()`` (and so also ``ndiff``) can no longer be provoked into cubic-time behavior, or into unbounded recursion, and should generally be faster in ordinary cases too. Results may change in some cases, although that should be rare. Correctness of diffs is not affected. Some similar lines far apart may be reported as deleting one and adding the other, where before they were displayed on adjacent output lines with markup showing the intraline differences.