mirror of https://github.com/python/cpython
SequenceMatcher.get_matching_blocks(): This now guarantees that
adjacent triples in the result list describe non-adjacent matching blocks. That's _nice_ to have, and Guido said he wanted it. Not a bugfix candidate: Guido or not ;-), this changes visible endcase semantics (note that some tests had to change), and nothing about this was documented before. Since it was working as designed, and behavior was consistent with the docs, it wasn't "a bug".
This commit is contained in:
parent
5d7c290b7b
commit
43898b4f64
|
@ -419,6 +419,16 @@ of the other sequences.
|
|||
len(\var{b}), 0)}. It is the only triple with \code{\var{n} == 0}.
|
||||
% Explain why a dummy is used!
|
||||
|
||||
If
|
||||
\code{(\var{i}, \var{j}, \var{n})} and
|
||||
\code{(\var{i'}, \var{j'}, \var{n'})} are adjacent triples in the list,
|
||||
and the second is not the last triple in the list, then
|
||||
\code{\var{i}+\var{n} != \var{i'}} or
|
||||
\code{\var{j}+\var{n} != \var{j'}}; in other words, adjacent triples
|
||||
always describe non-adjacent equal blocks.
|
||||
\versionchanged[The guarantee that adjacent triples always describe
|
||||
non-adjacent blocks was implemented]{2.5}
|
||||
|
||||
\begin{verbatim}
|
||||
>>> s = SequenceMatcher(None, "abxcd", "abcd")
|
||||
>>> s.get_matching_blocks()
|
||||
|
|
|
@ -86,8 +86,7 @@ class SequenceMatcher:
|
|||
>>> for block in s.get_matching_blocks():
|
||||
... print "a[%d] and b[%d] match for %d elements" % block
|
||||
a[0] and b[0] match for 8 elements
|
||||
a[8] and b[17] match for 6 elements
|
||||
a[14] and b[23] match for 15 elements
|
||||
a[8] and b[17] match for 21 elements
|
||||
a[29] and b[38] match for 0 elements
|
||||
|
||||
Note that the last tuple returned by .get_matching_blocks() is always a
|
||||
|
@ -101,8 +100,7 @@ class SequenceMatcher:
|
|||
... print "%6s a[%d:%d] b[%d:%d]" % opcode
|
||||
equal a[0:8] b[0:8]
|
||||
insert a[8:8] b[8:17]
|
||||
equal a[8:14] b[17:23]
|
||||
equal a[14:29] b[23:38]
|
||||
equal a[8:29] b[17:38]
|
||||
|
||||
See the Differ class for a fancy human-friendly file differencer, which
|
||||
uses SequenceMatcher both to compare sequences of lines, and to compare
|
||||
|
@ -461,7 +459,11 @@ class SequenceMatcher:
|
|||
|
||||
Each triple is of the form (i, j, n), and means that
|
||||
a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
|
||||
i and in j.
|
||||
i and in j. New in Python 2.5, it's also guaranteed that if
|
||||
(i, j, n) and (i', j', n') are adjacent triples in the list, and
|
||||
the second is not the last triple in the list, then i+n != i' or
|
||||
j+n != j'. IOW, adjacent triples never describe adjacent equal
|
||||
blocks.
|
||||
|
||||
The last triple is a dummy, (len(a), len(b), 0), and is the only
|
||||
triple with n==0.
|
||||
|
@ -474,7 +476,6 @@ class SequenceMatcher:
|
|||
if self.matching_blocks is not None:
|
||||
return self.matching_blocks
|
||||
la, lb = len(self.a), len(self.b)
|
||||
self.matching_blocks = matching_blocks = []
|
||||
|
||||
# This is most naturally expressed as a recursive algorithm, but
|
||||
# at least one user bumped into extreme use cases that exceeded
|
||||
|
@ -483,6 +484,7 @@ class SequenceMatcher:
|
|||
# results to `matching_blocks` in a loop; the matches are sorted
|
||||
# at the end.
|
||||
queue = [(0, la, 0, lb)]
|
||||
matching_blocks = []
|
||||
while queue:
|
||||
alo, ahi, blo, bhi = queue.pop()
|
||||
i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
|
||||
|
@ -496,8 +498,32 @@ class SequenceMatcher:
|
|||
if i+k < ahi and j+k < bhi:
|
||||
queue.append((i+k, ahi, j+k, bhi))
|
||||
matching_blocks.sort()
|
||||
matching_blocks.append( (la, lb, 0) )
|
||||
return matching_blocks
|
||||
|
||||
# It's possible that we have adjacent equal blocks in the
|
||||
# matching_blocks list now. Starting with 2.5, this code was added
|
||||
# to collapse them.
|
||||
i1 = j1 = k1 = 0
|
||||
non_adjacent = []
|
||||
for i2, j2, k2 in matching_blocks:
|
||||
# Is this block adjacent to i1, j1, k1?
|
||||
if i1 + k1 == i2 and j1 + k1 == j2:
|
||||
# Yes, so collapse them -- this just increases the length of
|
||||
# the first block by the length of the second, and the first
|
||||
# block so lengthebd remains the block to compare against.
|
||||
k1 += k2
|
||||
else:
|
||||
# Not adjacent. Remember the first block (k1==0 means it's
|
||||
# the dummy we started with), and make the second block the
|
||||
# new block to compare against.
|
||||
if k1:
|
||||
non_adjacent.append((i1, j1, k1))
|
||||
i1, j1, k1 = i2, j2, k2
|
||||
if k1:
|
||||
non_adjacent.append((i1, j1, k1))
|
||||
|
||||
non_adjacent.append( (la, lb, 0) )
|
||||
self.matching_blocks = non_adjacent
|
||||
return self.matching_blocks
|
||||
|
||||
def get_opcodes(self):
|
||||
"""Return list of 5-tuples describing how to turn a into b.
|
||||
|
|
|
@ -156,6 +156,12 @@ Extension Modules
|
|||
Library
|
||||
-------
|
||||
|
||||
- ``difflib``'s ``SequenceMatcher.get_matching_blocks()`` was changed to
|
||||
guarantee that adjacent triples in the return list always describe
|
||||
non-adjacent blocks. Previously, a pair of matching blocks could end
|
||||
up being described by multiple adjacent triples that formed a partition
|
||||
of the matching pair.
|
||||
|
||||
- Bug #1498146: fix optparse to handle Unicode strings in option help,
|
||||
description, and epilog.
|
||||
|
||||
|
|
Loading…
Reference in New Issue