From 6943a29cbf2cc5a743cb55857b623f7ab3168190 Mon Sep 17 00:00:00 2001 From: Fred Drake Date: Mon, 13 Aug 2001 19:31:59 +0000 Subject: [PATCH] David Goodger : Documentation for difflib/ndiff refactoring: more of the ndiff functionality has been moved to the underlying library (difflib). This closes SF patch #445413. --- Doc/lib/libdifflib.tex | 267 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 240 insertions(+), 27 deletions(-) diff --git a/Doc/lib/libdifflib.tex b/Doc/lib/libdifflib.tex index ce191c02317..cc9a776eb2c 100644 --- a/Doc/lib/libdifflib.tex +++ b/Doc/lib/libdifflib.tex @@ -10,6 +10,48 @@ \versionadded{2.1} +\begin{classdesc*}{SequenceMatcher} + This is a flexible class for comparing pairs of sequences of any + type, so long as the sequence elements are hashable. The basic + algorithm predates, and is a little fancier than, an algorithm + published in the late 1980's by Ratcliff and Obershelp under the + hyperbolic name ``gestalt pattern matching.'' The idea is to find + the longest contiguous matching subsequence that contains no + ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't + address junk). The same idea is then applied recursively to the + pieces of the sequences to the left and to the right of the matching + subsequence. This does not yield minimal edit sequences, but does + tend to yield matches that ``look right'' to people. + + \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic + time in the worst case and quadratic time in the expected case. + \class{SequenceMatcher} is quadratic time for the worst case and has + expected-case behavior dependent in a complicated way on how many + elements the sequences have in common; best case time is linear. +\end{classdesc*} + +\begin{classdesc*}{Differ} + This is a class for comparing sequences of lines of text, and + producing human-readable differences or deltas. Differ uses + \class{SequenceMatcher} both to compare sequences of lines, and to + compare sequences of characters within similar (near-matching) + lines. + + Each line of a \class{Differ} delta begins with a two-letter code: + +\begin{tableii}{l|l}{code}{Code}{Meaning} + \lineii{'- '}{line unique to sequence 1} + \lineii{'+ '}{line unique to sequence 2} + \lineii{' '}{line common to both sequences} + \lineii{'? '}{line not present in either input sequence} +\end{tableii} + + Lines beginning with `\code{?~}' attempt to guide the eye to + intraline differences, and were not present in either input + sequence. These lines can be confusing if the sequences contain tab + characters. +\end{classdesc*} + \begin{funcdesc}{get_close_matches}{word, possibilities\optional{, n\optional{, cutoff}}} Return a list of the best ``good enough'' matches. \var{word} is a @@ -40,25 +82,85 @@ \end{verbatim} \end{funcdesc} -\begin{classdesc*}{SequenceMatcher} - This is a flexible class for comparing pairs of sequences of any - type, so long as the sequence elements are hashable. The basic - algorithm predates, and is a little fancier than, an algorithm - published in the late 1980's by Ratcliff and Obershelp under the - hyperbolic name ``gestalt pattern matching.'' The idea is to find - the longest contiguous matching subsequence that contains no - ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't - address junk). The same idea is then applied recursively to the - pieces of the sequences to the left and to the right of the matching - subsequence. This does not yield minimal edit sequences, but does - tend to yield matches that ``look right'' to people. +\begin{funcdesc}{ndiff}{a, b\optional{, linejunk\optional{, + charjunk}}} + Compare \var{a} and \var{b} (lists of strings); return a + \class{Differ}-style delta. - \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic - time in the worst case and quadratic time in the expected case. - \class{SequenceMatcher} is quadratic time for the worst case and has - expected-case behavior dependent in a complicated way on how many - elements the sequences have in common; best case time is linear. -\end{classdesc*} + Optional keyword parameters \var{linejunk} and \var{charjunk} are + for filter functions (or \code{None}): + + \var{linejunk}: A function that should accept a single string + argument, and return true if the string is junk (or false if it is + not). The default is module-level function + \function{IS_LINE_JUNK()}, which filters out lines without visible + characters, except for at most one pound character (\character{\#}). + + \var{charjunk}: A function that should accept a string of length 1. + The default is module-level function \function{IS_CHARACTER_JUNK()}, + which filters out whitespace characters (a blank or tab; note: bad + idea to include newline in this!). + + \file{Tools/scripts/ndiff.py} is a command-line front-end to this + function. + +\begin{verbatim} +>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), +... 'ore\ntree\nemu\n'.splitlines(1))) +>>> print ''.join(diff), +- one +? ^ ++ ore +? ^ +- two +- three +? - ++ tree ++ emu +\end{verbatim} +\end{funcdesc} + +\begin{funcdesc}{restore}{sequence, which} + Return one of the two sequences that generated a delta. + + Given a \var{sequence} produced by \method{Differ.compare()} or + \function{ndiff()}, extract lines originating from file 1 or 2 + (parameter \var{which}), stripping off line prefixes. + + Example: + +\begin{verbatim} +>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), +... 'ore\ntree\nemu\n'.splitlines(1)) +>>> print ''.join(restore(diff, 1)), +one +two +three +>>> print ''.join(restore(diff, 2)), +ore +tree +emu +\end{verbatim} + +\end{funcdesc} + + +\begin{funcdesc}{IS_LINE_JUNK}{line}: + + Return 1 for ignorable line: iff \var{line} is blank or contains a + single \character{\#}. Used as a default for parameter + \var{linejunk} in \function{ndiff()}. + +\end{funcdesc} + + +\begin{funcdesc}{IS_CHARACTER_JUNK}{ch}: + + Return 1 for ignorable character: iff \var{ch} is a space or tab. + Used as a default for parameter \var{charjunk} in + \function{ndiff()}. + +\end{funcdesc} \begin{seealso} @@ -231,9 +333,9 @@ replace a[3:4] (x) b[2:3] (y) range [0, 1]. Where T is the total number of elements in both sequences, and M is - the number of matches, this is 2.0*M / T. Note that this is \code{1.} - if the sequences are identical, and \code{0.} if they have nothing in - common. + the number of matches, this is 2.0*M / T. Note that this is + \code{1.0} if the sequences are identical, and \code{0.0} if they + have nothing in common. This is expensive to compute if \method{get_matching_blocks()} or \method{get_opcodes()} hasn't already been called, in which case you @@ -272,7 +374,7 @@ at least as large as \method{ratio()}: \end{verbatim} -\subsection{Examples \label{difflib-examples}} +\subsection{SequenceMatcher Examples \label{sequencematcher-examples}} This example compares two strings, considering blanks to be ``junk:'' @@ -321,11 +423,122 @@ insert a[8:8] b[8:17] equal a[14:29] b[23:38] \end{verbatim} -See \file{Tools/scripts/ndiff.py} from the Python source distribution -for a fancy human-friendly file differencer, which uses -\class{SequenceMatcher} both to view files as sequences of lines, and -lines as sequences of characters. - See also the function \function{get_close_matches()} in this module, which shows how simple code building on \class{SequenceMatcher} can be used to do useful work. + + +\subsection{Differ Objects \label{differ-objects}} + +Note that \class{Differ}-generated deltas make no claim to be +\strong{minimal} diffs. To the contrary, minimal diffs are often +counter-intuitive, because they synch up anywhere possible, sometimes +accidental matches 100 pages apart. Restricting synch points to +contiguous matches preserves some notion of locality, at the +occasional cost of producing a longer diff. + +The \class{Differ} class has this constructor: + +\begin{classdesc}{Differ}{\optional{linejunk\optional{, charjunk}}} + Optional keyword parameters \var{linejunk} and \var{charjunk} are + for filter functions (or \code{None}): + + \var{linejunk}: A function that should accept a single string + argument, and return true iff the string is junk. The default is + module-level function \function{IS_LINE_JUNK()}, which filters out + lines without visible characters, except for at most one pound + character (\character{\#}). + + \var{charjunk}: A function that should accept a string of length 1. + The default is module-level function \function{IS_CHARACTER_JUNK()}, + which filters out whitespace characters (a blank or tab; note: bad + idea to include newline in this!). +\end{classdesc} + +\class{Differ} objects are used (deltas generated) via a single +method: + +\begin{methoddesc}{compare}{a, b} + Compare two sequences of lines; return the resulting delta (list). + + Each sequence must contain individual single-line strings ending + with newlines. Such sequences can be obtained from the + \method{readlines()} method of file-like objects. The list returned + is also made up of newline-terminated strings, and ready to be used + with the \method{writelines()} method of a file-like object. +\end{methoddesc} + + +\subsection{Differ Example \label{differ-examples}} + +This example compares two texts. First we set up the texts, sequences +of individual single-line strings ending with newlines (such sequences +can also be obtained from the \method{readlines()} method of file-like +objects): + +\begin{verbatim} +>>> text1 = ''' 1. Beautiful is better than ugly. +... 2. Explicit is better than implicit. +... 3. Simple is better than complex. +... 4. Complex is better than complicated. +... '''.splitlines(1) +>>> len(text1) +4 +>>> text1[0][-1] +'\n' +>>> text2 = ''' 1. Beautiful is better than ugly. +... 3. Simple is better than complex. +... 4. Complicated is better than complex. +... 5. Flat is better than nested. +... '''.splitlines(1) +\end{verbatim} + +Next we instantiate a Differ object: + +\begin{verbatim} +>>> d = Differ() +\end{verbatim} + +Note that when instantiating a \class{Differ} object we may pass +functions to filter out line and character ``junk.'' See the +\method{Differ()} constructor for details. + +Finally, we compare the two: + +\begin{verbatim} +>>> result = d.compare(text1, text2) +\end{verbatim} + +\code{result} is a list of strings, so let's pretty-print it: + +\begin{verbatim} +>>> from pprint import pprint +>>> pprint(result) +[' 1. Beautiful is better than ugly.\n', + '- 2. Explicit is better than implicit.\n', + '- 3. Simple is better than complex.\n', + '+ 3. Simple is better than complex.\n', + '? ++ \n', + '- 4. Complex is better than complicated.\n', + '? ^ ---- ^ \n', + '+ 4. Complicated is better than complex.\n', + '? ++++ ^ ^ \n', + '+ 5. Flat is better than nested.\n'] +\end{verbatim} + +As a single multi-line string it looks like this: + +\begin{verbatim} +>>> import sys +>>> sys.stdout.writelines(result) + 1. Beautiful is better than ugly. +- 2. Explicit is better than implicit. +- 3. Simple is better than complex. ++ 3. Simple is better than complex. +? ++ +- 4. Complex is better than complicated. +? ^ ---- ^ ++ 4. Complicated is better than complex. +? ++++ ^ ^ ++ 5. Flat is better than nested. +\end{verbatim}