python-2.5.2/win32/Lib/difflib.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 #! /usr/bin/env python
       
     2 
       
     3 """
       
     4 Module difflib -- helpers for computing deltas between objects.
       
     5 
       
     6 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
       
     7     Use SequenceMatcher to return list of the best "good enough" matches.
       
     8 
       
     9 Function context_diff(a, b):
       
    10     For two lists of strings, return a delta in context diff format.
       
    11 
       
    12 Function ndiff(a, b):
       
    13     Return a delta: the difference between `a` and `b` (lists of strings).
       
    14 
       
    15 Function restore(delta, which):
       
    16     Return one of the two sequences that generated an ndiff delta.
       
    17 
       
    18 Function unified_diff(a, b):
       
    19     For two lists of strings, return a delta in unified diff format.
       
    20 
       
    21 Class SequenceMatcher:
       
    22     A flexible class for comparing pairs of sequences of any type.
       
    23 
       
    24 Class Differ:
       
    25     For producing human-readable deltas from sequences of lines of text.
       
    26 
       
    27 Class HtmlDiff:
       
    28     For producing HTML side by side comparison with change highlights.
       
    29 """
       
    30 
       
    31 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
       
    32            'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
       
    33            'unified_diff', 'HtmlDiff']
       
    34 
       
    35 import heapq
       
    36 
       
    37 def _calculate_ratio(matches, length):
       
    38     if length:
       
    39         return 2.0 * matches / length
       
    40     return 1.0
       
    41 
       
    42 class SequenceMatcher:
       
    43 
       
    44     """
       
    45     SequenceMatcher is a flexible class for comparing pairs of sequences of
       
    46     any type, so long as the sequence elements are hashable.  The basic
       
    47     algorithm predates, and is a little fancier than, an algorithm
       
    48     published in the late 1980's by Ratcliff and Obershelp under the
       
    49     hyperbolic name "gestalt pattern matching".  The basic idea is to find
       
    50     the longest contiguous matching subsequence that contains no "junk"
       
    51     elements (R-O doesn't address junk).  The same idea is then applied
       
    52     recursively to the pieces of the sequences to the left and to the right
       
    53     of the matching subsequence.  This does not yield minimal edit
       
    54     sequences, but does tend to yield matches that "look right" to people.
       
    55 
       
    56     SequenceMatcher tries to compute a "human-friendly diff" between two
       
    57     sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
       
    58     longest *contiguous* & junk-free matching subsequence.  That's what
       
    59     catches peoples' eyes.  The Windows(tm) windiff has another interesting
       
    60     notion, pairing up elements that appear uniquely in each sequence.
       
    61     That, and the method here, appear to yield more intuitive difference
       
    62     reports than does diff.  This method appears to be the least vulnerable
       
    63     to synching up on blocks of "junk lines", though (like blank lines in
       
    64     ordinary text files, or maybe "<P>" lines in HTML files).  That may be
       
    65     because this is the only method of the 3 that has a *concept* of
       
    66     "junk" <wink>.
       
    67 
       
    68     Example, comparing two strings, and considering blanks to be "junk":
       
    69 
       
    70     >>> s = SequenceMatcher(lambda x: x == " ",
       
    71     ...                     "private Thread currentThread;",
       
    72     ...                     "private volatile Thread currentThread;")
       
    73     >>>
       
    74 
       
    75     .ratio() returns a float in [0, 1], measuring the "similarity" of the
       
    76     sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
       
    77     sequences are close matches:
       
    78 
       
    79     >>> print round(s.ratio(), 3)
       
    80     0.866
       
    81     >>>
       
    82 
       
    83     If you're only interested in where the sequences match,
       
    84     .get_matching_blocks() is handy:
       
    85 
       
    86     >>> for block in s.get_matching_blocks():
       
    87     ...     print "a[%d] and b[%d] match for %d elements" % block
       
    88     a[0] and b[0] match for 8 elements
       
    89     a[8] and b[17] match for 21 elements
       
    90     a[29] and b[38] match for 0 elements
       
    91 
       
    92     Note that the last tuple returned by .get_matching_blocks() is always a
       
    93     dummy, (len(a), len(b), 0), and this is the only case in which the last
       
    94     tuple element (number of elements matched) is 0.
       
    95 
       
    96     If you want to know how to change the first sequence into the second,
       
    97     use .get_opcodes():
       
    98 
       
    99     >>> for opcode in s.get_opcodes():
       
   100     ...     print "%6s a[%d:%d] b[%d:%d]" % opcode
       
   101      equal a[0:8] b[0:8]
       
   102     insert a[8:8] b[8:17]
       
   103      equal a[8:29] b[17:38]
       
   104 
       
   105     See the Differ class for a fancy human-friendly file differencer, which
       
   106     uses SequenceMatcher both to compare sequences of lines, and to compare
       
   107     sequences of characters within similar (near-matching) lines.
       
   108 
       
   109     See also function get_close_matches() in this module, which shows how
       
   110     simple code building on SequenceMatcher can be used to do useful work.
       
   111 
       
   112     Timing:  Basic R-O is cubic time worst case and quadratic time expected
       
   113     case.  SequenceMatcher is quadratic time for the worst case and has
       
   114     expected-case behavior dependent in a complicated way on how many
       
   115     elements the sequences have in common; best case time is linear.
       
   116 
       
   117     Methods:
       
   118 
       
   119     __init__(isjunk=None, a='', b='')
       
   120         Construct a SequenceMatcher.
       
   121 
       
   122     set_seqs(a, b)
       
   123         Set the two sequences to be compared.
       
   124 
       
   125     set_seq1(a)
       
   126         Set the first sequence to be compared.
       
   127 
       
   128     set_seq2(b)
       
   129         Set the second sequence to be compared.
       
   130 
       
   131     find_longest_match(alo, ahi, blo, bhi)
       
   132         Find longest matching block in a[alo:ahi] and b[blo:bhi].
       
   133 
       
   134     get_matching_blocks()
       
   135         Return list of triples describing matching subsequences.
       
   136 
       
   137     get_opcodes()
       
   138         Return list of 5-tuples describing how to turn a into b.
       
   139 
       
   140     ratio()
       
   141         Return a measure of the sequences' similarity (float in [0,1]).
       
   142 
       
   143     quick_ratio()
       
   144         Return an upper bound on .ratio() relatively quickly.
       
   145 
       
   146     real_quick_ratio()
       
   147         Return an upper bound on ratio() very quickly.
       
   148     """
       
   149 
       
   150     def __init__(self, isjunk=None, a='', b=''):
       
   151         """Construct a SequenceMatcher.
       
   152 
       
   153         Optional arg isjunk is None (the default), or a one-argument
       
   154         function that takes a sequence element and returns true iff the
       
   155         element is junk.  None is equivalent to passing "lambda x: 0", i.e.
       
   156         no elements are considered to be junk.  For example, pass
       
   157             lambda x: x in " \\t"
       
   158         if you're comparing lines as sequences of characters, and don't
       
   159         want to synch up on blanks or hard tabs.
       
   160 
       
   161         Optional arg a is the first of two sequences to be compared.  By
       
   162         default, an empty string.  The elements of a must be hashable.  See
       
   163         also .set_seqs() and .set_seq1().
       
   164 
       
   165         Optional arg b is the second of two sequences to be compared.  By
       
   166         default, an empty string.  The elements of b must be hashable. See
       
   167         also .set_seqs() and .set_seq2().
       
   168         """
       
   169 
       
   170         # Members:
       
   171         # a
       
   172         #      first sequence
       
   173         # b
       
   174         #      second sequence; differences are computed as "what do
       
   175         #      we need to do to 'a' to change it into 'b'?"
       
   176         # b2j
       
   177         #      for x in b, b2j[x] is a list of the indices (into b)
       
   178         #      at which x appears; junk elements do not appear
       
   179         # fullbcount
       
   180         #      for x in b, fullbcount[x] == the number of times x
       
   181         #      appears in b; only materialized if really needed (used
       
   182         #      only for computing quick_ratio())
       
   183         # matching_blocks
       
   184         #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
       
   185         #      ascending & non-overlapping in i and in j; terminated by
       
   186         #      a dummy (len(a), len(b), 0) sentinel
       
   187         # opcodes
       
   188         #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
       
   189         #      one of
       
   190         #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
       
   191         #          'delete'    a[i1:i2] should be deleted
       
   192         #          'insert'    b[j1:j2] should be inserted
       
   193         #          'equal'     a[i1:i2] == b[j1:j2]
       
   194         # isjunk
       
   195         #      a user-supplied function taking a sequence element and
       
   196         #      returning true iff the element is "junk" -- this has
       
   197         #      subtle but helpful effects on the algorithm, which I'll
       
   198         #      get around to writing up someday <0.9 wink>.
       
   199         #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
       
   200         # isbjunk
       
   201         #      for x in b, isbjunk(x) == isjunk(x) but much faster;
       
   202         #      it's really the has_key method of a hidden dict.
       
   203         #      DOES NOT WORK for x in a!
       
   204         # isbpopular
       
   205         #      for x in b, isbpopular(x) is true iff b is reasonably long
       
   206         #      (at least 200 elements) and x accounts for more than 1% of
       
   207         #      its elements.  DOES NOT WORK for x in a!
       
   208 
       
   209         self.isjunk = isjunk
       
   210         self.a = self.b = None
       
   211         self.set_seqs(a, b)
       
   212 
       
   213     def set_seqs(self, a, b):
       
   214         """Set the two sequences to be compared.
       
   215 
       
   216         >>> s = SequenceMatcher()
       
   217         >>> s.set_seqs("abcd", "bcde")
       
   218         >>> s.ratio()
       
   219         0.75
       
   220         """
       
   221 
       
   222         self.set_seq1(a)
       
   223         self.set_seq2(b)
       
   224 
       
   225     def set_seq1(self, a):
       
   226         """Set the first sequence to be compared.
       
   227 
       
   228         The second sequence to be compared is not changed.
       
   229 
       
   230         >>> s = SequenceMatcher(None, "abcd", "bcde")
       
   231         >>> s.ratio()
       
   232         0.75
       
   233         >>> s.set_seq1("bcde")
       
   234         >>> s.ratio()
       
   235         1.0
       
   236         >>>
       
   237 
       
   238         SequenceMatcher computes and caches detailed information about the
       
   239         second sequence, so if you want to compare one sequence S against
       
   240         many sequences, use .set_seq2(S) once and call .set_seq1(x)
       
   241         repeatedly for each of the other sequences.
       
   242 
       
   243         See also set_seqs() and set_seq2().
       
   244         """
       
   245 
       
   246         if a is self.a:
       
   247             return
       
   248         self.a = a
       
   249         self.matching_blocks = self.opcodes = None
       
   250 
       
   251     def set_seq2(self, b):
       
   252         """Set the second sequence to be compared.
       
   253 
       
   254         The first sequence to be compared is not changed.
       
   255 
       
   256         >>> s = SequenceMatcher(None, "abcd", "bcde")
       
   257         >>> s.ratio()
       
   258         0.75
       
   259         >>> s.set_seq2("abcd")
       
   260         >>> s.ratio()
       
   261         1.0
       
   262         >>>
       
   263 
       
   264         SequenceMatcher computes and caches detailed information about the
       
   265         second sequence, so if you want to compare one sequence S against
       
   266         many sequences, use .set_seq2(S) once and call .set_seq1(x)
       
   267         repeatedly for each of the other sequences.
       
   268 
       
   269         See also set_seqs() and set_seq1().
       
   270         """
       
   271 
       
   272         if b is self.b:
       
   273             return
       
   274         self.b = b
       
   275         self.matching_blocks = self.opcodes = None
       
   276         self.fullbcount = None
       
   277         self.__chain_b()
       
   278 
       
   279     # For each element x in b, set b2j[x] to a list of the indices in
       
   280     # b where x appears; the indices are in increasing order; note that
       
   281     # the number of times x appears in b is len(b2j[x]) ...
       
   282     # when self.isjunk is defined, junk elements don't show up in this
       
   283     # map at all, which stops the central find_longest_match method
       
   284     # from starting any matching block at a junk element ...
       
   285     # also creates the fast isbjunk function ...
       
   286     # b2j also does not contain entries for "popular" elements, meaning
       
   287     # elements that account for more than 1% of the total elements, and
       
   288     # when the sequence is reasonably large (>= 200 elements); this can
       
   289     # be viewed as an adaptive notion of semi-junk, and yields an enormous
       
   290     # speedup when, e.g., comparing program files with hundreds of
       
   291     # instances of "return NULL;" ...
       
   292     # note that this is only called when b changes; so for cross-product
       
   293     # kinds of matches, it's best to call set_seq2 once, then set_seq1
       
   294     # repeatedly
       
   295 
       
   296     def __chain_b(self):
       
   297         # Because isjunk is a user-defined (not C) function, and we test
       
   298         # for junk a LOT, it's important to minimize the number of calls.
       
   299         # Before the tricks described here, __chain_b was by far the most
       
   300         # time-consuming routine in the whole module!  If anyone sees
       
   301         # Jim Roskind, thank him again for profile.py -- I never would
       
   302         # have guessed that.
       
   303         # The first trick is to build b2j ignoring the possibility
       
   304         # of junk.  I.e., we don't call isjunk at all yet.  Throwing
       
   305         # out the junk later is much cheaper than building b2j "right"
       
   306         # from the start.
       
   307         b = self.b
       
   308         n = len(b)
       
   309         self.b2j = b2j = {}
       
   310         populardict = {}
       
   311         for i, elt in enumerate(b):
       
   312             if elt in b2j:
       
   313                 indices = b2j[elt]
       
   314                 if n >= 200 and len(indices) * 100 > n:
       
   315                     populardict[elt] = 1
       
   316                     del indices[:]
       
   317                 else:
       
   318                     indices.append(i)
       
   319             else:
       
   320                 b2j[elt] = [i]
       
   321 
       
   322         # Purge leftover indices for popular elements.
       
   323         for elt in populardict:
       
   324             del b2j[elt]
       
   325 
       
   326         # Now b2j.keys() contains elements uniquely, and especially when
       
   327         # the sequence is a string, that's usually a good deal smaller
       
   328         # than len(string).  The difference is the number of isjunk calls
       
   329         # saved.
       
   330         isjunk = self.isjunk
       
   331         junkdict = {}
       
   332         if isjunk:
       
   333             for d in populardict, b2j:
       
   334                 for elt in d.keys():
       
   335                     if isjunk(elt):
       
   336                         junkdict[elt] = 1
       
   337                         del d[elt]
       
   338 
       
   339         # Now for x in b, isjunk(x) == x in junkdict, but the
       
   340         # latter is much faster.  Note too that while there may be a
       
   341         # lot of junk in the sequence, the number of *unique* junk
       
   342         # elements is probably small.  So the memory burden of keeping
       
   343         # this dict alive is likely trivial compared to the size of b2j.
       
   344         self.isbjunk = junkdict.has_key
       
   345         self.isbpopular = populardict.has_key
       
   346 
       
   347     def find_longest_match(self, alo, ahi, blo, bhi):
       
   348         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
       
   349 
       
   350         If isjunk is not defined:
       
   351 
       
   352         Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
       
   353             alo <= i <= i+k <= ahi
       
   354             blo <= j <= j+k <= bhi
       
   355         and for all (i',j',k') meeting those conditions,
       
   356             k >= k'
       
   357             i <= i'
       
   358             and if i == i', j <= j'
       
   359 
       
   360         In other words, of all maximal matching blocks, return one that
       
   361         starts earliest in a, and of all those maximal matching blocks that
       
   362         start earliest in a, return the one that starts earliest in b.
       
   363 
       
   364         >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
       
   365         >>> s.find_longest_match(0, 5, 0, 9)
       
   366         (0, 4, 5)
       
   367 
       
   368         If isjunk is defined, first the longest matching block is
       
   369         determined as above, but with the additional restriction that no
       
   370         junk element appears in the block.  Then that block is extended as
       
   371         far as possible by matching (only) junk elements on both sides.  So
       
   372         the resulting block never matches on junk except as identical junk
       
   373         happens to be adjacent to an "interesting" match.
       
   374 
       
   375         Here's the same example as before, but considering blanks to be
       
   376         junk.  That prevents " abcd" from matching the " abcd" at the tail
       
   377         end of the second sequence directly.  Instead only the "abcd" can
       
   378         match, and matches the leftmost "abcd" in the second sequence:
       
   379 
       
   380         >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
       
   381         >>> s.find_longest_match(0, 5, 0, 9)
       
   382         (1, 0, 4)
       
   383 
       
   384         If no blocks match, return (alo, blo, 0).
       
   385 
       
   386         >>> s = SequenceMatcher(None, "ab", "c")
       
   387         >>> s.find_longest_match(0, 2, 0, 1)
       
   388         (0, 0, 0)
       
   389         """
       
   390 
       
   391         # CAUTION:  stripping common prefix or suffix would be incorrect.
       
   392         # E.g.,
       
   393         #    ab
       
   394         #    acab
       
   395         # Longest matching block is "ab", but if common prefix is
       
   396         # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
       
   397         # strip, so ends up claiming that ab is changed to acab by
       
   398         # inserting "ca" in the middle.  That's minimal but unintuitive:
       
   399         # "it's obvious" that someone inserted "ac" at the front.
       
   400         # Windiff ends up at the same place as diff, but by pairing up
       
   401         # the unique 'b's and then matching the first two 'a's.
       
   402 
       
   403         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
       
   404         besti, bestj, bestsize = alo, blo, 0
       
   405         # find longest junk-free match
       
   406         # during an iteration of the loop, j2len[j] = length of longest
       
   407         # junk-free match ending with a[i-1] and b[j]
       
   408         j2len = {}
       
   409         nothing = []
       
   410         for i in xrange(alo, ahi):
       
   411             # look at all instances of a[i] in b; note that because
       
   412             # b2j has no junk keys, the loop is skipped if a[i] is junk
       
   413             j2lenget = j2len.get
       
   414             newj2len = {}
       
   415             for j in b2j.get(a[i], nothing):
       
   416                 # a[i] matches b[j]
       
   417                 if j < blo:
       
   418                     continue
       
   419                 if j >= bhi:
       
   420                     break
       
   421                 k = newj2len[j] = j2lenget(j-1, 0) + 1
       
   422                 if k > bestsize:
       
   423                     besti, bestj, bestsize = i-k+1, j-k+1, k
       
   424             j2len = newj2len
       
   425 
       
   426         # Extend the best by non-junk elements on each end.  In particular,
       
   427         # "popular" non-junk elements aren't in b2j, which greatly speeds
       
   428         # the inner loop above, but also means "the best" match so far
       
   429         # doesn't contain any junk *or* popular non-junk elements.
       
   430         while besti > alo and bestj > blo and \
       
   431               not isbjunk(b[bestj-1]) and \
       
   432               a[besti-1] == b[bestj-1]:
       
   433             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
       
   434         while besti+bestsize < ahi and bestj+bestsize < bhi and \
       
   435               not isbjunk(b[bestj+bestsize]) and \
       
   436               a[besti+bestsize] == b[bestj+bestsize]:
       
   437             bestsize += 1
       
   438 
       
   439         # Now that we have a wholly interesting match (albeit possibly
       
   440         # empty!), we may as well suck up the matching junk on each
       
   441         # side of it too.  Can't think of a good reason not to, and it
       
   442         # saves post-processing the (possibly considerable) expense of
       
   443         # figuring out what to do with it.  In the case of an empty
       
   444         # interesting match, this is clearly the right thing to do,
       
   445         # because no other kind of match is possible in the regions.
       
   446         while besti > alo and bestj > blo and \
       
   447               isbjunk(b[bestj-1]) and \
       
   448               a[besti-1] == b[bestj-1]:
       
   449             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
       
   450         while besti+bestsize < ahi and bestj+bestsize < bhi and \
       
   451               isbjunk(b[bestj+bestsize]) and \
       
   452               a[besti+bestsize] == b[bestj+bestsize]:
       
   453             bestsize = bestsize + 1
       
   454 
       
   455         return besti, bestj, bestsize
       
   456 
       
   457     def get_matching_blocks(self):
       
   458         """Return list of triples describing matching subsequences.
       
   459 
       
   460         Each triple is of the form (i, j, n), and means that
       
   461         a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
       
   462         i and in j.  New in Python 2.5, it's also guaranteed that if
       
   463         (i, j, n) and (i', j', n') are adjacent triples in the list, and
       
   464         the second is not the last triple in the list, then i+n != i' or
       
   465         j+n != j'.  IOW, adjacent triples never describe adjacent equal
       
   466         blocks.
       
   467 
       
   468         The last triple is a dummy, (len(a), len(b), 0), and is the only
       
   469         triple with n==0.
       
   470 
       
   471         >>> s = SequenceMatcher(None, "abxcd", "abcd")
       
   472         >>> s.get_matching_blocks()
       
   473         [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
       
   474         """
       
   475 
       
   476         if self.matching_blocks is not None:
       
   477             return self.matching_blocks
       
   478         la, lb = len(self.a), len(self.b)
       
   479 
       
   480         # This is most naturally expressed as a recursive algorithm, but
       
   481         # at least one user bumped into extreme use cases that exceeded
       
   482         # the recursion limit on their box.  So, now we maintain a list
       
   483         # ('queue`) of blocks we still need to look at, and append partial
       
   484         # results to `matching_blocks` in a loop; the matches are sorted
       
   485         # at the end.
       
   486         queue = [(0, la, 0, lb)]
       
   487         matching_blocks = []
       
   488         while queue:
       
   489             alo, ahi, blo, bhi = queue.pop()
       
   490             i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
       
   491             # a[alo:i] vs b[blo:j] unknown
       
   492             # a[i:i+k] same as b[j:j+k]
       
   493             # a[i+k:ahi] vs b[j+k:bhi] unknown
       
   494             if k:   # if k is 0, there was no matching block
       
   495                 matching_blocks.append(x)
       
   496                 if alo < i and blo < j:
       
   497                     queue.append((alo, i, blo, j))
       
   498                 if i+k < ahi and j+k < bhi:
       
   499                     queue.append((i+k, ahi, j+k, bhi))
       
   500         matching_blocks.sort()
       
   501 
       
   502         # It's possible that we have adjacent equal blocks in the
       
   503         # matching_blocks list now.  Starting with 2.5, this code was added
       
   504         # to collapse them.
       
   505         i1 = j1 = k1 = 0
       
   506         non_adjacent = []
       
   507         for i2, j2, k2 in matching_blocks:
       
   508             # Is this block adjacent to i1, j1, k1?
       
   509             if i1 + k1 == i2 and j1 + k1 == j2:
       
   510                 # Yes, so collapse them -- this just increases the length of
       
   511                 # the first block by the length of the second, and the first
       
   512                 # block so lengthened remains the block to compare against.
       
   513                 k1 += k2
       
   514             else:
       
   515                 # Not adjacent.  Remember the first block (k1==0 means it's
       
   516                 # the dummy we started with), and make the second block the
       
   517                 # new block to compare against.
       
   518                 if k1:
       
   519                     non_adjacent.append((i1, j1, k1))
       
   520                 i1, j1, k1 = i2, j2, k2
       
   521         if k1:
       
   522             non_adjacent.append((i1, j1, k1))
       
   523 
       
   524         non_adjacent.append( (la, lb, 0) )
       
   525         self.matching_blocks = non_adjacent
       
   526         return self.matching_blocks
       
   527 
       
   528     def get_opcodes(self):
       
   529         """Return list of 5-tuples describing how to turn a into b.
       
   530 
       
   531         Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
       
   532         has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
       
   533         tuple preceding it, and likewise for j1 == the previous j2.
       
   534 
       
   535         The tags are strings, with these meanings:
       
   536 
       
   537         'replace':  a[i1:i2] should be replaced by b[j1:j2]
       
   538         'delete':   a[i1:i2] should be deleted.
       
   539                     Note that j1==j2 in this case.
       
   540         'insert':   b[j1:j2] should be inserted at a[i1:i1].
       
   541                     Note that i1==i2 in this case.
       
   542         'equal':    a[i1:i2] == b[j1:j2]
       
   543 
       
   544         >>> a = "qabxcd"
       
   545         >>> b = "abycdf"
       
   546         >>> s = SequenceMatcher(None, a, b)
       
   547         >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
       
   548         ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
       
   549         ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
       
   550          delete a[0:1] (q) b[0:0] ()
       
   551           equal a[1:3] (ab) b[0:2] (ab)
       
   552         replace a[3:4] (x) b[2:3] (y)
       
   553           equal a[4:6] (cd) b[3:5] (cd)
       
   554          insert a[6:6] () b[5:6] (f)
       
   555         """
       
   556 
       
   557         if self.opcodes is not None:
       
   558             return self.opcodes
       
   559         i = j = 0
       
   560         self.opcodes = answer = []
       
   561         for ai, bj, size in self.get_matching_blocks():
       
   562             # invariant:  we've pumped out correct diffs to change
       
   563             # a[:i] into b[:j], and the next matching block is
       
   564             # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
       
   565             # out a diff to change a[i:ai] into b[j:bj], pump out
       
   566             # the matching block, and move (i,j) beyond the match
       
   567             tag = ''
       
   568             if i < ai and j < bj:
       
   569                 tag = 'replace'
       
   570             elif i < ai:
       
   571                 tag = 'delete'
       
   572             elif j < bj:
       
   573                 tag = 'insert'
       
   574             if tag:
       
   575                 answer.append( (tag, i, ai, j, bj) )
       
   576             i, j = ai+size, bj+size
       
   577             # the list of matching blocks is terminated by a
       
   578             # sentinel with size 0
       
   579             if size:
       
   580                 answer.append( ('equal', ai, i, bj, j) )
       
   581         return answer
       
   582 
       
   583     def get_grouped_opcodes(self, n=3):
       
   584         """ Isolate change clusters by eliminating ranges with no changes.
       
   585 
       
   586         Return a generator of groups with upto n lines of context.
       
   587         Each group is in the same format as returned by get_opcodes().
       
   588 
       
   589         >>> from pprint import pprint
       
   590         >>> a = map(str, range(1,40))
       
   591         >>> b = a[:]
       
   592         >>> b[8:8] = ['i']     # Make an insertion
       
   593         >>> b[20] += 'x'       # Make a replacement
       
   594         >>> b[23:28] = []      # Make a deletion
       
   595         >>> b[30] += 'y'       # Make another replacement
       
   596         >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
       
   597         [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
       
   598          [('equal', 16, 19, 17, 20),
       
   599           ('replace', 19, 20, 20, 21),
       
   600           ('equal', 20, 22, 21, 23),
       
   601           ('delete', 22, 27, 23, 23),
       
   602           ('equal', 27, 30, 23, 26)],
       
   603          [('equal', 31, 34, 27, 30),
       
   604           ('replace', 34, 35, 30, 31),
       
   605           ('equal', 35, 38, 31, 34)]]
       
   606         """
       
   607 
       
   608         codes = self.get_opcodes()
       
   609         if not codes:
       
   610             codes = [("equal", 0, 1, 0, 1)]
       
   611         # Fixup leading and trailing groups if they show no changes.
       
   612         if codes[0][0] == 'equal':
       
   613             tag, i1, i2, j1, j2 = codes[0]
       
   614             codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
       
   615         if codes[-1][0] == 'equal':
       
   616             tag, i1, i2, j1, j2 = codes[-1]
       
   617             codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
       
   618 
       
   619         nn = n + n
       
   620         group = []
       
   621         for tag, i1, i2, j1, j2 in codes:
       
   622             # End the current group and start a new one whenever
       
   623             # there is a large range with no changes.
       
   624             if tag == 'equal' and i2-i1 > nn:
       
   625                 group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
       
   626                 yield group
       
   627                 group = []
       
   628                 i1, j1 = max(i1, i2-n), max(j1, j2-n)
       
   629             group.append((tag, i1, i2, j1 ,j2))
       
   630         if group and not (len(group)==1 and group[0][0] == 'equal'):
       
   631             yield group
       
   632 
       
   633     def ratio(self):
       
   634         """Return a measure of the sequences' similarity (float in [0,1]).
       
   635 
       
   636         Where T is the total number of elements in both sequences, and
       
   637         M is the number of matches, this is 2.0*M / T.
       
   638         Note that this is 1 if the sequences are identical, and 0 if
       
   639         they have nothing in common.
       
   640 
       
   641         .ratio() is expensive to compute if you haven't already computed
       
   642         .get_matching_blocks() or .get_opcodes(), in which case you may
       
   643         want to try .quick_ratio() or .real_quick_ratio() first to get an
       
   644         upper bound.
       
   645 
       
   646         >>> s = SequenceMatcher(None, "abcd", "bcde")
       
   647         >>> s.ratio()
       
   648         0.75
       
   649         >>> s.quick_ratio()
       
   650         0.75
       
   651         >>> s.real_quick_ratio()
       
   652         1.0
       
   653         """
       
   654 
       
   655         matches = reduce(lambda sum, triple: sum + triple[-1],
       
   656                          self.get_matching_blocks(), 0)
       
   657         return _calculate_ratio(matches, len(self.a) + len(self.b))
       
   658 
       
   659     def quick_ratio(self):
       
   660         """Return an upper bound on ratio() relatively quickly.
       
   661 
       
   662         This isn't defined beyond that it is an upper bound on .ratio(), and
       
   663         is faster to compute.
       
   664         """
       
   665 
       
   666         # viewing a and b as multisets, set matches to the cardinality
       
   667         # of their intersection; this counts the number of matches
       
   668         # without regard to order, so is clearly an upper bound
       
   669         if self.fullbcount is None:
       
   670             self.fullbcount = fullbcount = {}
       
   671             for elt in self.b:
       
   672                 fullbcount[elt] = fullbcount.get(elt, 0) + 1
       
   673         fullbcount = self.fullbcount
       
   674         # avail[x] is the number of times x appears in 'b' less the
       
   675         # number of times we've seen it in 'a' so far ... kinda
       
   676         avail = {}
       
   677         availhas, matches = avail.has_key, 0
       
   678         for elt in self.a:
       
   679             if availhas(elt):
       
   680                 numb = avail[elt]
       
   681             else:
       
   682                 numb = fullbcount.get(elt, 0)
       
   683             avail[elt] = numb - 1
       
   684             if numb > 0:
       
   685                 matches = matches + 1
       
   686         return _calculate_ratio(matches, len(self.a) + len(self.b))
       
   687 
       
   688     def real_quick_ratio(self):
       
   689         """Return an upper bound on ratio() very quickly.
       
   690 
       
   691         This isn't defined beyond that it is an upper bound on .ratio(), and
       
   692         is faster to compute than either .ratio() or .quick_ratio().
       
   693         """
       
   694 
       
   695         la, lb = len(self.a), len(self.b)
       
   696         # can't have more matches than the number of elements in the
       
   697         # shorter sequence
       
   698         return _calculate_ratio(min(la, lb), la + lb)
       
   699 
       
   700 def get_close_matches(word, possibilities, n=3, cutoff=0.6):
       
   701     """Use SequenceMatcher to return list of the best "good enough" matches.
       
   702 
       
   703     word is a sequence for which close matches are desired (typically a
       
   704     string).
       
   705 
       
   706     possibilities is a list of sequences against which to match word
       
   707     (typically a list of strings).
       
   708 
       
   709     Optional arg n (default 3) is the maximum number of close matches to
       
   710     return.  n must be > 0.
       
   711 
       
   712     Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
       
   713     that don't score at least that similar to word are ignored.
       
   714 
       
   715     The best (no more than n) matches among the possibilities are returned
       
   716     in a list, sorted by similarity score, most similar first.
       
   717 
       
   718     >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
       
   719     ['apple', 'ape']
       
   720     >>> import keyword as _keyword
       
   721     >>> get_close_matches("wheel", _keyword.kwlist)
       
   722     ['while']
       
   723     >>> get_close_matches("apple", _keyword.kwlist)
       
   724     []
       
   725     >>> get_close_matches("accept", _keyword.kwlist)
       
   726     ['except']
       
   727     """
       
   728 
       
   729     if not n >  0:
       
   730         raise ValueError("n must be > 0: %r" % (n,))
       
   731     if not 0.0 <= cutoff <= 1.0:
       
   732         raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
       
   733     result = []
       
   734     s = SequenceMatcher()
       
   735     s.set_seq2(word)
       
   736     for x in possibilities:
       
   737         s.set_seq1(x)
       
   738         if s.real_quick_ratio() >= cutoff and \
       
   739            s.quick_ratio() >= cutoff and \
       
   740            s.ratio() >= cutoff:
       
   741             result.append((s.ratio(), x))
       
   742 
       
   743     # Move the best scorers to head of list
       
   744     result = heapq.nlargest(n, result)
       
   745     # Strip scores for the best n matches
       
   746     return [x for score, x in result]
       
   747 
       
   748 def _count_leading(line, ch):
       
   749     """
       
   750     Return number of `ch` characters at the start of `line`.
       
   751 
       
   752     Example:
       
   753 
       
   754     >>> _count_leading('   abc', ' ')
       
   755     3
       
   756     """
       
   757 
       
   758     i, n = 0, len(line)
       
   759     while i < n and line[i] == ch:
       
   760         i += 1
       
   761     return i
       
   762 
       
   763 class Differ:
       
   764     r"""
       
   765     Differ is a class for comparing sequences of lines of text, and
       
   766     producing human-readable differences or deltas.  Differ uses
       
   767     SequenceMatcher both to compare sequences of lines, and to compare
       
   768     sequences of characters within similar (near-matching) lines.
       
   769 
       
   770     Each line of a Differ delta begins with a two-letter code:
       
   771 
       
   772         '- '    line unique to sequence 1
       
   773         '+ '    line unique to sequence 2
       
   774         '  '    line common to both sequences
       
   775         '? '    line not present in either input sequence
       
   776 
       
   777     Lines beginning with '? ' attempt to guide the eye to intraline
       
   778     differences, and were not present in either input sequence.  These lines
       
   779     can be confusing if the sequences contain tab characters.
       
   780 
       
   781     Note that Differ makes no claim to produce a *minimal* diff.  To the
       
   782     contrary, minimal diffs are often counter-intuitive, because they synch
       
   783     up anywhere possible, sometimes accidental matches 100 pages apart.
       
   784     Restricting synch points to contiguous matches preserves some notion of
       
   785     locality, at the occasional cost of producing a longer diff.
       
   786 
       
   787     Example: Comparing two texts.
       
   788 
       
   789     First we set up the texts, sequences of individual single-line strings
       
   790     ending with newlines (such sequences can also be obtained from the
       
   791     `readlines()` method of file-like objects):
       
   792 
       
   793     >>> text1 = '''  1. Beautiful is better than ugly.
       
   794     ...   2. Explicit is better than implicit.
       
   795     ...   3. Simple is better than complex.
       
   796     ...   4. Complex is better than complicated.
       
   797     ... '''.splitlines(1)
       
   798     >>> len(text1)
       
   799     4
       
   800     >>> text1[0][-1]
       
   801     '\n'
       
   802     >>> text2 = '''  1. Beautiful is better than ugly.
       
   803     ...   3.   Simple is better than complex.
       
   804     ...   4. Complicated is better than complex.
       
   805     ...   5. Flat is better than nested.
       
   806     ... '''.splitlines(1)
       
   807 
       
   808     Next we instantiate a Differ object:
       
   809 
       
   810     >>> d = Differ()
       
   811 
       
   812     Note that when instantiating a Differ object we may pass functions to
       
   813     filter out line and character 'junk'.  See Differ.__init__ for details.
       
   814 
       
   815     Finally, we compare the two:
       
   816 
       
   817     >>> result = list(d.compare(text1, text2))
       
   818 
       
   819     'result' is a list of strings, so let's pretty-print it:
       
   820 
       
   821     >>> from pprint import pprint as _pprint
       
   822     >>> _pprint(result)
       
   823     ['    1. Beautiful is better than ugly.\n',
       
   824      '-   2. Explicit is better than implicit.\n',
       
   825      '-   3. Simple is better than complex.\n',
       
   826      '+   3.   Simple is better than complex.\n',
       
   827      '?     ++\n',
       
   828      '-   4. Complex is better than complicated.\n',
       
   829      '?            ^                     ---- ^\n',
       
   830      '+   4. Complicated is better than complex.\n',
       
   831      '?           ++++ ^                      ^\n',
       
   832      '+   5. Flat is better than nested.\n']
       
   833 
       
   834     As a single multi-line string it looks like this:
       
   835 
       
   836     >>> print ''.join(result),
       
   837         1. Beautiful is better than ugly.
       
   838     -   2. Explicit is better than implicit.
       
   839     -   3. Simple is better than complex.
       
   840     +   3.   Simple is better than complex.
       
   841     ?     ++
       
   842     -   4. Complex is better than complicated.
       
   843     ?            ^                     ---- ^
       
   844     +   4. Complicated is better than complex.
       
   845     ?           ++++ ^                      ^
       
   846     +   5. Flat is better than nested.
       
   847 
       
   848     Methods:
       
   849 
       
   850     __init__(linejunk=None, charjunk=None)
       
   851         Construct a text differencer, with optional filters.
       
   852 
       
   853     compare(a, b)
       
   854         Compare two sequences of lines; generate the resulting delta.
       
   855     """
       
   856 
       
   857     def __init__(self, linejunk=None, charjunk=None):
       
   858         """
       
   859         Construct a text differencer, with optional filters.
       
   860 
       
   861         The two optional keyword parameters are for filter functions:
       
   862 
       
   863         - `linejunk`: A function that should accept a single string argument,
       
   864           and return true iff the string is junk. The module-level function
       
   865           `IS_LINE_JUNK` may be used to filter out lines without visible
       
   866           characters, except for at most one splat ('#').  It is recommended
       
   867           to leave linejunk None; as of Python 2.3, the underlying
       
   868           SequenceMatcher class has grown an adaptive notion of "noise" lines
       
   869           that's better than any static definition the author has ever been
       
   870           able to craft.
       
   871 
       
   872         - `charjunk`: A function that should accept a string of length 1. The
       
   873           module-level function `IS_CHARACTER_JUNK` may be used to filter out
       
   874           whitespace characters (a blank or tab; **note**: bad idea to include
       
   875           newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
       
   876         """
       
   877 
       
   878         self.linejunk = linejunk
       
   879         self.charjunk = charjunk
       
   880 
       
   881     def compare(self, a, b):
       
   882         r"""
       
   883         Compare two sequences of lines; generate the resulting delta.
       
   884 
       
   885         Each sequence must contain individual single-line strings ending with
       
   886         newlines. Such sequences can be obtained from the `readlines()` method
       
   887         of file-like objects.  The delta generated also consists of newline-
       
   888         terminated strings, ready to be printed as-is via the writeline()
       
   889         method of a file-like object.
       
   890 
       
   891         Example:
       
   892 
       
   893         >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
       
   894         ...                                'ore\ntree\nemu\n'.splitlines(1))),
       
   895         - one
       
   896         ?  ^
       
   897         + ore
       
   898         ?  ^
       
   899         - two
       
   900         - three
       
   901         ?  -
       
   902         + tree
       
   903         + emu
       
   904         """
       
   905 
       
   906         cruncher = SequenceMatcher(self.linejunk, a, b)
       
   907         for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
       
   908             if tag == 'replace':
       
   909                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
       
   910             elif tag == 'delete':
       
   911                 g = self._dump('-', a, alo, ahi)
       
   912             elif tag == 'insert':
       
   913                 g = self._dump('+', b, blo, bhi)
       
   914             elif tag == 'equal':
       
   915                 g = self._dump(' ', a, alo, ahi)
       
   916             else:
       
   917                 raise ValueError, 'unknown tag %r' % (tag,)
       
   918 
       
   919             for line in g:
       
   920                 yield line
       
   921 
       
   922     def _dump(self, tag, x, lo, hi):
       
   923         """Generate comparison results for a same-tagged range."""
       
   924         for i in xrange(lo, hi):
       
   925             yield '%s %s' % (tag, x[i])
       
   926 
       
   927     def _plain_replace(self, a, alo, ahi, b, blo, bhi):
       
   928         assert alo < ahi and blo < bhi
       
   929         # dump the shorter block first -- reduces the burden on short-term
       
   930         # memory if the blocks are of very different sizes
       
   931         if bhi - blo < ahi - alo:
       
   932             first  = self._dump('+', b, blo, bhi)
       
   933             second = self._dump('-', a, alo, ahi)
       
   934         else:
       
   935             first  = self._dump('-', a, alo, ahi)
       
   936             second = self._dump('+', b, blo, bhi)
       
   937 
       
   938         for g in first, second:
       
   939             for line in g:
       
   940                 yield line
       
   941 
       
   942     def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
       
   943         r"""
       
   944         When replacing one block of lines with another, search the blocks
       
   945         for *similar* lines; the best-matching pair (if any) is used as a
       
   946         synch point, and intraline difference marking is done on the
       
   947         similar pair. Lots of work, but often worth it.
       
   948 
       
   949         Example:
       
   950 
       
   951         >>> d = Differ()
       
   952         >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
       
   953         ...                            ['abcdefGhijkl\n'], 0, 1)
       
   954         >>> print ''.join(results),
       
   955         - abcDefghiJkl
       
   956         ?    ^  ^  ^
       
   957         + abcdefGhijkl
       
   958         ?    ^  ^  ^
       
   959         """
       
   960 
       
   961         # don't synch up unless the lines have a similarity score of at
       
   962         # least cutoff; best_ratio tracks the best score seen so far
       
   963         best_ratio, cutoff = 0.74, 0.75
       
   964         cruncher = SequenceMatcher(self.charjunk)
       
   965         eqi, eqj = None, None   # 1st indices of equal lines (if any)
       
   966 
       
   967         # search for the pair that matches best without being identical
       
   968         # (identical lines must be junk lines, & we don't want to synch up
       
   969         # on junk -- unless we have to)
       
   970         for j in xrange(blo, bhi):
       
   971             bj = b[j]
       
   972             cruncher.set_seq2(bj)
       
   973             for i in xrange(alo, ahi):
       
   974                 ai = a[i]
       
   975                 if ai == bj:
       
   976                     if eqi is None:
       
   977                         eqi, eqj = i, j
       
   978                     continue
       
   979                 cruncher.set_seq1(ai)
       
   980                 # computing similarity is expensive, so use the quick
       
   981                 # upper bounds first -- have seen this speed up messy
       
   982                 # compares by a factor of 3.
       
   983                 # note that ratio() is only expensive to compute the first
       
   984                 # time it's called on a sequence pair; the expensive part
       
   985                 # of the computation is cached by cruncher
       
   986                 if cruncher.real_quick_ratio() > best_ratio and \
       
   987                       cruncher.quick_ratio() > best_ratio and \
       
   988                       cruncher.ratio() > best_ratio:
       
   989                     best_ratio, best_i, best_j = cruncher.ratio(), i, j
       
   990         if best_ratio < cutoff:
       
   991             # no non-identical "pretty close" pair
       
   992             if eqi is None:
       
   993                 # no identical pair either -- treat it as a straight replace
       
   994                 for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
       
   995                     yield line
       
   996                 return
       
   997             # no close pair, but an identical pair -- synch up on that
       
   998             best_i, best_j, best_ratio = eqi, eqj, 1.0
       
   999         else:
       
  1000             # there's a close pair, so forget the identical pair (if any)
       
  1001             eqi = None
       
  1002 
       
  1003         # a[best_i] very similar to b[best_j]; eqi is None iff they're not
       
  1004         # identical
       
  1005 
       
  1006         # pump out diffs from before the synch point
       
  1007         for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
       
  1008             yield line
       
  1009 
       
  1010         # do intraline marking on the synch pair
       
  1011         aelt, belt = a[best_i], b[best_j]
       
  1012         if eqi is None:
       
  1013             # pump out a '-', '?', '+', '?' quad for the synched lines
       
  1014             atags = btags = ""
       
  1015             cruncher.set_seqs(aelt, belt)
       
  1016             for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
       
  1017                 la, lb = ai2 - ai1, bj2 - bj1
       
  1018                 if tag == 'replace':
       
  1019                     atags += '^' * la
       
  1020                     btags += '^' * lb
       
  1021                 elif tag == 'delete':
       
  1022                     atags += '-' * la
       
  1023                 elif tag == 'insert':
       
  1024                     btags += '+' * lb
       
  1025                 elif tag == 'equal':
       
  1026                     atags += ' ' * la
       
  1027                     btags += ' ' * lb
       
  1028                 else:
       
  1029                     raise ValueError, 'unknown tag %r' % (tag,)
       
  1030             for line in self._qformat(aelt, belt, atags, btags):
       
  1031                 yield line
       
  1032         else:
       
  1033             # the synch pair is identical
       
  1034             yield '  ' + aelt
       
  1035 
       
  1036         # pump out diffs from after the synch point
       
  1037         for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
       
  1038             yield line
       
  1039 
       
  1040     def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
       
  1041         g = []
       
  1042         if alo < ahi:
       
  1043             if blo < bhi:
       
  1044                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
       
  1045             else:
       
  1046                 g = self._dump('-', a, alo, ahi)
       
  1047         elif blo < bhi:
       
  1048             g = self._dump('+', b, blo, bhi)
       
  1049 
       
  1050         for line in g:
       
  1051             yield line
       
  1052 
       
  1053     def _qformat(self, aline, bline, atags, btags):
       
  1054         r"""
       
  1055         Format "?" output and deal with leading tabs.
       
  1056 
       
  1057         Example:
       
  1058 
       
  1059         >>> d = Differ()
       
  1060         >>> results = d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n',
       
  1061         ...                      '  ^ ^  ^      ', '+  ^ ^  ^      ')
       
  1062         >>> for line in results: print repr(line)
       
  1063         ...
       
  1064         '- \tabcDefghiJkl\n'
       
  1065         '? \t ^ ^  ^\n'
       
  1066         '+ \t\tabcdefGhijkl\n'
       
  1067         '? \t  ^ ^  ^\n'
       
  1068         """
       
  1069 
       
  1070         # Can hurt, but will probably help most of the time.
       
  1071         common = min(_count_leading(aline, "\t"),
       
  1072                      _count_leading(bline, "\t"))
       
  1073         common = min(common, _count_leading(atags[:common], " "))
       
  1074         atags = atags[common:].rstrip()
       
  1075         btags = btags[common:].rstrip()
       
  1076 
       
  1077         yield "- " + aline
       
  1078         if atags:
       
  1079             yield "? %s%s\n" % ("\t" * common, atags)
       
  1080 
       
  1081         yield "+ " + bline
       
  1082         if btags:
       
  1083             yield "? %s%s\n" % ("\t" * common, btags)
       
  1084 
       
  1085 # With respect to junk, an earlier version of ndiff simply refused to
       
  1086 # *start* a match with a junk element.  The result was cases like this:
       
  1087 #     before: private Thread currentThread;
       
  1088 #     after:  private volatile Thread currentThread;
       
  1089 # If you consider whitespace to be junk, the longest contiguous match
       
  1090 # not starting with junk is "e Thread currentThread".  So ndiff reported
       
  1091 # that "e volatil" was inserted between the 't' and the 'e' in "private".
       
  1092 # While an accurate view, to people that's absurd.  The current version
       
  1093 # looks for matching blocks that are entirely junk-free, then extends the
       
  1094 # longest one of those as far as possible but only with matching junk.
       
  1095 # So now "currentThread" is matched, then extended to suck up the
       
  1096 # preceding blank; then "private" is matched, and extended to suck up the
       
  1097 # following blank; then "Thread" is matched; and finally ndiff reports
       
  1098 # that "volatile " was inserted before "Thread".  The only quibble
       
  1099 # remaining is that perhaps it was really the case that " volatile"
       
  1100 # was inserted after "private".  I can live with that <wink>.
       
  1101 
       
  1102 import re
       
  1103 
       
  1104 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
       
  1105     r"""
       
  1106     Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
       
  1107 
       
  1108     Examples:
       
  1109 
       
  1110     >>> IS_LINE_JUNK('\n')
       
  1111     True
       
  1112     >>> IS_LINE_JUNK('  #   \n')
       
  1113     True
       
  1114     >>> IS_LINE_JUNK('hello\n')
       
  1115     False
       
  1116     """
       
  1117 
       
  1118     return pat(line) is not None
       
  1119 
       
  1120 def IS_CHARACTER_JUNK(ch, ws=" \t"):
       
  1121     r"""
       
  1122     Return 1 for ignorable character: iff `ch` is a space or tab.
       
  1123 
       
  1124     Examples:
       
  1125 
       
  1126     >>> IS_CHARACTER_JUNK(' ')
       
  1127     True
       
  1128     >>> IS_CHARACTER_JUNK('\t')
       
  1129     True
       
  1130     >>> IS_CHARACTER_JUNK('\n')
       
  1131     False
       
  1132     >>> IS_CHARACTER_JUNK('x')
       
  1133     False
       
  1134     """
       
  1135 
       
  1136     return ch in ws
       
  1137 
       
  1138 
       
  1139 def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
       
  1140                  tofiledate='', n=3, lineterm='\n'):
       
  1141     r"""
       
  1142     Compare two sequences of lines; generate the delta as a unified diff.
       
  1143 
       
  1144     Unified diffs are a compact way of showing line changes and a few
       
  1145     lines of context.  The number of context lines is set by 'n' which
       
  1146     defaults to three.
       
  1147 
       
  1148     By default, the diff control lines (those with ---, +++, or @@) are
       
  1149     created with a trailing newline.  This is helpful so that inputs
       
  1150     created from file.readlines() result in diffs that are suitable for
       
  1151     file.writelines() since both the inputs and outputs have trailing
       
  1152     newlines.
       
  1153 
       
  1154     For inputs that do not have trailing newlines, set the lineterm
       
  1155     argument to "" so that the output will be uniformly newline free.
       
  1156 
       
  1157     The unidiff format normally has a header for filenames and modification
       
  1158     times.  Any or all of these may be specified using strings for
       
  1159     'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.  The modification
       
  1160     times are normally expressed in the format returned by time.ctime().
       
  1161 
       
  1162     Example:
       
  1163 
       
  1164     >>> for line in unified_diff('one two three four'.split(),
       
  1165     ...             'zero one tree four'.split(), 'Original', 'Current',
       
  1166     ...             'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
       
  1167     ...             lineterm=''):
       
  1168     ...     print line
       
  1169     --- Original Sat Jan 26 23:30:50 1991
       
  1170     +++ Current Fri Jun 06 10:20:52 2003
       
  1171     @@ -1,4 +1,4 @@
       
  1172     +zero
       
  1173      one
       
  1174     -two
       
  1175     -three
       
  1176     +tree
       
  1177      four
       
  1178     """
       
  1179 
       
  1180     started = False
       
  1181     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
       
  1182         if not started:
       
  1183             yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)
       
  1184             yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)
       
  1185             started = True
       
  1186         i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
       
  1187         yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)
       
  1188         for tag, i1, i2, j1, j2 in group:
       
  1189             if tag == 'equal':
       
  1190                 for line in a[i1:i2]:
       
  1191                     yield ' ' + line
       
  1192                 continue
       
  1193             if tag == 'replace' or tag == 'delete':
       
  1194                 for line in a[i1:i2]:
       
  1195                     yield '-' + line
       
  1196             if tag == 'replace' or tag == 'insert':
       
  1197                 for line in b[j1:j2]:
       
  1198                     yield '+' + line
       
  1199 
       
  1200 # See http://www.unix.org/single_unix_specification/
       
  1201 def context_diff(a, b, fromfile='', tofile='',
       
  1202                  fromfiledate='', tofiledate='', n=3, lineterm='\n'):
       
  1203     r"""
       
  1204     Compare two sequences of lines; generate the delta as a context diff.
       
  1205 
       
  1206     Context diffs are a compact way of showing line changes and a few
       
  1207     lines of context.  The number of context lines is set by 'n' which
       
  1208     defaults to three.
       
  1209 
       
  1210     By default, the diff control lines (those with *** or ---) are
       
  1211     created with a trailing newline.  This is helpful so that inputs
       
  1212     created from file.readlines() result in diffs that are suitable for
       
  1213     file.writelines() since both the inputs and outputs have trailing
       
  1214     newlines.
       
  1215 
       
  1216     For inputs that do not have trailing newlines, set the lineterm
       
  1217     argument to "" so that the output will be uniformly newline free.
       
  1218 
       
  1219     The context diff format normally has a header for filenames and
       
  1220     modification times.  Any or all of these may be specified using
       
  1221     strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
       
  1222     The modification times are normally expressed in the format returned
       
  1223     by time.ctime().  If not specified, the strings default to blanks.
       
  1224 
       
  1225     Example:
       
  1226 
       
  1227     >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1),
       
  1228     ...       'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current',
       
  1229     ...       'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:22:46 2003')),
       
  1230     *** Original Sat Jan 26 23:30:50 1991
       
  1231     --- Current Fri Jun 06 10:22:46 2003
       
  1232     ***************
       
  1233     *** 1,4 ****
       
  1234       one
       
  1235     ! two
       
  1236     ! three
       
  1237       four
       
  1238     --- 1,4 ----
       
  1239     + zero
       
  1240       one
       
  1241     ! tree
       
  1242       four
       
  1243     """
       
  1244 
       
  1245     started = False
       
  1246     prefixmap = {'insert':'+ ', 'delete':'- ', 'replace':'! ', 'equal':'  '}
       
  1247     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
       
  1248         if not started:
       
  1249             yield '*** %s %s%s' % (fromfile, fromfiledate, lineterm)
       
  1250             yield '--- %s %s%s' % (tofile, tofiledate, lineterm)
       
  1251             started = True
       
  1252 
       
  1253         yield '***************%s' % (lineterm,)
       
  1254         if group[-1][2] - group[0][1] >= 2:
       
  1255             yield '*** %d,%d ****%s' % (group[0][1]+1, group[-1][2], lineterm)
       
  1256         else:
       
  1257             yield '*** %d ****%s' % (group[-1][2], lineterm)
       
  1258         visiblechanges = [e for e in group if e[0] in ('replace', 'delete')]
       
  1259         if visiblechanges:
       
  1260             for tag, i1, i2, _, _ in group:
       
  1261                 if tag != 'insert':
       
  1262                     for line in a[i1:i2]:
       
  1263                         yield prefixmap[tag] + line
       
  1264 
       
  1265         if group[-1][4] - group[0][3] >= 2:
       
  1266             yield '--- %d,%d ----%s' % (group[0][3]+1, group[-1][4], lineterm)
       
  1267         else:
       
  1268             yield '--- %d ----%s' % (group[-1][4], lineterm)
       
  1269         visiblechanges = [e for e in group if e[0] in ('replace', 'insert')]
       
  1270         if visiblechanges:
       
  1271             for tag, _, _, j1, j2 in group:
       
  1272                 if tag != 'delete':
       
  1273                     for line in b[j1:j2]:
       
  1274                         yield prefixmap[tag] + line
       
  1275 
       
  1276 def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
       
  1277     r"""
       
  1278     Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
       
  1279 
       
  1280     Optional keyword parameters `linejunk` and `charjunk` are for filter
       
  1281     functions (or None):
       
  1282 
       
  1283     - linejunk: A function that should accept a single string argument, and
       
  1284       return true iff the string is junk.  The default is None, and is
       
  1285       recommended; as of Python 2.3, an adaptive notion of "noise" lines is
       
  1286       used that does a good job on its own.
       
  1287 
       
  1288     - charjunk: A function that should accept a string of length 1. The
       
  1289       default is module-level function IS_CHARACTER_JUNK, which filters out
       
  1290       whitespace characters (a blank or tab; note: bad idea to include newline
       
  1291       in this!).
       
  1292 
       
  1293     Tools/scripts/ndiff.py is a command-line front-end to this function.
       
  1294 
       
  1295     Example:
       
  1296 
       
  1297     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
       
  1298     ...              'ore\ntree\nemu\n'.splitlines(1))
       
  1299     >>> print ''.join(diff),
       
  1300     - one
       
  1301     ?  ^
       
  1302     + ore
       
  1303     ?  ^
       
  1304     - two
       
  1305     - three
       
  1306     ?  -
       
  1307     + tree
       
  1308     + emu
       
  1309     """
       
  1310     return Differ(linejunk, charjunk).compare(a, b)
       
  1311 
       
  1312 def _mdiff(fromlines, tolines, context=None, linejunk=None,
       
  1313            charjunk=IS_CHARACTER_JUNK):
       
  1314     r"""Returns generator yielding marked up from/to side by side differences.
       
  1315 
       
  1316     Arguments:
       
  1317     fromlines -- list of text lines to compared to tolines
       
  1318     tolines -- list of text lines to be compared to fromlines
       
  1319     context -- number of context lines to display on each side of difference,
       
  1320                if None, all from/to text lines will be generated.
       
  1321     linejunk -- passed on to ndiff (see ndiff documentation)
       
  1322     charjunk -- passed on to ndiff (see ndiff documentation)
       
  1323 
       
  1324     This function returns an interator which returns a tuple:
       
  1325     (from line tuple, to line tuple, boolean flag)
       
  1326 
       
  1327     from/to line tuple -- (line num, line text)
       
  1328         line num -- integer or None (to indicate a context seperation)
       
  1329         line text -- original line text with following markers inserted:
       
  1330             '\0+' -- marks start of added text
       
  1331             '\0-' -- marks start of deleted text
       
  1332             '\0^' -- marks start of changed text
       
  1333             '\1' -- marks end of added/deleted/changed text
       
  1334 
       
  1335     boolean flag -- None indicates context separation, True indicates
       
  1336         either "from" or "to" line contains a change, otherwise False.
       
  1337 
       
  1338     This function/iterator was originally developed to generate side by side
       
  1339     file difference for making HTML pages (see HtmlDiff class for example
       
  1340     usage).
       
  1341 
       
  1342     Note, this function utilizes the ndiff function to generate the side by
       
  1343     side difference markup.  Optional ndiff arguments may be passed to this
       
  1344     function and they in turn will be passed to ndiff.
       
  1345     """
       
  1346     import re
       
  1347 
       
  1348     # regular expression for finding intraline change indices
       
  1349     change_re = re.compile('(\++|\-+|\^+)')
       
  1350 
       
  1351     # create the difference iterator to generate the differences
       
  1352     diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
       
  1353 
       
  1354     def _make_line(lines, format_key, side, num_lines=[0,0]):
       
  1355         """Returns line of text with user's change markup and line formatting.
       
  1356 
       
  1357         lines -- list of lines from the ndiff generator to produce a line of
       
  1358                  text from.  When producing the line of text to return, the
       
  1359                  lines used are removed from this list.
       
  1360         format_key -- '+' return first line in list with "add" markup around
       
  1361                           the entire line.
       
  1362                       '-' return first line in list with "delete" markup around
       
  1363                           the entire line.
       
  1364                       '?' return first line in list with add/delete/change
       
  1365                           intraline markup (indices obtained from second line)
       
  1366                       None return first line in list with no markup
       
  1367         side -- indice into the num_lines list (0=from,1=to)
       
  1368         num_lines -- from/to current line number.  This is NOT intended to be a
       
  1369                      passed parameter.  It is present as a keyword argument to
       
  1370                      maintain memory of the current line numbers between calls
       
  1371                      of this function.
       
  1372 
       
  1373         Note, this function is purposefully not defined at the module scope so
       
  1374         that data it needs from its parent function (within whose context it
       
  1375         is defined) does not need to be of module scope.
       
  1376         """
       
  1377         num_lines[side] += 1
       
  1378         # Handle case where no user markup is to be added, just return line of
       
  1379         # text with user's line format to allow for usage of the line number.
       
  1380         if format_key is None:
       
  1381             return (num_lines[side],lines.pop(0)[2:])
       
  1382         # Handle case of intraline changes
       
  1383         if format_key == '?':
       
  1384             text, markers = lines.pop(0), lines.pop(0)
       
  1385             # find intraline changes (store change type and indices in tuples)
       
  1386             sub_info = []
       
  1387             def record_sub_info(match_object,sub_info=sub_info):
       
  1388                 sub_info.append([match_object.group(1)[0],match_object.span()])
       
  1389                 return match_object.group(1)
       
  1390             change_re.sub(record_sub_info,markers)
       
  1391             # process each tuple inserting our special marks that won't be
       
  1392             # noticed by an xml/html escaper.
       
  1393             for key,(begin,end) in sub_info[::-1]:
       
  1394                 text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
       
  1395             text = text[2:]
       
  1396         # Handle case of add/delete entire line
       
  1397         else:
       
  1398             text = lines.pop(0)[2:]
       
  1399             # if line of text is just a newline, insert a space so there is
       
  1400             # something for the user to highlight and see.
       
  1401             if not text:
       
  1402                 text = ' '
       
  1403             # insert marks that won't be noticed by an xml/html escaper.
       
  1404             text = '\0' + format_key + text + '\1'
       
  1405         # Return line of text, first allow user's line formatter to do its
       
  1406         # thing (such as adding the line number) then replace the special
       
  1407         # marks with what the user's change markup.
       
  1408         return (num_lines[side],text)
       
  1409 
       
  1410     def _line_iterator():
       
  1411         """Yields from/to lines of text with a change indication.
       
  1412 
       
  1413         This function is an iterator.  It itself pulls lines from a
       
  1414         differencing iterator, processes them and yields them.  When it can
       
  1415         it yields both a "from" and a "to" line, otherwise it will yield one
       
  1416         or the other.  In addition to yielding the lines of from/to text, a
       
  1417         boolean flag is yielded to indicate if the text line(s) have
       
  1418         differences in them.
       
  1419 
       
  1420         Note, this function is purposefully not defined at the module scope so
       
  1421         that data it needs from its parent function (within whose context it
       
  1422         is defined) does not need to be of module scope.
       
  1423         """
       
  1424         lines = []
       
  1425         num_blanks_pending, num_blanks_to_yield = 0, 0
       
  1426         while True:
       
  1427             # Load up next 4 lines so we can look ahead, create strings which
       
  1428             # are a concatenation of the first character of each of the 4 lines
       
  1429             # so we can do some very readable comparisons.
       
  1430             while len(lines) < 4:
       
  1431                 try:
       
  1432                     lines.append(diff_lines_iterator.next())
       
  1433                 except StopIteration:
       
  1434                     lines.append('X')
       
  1435             s = ''.join([line[0] for line in lines])
       
  1436             if s.startswith('X'):
       
  1437                 # When no more lines, pump out any remaining blank lines so the
       
  1438                 # corresponding add/delete lines get a matching blank line so
       
  1439                 # all line pairs get yielded at the next level.
       
  1440                 num_blanks_to_yield = num_blanks_pending
       
  1441             elif s.startswith('-?+?'):
       
  1442                 # simple intraline change
       
  1443                 yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
       
  1444                 continue
       
  1445             elif s.startswith('--++'):
       
  1446                 # in delete block, add block coming: we do NOT want to get
       
  1447                 # caught up on blank lines yet, just process the delete line
       
  1448                 num_blanks_pending -= 1
       
  1449                 yield _make_line(lines,'-',0), None, True
       
  1450                 continue
       
  1451             elif s.startswith(('--?+', '--+', '- ')):
       
  1452                 # in delete block and see a intraline change or unchanged line
       
  1453                 # coming: yield the delete line and then blanks
       
  1454                 from_line,to_line = _make_line(lines,'-',0), None
       
  1455                 num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
       
  1456             elif s.startswith('-+?'):
       
  1457                 # intraline change
       
  1458                 yield _make_line(lines,None,0), _make_line(lines,'?',1), True
       
  1459                 continue
       
  1460             elif s.startswith('-?+'):
       
  1461                 # intraline change
       
  1462                 yield _make_line(lines,'?',0), _make_line(lines,None,1), True
       
  1463                 continue
       
  1464             elif s.startswith('-'):
       
  1465                 # delete FROM line
       
  1466                 num_blanks_pending -= 1
       
  1467                 yield _make_line(lines,'-',0), None, True
       
  1468                 continue
       
  1469             elif s.startswith('+--'):
       
  1470                 # in add block, delete block coming: we do NOT want to get
       
  1471                 # caught up on blank lines yet, just process the add line
       
  1472                 num_blanks_pending += 1
       
  1473                 yield None, _make_line(lines,'+',1), True
       
  1474                 continue
       
  1475             elif s.startswith(('+ ', '+-')):
       
  1476                 # will be leaving an add block: yield blanks then add line
       
  1477                 from_line, to_line = None, _make_line(lines,'+',1)
       
  1478                 num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
       
  1479             elif s.startswith('+'):
       
  1480                 # inside an add block, yield the add line
       
  1481                 num_blanks_pending += 1
       
  1482                 yield None, _make_line(lines,'+',1), True
       
  1483                 continue
       
  1484             elif s.startswith(' '):
       
  1485                 # unchanged text, yield it to both sides
       
  1486                 yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
       
  1487                 continue
       
  1488             # Catch up on the blank lines so when we yield the next from/to
       
  1489             # pair, they are lined up.
       
  1490             while(num_blanks_to_yield < 0):
       
  1491                 num_blanks_to_yield += 1
       
  1492                 yield None,('','\n'),True
       
  1493             while(num_blanks_to_yield > 0):
       
  1494                 num_blanks_to_yield -= 1
       
  1495                 yield ('','\n'),None,True
       
  1496             if s.startswith('X'):
       
  1497                 raise StopIteration
       
  1498             else:
       
  1499                 yield from_line,to_line,True
       
  1500 
       
  1501     def _line_pair_iterator():
       
  1502         """Yields from/to lines of text with a change indication.
       
  1503 
       
  1504         This function is an iterator.  It itself pulls lines from the line
       
  1505         iterator.  Its difference from that iterator is that this function
       
  1506         always yields a pair of from/to text lines (with the change
       
  1507         indication).  If necessary it will collect single from/to lines
       
  1508         until it has a matching pair from/to pair to yield.
       
  1509 
       
  1510         Note, this function is purposefully not defined at the module scope so
       
  1511         that data it needs from its parent function (within whose context it
       
  1512         is defined) does not need to be of module scope.
       
  1513         """
       
  1514         line_iterator = _line_iterator()
       
  1515         fromlines,tolines=[],[]
       
  1516         while True:
       
  1517             # Collecting lines of text until we have a from/to pair
       
  1518             while (len(fromlines)==0 or len(tolines)==0):
       
  1519                 from_line, to_line, found_diff =line_iterator.next()
       
  1520                 if from_line is not None:
       
  1521                     fromlines.append((from_line,found_diff))
       
  1522                 if to_line is not None:
       
  1523                     tolines.append((to_line,found_diff))
       
  1524             # Once we have a pair, remove them from the collection and yield it
       
  1525             from_line, fromDiff = fromlines.pop(0)
       
  1526             to_line, to_diff = tolines.pop(0)
       
  1527             yield (from_line,to_line,fromDiff or to_diff)
       
  1528 
       
  1529     # Handle case where user does not want context differencing, just yield
       
  1530     # them up without doing anything else with them.
       
  1531     line_pair_iterator = _line_pair_iterator()
       
  1532     if context is None:
       
  1533         while True:
       
  1534             yield line_pair_iterator.next()
       
  1535     # Handle case where user wants context differencing.  We must do some
       
  1536     # storage of lines until we know for sure that they are to be yielded.
       
  1537     else:
       
  1538         context += 1
       
  1539         lines_to_write = 0
       
  1540         while True:
       
  1541             # Store lines up until we find a difference, note use of a
       
  1542             # circular queue because we only need to keep around what
       
  1543             # we need for context.
       
  1544             index, contextLines = 0, [None]*(context)
       
  1545             found_diff = False
       
  1546             while(found_diff is False):
       
  1547                 from_line, to_line, found_diff = line_pair_iterator.next()
       
  1548                 i = index % context
       
  1549                 contextLines[i] = (from_line, to_line, found_diff)
       
  1550                 index += 1
       
  1551             # Yield lines that we have collected so far, but first yield
       
  1552             # the user's separator.
       
  1553             if index > context:
       
  1554                 yield None, None, None
       
  1555                 lines_to_write = context
       
  1556             else:
       
  1557                 lines_to_write = index
       
  1558                 index = 0
       
  1559             while(lines_to_write):
       
  1560                 i = index % context
       
  1561                 index += 1
       
  1562                 yield contextLines[i]
       
  1563                 lines_to_write -= 1
       
  1564             # Now yield the context lines after the change
       
  1565             lines_to_write = context-1
       
  1566             while(lines_to_write):
       
  1567                 from_line, to_line, found_diff = line_pair_iterator.next()
       
  1568                 # If another change within the context, extend the context
       
  1569                 if found_diff:
       
  1570                     lines_to_write = context-1
       
  1571                 else:
       
  1572                     lines_to_write -= 1
       
  1573                 yield from_line, to_line, found_diff
       
  1574 
       
  1575 
       
  1576 _file_template = """
       
  1577 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
       
  1578           "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
       
  1579 
       
  1580 <html>
       
  1581 
       
  1582 <head>
       
  1583     <meta http-equiv="Content-Type"
       
  1584           content="text/html; charset=ISO-8859-1" />
       
  1585     <title></title>
       
  1586     <style type="text/css">%(styles)s
       
  1587     </style>
       
  1588 </head>
       
  1589 
       
  1590 <body>
       
  1591     %(table)s%(legend)s
       
  1592 </body>
       
  1593 
       
  1594 </html>"""
       
  1595 
       
  1596 _styles = """
       
  1597         table.diff {font-family:Courier; border:medium;}
       
  1598         .diff_header {background-color:#e0e0e0}
       
  1599         td.diff_header {text-align:right}
       
  1600         .diff_next {background-color:#c0c0c0}
       
  1601         .diff_add {background-color:#aaffaa}
       
  1602         .diff_chg {background-color:#ffff77}
       
  1603         .diff_sub {background-color:#ffaaaa}"""
       
  1604 
       
  1605 _table_template = """
       
  1606     <table class="diff" id="difflib_chg_%(prefix)s_top"
       
  1607            cellspacing="0" cellpadding="0" rules="groups" >
       
  1608         <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
       
  1609         <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
       
  1610         %(header_row)s
       
  1611         <tbody>
       
  1612 %(data_rows)s        </tbody>
       
  1613     </table>"""
       
  1614 
       
  1615 _legend = """
       
  1616     <table class="diff" summary="Legends">
       
  1617         <tr> <th colspan="2"> Legends </th> </tr>
       
  1618         <tr> <td> <table border="" summary="Colors">
       
  1619                       <tr><th> Colors </th> </tr>
       
  1620                       <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
       
  1621                       <tr><td class="diff_chg">Changed</td> </tr>
       
  1622                       <tr><td class="diff_sub">Deleted</td> </tr>
       
  1623                   </table></td>
       
  1624              <td> <table border="" summary="Links">
       
  1625                       <tr><th colspan="2"> Links </th> </tr>
       
  1626                       <tr><td>(f)irst change</td> </tr>
       
  1627                       <tr><td>(n)ext change</td> </tr>
       
  1628                       <tr><td>(t)op</td> </tr>
       
  1629                   </table></td> </tr>
       
  1630     </table>"""
       
  1631 
       
  1632 class HtmlDiff(object):
       
  1633     """For producing HTML side by side comparison with change highlights.
       
  1634 
       
  1635     This class can be used to create an HTML table (or a complete HTML file
       
  1636     containing the table) showing a side by side, line by line comparison
       
  1637     of text with inter-line and intra-line change highlights.  The table can
       
  1638     be generated in either full or contextual difference mode.
       
  1639 
       
  1640     The following methods are provided for HTML generation:
       
  1641 
       
  1642     make_table -- generates HTML for a single side by side table
       
  1643     make_file -- generates complete HTML file with a single side by side table
       
  1644 
       
  1645     See tools/scripts/diff.py for an example usage of this class.
       
  1646     """
       
  1647 
       
  1648     _file_template = _file_template
       
  1649     _styles = _styles
       
  1650     _table_template = _table_template
       
  1651     _legend = _legend
       
  1652     _default_prefix = 0
       
  1653 
       
  1654     def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
       
  1655                  charjunk=IS_CHARACTER_JUNK):
       
  1656         """HtmlDiff instance initializer
       
  1657 
       
  1658         Arguments:
       
  1659         tabsize -- tab stop spacing, defaults to 8.
       
  1660         wrapcolumn -- column number where lines are broken and wrapped,
       
  1661             defaults to None where lines are not wrapped.
       
  1662         linejunk,charjunk -- keyword arguments passed into ndiff() (used to by
       
  1663             HtmlDiff() to generate the side by side HTML differences).  See
       
  1664             ndiff() documentation for argument default values and descriptions.
       
  1665         """
       
  1666         self._tabsize = tabsize
       
  1667         self._wrapcolumn = wrapcolumn
       
  1668         self._linejunk = linejunk
       
  1669         self._charjunk = charjunk
       
  1670 
       
  1671     def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False,
       
  1672                   numlines=5):
       
  1673         """Returns HTML file of side by side comparison with change highlights
       
  1674 
       
  1675         Arguments:
       
  1676         fromlines -- list of "from" lines
       
  1677         tolines -- list of "to" lines
       
  1678         fromdesc -- "from" file column header string
       
  1679         todesc -- "to" file column header string
       
  1680         context -- set to True for contextual differences (defaults to False
       
  1681             which shows full differences).
       
  1682         numlines -- number of context lines.  When context is set True,
       
  1683             controls number of lines displayed before and after the change.
       
  1684             When context is False, controls the number of lines to place
       
  1685             the "next" link anchors before the next change (so click of
       
  1686             "next" link jumps to just before the change).
       
  1687         """
       
  1688 
       
  1689         return self._file_template % dict(
       
  1690             styles = self._styles,
       
  1691             legend = self._legend,
       
  1692             table = self.make_table(fromlines,tolines,fromdesc,todesc,
       
  1693                                     context=context,numlines=numlines))
       
  1694 
       
  1695     def _tab_newline_replace(self,fromlines,tolines):
       
  1696         """Returns from/to line lists with tabs expanded and newlines removed.
       
  1697 
       
  1698         Instead of tab characters being replaced by the number of spaces
       
  1699         needed to fill in to the next tab stop, this function will fill
       
  1700         the space with tab characters.  This is done so that the difference
       
  1701         algorithms can identify changes in a file when tabs are replaced by
       
  1702         spaces and vice versa.  At the end of the HTML generation, the tab
       
  1703         characters will be replaced with a nonbreakable space.
       
  1704         """
       
  1705         def expand_tabs(line):
       
  1706             # hide real spaces
       
  1707             line = line.replace(' ','\0')
       
  1708             # expand tabs into spaces
       
  1709             line = line.expandtabs(self._tabsize)
       
  1710             # relace spaces from expanded tabs back into tab characters
       
  1711             # (we'll replace them with markup after we do differencing)
       
  1712             line = line.replace(' ','\t')
       
  1713             return line.replace('\0',' ').rstrip('\n')
       
  1714         fromlines = [expand_tabs(line) for line in fromlines]
       
  1715         tolines = [expand_tabs(line) for line in tolines]
       
  1716         return fromlines,tolines
       
  1717 
       
  1718     def _split_line(self,data_list,line_num,text):
       
  1719         """Builds list of text lines by splitting text lines at wrap point
       
  1720 
       
  1721         This function will determine if the input text line needs to be
       
  1722         wrapped (split) into separate lines.  If so, the first wrap point
       
  1723         will be determined and the first line appended to the output
       
  1724         text line list.  This function is used recursively to handle
       
  1725         the second part of the split line to further split it.
       
  1726         """
       
  1727         # if blank line or context separator, just add it to the output list
       
  1728         if not line_num:
       
  1729             data_list.append((line_num,text))
       
  1730             return
       
  1731 
       
  1732         # if line text doesn't need wrapping, just add it to the output list
       
  1733         size = len(text)
       
  1734         max = self._wrapcolumn
       
  1735         if (size <= max) or ((size -(text.count('\0')*3)) <= max):
       
  1736             data_list.append((line_num,text))
       
  1737             return
       
  1738 
       
  1739         # scan text looking for the wrap point, keeping track if the wrap
       
  1740         # point is inside markers
       
  1741         i = 0
       
  1742         n = 0
       
  1743         mark = ''
       
  1744         while n < max and i < size:
       
  1745             if text[i] == '\0':
       
  1746                 i += 1
       
  1747                 mark = text[i]
       
  1748                 i += 1
       
  1749             elif text[i] == '\1':
       
  1750                 i += 1
       
  1751                 mark = ''
       
  1752             else:
       
  1753                 i += 1
       
  1754                 n += 1
       
  1755 
       
  1756         # wrap point is inside text, break it up into separate lines
       
  1757         line1 = text[:i]
       
  1758         line2 = text[i:]
       
  1759 
       
  1760         # if wrap point is inside markers, place end marker at end of first
       
  1761         # line and start marker at beginning of second line because each
       
  1762         # line will have its own table tag markup around it.
       
  1763         if mark:
       
  1764             line1 = line1 + '\1'
       
  1765             line2 = '\0' + mark + line2
       
  1766 
       
  1767         # tack on first line onto the output list
       
  1768         data_list.append((line_num,line1))
       
  1769 
       
  1770         # use this routine again to wrap the remaining text
       
  1771         self._split_line(data_list,'>',line2)
       
  1772 
       
  1773     def _line_wrapper(self,diffs):
       
  1774         """Returns iterator that splits (wraps) mdiff text lines"""
       
  1775 
       
  1776         # pull from/to data and flags from mdiff iterator
       
  1777         for fromdata,todata,flag in diffs:
       
  1778             # check for context separators and pass them through
       
  1779             if flag is None:
       
  1780                 yield fromdata,todata,flag
       
  1781                 continue
       
  1782             (fromline,fromtext),(toline,totext) = fromdata,todata
       
  1783             # for each from/to line split it at the wrap column to form
       
  1784             # list of text lines.
       
  1785             fromlist,tolist = [],[]
       
  1786             self._split_line(fromlist,fromline,fromtext)
       
  1787             self._split_line(tolist,toline,totext)
       
  1788             # yield from/to line in pairs inserting blank lines as
       
  1789             # necessary when one side has more wrapped lines
       
  1790             while fromlist or tolist:
       
  1791                 if fromlist:
       
  1792                     fromdata = fromlist.pop(0)
       
  1793                 else:
       
  1794                     fromdata = ('',' ')
       
  1795                 if tolist:
       
  1796                     todata = tolist.pop(0)
       
  1797                 else:
       
  1798                     todata = ('',' ')
       
  1799                 yield fromdata,todata,flag
       
  1800 
       
  1801     def _collect_lines(self,diffs):
       
  1802         """Collects mdiff output into separate lists
       
  1803 
       
  1804         Before storing the mdiff from/to data into a list, it is converted
       
  1805         into a single line of text with HTML markup.
       
  1806         """
       
  1807 
       
  1808         fromlist,tolist,flaglist = [],[],[]
       
  1809         # pull from/to data and flags from mdiff style iterator
       
  1810         for fromdata,todata,flag in diffs:
       
  1811             try:
       
  1812                 # store HTML markup of the lines into the lists
       
  1813                 fromlist.append(self._format_line(0,flag,*fromdata))
       
  1814                 tolist.append(self._format_line(1,flag,*todata))
       
  1815             except TypeError:
       
  1816                 # exceptions occur for lines where context separators go
       
  1817                 fromlist.append(None)
       
  1818                 tolist.append(None)
       
  1819             flaglist.append(flag)
       
  1820         return fromlist,tolist,flaglist
       
  1821 
       
  1822     def _format_line(self,side,flag,linenum,text):
       
  1823         """Returns HTML markup of "from" / "to" text lines
       
  1824 
       
  1825         side -- 0 or 1 indicating "from" or "to" text
       
  1826         flag -- indicates if difference on line
       
  1827         linenum -- line number (used for line number column)
       
  1828         text -- line text to be marked up
       
  1829         """
       
  1830         try:
       
  1831             linenum = '%d' % linenum
       
  1832             id = ' id="%s%s"' % (self._prefix[side],linenum)
       
  1833         except TypeError:
       
  1834             # handle blank lines where linenum is '>' or ''
       
  1835             id = ''
       
  1836         # replace those things that would get confused with HTML symbols
       
  1837         text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
       
  1838 
       
  1839         # make space non-breakable so they don't get compressed or line wrapped
       
  1840         text = text.replace(' ','&nbsp;').rstrip()
       
  1841 
       
  1842         return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
       
  1843                % (id,linenum,text)
       
  1844 
       
  1845     def _make_prefix(self):
       
  1846         """Create unique anchor prefixes"""
       
  1847 
       
  1848         # Generate a unique anchor prefix so multiple tables
       
  1849         # can exist on the same HTML page without conflicts.
       
  1850         fromprefix = "from%d_" % HtmlDiff._default_prefix
       
  1851         toprefix = "to%d_" % HtmlDiff._default_prefix
       
  1852         HtmlDiff._default_prefix += 1
       
  1853         # store prefixes so line format method has access
       
  1854         self._prefix = [fromprefix,toprefix]
       
  1855 
       
  1856     def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
       
  1857         """Makes list of "next" links"""
       
  1858 
       
  1859         # all anchor names will be generated using the unique "to" prefix
       
  1860         toprefix = self._prefix[1]
       
  1861 
       
  1862         # process change flags, generating middle column of next anchors/links
       
  1863         next_id = ['']*len(flaglist)
       
  1864         next_href = ['']*len(flaglist)
       
  1865         num_chg, in_change = 0, False
       
  1866         last = 0
       
  1867         for i,flag in enumerate(flaglist):
       
  1868             if flag:
       
  1869                 if not in_change:
       
  1870                     in_change = True
       
  1871                     last = i
       
  1872                     # at the beginning of a change, drop an anchor a few lines
       
  1873                     # (the context lines) before the change for the previous
       
  1874                     # link
       
  1875                     i = max([0,i-numlines])
       
  1876                     next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
       
  1877                     # at the beginning of a change, drop a link to the next
       
  1878                     # change
       
  1879                     num_chg += 1
       
  1880                     next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
       
  1881                          toprefix,num_chg)
       
  1882             else:
       
  1883                 in_change = False
       
  1884         # check for cases where there is no content to avoid exceptions
       
  1885         if not flaglist:
       
  1886             flaglist = [False]
       
  1887             next_id = ['']
       
  1888             next_href = ['']
       
  1889             last = 0
       
  1890             if context:
       
  1891                 fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
       
  1892                 tolist = fromlist
       
  1893             else:
       
  1894                 fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
       
  1895         # if not a change on first line, drop a link
       
  1896         if not flaglist[0]:
       
  1897             next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
       
  1898         # redo the last link to link to the top
       
  1899         next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
       
  1900 
       
  1901         return fromlist,tolist,flaglist,next_href,next_id
       
  1902 
       
  1903     def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
       
  1904                    numlines=5):
       
  1905         """Returns HTML table of side by side comparison with change highlights
       
  1906 
       
  1907         Arguments:
       
  1908         fromlines -- list of "from" lines
       
  1909         tolines -- list of "to" lines
       
  1910         fromdesc -- "from" file column header string
       
  1911         todesc -- "to" file column header string
       
  1912         context -- set to True for contextual differences (defaults to False
       
  1913             which shows full differences).
       
  1914         numlines -- number of context lines.  When context is set True,
       
  1915             controls number of lines displayed before and after the change.
       
  1916             When context is False, controls the number of lines to place
       
  1917             the "next" link anchors before the next change (so click of
       
  1918             "next" link jumps to just before the change).
       
  1919         """
       
  1920 
       
  1921         # make unique anchor prefixes so that multiple tables may exist
       
  1922         # on the same page without conflict.
       
  1923         self._make_prefix()
       
  1924 
       
  1925         # change tabs to spaces before it gets more difficult after we insert
       
  1926         # markkup
       
  1927         fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
       
  1928 
       
  1929         # create diffs iterator which generates side by side from/to data
       
  1930         if context:
       
  1931             context_lines = numlines
       
  1932         else:
       
  1933             context_lines = None
       
  1934         diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
       
  1935                       charjunk=self._charjunk)
       
  1936 
       
  1937         # set up iterator to wrap lines that exceed desired width
       
  1938         if self._wrapcolumn:
       
  1939             diffs = self._line_wrapper(diffs)
       
  1940 
       
  1941         # collect up from/to lines and flags into lists (also format the lines)
       
  1942         fromlist,tolist,flaglist = self._collect_lines(diffs)
       
  1943 
       
  1944         # process change flags, generating middle column of next anchors/links
       
  1945         fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
       
  1946             fromlist,tolist,flaglist,context,numlines)
       
  1947 
       
  1948         s = []
       
  1949         fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
       
  1950               '<td class="diff_next">%s</td>%s</tr>\n'
       
  1951         for i in range(len(flaglist)):
       
  1952             if flaglist[i] is None:
       
  1953                 # mdiff yields None on separator lines skip the bogus ones
       
  1954                 # generated for the first line
       
  1955                 if i > 0:
       
  1956                     s.append('        </tbody>        \n        <tbody>\n')
       
  1957             else:
       
  1958                 s.append( fmt % (next_id[i],next_href[i],fromlist[i],
       
  1959                                            next_href[i],tolist[i]))
       
  1960         if fromdesc or todesc:
       
  1961             header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
       
  1962                 '<th class="diff_next"><br /></th>',
       
  1963                 '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
       
  1964                 '<th class="diff_next"><br /></th>',
       
  1965                 '<th colspan="2" class="diff_header">%s</th>' % todesc)
       
  1966         else:
       
  1967             header_row = ''
       
  1968 
       
  1969         table = self._table_template % dict(
       
  1970             data_rows=''.join(s),
       
  1971             header_row=header_row,
       
  1972             prefix=self._prefix[1])
       
  1973 
       
  1974         return table.replace('\0+','<span class="diff_add">'). \
       
  1975                      replace('\0-','<span class="diff_sub">'). \
       
  1976                      replace('\0^','<span class="diff_chg">'). \
       
  1977                      replace('\1','</span>'). \
       
  1978                      replace('\t','&nbsp;')
       
  1979 
       
  1980 del re
       
  1981 
       
  1982 def restore(delta, which):
       
  1983     r"""
       
  1984     Generate one of the two sequences that generated a delta.
       
  1985 
       
  1986     Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
       
  1987     lines originating from file 1 or 2 (parameter `which`), stripping off line
       
  1988     prefixes.
       
  1989 
       
  1990     Examples:
       
  1991 
       
  1992     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
       
  1993     ...              'ore\ntree\nemu\n'.splitlines(1))
       
  1994     >>> diff = list(diff)
       
  1995     >>> print ''.join(restore(diff, 1)),
       
  1996     one
       
  1997     two
       
  1998     three
       
  1999     >>> print ''.join(restore(diff, 2)),
       
  2000     ore
       
  2001     tree
       
  2002     emu
       
  2003     """
       
  2004     try:
       
  2005         tag = {1: "- ", 2: "+ "}[int(which)]
       
  2006     except KeyError:
       
  2007         raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
       
  2008                            % which)
       
  2009     prefixes = ("  ", tag)
       
  2010     for line in delta:
       
  2011         if line[:2] in prefixes:
       
  2012             yield line[2:]
       
  2013 
       
  2014 def _test():
       
  2015     import doctest, difflib
       
  2016     return doctest.testmod(difflib)
       
  2017 
       
  2018 if __name__ == "__main__":
       
  2019     _test()