diff --git a/Doc/library/difflib.rst b/Doc/library/difflib.rst --- a/Doc/library/difflib.rst +++ b/Doc/library/difflib.rst @@ -88,7 +88,7 @@ The constructor for this class is: - .. method:: __init__(tabsize=8, wrapcolumn=None, linejunk=None, charjunk=IS_CHARACTER_JUNK) + .. method:: __init__(tabsize=8, wrapcolumn=None, linejunk=None, charjunk=IS_CHARACTER_JUNK, make_line_matcher=SequenceMatcher, make_char_matcher=SequenceMatcher) Initializes instance of :class:`HtmlDiff`. @@ -98,9 +98,13 @@ *wrapcolumn* is an optional keyword to specify column number where lines are broken and wrapped, defaults to ``None`` where lines are not wrapped. - *linejunk* and *charjunk* are optional keyword arguments passed into ``ndiff()`` - (used by :class:`HtmlDiff` to generate the side by side HTML differences). See - ``ndiff()`` documentation for argument default values and descriptions. + *linejunk*, *charjunk*, *make_line_matcher* and *make_char_matcher* are optional + keyword arguments passed into ``ndiff()`` (used by :class:`HtmlDiff` to generate + the side by side HTML differences). See ``ndiff()`` documentation for argument + default values and descriptions. + + .. versionchanged:: 3.5 + Added the *make_line_matcher* and *make_char_matcher* parameters. The following methods are public: @@ -136,7 +140,7 @@ contains a good example of its use. -.. function:: context_diff(a, b, fromfile='', tofile='', fromfiledate='', tofiledate='', n=3, lineterm='\\n') +.. function:: context_diff(a, b, fromfile='', tofile='', fromfiledate='', tofiledate='', n=3, lineterm='\\n', line_matcher=None) Compare *a* and *b* (lists of strings); return a delta (a :term:`generator` generating the delta lines) in context diff format. @@ -180,6 +184,9 @@ See :ref:`difflib-interface` for a more detailed example. + .. versionchanged:: 3.5 + Added the *line_matcher* parameter. + .. function:: get_close_matches(word, possibilities, n=3, cutoff=0.6) @@ -207,7 +214,7 @@ ['except'] -.. function:: ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK) +.. function:: ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK, make_line_matcher=SequenceMatcher, make_char_matcher=SequenceMatcher) Compare *a* and *b* (lists of strings); return a :class:`Differ`\ -style delta (a :term:`generator` generating the delta lines). @@ -228,6 +235,14 @@ function :func:`IS_CHARACTER_JUNK`, which filters out whitespace characters (a blank or tab; it's a bad idea to include newline in this!). + *make_line_matcher*: A callable that returns an implementation of + SequenceMatcher and takes in the linejunk function and the two compared line + sequences a and b. + + *make_char_matcher*: A callable that returns an implementation of + SequenceMatcher and takes in the charjunk function and two sequences of + characters. + :file:`Tools/scripts/ndiff.py` is a command-line front-end to this function. >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), @@ -243,6 +258,9 @@ + tree + emu + .. versionchanged:: 3.5 + Added the *make_line_matcher* and *make_char_matcher* parameters. + .. function:: restore(sequence, which) @@ -267,7 +285,7 @@ emu -.. function:: unified_diff(a, b, fromfile='', tofile='', fromfiledate='', tofiledate='', n=3, lineterm='\\n') +.. function:: unified_diff(a, b, fromfile='', tofile='', fromfiledate='', tofiledate='', n=3, lineterm='\\n', line_matcher=None) Compare *a* and *b* (lists of strings); return a delta (a :term:`generator` generating the delta lines) in unified diff format. diff --git a/Lib/difflib.py b/Lib/difflib.py --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -27,7 +27,7 @@ """ __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', - 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', + 'Differ', 'IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', 'unified_diff', 'HtmlDiff', 'Match'] import heapq @@ -35,11 +35,13 @@ Match = _namedtuple('Match', 'a b size') + def _calculate_ratio(matches, length): if length: return 2.0 * matches / length return 1.0 + class SequenceMatcher: """ @@ -319,7 +321,7 @@ for elt in b2j.keys(): if isjunk(elt): junk.add(elt) - for elt in junk: # separate loop avoids separate list of keys + for elt in junk: # separate loop avoids separate list of keys del b2j[elt] # Purge popular elements that are not junk @@ -330,7 +332,7 @@ for elt, idxs in b2j.items(): if len(idxs) > ntest: popular.add(elt) - for elt in popular: # ditto; as fast for 1% deletion + for elt in popular: # ditto; as fast for 1% deletion del b2j[elt] def find_longest_match(self, alo, ahi, blo, bhi): @@ -685,7 +687,7 @@ # shorter sequence return _calculate_ratio(min(la, lb), la + lb) -def get_close_matches(word, possibilities, n=3, cutoff=0.6): +def get_close_matches(word, possibilities, n=3, cutoff=0.6, sequence_matcher=None): """Use SequenceMatcher to return list of the best "good enough" matches. word is a sequence for which close matches are desired (typically a @@ -700,6 +702,10 @@ Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities that don't score at least that similar to word are ignored. + Optional arg sequence_matcher (default None) is a SequenceMatcher + instance. Its a and b sequences will be replaced with the arguments + to this function. + The best (no more than n) matches among the possibilities are returned in a list, sorted by similarity score, most similar first. @@ -712,6 +718,7 @@ [] >>> get_close_matches("accept", _keyword.kwlist) ['except'] + """ if not n > 0: @@ -719,7 +726,10 @@ if not 0.0 <= cutoff <= 1.0: raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) result = [] - s = SequenceMatcher() + + # use the provided sequence matcher if offered + s = sequence_matcher if sequence_matcher else SequenceMatcher() + s.set_seq2(word) for x in possibilities: s.set_seq1(x) @@ -835,16 +845,20 @@ Methods: - __init__(linejunk=None, charjunk=None) - Construct a text differencer, with optional filters. + __init__(linejunk=None, charjunk=None, + make_line_matcher=SequenceMatcher, + make_char_matcher=SequenceMatcher) + Construct a text differencer, with optional filters and matcher + implementations. compare(a, b) Compare two sequences of lines; generate the resulting delta. """ - def __init__(self, linejunk=None, charjunk=None): - """ - Construct a text differencer, with optional filters. + def __init__(self, linejunk=None, charjunk=None, + make_line_matcher=SequenceMatcher, + make_char_matcher=SequenceMatcher): + """Construct a text differencer, with optional filters. The two optional keyword parameters are for filter functions: @@ -860,10 +874,28 @@ module-level function `IS_CHARACTER_JUNK` may be used to filter out whitespace characters (a blank or tab; **note**: bad idea to include newline in this!). Use of IS_CHARACTER_JUNK is recommended. + + - `make_line_matcher`: A callable that returns a SequenceMatcher + instance for matching lines in the input. The callable will be + called with the linejunk function, and the a and b + sequences. The method must conform to the SequenceMatcher + interface: + + line_cruncher = make_line_matcher(self.linejunk, a, b) + + - `make_char_matcher`: A callable that returns a SequenceMatcher + instance for matching characters in the input. The callable + will be called with the charjunk function and must conform to + the SequenceMatcher interface: + + char_cruncher = make_char_matcher(self.charjunk) + """ self.linejunk = linejunk self.charjunk = charjunk + self.make_line_matcher = make_line_matcher + self.make_char_matcher = make_char_matcher def compare(self, a, b): r""" @@ -891,7 +923,7 @@ + emu """ - cruncher = SequenceMatcher(self.linejunk, a, b) + cruncher = self.make_line_matcher(self.linejunk, a, b) for tag, alo, ahi, blo, bhi in cruncher.get_opcodes(): if tag == 'replace': g = self._fancy_replace(a, alo, ahi, b, blo, bhi) @@ -947,7 +979,7 @@ # don't synch up unless the lines have a similarity score of at # least cutoff; best_ratio tracks the best score seen so far best_ratio, cutoff = 0.74, 0.75 - cruncher = SequenceMatcher(self.charjunk) + cruncher = self.make_char_matcher(self.charjunk) eqi, eqj = None, None # 1st indices of equal lines (if any) # search for the pair that matches best without being identical @@ -1134,7 +1166,8 @@ return '{},{}'.format(beginning, length) def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', - tofiledate='', n=3, lineterm='\n'): + tofiledate='', n=3, lineterm='\n', + line_matcher=None): r""" Compare two sequences of lines; generate the delta as a unified diff. @@ -1175,7 +1208,13 @@ """ started = False - for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): + + if line_matcher: + matcher = line_matcher + else: + matcher = SequenceMatcher(None) + matcher.set_seqs(a, b) + for group in matcher.get_grouped_opcodes(n): if not started: started = True fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' @@ -1218,7 +1257,8 @@ # See http://www.unix.org/single_unix_specification/ def context_diff(a, b, fromfile='', tofile='', - fromfiledate='', tofiledate='', n=3, lineterm='\n'): + fromfiledate='', tofiledate='', n=3, lineterm='\n', + line_matcher=None): r""" Compare two sequences of lines; generate the delta as a context diff. @@ -1263,7 +1303,13 @@ prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ') started = False - for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): + + if line_matcher: + matcher = line_matcher + else: + matcher = SequenceMatcher(None) + matcher.set_seqs(a, b) + for group in matcher.get_grouped_opcodes(n): if not started: started = True fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' @@ -1292,7 +1338,9 @@ for line in b[j1:j2]: yield prefix[tag] + line -def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): +def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK, + make_line_matcher=SequenceMatcher, + make_char_matcher=SequenceMatcher): r""" Compare `a` and `b` (lists of strings); return a `Differ`-style delta. @@ -1310,6 +1358,18 @@ whitespace characters (a blank or tab; note: it's a bad idea to include newline in this!). + - `make_line_matcher`: A callable that returns a SequenceMatcher + instance for matching lines in the input. The callable will be + called with the linejunk function, and the a and b sequences: + + line_cruncher = make_line_matcher(self.linejunk, a, b) + + - `make_char_matcher`: A callable that returns a SequenceMatcher + instance for matching characters in the input. The callable will be + called with the charjunk function, and the a and b sequences: + + char_cruncher = make_char_matcher(self.charjunk, a, b) + Tools/scripts/ndiff.py is a command-line front-end to this function. Example: @@ -1327,10 +1387,12 @@ + tree + emu """ - return Differ(linejunk, charjunk).compare(a, b) + return Differ(linejunk, charjunk, make_line_matcher, make_char_matcher).compare(a, b) def _mdiff(fromlines, tolines, context=None, linejunk=None, - charjunk=IS_CHARACTER_JUNK): + charjunk=IS_CHARACTER_JUNK, + make_line_matcher=SequenceMatcher, + make_char_matcher=SequenceMatcher): r"""Returns generator yielding marked up from/to side by side differences. Arguments: @@ -1340,6 +1402,8 @@ if None, all from/to text lines will be generated. linejunk -- passed on to ndiff (see ndiff documentation) charjunk -- passed on to ndiff (see ndiff documentation) + make_line_matcher -- passed on to ndiff (see ndiff documentation) + make_char_matcher -- passed on to ndiff (see ndiff documentation) This function returns an iterator which returns a tuple: (from line tuple, to line tuple, boolean flag) @@ -1369,7 +1433,10 @@ change_re = re.compile('(\++|\-+|\^+)') # create the difference iterator to generate the differences - diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk) + diff_lines_iterator = ndiff(fromlines,tolines, + linejunk,charjunk, + make_line_matcher, + make_char_matcher) def _make_line(lines, format_key, side, num_lines=[0,0]): """Returns line of text with user's change markup and line formatting. @@ -1672,21 +1739,26 @@ _default_prefix = 0 def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None, - charjunk=IS_CHARACTER_JUNK): + charjunk=IS_CHARACTER_JUNK, + make_line_matcher=SequenceMatcher, + make_char_matcher=SequenceMatcher): """HtmlDiff instance initializer Arguments: tabsize -- tab stop spacing, defaults to 8. wrapcolumn -- column number where lines are broken and wrapped, defaults to None where lines are not wrapped. - linejunk,charjunk -- keyword arguments passed into ndiff() (used by - HtmlDiff() to generate the side by side HTML differences). See - ndiff() documentation for argument default values and descriptions. + linejunk,charjunk,make_line_matcher,make_char_matcher -- keyword + arguments passed into ndiff() (used by HtmlDiff() to generate + the side by side HTML differences). See ndiff() documentation + for argument default values and descriptions. """ self._tabsize = tabsize self._wrapcolumn = wrapcolumn self._linejunk = linejunk self._charjunk = charjunk + self._make_line_matcher = make_line_matcher + self._make_char_matcher = make_char_matcher def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False, numlines=5): @@ -1951,8 +2023,11 @@ context_lines = numlines else: context_lines = None - diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk, - charjunk=self._charjunk) + diffs = _mdiff(fromlines, tolines, context_lines, + linejunk=self._linejunk, + charjunk=self._charjunk, + make_line_matcher=self._make_line_matcher, + make_char_matcher=self._make_char_matcher) # set up iterator to wrap lines that exceed desired width if self._wrapcolumn: diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py --- a/Lib/test/test_difflib.py +++ b/Lib/test/test_difflib.py @@ -1,6 +1,7 @@ import difflib from test.support import run_unittest, findfile import unittest +from unittest import mock import doctest import sys @@ -278,12 +279,45 @@ self.assertEqual(fmt(0,0), '0') +class TestDifflibSequenceMatcherSubstitution(unittest.TestCase): + + def test_get_close_matches(self): + m = mock.Mock(spec=difflib.SequenceMatcher) + m.real_quick_ratio.return_value = 0.0 + difflib.get_close_matches('w', ('a'), sequence_matcher=m) + self.assertGreater(len(m.mock_calls), 0, + "At least one call must have been made to the matcher") + + def test_differ_default_matchers(self): + d = difflib.Differ() + self.assertIsInstance(d.make_line_matcher(None, 'a', 'b'), difflib.SequenceMatcher) + self.assertIsInstance(d.make_char_matcher(None, 'a', 'b'), difflib.SequenceMatcher) + + def test_differ_provided_matchers(self): + d = difflib.Differ(make_line_matcher=lambda *a, **kw: mock.sentinel.LineMatcher, + make_char_matcher=lambda *a, **kw: mock.sentinel.CharMatcher) + self.assertEqual(d.make_line_matcher(None, 'a', 'b'), mock.sentinel.LineMatcher) + self.assertEqual(d.make_char_matcher(None, 'a', 'b'), mock.sentinel.CharMatcher) + + def test_unified_diff_uses_provided_matcher(self): + lm = mock.Mock(spec=difflib.SequenceMatcher) + lm.get_grouped_opcodes.return_value = [] + list(difflib.unified_diff('a', 'b', line_matcher=lm)) + self.assertTrue(lm.get_grouped_opcodes.called) + + def test_context_diff_uses_provided_matcher(self): + lm = mock.Mock(spec=difflib.SequenceMatcher) + lm.get_grouped_opcodes.return_value = [] + list(difflib.context_diff('a', 'b', line_matcher=lm)) + self.assertTrue(lm.get_grouped_opcodes.called) + + def test_main(): difflib.HtmlDiff._default_prefix = 0 Doctests = doctest.DocTestSuite(difflib) run_unittest( TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs, - TestOutputFormat, Doctests) + TestOutputFormat, Doctests, TestDifflibSequenceMatcherSubstitution) if __name__ == '__main__': test_main() diff --git a/Misc/ACKS b/Misc/ACKS --- a/Misc/ACKS +++ b/Misc/ACKS @@ -1111,6 +1111,7 @@ Armin Ronacher Case Roole Timothy Roscoe +Chris Rose Erik Rose Josh Rosenberg Jim Roskind