Index: Misc/NEWS =================================================================== --- Misc/NEWS (revision 86403) +++ Misc/NEWS (working copy) @@ -213,6 +213,10 @@ - Issue #808164: Fixed socket.close to avoid references to globals, to avoid issues when socket.close is called from a __del__ method. +- Issue #2986: difflib.SequenceMatcher gets a new parameter, autojunk, which + can be set to False to turn off the previously undocumented 'popularity' + heuristic. Patch by Terry Reedy and Eli Bendersky + - Issue #8797: urllib2 does a retry for Basic Authentication failure instead of falling into recursion. Index: Doc/library/difflib.rst =================================================================== --- Doc/library/difflib.rst (revision 86403) +++ Doc/library/difflib.rst (working copy) @@ -37,7 +37,16 @@ complicated way on how many elements the sequences have in common; best case time is linear. + **Automatic junk heuristic:** :class:`SequenceMatcher` supports a heuristic that + automatically treats certain sequence items as junk. The heuristic counts how many + times each individual item appears in the sequence. If an item's duplicates (after + the first one) account for more than 1% of the sequence and the sequence is at least + 200 items long, this item is marked as "popular" and is treated as junk for + the purpose of sequence matching. This heuristic can be turned off by setting + the ``autojunk`` argument to ``False`` when creating the :class:`SequenceMatcher`. + + .. class:: Differ This is a class for comparing sequences of lines of text, and producing @@ -334,7 +343,7 @@ The :class:`SequenceMatcher` class has this constructor: -.. class:: SequenceMatcher([isjunk[, a[, b]]]) +.. class:: SequenceMatcher([isjunk[, a[, b[, autojunk=True]]]]) Optional argument *isjunk* must be ``None`` (the default) or a one-argument function that takes a sequence element and returns true if and only if the @@ -350,6 +359,9 @@ The optional arguments *a* and *b* are sequences to be compared; both default to empty strings. The elements of both sequences must be :term:`hashable`. + The optional argument *autojunk* can be used to disable the automatic junk + heuristic. + :class:`SequenceMatcher` objects have the following methods: Index: Lib/difflib.py =================================================================== --- Lib/difflib.py (revision 86403) +++ Lib/difflib.py (working copy) @@ -151,7 +151,7 @@ Return an upper bound on ratio() very quickly. """ - def __init__(self, isjunk=None, a='', b=''): + def __init__(self, isjunk=None, a='', b='', autojunk=True): """Construct a SequenceMatcher. Optional arg isjunk is None (the default), or a one-argument @@ -169,6 +169,10 @@ Optional arg b is the second of two sequences to be compared. By default, an empty string. The elements of b must be hashable. See also .set_seqs() and .set_seq2(). + + Optional arg autojunk should be set to False to disable the + "automatic junk heuristic" that treats popular elements as junk + (see module documentation for more information). """ # Members: @@ -207,11 +211,13 @@ # DOES NOT WORK for x in a! # isbpopular # for x in b, isbpopular(x) is true iff b is reasonably long - # (at least 200 elements) and x accounts for more than 1% of - # its elements. DOES NOT WORK for x in a! + # (at least 200 elements) and x accounts for more than 1 + 1% of + # its elements (when autojunk is enabled). + # DOES NOT WORK for x in a! self.isjunk = isjunk self.a = self.b = None + self.autojunk = autojunk self.set_seqs(a, b) def set_seqs(self, a, b): @@ -288,7 +294,7 @@ # from starting any matching block at a junk element ... # also creates the fast isbjunk function ... # b2j also does not contain entries for "popular" elements, meaning - # elements that account for more than 1% of the total elements, and + # elements that account for more than 1 + 1% of the total elements, and # when the sequence is reasonably large (>= 200 elements); this can # be viewed as an adaptive notion of semi-junk, and yields an enormous # speedup when, e.g., comparing program files with hundreds of @@ -309,45 +315,38 @@ # out the junk later is much cheaper than building b2j "right" # from the start. b = self.b - n = len(b) self.b2j = b2j = {} - populardict = {} + for i, elt in enumerate(b): - if elt in b2j: - indices = b2j[elt] - if n >= 200 and len(indices) * 100 > n: - populardict[elt] = 1 - del indices[:] - else: - indices.append(i) - else: - b2j[elt] = [i] + indices = b2j.setdefault(elt, []) + indices.append(i) - # Purge leftover indices for popular elements. - for elt in populardict: - del b2j[elt] - - # Now b2j.keys() contains elements uniquely, and especially when - # the sequence is a string, that's usually a good deal smaller - # than len(string). The difference is the number of isjunk calls - # saved. + # Purge junk elements + junk = set() isjunk = self.isjunk - junkdict = {} if isjunk: - for d in populardict, b2j: - for elt in d.keys(): - if isjunk(elt): - junkdict[elt] = 1 - del d[elt] + for elt in list(b2j.keys()): # using list() since b2j is modified + if isjunk(elt): + junk.add(elt) + del b2j[elt] - # Now for x in b, isjunk(x) == x in junkdict, but the - # latter is much faster. Note too that while there may be a - # lot of junk in the sequence, the number of *unique* junk - # elements is probably small. So the memory burden of keeping - # this dict alive is likely trivial compared to the size of b2j. - self.isbjunk = junkdict.__contains__ - self.isbpopular = populardict.__contains__ + # Purge popular elements that are not junk + popular = set() + n = len(b) + if self.autojunk and n >= 200: + ntest = n // 100 + 1 + for elt, idxs in list(b2j.items()): + if len(idxs) > ntest: + popular.add(elt) + del b2j[elt] + # Now for x in b, isjunk(x) == x in junk, but the latter is much faster. + # Sicne the number of *unique* junk elements is probably small, the + # memory burden of keeping this set alive is likely trivial compared to + # the size of b2j. + self.isbjunk = junk.__contains__ + self.isbpopular = popular.__contains__ + def find_longest_match(self, alo, ahi, blo, bhi): """Find longest matching block in a[alo:ahi] and b[blo:bhi]. Index: Lib/test/test_difflib.py =================================================================== --- Lib/test/test_difflib.py (revision 86403) +++ Lib/test/test_difflib.py (working copy) @@ -4,8 +4,47 @@ import doctest import sys + +class TestWithAscii(unittest.TestCase): + def test_one_insert(self): + sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100) + self.assertAlmostEqual(sm.ratio(), 0.995, places=3) + self.assertEqual(list(sm.get_opcodes()), + [ ('insert', 0, 0, 0, 1), + ('equal', 0, 100, 1, 101)]) + sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50) + self.assertAlmostEqual(sm.ratio(), 0.995, places=3) + self.assertEqual(list(sm.get_opcodes()), + [ ('equal', 0, 50, 0, 50), + ('insert', 50, 50, 50, 51), + ('equal', 50, 100, 51, 101)]) + + def test_one_delete(self): + sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40) + self.assertAlmostEqual(sm.ratio(), 0.994, places=3) + self.assertEqual(list(sm.get_opcodes()), + [ ('equal', 0, 40, 0, 40), + ('delete', 40, 41, 40, 40), + ('equal', 41, 81, 40, 80)]) + + +class TestAutojunk(unittest.TestCase): + """Tests for the autojunk parameter added in 2.7""" + def test_one_insert_homogenous_sequence(self): + # By default autojunk=True and the heuristic kicks in for a sequence + # of length 200+ + seq1 = 'b' * 200 + seq2 = 'a' + 'b' * 200 + + sm = difflib.SequenceMatcher(None, seq1, seq2) + self.assertAlmostEqual(sm.ratio(), 0, places=3) + + # Now turn the heuristic off + sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False) + self.assertAlmostEqual(sm.ratio(), 0.9975, places=3) + + class TestSFbugs(unittest.TestCase): - def test_ratio_for_null_seqn(self): # Check clearing of SF bug 763023 s = difflib.SequenceMatcher(None, [], []) @@ -184,7 +223,9 @@ def test_main(): difflib.HtmlDiff._default_prefix = 0 Doctests = doctest.DocTestSuite(difflib) - run_unittest(TestSFpatches, TestSFbugs, TestOutputFormat, Doctests) + run_unittest( + TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs, + TestOutputFormat, Doctests) if __name__ == '__main__': test_main()