# HG changeset patch # User RussellBallestrini # Date 1398390177 14400 # Thu Apr 24 21:42:57 2014 -0400 # Node ID a138619f47292c80fca77fbdc9621afa05f473dc # Parent 78bbfcccff11eab2d1010f54a79b9eaf11f55983 Added documentation for get_scored_matches() changed Doc/library/difflib.rst diff -r 78bbfcccff11 -r a138619f4729 Doc/library/difflib.rst --- a/Doc/library/difflib.rst Thu Apr 24 20:58:05 2014 -0400 +++ b/Doc/library/difflib.rst Thu Apr 24 21:42:57 2014 -0400 @@ -207,6 +207,33 @@ ['except'] +.. function:: get_scored_matches(word, possibilities, n=3, cutoff=0.6) + + Return a list of the best "good enough" tuple matches. + This function operates the same as *get_close_matches* and accepts + the same arguments. The only difference is the return type:: + + [(score, word),] + + *word* is a sequence for which close matches are desired + (typically a string), and *possibilities* is a list of sequences + against which to match *word* (typically a list of strings). + + Optional argument *n* (default ``3``) is the maximum number of + close matches to return; *n* must be greater than ``0``. + + Optional argument *cutoff* (default ``0.6``) is a float in the + range [0, 1]. Possibilities that don't score at least that similar + to *word* are ignored. + + The best (no more than *n*) matches among the possibilities are + returned in a list, sorted by similarity score, most similar first. + + >>> import keyword as _keyword + >>> get_scored_matches("wheel", _keyword.kwlist) + [(0.6, 'while')] + + .. function:: ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK) Compare *a* and *b* (lists of strings); return a :class:`Differ`\ -style # HG changeset patch # User RussellBallestrini # Date 1398387485 14400 # Thu Apr 24 20:58:05 2014 -0400 # Node ID 78bbfcccff11eab2d1010f54a79b9eaf11f55983 # Parent 4ff37fbcd4e829ed3f57edd241f2d9f681707154 New function in difflib: get_scored_matches() This function acts just like the existing get_close_matches() function however instead of returning a list of words, it returns a list of tuples (score, word) pairs. This gives the end-user the ability to access the computationally expensive scores/ratios produced as a by-product. This patch also contains complete test coverage for both get_close_matches() and get_scored_matches(). The new usage does _not_ impact backward compatibility:: >>> import difflib >>> import keyword as _keyword >>> difflib.get_scored_matches("wheel", _keyword.kwlist) [(0.6, 'while')] >>> difflib.get_close_matches("wheel", _keyword.kwlist) ['while'] hg: branch 'default' changed Lib/difflib.py changed Lib/test/test_difflib.py diff -r 4ff37fbcd4e8 -r 78bbfcccff11 Lib/difflib.py --- a/Lib/difflib.py Wed Apr 23 15:37:37 2014 -0500 +++ b/Lib/difflib.py Thu Apr 24 20:58:05 2014 -0400 @@ -4,6 +4,9 @@ Function get_close_matches(word, possibilities, n=3, cutoff=0.6): Use SequenceMatcher to return list of the best "good enough" matches. +Function get_scored_matches(word, possibilities, n=3, cutoff=0.6): + Use SequenceMatcher to return list of the best (score, word) tuples. + Function context_diff(a, b): For two lists of strings, return a delta in context diff format. @@ -713,6 +716,20 @@ >>> get_close_matches("accept", _keyword.kwlist) ['except'] """ + result = get_scored_matches(word, possibilities, n, cutoff) + # transform the list of (score, word) tuples, into a list of words. + return [x for score, x in result] + + +def get_scored_matches(word, possibilities, n=3, cutoff=0.6): + """Use SequenceMatcher to return list of the best (score, word) tuples. + + All arguments are the same as get_close_matches(). + + >>> import keyword as _keyword + >>> get_scored_matches("wheel", _keyword.kwlist) + [(0.6, 'while')] + """ if not n > 0: raise ValueError("n must be > 0: %r" % (n,)) @@ -728,10 +745,8 @@ s.ratio() >= cutoff: result.append((s.ratio(), x)) - # Move the best scorers to head of list - result = heapq.nlargest(n, result) - # Strip scores for the best n matches - return [x for score, x in result] + # Move the best scorers to head of list and return result. + return heapq.nlargest(n, result) def _count_leading(line, ch): """ diff -r 4ff37fbcd4e8 -r 78bbfcccff11 Lib/test/test_difflib.py --- a/Lib/test/test_difflib.py Wed Apr 23 15:37:37 2014 -0500 +++ b/Lib/test/test_difflib.py Thu Apr 24 20:58:05 2014 -0400 @@ -278,12 +278,61 @@ self.assertEqual(fmt(0,0), '0') +class TestGetCloseScoredMatches(unittest.TestCase): + """ + This test suite covers the following difflib functions: + + * get_scored_matches + * get_close_matches + """ + def test_get_close_matches_with_defaults(self): + matches = difflib.get_close_matches("appel", + ["ape", "apple", "peach", "puppy"]) + self.assertEqual(matches, ["apple", "ape"]) + + def test_get_scored_matches_with_defaults(self): + matches = difflib.get_scored_matches("appel", + ["ape", "apple", "peach", "puppy"]) + self.assertEqual(matches, [(0.8, "apple"), (0.75, "ape")]) + + def test_scored_dog_is_dog(self): + matches = difflib.get_scored_matches("dog", ["dog"]) + self.assertEqual(matches, [(1.0, "dog")]) + + def test_scored_dog_is_not_cat(self): + matches = difflib.get_scored_matches("dog", ["cat"]) + self.assertEqual(matches, []) + + def test_scored_n_is_one(self): + matches = difflib.get_scored_matches("dog", ["dog", "dog"], n=1) + self.assertEqual(len(matches), 1) + self.assertEqual(matches, [(1.0, "dog")]) + + def test_scored_cutoff_is_point_nine_five(self): + matches = difflib.get_scored_matches("dog", ["dog", "doge"], + cutoff=.95) + self.assertEqual(len(matches), 1) + self.assertEqual(matches, [(1.0, "dog")]) + + def test_negative_n_is_value_error(self): + self.assertRaises(ValueError, + difflib.get_close_matches, "a", ["a"], n=-1) + + def test_negative_cutoff_is_value_error(self): + self.assertRaises(ValueError, + difflib.get_close_matches, "a", ["a"], cutoff=-.5) + + def test_two_cutoff_is_value_error(self): + self.assertRaises(ValueError, + difflib.get_close_matches, "a", ["a"], cutoff=2.0) + + def test_main(): difflib.HtmlDiff._default_prefix = 0 Doctests = doctest.DocTestSuite(difflib) run_unittest( TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs, - TestOutputFormat, Doctests) + TestOutputFormat, TestGetCloseScoredMatches, Doctests) if __name__ == '__main__': test_main()