This issue tracker has been migrated to GitHub, and is currently read-only.
For more information, see the GitHub FAQs in the Python's Developer Guide.

classification
Title: possible wrong result for difflib.SequenceMatcher.ratio()
Type: behavior Stage: resolved
Components: Library (Lib) Versions: Python 3.11, Python 3.10, Python 3.9, Python 3.8, Python 3.7, Python 3.6
process
Status: closed Resolution: not a bug
Dependencies: Superseder:
Assigned To: Nosy List: nalza001, tim.peters
Priority: normal Keywords:

Created on 2021-09-13 04:02 by nalza001, last changed 2022-04-11 14:59 by admin. This issue is now closed.

Messages (6)
msg401683 - (view) Author: Nabeel Alzahrani (nalza001) Date: 2021-09-13 04:02
The difflib.SequenceMatcher.ratio() gives 0.3 instead of 1.0 or at least 0.9 for the following two strings a and b: 
a="""
#include <iostream>
#include <string>
using namespace std;
int main() {
   string userWord;
   unsigned int i;
      cin >> userWord;

      
      for(i = 0; i < userWord.size(); i++) {
         if(userWord.at(i) == 'i') {
            userWord.at(i) = '1';
         }
         if(userWord.at(i) == 'a') {
            userWord.at(i) = '@'; 
         }
         if(userWord.at(i) == 'm') {
            userWord.at(i) = 'M';
         }
         if(userWord.at(i) == 'B') {
            userWord.at(i) = '8';
         }
         if(userWord.at(i) == 's') {
            userWord.at(i) = '$';
         }
         userWord.push_back('!');
      }
      cout << userWord << endl;
   return 0;
}
"""

b="""
#include <iostream>
#include <string>
using namespace std;
int main() {
   string userWord;
   unsigned int i;
      cin >> userWord;
      userWord.push_back('!');
      
      for(i = 0; i < userWord.size(); i++) {
         if(userWord.at(i) == 'i') {
            userWord.at(i) = '1';
         }
         if(userWord.at(i) == 'a') {
            userWord.at(i) = '@'; 
         }
         if(userWord.at(i) == 'm') {
            userWord.at(i) = 'M';
         }
         if(userWord.at(i) == 'B') {
            userWord.at(i) = '8';
         }
         if(userWord.at(i) == 's') {
            userWord.at(i) = '$';
         }
   
      }
      cout << userWord << endl;
   return 0;
}
"""
msg401685 - (view) Author: Tim Peters (tim.peters) * (Python committer) Date: 2021-09-13 04:10
Unfortunately, you're getting hurt by the "autojunk" feature (see the docs). If you turn it off, you'll get a result much more to your liking:

>>> print(SequenceMatcher(None, a, b).ratio())
0.3431803896920176

>>> print(SequenceMatcher(None, a, b, autojunk=False).ratio())
0.9553739786297926
msg401912 - (view) Author: Nabeel Alzahrani (nalza001) Date: 2021-09-16 02:03
But when I turn off the "autojunk" feature for the following example, I get the wrong ratio of 0.5 instead of the correct ratio of 0.2 with autojunk enabled.

a="""
#include <iostream>
#include <string>
using namespace std;
int main() {
   
   string userPass;
   int sMaxIndex;
   char indivChar;
   int i;
   
   cin >> userPass;
   
   sMaxIndex = userPass.size() - 1;
   
   
   for (i = 0; i <= sMaxIndex; ++i) {
      
      indivChar = userPass.at(i);
      
      if (indivChar == 'i') {
         
         indivChar = '1';
         cout << indivChar;
         
      }
      else if (indivChar == 'a') {
         
         indivChar = '@';
         cout << indivChar;
         
      }
      else if (indivChar == 'm') {
         
         indivChar = 'M';
         cout << indivChar;
         
      }
      else if (indivChar == 'B') {
         
         indivChar = '8';
         cout << indivChar;
         
      }
      else if (indivChar == 's') {
         
         indivChar = '$';
         cout << indivChar;
         
      }
      else {
         
         cout << indivChar;
         
      }
      
   }
   
   cout << "!" << endl;
   
   return 0;
}
"""

b="""
#include <iostream>
#include <string>
using namespace std;
int main() {
   string ori;
   cin >> ori;
   for (int i = 0; i < ori.size(); i++){
      if (ori.at(i) == 'i')
         ori.at(i) = '1';
      if (ori.at(i) == 'a')
         ori.at(i) = '@';
      if (ori.at(i) == 'm')
         ori.at(i) = 'M';
      if (ori.at(i) == 'B')
         ori.at(i) = '8';
      if (ori.at(i) == 's')
         ori.at(i) = '$';
      }
   cout << ori << endl;

   return 0;
}
"""
msg401915 - (view) Author: Tim Peters (tim.peters) * (Python committer) Date: 2021-09-16 03:46
I have no idea why you think the result should be 0.2. 0.5630188679245283 looks correct to me with autojunk disabled:

sm = SequenceMatcher(None, a, b, autojunk=False)
total = 0
for m in sm.get_matching_blocks():
	print(m, repr(a[m.a : m.a + m.size]))
	total += m.size

Running that displays every matching block:

Match(a=0, b=0, size=73) '\n#include <iostream>\n#include <string>\nusing namespace std;\nint main() {\n'
Match(a=74, b=73, size=10) '   string '
Match(a=87, b=84, size=1) 'r'
Match(a=138, b=85, size=2) 'i;'
Match(a=141, b=87, size=11) '\n   cin >> '
Match(a=155, b=99, size=1) 'r'
Match(a=160, b=101, size=1) ';'
Match(a=200, b=102, size=10) '\n   for (i'
Match(a=210, b=116, size=9) ' = 0; i <'
Match(a=220, b=125, size=1) ' '
Match(a=221, b=130, size=1) 's'
Match(a=228, b=133, size=1) 'e'
Match(a=230, b=136, size=2) '; '
Match(a=232, b=139, size=2) '++'
Match(a=235, b=141, size=1) ')'
Match(a=237, b=142, size=1) '{'
Match(a=274, b=143, size=11) '\n      if ('
Match(a=285, b=156, size=1) 'i'
Match(a=288, b=161, size=1) 'i'
Match(a=294, b=163, size=8) " == 'i')"
Match(a=305, b=171, size=10) '\n         '
Match(a=315, b=183, size=1) 'i'
Match(a=318, b=188, size=1) 'i'
Match(a=324, b=190, size=14) " = '1';\n      "
Match(a=380, b=204, size=4) 'if ('
Match(a=384, b=210, size=1) 'i'
Match(a=387, b=215, size=1) 'i'
Match(a=393, b=217, size=8) " == 'a')"
Match(a=404, b=225, size=10) '\n         '
Match(a=414, b=237, size=1) 'i'
Match(a=417, b=242, size=1) 'i'
Match(a=423, b=244, size=14) " = '@';\n      "
Match(a=479, b=258, size=4) 'if ('
Match(a=483, b=264, size=1) 'i'
Match(a=486, b=269, size=1) 'i'
Match(a=492, b=271, size=8) " == 'm')"
Match(a=503, b=279, size=10) '\n         '
Match(a=513, b=291, size=1) 'i'
Match(a=516, b=296, size=1) 'i'
Match(a=522, b=298, size=14) " = 'M';\n      "
Match(a=578, b=312, size=4) 'if ('
Match(a=582, b=318, size=1) 'i'
Match(a=585, b=323, size=1) 'i'
Match(a=591, b=325, size=8) " == 'B')"
Match(a=602, b=333, size=10) '\n         '
Match(a=612, b=345, size=1) 'i'
Match(a=615, b=350, size=1) 'i'
Match(a=621, b=352, size=14) " = '8';\n      "
Match(a=677, b=366, size=4) 'if ('
Match(a=681, b=372, size=1) 'i'
Match(a=684, b=377, size=1) 'i'
Match(a=690, b=379, size=8) " == 's')"
Match(a=701, b=387, size=10) '\n         '
Match(a=711, b=399, size=1) 'i'
Match(a=714, b=404, size=1) 'i'
Match(a=720, b=406, size=14) " = '$';\n      "
Match(a=763, b=420, size=1) '}'
Match(a=822, b=421, size=12) '\n   cout << '
Match(a=837, b=436, size=26) ' << endl;\n\n   return 0;\n}\n'
Match(a=863, b=462, size=0) ''

and then

>>> total
373
>>> 2 * total / (len(a) + len(b))
0.5630188679245283
>>> sm.ratio()
0.5630188679245283

give identical results.
msg401916 - (view) Author: Nabeel Alzahrani (nalza001) Date: 2021-09-16 04:33
Here are the steps that I used to calculate 0.2 for the last example:

I used class difflib.HtmlDiff to find the number of changed chars (addedChars, deletedChars, and changedChars) which is 1172 (let us call it delta)

The size of both strings a and b in this example is 1470

I calculated the similality ratio using 1-(delta/totalSize) = 1-(1172/1470)=0.2

I am assuming both classes difflib.SequenceMatcher and difflib.HtmlDiff are both using the same algorithms and arguments and if so they should produce the same ratio. Is that right?
msg401920 - (view) Author: Tim Peters (tim.peters) * (Python committer) Date: 2021-09-16 05:02
Please stop re-opening this. The issue tracker is not a "help desk", and your confusions aren't necessarily Python bugs ;-) If you post something that looks like an actual bug, I'll re-open the report.

SequenceMatcher works on sequences.

HtmlFiff works on sequences OF sequences (typically lists of lines). Very different things. For example,

h = difflib.HtmlDiff()
h.make_file(['aaabbbbbbbbb'], ['aaacccccccc'])

finds nothing at all in common. It finds that the two lines don't match, and then finds that the lines aren't "similar enough" to bother looking any deeper. But, obviously, they do share 'aaa' as a common prefix, and calling SequenceMatcher directly on the two strings will find their common prefix.

There's no reason to imagine they'll produce the same results - they're not doing the same things. SequenceMatcher is used, in several places, as a building block to do the more complicated things HtmlDiff does. But HtmlDiff works on _lines_ first; SequenceMatcher has no concept of "line".

As to 1-(delta/totalSize), I have no idea where that came from. What SequenceMatcher.ratio() returns is documented:

"""
Where T is the total number of elements in both sequences, and M is the number of matches, this is 2.0*M / T. Note that this is 1.0 if the sequences are identical, and 0.0 if they have nothing in common.
"""
History
Date User Action Args
2022-04-11 14:59:49adminsetgithub: 89343
2021-09-16 05:02:23tim.peterssetstatus: open -> closed

messages: + msg401920
2021-09-16 04:33:04nalza001setstatus: closed -> open

messages: + msg401916
2021-09-16 03:46:15tim.peterssetstatus: open -> closed
resolution: not a bug
messages: + msg401915
2021-09-16 03:44:06nalza001setstatus: closed -> open
resolution: not a bug -> (no value)
2021-09-16 02:03:58nalza001setmessages: + msg401912
2021-09-13 04:10:47tim.peterssetstatus: open -> closed

nosy: + tim.peters
messages: + msg401685

resolution: not a bug
stage: resolved
2021-09-13 04:02:54nalza001create