import difflib UPPER_BOUNDS = [(1.0, 1.0), (1.1111111111111112, 0.94736842105263153), (1.125, 0.94117647058823528), (1.1428571428571428, 0.93333333333333335), (1.1666666666666667, 0.92307692307692313), (1.2, 0.90909090909090906), (1.2222222222222223, 0.90000000000000002), (1.25, 0.88888888888888884), (1.2857142857142858, 0.875), (1.3333333333333333, 0.8571428571428571), (1.375, 0.84210526315789469), (1.3999999999999999, 0.83333333333333337), (1.4285714285714286, 0.82352941176470584), (1.4444444444444444, 0.81818181818181823), (1.5, 0.80000000000000004), (1.5555555555555556, 0.78260869565217395), (1.5714285714285714, 0.77777777777777779), (1.6000000000000001, 0.76923076923076927), (1.625, 0.76190476190476186), (1.6666666666666667, 0.75), (1.7142857142857142, 0.73684210526315785), (1.75, 0.72727272727272729), (1.7777777777777777, 0.71999999999999997), (1.8, 0.7142857142857143), (1.8333333333333333, 0.70588235294117652), (1.8571428571428572, 0.69999999999999996), (1.875, 0.69565217391304346), (1.8888888888888888, 0.69230769230769229), (2.0, 0.66666666666666663), (2.1111111111111112, 0.6428571428571429), (2.125, 0.64000000000000001), (2.1428571428571428, 0.63636363636363635), (2.1666666666666665, 0.63157894736842102), (2.2000000000000002, 0.625), (2.2222222222222223, 0.62068965517241381), (2.25, 0.61538461538461542), (2.2857142857142856, 0.60869565217391308), (2.3333333333333335, 0.59999999999999998), (2.375, 0.59259259259259256), (2.3999999999999999, 0.58823529411764708), (2.4285714285714284, 0.58333333333333337), (2.4444444444444446, 0.58064516129032262), (2.5, 0.5714285714285714), (2.5555555555555554, 0.5625), (2.5714285714285716, 0.56000000000000005), (2.6000000000000001, 0.55555555555555558), (2.625, 0.55172413793103448), (2.6666666666666665, 0.54545454545454541), (2.7142857142857144, 0.53846153846153844), (2.75, 0.53333333333333333), (2.7777777777777777, 0.52941176470588236), (2.7999999999999998, 0.52631578947368418), (2.8333333333333335, 0.52173913043478259), (2.8571428571428572, 0.51851851851851849), (2.875, 0.5161290322580645), (2.8888888888888888, 0.51428571428571423), (3.0, 0.5), (3.125, 0.48484848484848486), (3.1428571428571428, 0.48275862068965519), (3.1666666666666665, 0.47999999999999998), (3.2000000000000002, 0.47619047619047616), (3.25, 0.47058823529411764), (3.2857142857142856, 0.46666666666666667), (3.3333333333333335, 0.46153846153846156), (3.3999999999999999, 0.45454545454545453), (3.4285714285714284, 0.45161290322580644), (3.5, 0.44444444444444442), (3.5714285714285716, 0.4375), (3.6000000000000001, 0.43478260869565216), (3.6666666666666665, 0.42857142857142855), (3.75, 0.42105263157894735), (3.7999999999999998, 0.41666666666666669), (3.8333333333333335, 0.41379310344827586), (4.0, 0.40000000000000002), (4.2000000000000002, 0.38461538461538464), (4.25, 0.38095238095238093), (4.333333333333333, 0.375), (4.4000000000000004, 0.37037037037037035), (4.5, 0.36363636363636365), (4.5999999999999996, 0.35714285714285715), (4.666666666666667, 0.35294117647058826), (4.75, 0.34782608695652173), (5.0, 0.33333333333333331), (5.25, 0.32000000000000001), (5.333333333333333, 0.31578947368421051), (5.5, 0.30769230769230771), (5.666666666666667, 0.29999999999999999), (6.0, 0.2857142857142857), (6.333333333333333, 0.27272727272727271), (6.5, 0.26666666666666666), (6.666666666666667, 0.2608695652173913), (7.0, 0.25), (7.5, 0.23529411764705882), (8.0, 0.22222222222222221), (8.5, 0.21052631578947367), (9.0, 0.20000000000000001), (9.5, 0.19047619047619047), (10.0, 0.18181818181818182), (11.0, 0.16666666666666666), (12.0, 0.15384615384615385), (13.0, 0.14285714285714285), (14.0, 0.13333333333333333), (15.0, 0.125), (16.0, 0.11764705882352941), (17.0, 0.1111111111111111), (18.0, 0.10526315789473684), (19.0, 0.10000000000000001)] def quick_ratio_ge(a_str, b_str, threshold): ''' Indicates if the strings to compare are enough "similar". This (optimized) function is equivalent to the expression: difflib.SequenceMatcher.quick_ratio(None, a_str, b_str) >= threshold @param a_str: A string object @param b_str: A string object @param threshold: Float value indicating the expected "quick_ratio". Must be 0 <= threshold <= 1.0 @return: A boolean value ''' #Do the job to raise TypeErrors iter(a_str) iter(b_str) threshold = float(threshold) if threshold < 0 or threshold > 1.0: raise TypeError('Parameter threshold is not in range 0 <= threshold <= 1.0') if threshold == 0: return True elif threshold == 1.0: return a_str == b_str if len(b_str) < len(a_str): a_str, b_str = b_str, a_str alen = len(a_str) blen = len(b_str) if blen == 0 or alen == 0: return alen == blen ratio = float(blen) / alen last_ratio, last_bound = UPPER_BOUNDS[-1] if threshold < last_bound: # Bad, we can't optimize anything here return difflib.SequenceMatcher(None, a_str, b_str).quick_ratio() >= threshold else: if last_ratio < ratio: # Good, we know the upper bound return False else: # We have to step through for size_ratio, bound in UPPER_BOUNDS: if size_ratio > ratio: # Bad: we have to do the quick_ratio return difflib.SequenceMatcher(None, a_str, b_str).quick_ratio() >= threshold elif bound < threshold: # Good: We found an upper bound return False def _generate_upper_bounds(left_max=40, right_max=30): ''' This function can be used to produce new upper bounds, but shouldn't be used in productive code. Simply run this command once and then hardcode the list. ''' import pprint UPPER_BOUNDS = set() UPPER_BOUNDS.add((1.0, 1.0)) def addToBounds(a, b): size = float(len(b)) / len(a) upper_bound = difflib.SequenceMatcher(None, a, b).quick_ratio() UPPER_BOUNDS.add((size, upper_bound)) for k in range(1, left_max): for i in range(1, right_max): if k == i == 1: continue atest = 'm' * k btest = 'm' * k + 'm' * (i - 1) addToBounds(atest, btest) UPPER_BOUNDS = list(UPPER_BOUNDS) UPPER_BOUNDS.sort(key=lambda x: x[0]) with open("upper_bounds.py", "w") as fp: fp.write("UPPER_BOUNDS = ") pprint.pprint(UPPER_BOUNDS, fp)