--- difflib.py Tue Jul 21 02:09:16 2015 +++ difflib_improved.py Thu Aug 20 17:07:54 2015 @@ -32,6 +32,7 @@ from heapq import nlargest as _nlargest from collections import namedtuple as _namedtuple +from timeit import default_timer as _default_timer Match = _namedtuple('Match', 'a b size') @@ -117,7 +118,7 @@ Methods: - __init__(isjunk=None, a='', b='') + __init__(isjunk=None, a='', b='', autojunk=True, timeout=0, fallthrough=0)) Construct a SequenceMatcher. set_seqs(a, b) @@ -139,7 +140,7 @@ Return list of 5-tuples describing how to turn a into b. ratio() - Return a measure of the sequences' similarity (float in [0,1]). + Return a measure of the sequences' similarity (float in [0,1] or -1 on timeout). quick_ratio() Return an upper bound on .ratio() relatively quickly. @@ -148,7 +149,7 @@ Return an upper bound on ratio() very quickly. """ - def __init__(self, isjunk=None, a='', b='', autojunk=True): + def __init__(self, isjunk=None, a='', b='', autojunk=True, timeout=0, fallthrough=0): """Construct a SequenceMatcher. Optional arg isjunk is None (the default), or a one-argument @@ -170,6 +171,19 @@ Optional arg autojunk should be set to False to disable the "automatic junk heuristic" that treats popular elements as junk (see module documentation for more information). + + Optional arg timeout should be set to the number of seconds to wait + for ratio() or quick_ratio() to complete. If a timeout seconds is + exceeded return -1 for the ratio unless fallthrough is set. This value + can be a float. + + Optional arg fallthrough should be set to 1 or 2. If ratio() is + called and fallthrough is set to 1, quick_ratio() will be + subsequently called when a timeout occurs. real_quick_ratio() + will be called if timeout is set to 2 if another timeout + occurs. If quick_ratio() is initially called and fallthrough + is set to 1 or 2, real_quick_ratio() will be called when a + timeout occurs. """ # Members: @@ -206,11 +220,21 @@ # the items in b for which isjunk is True. # bpopular # nonjunk items in b treated as junk by the heuristic (if used). + # timeout + # number of seconds to wait for ratio() or quick_ratio() to complete + # fallthrough + # when ratio() or quick_ratio() times out, fall through to next + # 1 or 2 comparison methods: quick_ratio() and then real_quick_ratio() + # abort_ratio + # used to keep track of a timeout occurence in ratio() self.isjunk = isjunk self.a = self.b = None self.autojunk = autojunk self.set_seqs(a, b) + self.timeout = timeout + self.fallthrough = fallthrough + self.abort_ratio = False def set_seqs(self, a, b): """Set the two sequences to be compared. @@ -396,7 +420,11 @@ # junk-free match ending with a[i-1] and b[j] j2len = {} nothing = [] + start_time = _default_timer() for i in range(alo, ahi): + if self.timeout and (_default_timer() - start_time) > self.timeout: + self.abort_ratio = True + return (alo, blo, 0) # look at all instances of a[i] in b; note that because # b2j has no junk keys, the loop is skipped if a[i] is junk j2lenget = j2len.get @@ -474,9 +502,13 @@ # at the end. queue = [(0, la, 0, lb)] matching_blocks = [] + start_time = _default_timer() while queue: alo, ahi, blo, bhi = queue.pop() i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) + if self.timeout and (_default_timer() - start_time) > self.timeout: + self.abort_ratio = True + return [] # a[alo:i] vs b[blo:j] unknown # a[i:i+k] same as b[j:j+k] # a[i+k:ahi] vs b[j+k:bhi] unknown @@ -642,6 +674,12 @@ """ matches = sum(triple[-1] for triple in self.get_matching_blocks()) + if self.abort_ratio: + if self.fallthrough > 0: + self.fallthrough -= 1 + return self.quick_ratio() + else: + return -1 return _calculate_ratio(matches, len(self.a) + len(self.b)) def quick_ratio(self): @@ -663,7 +701,13 @@ # number of times we've seen it in 'a' so far ... kinda avail = {} availhas, matches = avail.__contains__, 0 + start_time = _default_timer() for elt in self.a: + if self.timeout and (_default_timer() - start_time) > self.timeout: + if self.fallthrough > 0: + return self.real_quick_ratio() + else: + return -1 if availhas(elt): numb = avail[elt] else: