#!/usr/bin/python # -*- coding: utf-8 -*- import os, random, collections, time, re, gc, sys # Here are a couple of lookup arrays that I've made (256 elements, based # on chars/word lengths occurrence frequencies), could be useful to # someone attempting to generate random 'words' really quickly: letters = [ 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 't', 't', 't', 't', 't', 't', 't', 't', 't', 't', 't', 't', 't', 't', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'l', 'l', 'l', 'l', 'l', 'l', 'l', ']', ']', ']', ']', ']', ']', '[', '[', '[', '[', '[', 'u', 'u', 'u', 'u', 'u', 'd', 'd', 'd', 'd', 'd', 'm', 'm', 'm', 'm', 'm', 'g', 'g', 'g', 'g', 'g', 'p', 'p', 'p', 'p', 'p', 'f', 'f', 'f', 'f', 'y', 'y', 'y', 'y', 'k', 'k', 'k', 'w', 'w', 'w', '.', '.', '.', 'V', 'V', 'V', 'b', 'b', 'b', "'", "'", "'", 'v', 'v', '1', '1', '|', '|', '=', '=', 'A', 'A', '/', '/', '5', '5', '!', '!', '0', '0', 'S', 'S', '4'] lengths = [ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 1, 1, 1, 1, 1, 1, 1, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 0, 0, 0, 0, 0, 14, 14, 14, 15, 15, 15, 16, 16, 21, 22, 20, 17] def benchmark_dict(d, N): start = time.time() for i in xrange(N): word = ''.join([ random.choice(letters) for i in xrange(random.choice(lengths)) ]) # word = unicode( word, "utf-8" ).encode('utf-8') d[word] = d.get(word, 0) + 1 dt = time.time() - start vm = re.findall("(VmPeak.*|VmSize.*)", open('/proc/%d/status' % os.getpid()).read()) print "%d keys (%d unique), %s, %f seconds, %f keys per second" % (N, len(d), vm, dt, N / dt) def test_trie(N): d = collections.defaultdict(int) t = trie.trie() for i in xrange(N): word = ''.join([ random.choice(letters) for i in xrange(random.choice(lengths)) ]) d[word] = d.get(word, 0) + 1 t[word] = t.get(word, 0) + 1 print "len(t) =", len(t), "len(d) =", len(d) if len(t) != len(d): print "Error: Lengths are different." print "5 items from t:", [(word, t[word]) for word in t.keys()[:5]] print "5 items from d:", [(word, d[word]) for word in d.keys()[:5]] # positive test for word in t.keys(): if d[word] != t[word]: print word, t[word], d[word] for word in d.keys(): if d[word] != t[word]: print word, t[word], d[word] print "Positive test done." # negative test for i in xrange(N/10): word = ''.join([ random.choice(letters) for i in xrange(random.choice(lengths)) ]) if d.has_key(word) and not t.has_key(word): print word, 'in d and not in t' if t.has_key(word) and not d.has_key(word): print word, 'in d and not in t' print "Negative test done." if __name__ == "__main__": #gc.disable() #sys.setcheckinterval(100000) d = collections.defaultdict(int) benchmark_dict(d, 1000000) from Bio import trie d = trie.trie() benchmark_dict(d, 1000000) test_trie(1000000)