#!/usr/bin/python # -*- coding: utf-8 -*- import os, random, collections, time, re, gc, sys, pickle, zlib # Here are a couple of lookup arrays that I've made (256 elements, based # on chars/word lengths occurrence frequencies), could be useful to # someone attempting to generate random 'words' really quickly: _letters = [ 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 't', 't', 't', 't', 't', 't', 't', 't', 't', 't', 't', 't', 't', 't', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 'i', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'l', 'l', 'l', 'l', 'l', 'l', 'l', ']', ']', ']', ']', ']', ']', '[', '[', '[', '[', '[', 'u', 'u', 'u', 'u', 'u', 'd', 'd', 'd', 'd', 'd', 'm', 'm', 'm', 'm', 'm', 'g', 'g', 'g', 'g', 'g', 'p', 'p', 'p', 'p', 'p', 'f', 'f', 'f', 'f', 'y', 'y', 'y', 'y', 'k', 'k', 'k', 'w', 'w', 'w', '.', '.', '.', 'V', 'V', 'V', 'b', 'b', 'b', "'", "'", "'", 'v', 'v', '1', '1', '|', '|', '=', '=', 'A', 'A', '/', '/', '5', '5', '!', '!', '0', '0', 'S', 'S', '4'] letters = [_c.encode('latin1') for _c in _letters] lengths = [ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 1, 1, 1, 1, 1, 1, 1, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 0, 0, 0, 0, 0, 14, 14, 14, 15, 15, 15, 16, 16, 21, 22, 20, 17] _WORDS_CACHE = "dcbench.bin" def _create_words(N): random.seed(1234) words = [ b''.join(random.choice(letters) for i in range(random.choice(lengths))) for j in range(N) ] return words def benchmark_dict(d, N, words): assert len(words) >= N words_subset = words[:N] start = time.time() for w in words_subset: d[w] = d.get(w, 0) + 1 dt_insert = time.time() - start start = time.time() for w in words_subset: d[w] dt_lookup = time.time() - start #vm = re.findall("(VmPeak.*|VmSize.*)", open('/proc/%d/status' % os.getpid()).read()) dict_size = sys.getsizeof(d) print("%8d words (%8d keys), %d inserts/s, %d lookups/s, %d bytes/key (%.1fMB)" % (N, len(d), N / dt_insert, N / dt_lookup, dict_size / len(d), dict_size / 2**20)) def test_trie(N): d = collections.defaultdict(int) t = trie.trie() for i in xrange(N): word = ''.join([ random.choice(letters) for i in xrange(random.choice(lengths)) ]) d[word] = d.get(word, 0) + 1 t[word] = t.get(word, 0) + 1 print("len(t) =", len(t), "len(d) =", len(d)) if len(t) != len(d): print("Error: Lengths are different.") print("5 items from t:", [(word, t[word]) for word in t.keys()[:5]]) print("5 items from d:", [(word, d[word]) for word in d.keys()[:5]]) # positive test for word in t.keys(): if d[word] != t[word]: print (word, t[word], d[word]) for word in d.keys(): if d[word] != t[word]: print (word, t[word], d[word]) print("Positive test done.") # negative test for i in xrange(N/10): word = ''.join([ random.choice(letters) for i in xrange(random.choice(lengths)) ]) if d.has_key(word) and not t.has_key(word): print (word, 'in d and not in t') if t.has_key(word) and not d.has_key(word): print (word, 'in d and not in t') print("Negative test done.") if __name__ == "__main__": gc.disable() #sys.setcheckinterval(100000) max_N = 10**7 words = None try: f = open(_WORDS_CACHE, "rb") except IOError: pass else: print(">> loading data from cache") with f: words = pickle.loads(zlib.decompress(f.read())) if len(words) < max_N: words = None if not words: print(">> creating data") words = _create_words(max_N) with open(_WORDS_CACHE, "wb") as f: f.write(zlib.compress(pickle.dumps(words, -1))) #random.shuffle(words) #for exp in range(3, 6): #d = {} #benchmark_dict(d, 10**exp, words) #for n in [5, 10, 20, 50, 100]: #d = {} #benchmark_dict(d, n * 10**5, words) for n in [1, 2, 4, 8, 16, 32, 64, 128]: d = {} benchmark_dict(d, n * 10000, words) #from Bio import trie #d = trie.trie() #benchmark_dict(d, 1000000) #test_trie(1000000)