#!/usr/bin/python
# -*- coding: utf-8  -*-
import os, random, collections, time, re, gc, sys, pickle, zlib

# Here are a couple of lookup arrays that I've made (256 elements, based
# on chars/word lengths occurrence frequencies), could be useful to
# someone attempting to generate random 'words' really quickly:

_letters = [
'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',
'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',
't',  't',  't',  't',  't',  't',  't',  't',  't',  't',  't',
't',  't',  't',
'o',  'o',  'o',  'o',  'o',  'o',  'o',  'o',  'o',  'o',  'o',
'o',  'o',
'a',  'a',  'a',  'a',  'a',  'a',  'a',  'a',  'a',  'a',  'a',
'a',  'a',
'r',  'r',  'r',  'r',  'r',  'r',  'r',  'r',  'r',  'r',  'r',  'r',
'i',  'i',  'i',  'i',  'i',  'i',  'i',  'i',  'i',  'i',  'i',  'i',
's',  's',  's',  's',  's',  's',  's',  's',  's',  's',  's',
'C',  'C',  'C',  'C',  'C',  'C',  'C',  'C',  'C',  'C',  'C',
'F',  'F',  'F',  'F',  'F',  'F',  'F',  'F',  'F',  'F',  'F',
'n',  'n',  'n',  'n',  'n',  'n',  'n',  'n',  'n',  'n',  'n',
'K',  'K',  'K',  'K',  'K',  'K',  'K',  'K',  'K',  'K',  'K',
'U',  'U',  'U',  'U',  'U',  'U',  'U',  'U',  'U',  'U',
'c',  'c',  'c',  'c',  'c',  'c',  'c',  'c',
'h',  'h',  'h',  'h',  'h',  'h',  'h',  'h',
'l',  'l',  'l',  'l',  'l',  'l',  'l',
']',  ']',  ']',  ']',  ']',  ']',
'[',  '[',  '[',  '[',  '[',
'u',  'u',  'u',  'u',  'u',
'd',  'd',  'd',  'd',  'd',
'm',  'm',  'm',  'm',  'm',
'g',  'g',  'g',  'g',  'g',
'p',  'p',  'p',  'p',  'p',
'f',  'f',  'f',  'f',
'y',  'y',  'y',  'y',
'k',  'k',  'k',
'w',  'w',  'w',
'.',  '.',  '.',
'V',  'V',  'V',
'b',  'b',  'b',
"'",  "'",  "'",
'v',  'v',
'1',  '1',
'|',  '|',
'=',  '=',
'A',  'A',
'/',  '/',
'5',  '5',
'!',  '!',
'0',  '0',
'S',  'S',
'4']

letters = [_c.encode('latin1') for _c in _letters]

lengths = [
4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
7,  7,  7,  7,  7,  7,  7,  7,  7,
5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
5,  5,  5,  5,  5,  5,  5,  5,
6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
6,  6,  6,  6,  6,  6,  6,
8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
8,  8,  8,  8,
9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,
1,  1,  1,  1,  1,  1,  1,  1,
11,  11,  11,  11,  11,  11,  11,  11,
12,  12,  12,  12,  12,  12,  12,
13,  13,  13,  13,  13,
0,  0,  0,  0,  0,
14,  14,  14,
15,  15,  15,
16,  16,
21,
22,
20,
17]


_WORDS_CACHE = "dcbench.bin"

def _create_words(N):
    random.seed(1234)
    words = [
        b''.join(random.choice(letters)
                 for i in range(random.choice(lengths)))
        for j in range(N)
        ]
    return words

def benchmark_dict(d, N, words):
    assert len(words) >= N
    words_subset = words[:N]

    start = time.time()
    for w in words_subset:
        d[w] = d.get(w, 0) + 1
    dt_insert = time.time() - start
    start = time.time()
    for w in words_subset:
        d[w]
    dt_lookup = time.time() - start

    #vm = re.findall("(VmPeak.*|VmSize.*)", open('/proc/%d/status' % os.getpid()).read())
    dict_size = sys.getsizeof(d)
    print("%8d words (%8d keys), %d inserts/s, %d lookups/s, %d bytes/key (%.1fMB)"
          % (N, len(d), N / dt_insert, N / dt_lookup, dict_size / len(d), dict_size / 2**20))

def test_trie(N):
    d = collections.defaultdict(int)
    t = trie.trie()
    for i in xrange(N):
        word = ''.join([ random.choice(letters) for i in xrange(random.choice(lengths)) ])
        d[word] = d.get(word, 0) + 1
        t[word] = t.get(word, 0) + 1

    print("len(t) =", len(t), "len(d) =", len(d))
    if len(t) != len(d): print("Error: Lengths are different.")

    print("5 items from t:", [(word, t[word]) for word in t.keys()[:5]])
    print("5 items from d:", [(word, d[word]) for word in d.keys()[:5]])

    # positive test
    for word in t.keys():
        if d[word] != t[word]: print (word, t[word], d[word])
    for word in d.keys():
        if d[word] != t[word]: print (word, t[word], d[word])
    print("Positive test done.")

    # negative test
    for i in xrange(N/10):
        word = ''.join([ random.choice(letters) for i in xrange(random.choice(lengths)) ])
        if d.has_key(word) and not t.has_key(word): print (word, 'in d and not in t')
        if t.has_key(word) and not d.has_key(word): print (word, 'in d and not in t')
    print("Negative test done.")


if __name__ == "__main__":
    gc.disable()
    #sys.setcheckinterval(100000)

    max_N = 10**7

    words = None
    try:
        f = open(_WORDS_CACHE, "rb")
    except IOError:
        pass
    else:
        print(">> loading data from cache")
        with f:
            words = pickle.loads(zlib.decompress(f.read()))
        if len(words) < max_N:
            words = None
    
    if not words:
        print(">> creating data")
        words = _create_words(max_N)
        with open(_WORDS_CACHE, "wb") as f:
            f.write(zlib.compress(pickle.dumps(words, -1)))

    #random.shuffle(words)
    #for exp in range(3, 6):
        #d = {}
        #benchmark_dict(d, 10**exp, words)
    #for n in [5, 10, 20, 50, 100]:
        #d = {}
        #benchmark_dict(d, n * 10**5, words)

    for n in [1, 2, 4, 8, 16, 32, 64, 128]:
        d = {}
        benchmark_dict(d, n * 10000, words)

    #from Bio import trie
    #d = trie.trie()
    #benchmark_dict(d, 1000000)
    #test_trie(1000000)