#!/usr/bin/python
# -*- coding: utf-8  -*-
import os, random, collections, time, re, gc, sys

# Here are a couple of lookup arrays that I've made (256 elements, based
# on chars/word lengths occurrence frequencies), could be useful to
# someone attempting to generate random 'words' really quickly:

letters = [
'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',
'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',  'e',
't',  't',  't',  't',  't',  't',  't',  't',  't',  't',  't',
't',  't',  't',
'o',  'o',  'o',  'o',  'o',  'o',  'o',  'o',  'o',  'o',  'o',
'o',  'o',
'a',  'a',  'a',  'a',  'a',  'a',  'a',  'a',  'a',  'a',  'a',
'a',  'a',
'r',  'r',  'r',  'r',  'r',  'r',  'r',  'r',  'r',  'r',  'r',  'r',
'i',  'i',  'i',  'i',  'i',  'i',  'i',  'i',  'i',  'i',  'i',  'i',
's',  's',  's',  's',  's',  's',  's',  's',  's',  's',  's',
'C',  'C',  'C',  'C',  'C',  'C',  'C',  'C',  'C',  'C',  'C',
'F',  'F',  'F',  'F',  'F',  'F',  'F',  'F',  'F',  'F',  'F',
'n',  'n',  'n',  'n',  'n',  'n',  'n',  'n',  'n',  'n',  'n',
'K',  'K',  'K',  'K',  'K',  'K',  'K',  'K',  'K',  'K',  'K',
'U',  'U',  'U',  'U',  'U',  'U',  'U',  'U',  'U',  'U',
'c',  'c',  'c',  'c',  'c',  'c',  'c',  'c',
'h',  'h',  'h',  'h',  'h',  'h',  'h',  'h',
'l',  'l',  'l',  'l',  'l',  'l',  'l',
']',  ']',  ']',  ']',  ']',  ']',
'[',  '[',  '[',  '[',  '[',
'u',  'u',  'u',  'u',  'u',
'd',  'd',  'd',  'd',  'd',
'm',  'm',  'm',  'm',  'm',
'g',  'g',  'g',  'g',  'g',
'p',  'p',  'p',  'p',  'p',
'f',  'f',  'f',  'f',
'y',  'y',  'y',  'y',
'k',  'k',  'k',
'w',  'w',  'w',
'.',  '.',  '.',
'V',  'V',  'V',
'b',  'b',  'b',
"'",  "'",  "'",
'v',  'v',
'1',  '1',
'|',  '|',
'=',  '=',
'A',  'A',
'/',  '/',
'5',  '5',
'!',  '!',
'0',  '0',
'S',  'S',
'4']

lengths = [
4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
7,  7,  7,  7,  7,  7,  7,  7,  7,
5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
5,  5,  5,  5,  5,  5,  5,  5,
6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
6,  6,  6,  6,  6,  6,  6,
8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
8,  8,  8,  8,
9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,
1,  1,  1,  1,  1,  1,  1,  1,
11,  11,  11,  11,  11,  11,  11,  11,
12,  12,  12,  12,  12,  12,  12,
13,  13,  13,  13,  13,
0,  0,  0,  0,  0,
14,  14,  14,
15,  15,  15,
16,  16,
21,
22,
20,
17]


def benchmark_dict(d, N):
    start = time.time()
    for i in xrange(N):
        word = ''.join([ random.choice(letters) for i in xrange(random.choice(lengths)) ])
        # word = unicode( word, "utf-8" ).encode('utf-8')
        d[word] = d.get(word, 0) + 1

    dt = time.time() - start
    vm = re.findall("(VmPeak.*|VmSize.*)", open('/proc/%d/status' % os.getpid()).read())
    print "%d keys (%d unique), %s, %f seconds, %f keys per second" % (N, len(d), vm, dt, N / dt)

def test_trie(N):
    d = collections.defaultdict(int)
    t = trie.trie()
    for i in xrange(N):
        word = ''.join([ random.choice(letters) for i in xrange(random.choice(lengths)) ])
        d[word] = d.get(word, 0) + 1
        t[word] = t.get(word, 0) + 1

    print "len(t) =", len(t), "len(d) =", len(d)
    if len(t) != len(d): print "Error: Lengths are different."

    print "5 items from t:", [(word, t[word]) for word in t.keys()[:5]]
    print "5 items from d:", [(word, d[word]) for word in d.keys()[:5]]

    # positive test
    for word in t.keys():
        if d[word] != t[word]: print word, t[word], d[word]
    for word in d.keys():
        if d[word] != t[word]: print word, t[word], d[word]
    print "Positive test done."

    # negative test
    for i in xrange(N/10):
        word = ''.join([ random.choice(letters) for i in xrange(random.choice(lengths)) ])
        if d.has_key(word) and not t.has_key(word): print word, 'in d and not in t'
        if t.has_key(word) and not d.has_key(word): print word, 'in d and not in t'
    print "Negative test done."


if __name__ == "__main__":
    #gc.disable()
    #sys.setcheckinterval(100000)

    d = collections.defaultdict(int)
    benchmark_dict(d, 1000000)

    from Bio import trie
    d = trie.trie()
    benchmark_dict(d, 1000000)
    test_trie(1000000)