from codecs      import open as copen
#from io          import open as copen
from collections import defaultdict
from re          import match, split, sub
from time        import time


filename   = 'big-utf-8-file.txt'		# approx. 600 MB
filename   = '/media/Seagate Expansion Drive/twt/crawled/tweets_2013-04-07.txt'

trash      = '-[](){}\\/`\'"0123456789~!@#$%^&*_+=;:|,.<>?\r\n\t '
firstname  = r'[\[\]\(\)\{\}\\' + trash[8:] + '-]'
consonants = r'[bcdfghjklmnpqrstvwxz]*$'	# *only* consonants

namecounts = defaultdict(int)


t = time()

with copen(filename, encoding='utf-8') as f:
	lines = (l.split(',') for l in f)
	names = (l[4].strip(trash) for l in lines)
	camel = (sub(r'([a-z])([A-Z])', r'\1 \2', name) for name in names)
	first = (split(firstname, name, 1)[0] for name in camel)
	small = (name.lower() for name in first)
	# rejects non-western names I'm afraid.
	vowel = (name for name in small if not match(consonants, name))
	
	for name in vowel:
		namecounts[name] += 1

print time() - t
# Ubuntu 10.04, 64-bit. Python 2.6.5. File is on a Seagate external HDD (USB).
# ------------------------
# using codecs: 14.1 sec.
# using io:     75.3 sec.
#