from codecs import open as copen #from io import open as copen from collections import defaultdict from re import match, split, sub from time import time filename = 'big-utf-8-file.txt' # approx. 600 MB filename = '/media/Seagate Expansion Drive/twt/crawled/tweets_2013-04-07.txt' trash = '-[](){}\\/`\'"0123456789~!@#$%^&*_+=;:|,.<>?\r\n\t ' firstname = r'[\[\]\(\)\{\}\\' + trash[8:] + '-]' consonants = r'[bcdfghjklmnpqrstvwxz]*$' # *only* consonants namecounts = defaultdict(int) t = time() with copen(filename, encoding='utf-8') as f: lines = (l.split(',') for l in f) names = (l[4].strip(trash) for l in lines) camel = (sub(r'([a-z])([A-Z])', r'\1 \2', name) for name in names) first = (split(firstname, name, 1)[0] for name in camel) small = (name.lower() for name in first) # rejects non-western names I'm afraid. vowel = (name for name in small if not match(consonants, name)) for name in vowel: namecounts[name] += 1 print time() - t # Ubuntu 10.04, 64-bit. Python 2.6.5. File is on a Seagate external HDD (USB). # ------------------------ # using codecs: 14.1 sec. # using io: 75.3 sec. #