#/usr/bin/env python # -*- encoding: utf-8 import time from unicodedata import east_asian_width ucs_en = u'test string in english ' ucs_sp = u'cadena en español óóÄ€ ' ucs_cn = u'广告计划 广告计划 广告计划 广告计划 广告计划 ' ucs_jp = u'広告プログラム 広告プログラム 広告プログラム ' NUM = 10000 WIDEMODES = ('W', 'F') WIDETBL = {'W': 2, 'F': 2, 'N': 1, 'Na': 1, 'H': 1, 'A': 1} # len def ucs2w_00(ucs): """pure len()""" return len(ucs) # exact functions def ucs2w_11(ucs): """sum of every char, list comprehension""" # return sum([(east_asian_width(c)=='W' and 2 or 1) for c in ucs]) return sum([(east_asian_width(c) in WIDEMODES and 2 or 1) for c in ucs]) def ucs2w_12(ucs): """sum of every char, for loop""" sum = 0 for c in ucs: # sum += east_asian_width(c)=='W' and 2 or 1 sum += east_asian_width(c) in WIDEMODES and 2 or 1 return sum def ucs2w_13(ucs): """sum of every char, for loop & if-else""" sum = 0 for c in ucs: # if east_asian_width(c) == 'W': if east_asian_width(c) in WIDEMODES: sum += 2 else: sum += 1 return sum def ucs2w_14(ucs): """sum of every char, map & list comp...""" return sum([WIDETBL[e] for e in map(east_asian_width, ucs)]) def ucs2w_15(ucs): """sum of every char, map & for loop""" sum = 0 for e in map(east_asian_width, ucs): sum += WIDETBL[e] return sum def ucs2w_16(ucs): """sum of every char, map & reduce""" return reduce(lambda x, y: x+y, map(lambda c: WIDETBL[east_asian_width(c)], ucs)) from ucs2w import unichr2w def ucs2w_17(ucs): """sum every char, c extension""" return sum([unichr2w(ord(c)) for c in ucs]) def ucs2w_18(ucs): """sum every char, c extension""" sum = 0 for c in ucs: sum += unichr2w(ord(c)) return sum from ucs2w import ucs2w def ucs2w_19(ucs): """whole string, c extension""" return ucs2w(ucs) # approximations, no false negatives def ucs2w_21(ucs): """check until first wide char is found""" for c in ucs: # if east_asian_width(c) == 'W': if east_asian_width(c) in WIDEMODES: return 2 * len(ucs) else: return len(ucs) def ucs2w_22(ucs): """check all chars, map""" lst = map(east_asian_width, ucs) if 'W' in lst or 'F' in lst: return 2 * len(ucs) else: return len(ucs) from itertools import groupby def ucs2w_23(ucs): """check all chars, itertools.groupby""" lst = [k for k, g in groupby(ucs, east_asian_width)] if 'W' in lst or 'F' in lst: return 2 * len(ucs) else: return len(ucs) # approximations, not perfect def ucs2w_31(ucs): """check only first char""" # if east_asian_width(ucs[0]) == 'W': if east_asian_width(ucs[0]) in WIDEMODES: return 2 * len(ucs) else: return len(ucs) def ucs2w_32(ucs): """check first and last chars""" # if east_asian_width(ucs[0]) == 'W' or east_asian_width(ucs[-1]) == 'W': if east_asian_width(ucs[0]) in WIDEMODES or \ east_asian_width(ucs[-1]) in WIDEMODES: return 2 * len(ucs) else: return len(ucs) def ucs2w_33(ucs): """check first, middle, and last chars""" mid = len(ucs) / 2 # if east_asian_width(ucs[0]) == 'W' or east_asian_width(ucs[mid]) == 'W' or \ # east_asian_width(ucs[-1]) == 'W': if east_asian_width(ucs[0]) in WIDEMODES or \ east_asian_width(ucs[mid]) in WIDEMODES or \ east_asian_width(ucs[-1]) in WIDEMODES: return 2 * len(ucs) else: return len(ucs) # run tests def test_time(fn, ucs, num=NUM): t0 = time.time() for i in xrange(NUM): w = fn(ucs) t1 = time.time() print ' %s: %s %d %d %.5f secs' % \ (str(fn)[10:18], fn.__doc__.ljust(38), len(ucs), w, t1-t0) for ucs in (ucs_en, ucs_sp, ucs_cn, ucs_jp): print '-> "%s" - %d iterations' % (ucs, NUM) test_time(ucs2w_00, ucs) test_time(ucs2w_11, ucs) test_time(ucs2w_12, ucs) test_time(ucs2w_13, ucs) test_time(ucs2w_14, ucs) test_time(ucs2w_15, ucs) test_time(ucs2w_16, ucs) test_time(ucs2w_17, ucs) test_time(ucs2w_18, ucs) test_time(ucs2w_19, ucs) test_time(ucs2w_21, ucs) test_time(ucs2w_22, ucs) test_time(ucs2w_23, ucs) test_time(ucs2w_31, ucs) test_time(ucs2w_32, ucs) test_time(ucs2w_33, ucs) print