# -*- coding: utf-8 -*- from __future__ import division, print_function, unicode_literals import sys import timeit import math def bench_decode(encoding, string): try: x = eval(string).encode(encoding) assert x.decode(encoding) == eval(string) except UnicodeEncodeError: return setup = ''' import codecs d = codecs.getdecoder({0!r}) x = {1!r} '''.format(encoding, x) repeat = 10 number = 100 r = timeit.repeat('d(x)', setup, repeat=repeat, number=number) best = min(r) usec = best * 1e6 / number print("%-8s %-30s %.0f" % (encoding, string.replace("u'", "'"), len(x) / usec)) sys.stdout.flush() n = 10000 encodings = sys.argv[1:] if not encodings: encodings = ('ascii', 'latin1', 'cp037', 'cp1006', 'cp1026', 'cp1140', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256', 'cp1257', 'cp1258', 'cp424', 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'hp_roman8', 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16', 'iso8859_1', 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8', 'iso8859_9', 'koi8_r', 'koi8_u', 'mac_arabic', 'mac_centeuro', 'mac_croatian', 'mac_cyrillic', 'mac_farsi', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_romanian', 'mac_roman', 'mac_turkish', 'palmos', 'ptcp154', 'tis_620', ) for encoding in encodings: chars = ['A'] s = bytes(range(256)).decode(encoding, 'replace') for ch in s: if 0x80 <= ord(ch) < 0x100: chars.append(ch) break for ch in s: if 0x100 <= ord(ch) < 0x10000 and ch != '\uFFFE': chars.append(ch) break for i, ch1 in enumerate(chars): bench_decode(encoding, '%s*%d' % (ascii(ch1), n)) # for ch2 in chars[:i]: # bench_decode(encoding, ' %s+%s*%d' % (ascii(ch1), ascii(ch2), n - 1)) # for ch2 in chars[i + 1:]: # bench_decode(encoding, ' %s*%d+%s' % (ascii(ch1), n - 1, ascii(ch2))) print()