import time, codecs, cStringIO as StringIO count = 1000000 def test(uni, enc, err): luni = uni*count try: t1 = time.time() str = luni.encode(enc, err) t2 = time.time() note = "" except UnicodeError: t2 = time.time() note = "(exc)" except ValueError: if err=="xmlcharrefreplace": t1 = time.time() v = StringIO.StringIO() writer = codecs.getwriter(enc)(v) for c in luni: try: writer.write(c) except UnicodeError: writer.write(u"&#%d;" % ord(c)) str = v.getvalue() t2 = time.time() elif err=="backslashreplace": t1 = time.time() v = StringIO.StringIO() writer = codecs.getwriter(enc)(v) for c in luni: try: writer.write(c) except UnicodeError: if ord(c)<=0xff: writer.write(u"\\x%02x" % ord(c)) elif ord(c)<=0xffff: writer.write(u"\\u%04x" % ord(c)) else: writer.write(u"\\U%08x" % ord(c)) str = v.getvalue() t2 = time.time() else: raise note = "(emu)" print "%-15r %-15s %-18s %7.03f %s" % (uni, enc, err, t2-t1, note) for uni in (u"x", u"\xe4", u"\u3042", u"a\xe4", u"a\xe4\u3042"): for enc in ("ascii", "latin-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"): for err in ("strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"): test(uni, enc, err)