import test_support, unittest import sys, codecs, htmlentitydefs, unicodedata class CodecCallbackTest(unittest.TestCase): def test_xmlcharrefreplace(self): """ replace unencodable characters which numeric character entities. For ascii, latin-1 and charmaps this is completely implemented in C and should be reasonably fast. """ s = u"\u30b9\u30d1\u30e2 \xe4nd eggs" self.assertEqual( s.encode("ascii", "xmlcharrefreplace"), "スパモ änd eggs" ) self.assertEqual( s.encode("latin-1", "xmlcharrefreplace"), "スパモ \xe4nd eggs" ) def test_xmlcharnamereplace(self): """ This time use a named character entity for unencodable characters, if one is available. """ names = {} for (key, value) in htmlentitydefs.entitydefs.items(): if len(value)==1: names[unicode(value)] = unicode(key) else: names[unichr(int(value[2:-1]))] = unicode(key) def xmlcharnamereplace(encoding, s, start, end, reason, data): l = [] for c in s[start:end]: try: l.append(u"&%s;" % names[c]) except KeyError: l.append(u"&#%d;" % ord(c)) return (u"".join(l), end) codecs.register_unicodeencodeerrorhandler( "test.xmlcharnamereplace", xmlcharnamereplace) sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a" sout = "«ℜ» = ⟨ሴ€⟩" self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout) sout = "\xabℜ\xbb = ⟨ሴ€⟩" self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout) sout = "\xabℜ\xbb = ⟨ሴ\xa4⟩" self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout) def test_uninamereplace(self): """ We're using the names from the unicode database this time, and we're doing "systax highlighting" here, i.e. we include the replaced text in ANSI escape sequences. For this it is useful that the error handler is not called for every single unencodable character, but for a complete sequence of unencodable characters, otherwiese we would output many unneccessary escape sequences. """ def uninamereplace(encoding, s, start, end, reason, data): l = [] for c in s[start:end]: l.append(unicodedata.name(c, u"0x%x" % ord(c))) return (u"\033[1m%s\033[0m" % u", ".join(l), end) codecs.register_unicodeencodeerrorhandler( "test.uninamereplace", uninamereplace) sin = u"\xac\u1234\u20ac\u8000" sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m" self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout) sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m" self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout) sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1m0x8000\033[0m" self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout) def test_backslashescape(self): """ Does the same as the "unicode-escape" encoding, but with different base encodings. """ sin = u"a\xac\u1234\u20ac\u8000" if sys.maxunicode > 0xffff: sin += unichr(sys.maxunicode) sout = "a\\xac\\u1234\\u20ac\\u8000" if sys.maxunicode > 0xffff: sout += u"\\U%08x" % sys.maxunicode self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) sout = "a\xac\\u1234\\u20ac\\u8000" if sys.maxunicode > 0xffff: sout += u"\\U%08x" % sys.maxunicode self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) sout = "a\xac\\u1234\xa4\\u8000" if sys.maxunicode > 0xffff: sout += u"\\U%08x" % sys.maxunicode self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) def test_relaxedutf8(self): """ This is the test for a decoding callback handler, that relaxes the UTF-8 minimal encoding restriction. A null byte that is encoded as "\xc0\x80" will be decoded as a null byte. All other illegal sequences will be handled strictly. """ def relaxedutf8(encoding, s, start, end, reason, data): if s[start:end].startswith("\xc0\x80"): return (u"\x00", start+2) # retry after two bytes else: codecs.raise_unicodedecode_errors(encoding, s, start, end, reason, data) codecs.register_unicodedecodeerrorhandler( "test.relaxedutf8", relaxedutf8) sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80" sout = u"a\x00b\x00c\xfc\x00\x00" self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout) sin = "\xc0\x80\xc0\x81" self.assertRaises(UnicodeError, sin.decode, "utf-8", "test.relaxedutf8") def test_charmapencode(self): """ For charmap encodings the replacement string will be mapped through the encoding again. This means, that to be able to use e.g. the "replace" handler, the charmap has to have a mapping for "?". """ charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"]) sin = u"abc" sout = "AABBCC" self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout) sin = u"abcA" self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) charmap[ord("?")] = "XYZ" sin = u"abcDEF" sout = "AABBCCXYZXYZXYZ" self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout) charmap[ord("?")] = u"XYZ" self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) charmap[ord("?")] = u"XYZ" self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) def test_callbacks(self): def handler1(encoding, s, start, end, reason, data): l = [u"<%d>" % ord(s[pos]) for pos in xrange(start, end)] return (u"[%s]" % u"".join(l), end) codecs.register_unicodedecodeerrorhandler("test.handler1", handler1) codecs.register_unicodeencodeerrorhandler("test.handler1", handler1) def handler2(encoding, s, start, end, reason, data): l = [u"<%d>" % ord(s[pos]) for pos in xrange(start, end)] return (u"[%s]" % u"".join(l), end+1) # skip one character codecs.register_unicodedecodeerrorhandler("test.handler2", handler2) s = "\x00\x81\x7f\x80\xff" self.assertEqual( s.decode("ascii", "test.handler1"), u"\x00[<129>]\x7f[<128>][<255>]" ) self.assertEqual( s.decode("ascii", "test.handler2"), u"\x00[<129>][<128>]" ) self.assertEqual( "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"), u"\u3042[<92><117><51><120>]xx" ) self.assertEqual( "\\u3042\u3xx".decode("unicode-escape", "test.handler1"), u"\u3042[<92><117><51><120><120>]" ) self.assertEqual( codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0], u"z[<98>][<99>]" ) self.assertEqual( u"g\xfc\xdfrk".encode("ascii", "test.handler1"), u"g[<252><223>]rk" ) self.assertEqual( u"g\xfc\xdf".encode("ascii", "test.handler1"), u"g[<252><223>]" ) def test_main(): test_support.run_unittest(CodecCallbackTest) if __name__ == "__main__": test_main()