Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(4)

Delta Between Two Patch Sets: Lib/test/test_codecs.py

Issue 25270: codecs.escape_encode systemerror on empty byte string
Left Patch Set: Created 4 years, 4 months ago
Right Patch Set: Created 3 years, 5 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « Doc/c-api/bytes.rst ('k') | Objects/bytesobject.c » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 import codecs 1 import codecs
2 import contextlib 2 import contextlib
3 import io 3 import io
4 import locale 4 import locale
5 import sys 5 import sys
6 import unittest 6 import unittest
7 import warnings
8 import encodings 7 import encodings
9 8
10 from test import support 9 from test import support
11
12 if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14 else:
15 VISTA_OR_LATER = False
16 10
17 try: 11 try:
18 import ctypes 12 import ctypes
19 except ImportError: 13 except ImportError:
20 ctypes = None 14 ctypes = None
21 SIZEOF_WCHAR_T = -1 15 SIZEOF_WCHAR_T = -1
22 else: 16 else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar) 17 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
24 18
25 def coding_checker(self, coder): 19 def coding_checker(self, coder):
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
93 r = codecs.getreader(self.encoding)(q) 87 r = codecs.getreader(self.encoding)(q)
94 result = "" 88 result = ""
95 for (c, partialresult) in zip(input.encode(self.encoding), partialresult s): 89 for (c, partialresult) in zip(input.encode(self.encoding), partialresult s):
96 q.write(bytes([c])) 90 q.write(bytes([c]))
97 result += r.read() 91 result += r.read()
98 self.assertEqual(result, partialresult) 92 self.assertEqual(result, partialresult)
99 # check that there's nothing left in the buffers 93 # check that there's nothing left in the buffers
100 self.assertEqual(r.read(), "") 94 self.assertEqual(r.read(), "")
101 self.assertEqual(r.bytebuffer, b"") 95 self.assertEqual(r.bytebuffer, b"")
102 96
103 # do the check again, this time using a incremental decoder 97 # do the check again, this time using an incremental decoder
104 d = codecs.getincrementaldecoder(self.encoding)() 98 d = codecs.getincrementaldecoder(self.encoding)()
105 result = "" 99 result = ""
106 for (c, partialresult) in zip(input.encode(self.encoding), partialresult s): 100 for (c, partialresult) in zip(input.encode(self.encoding), partialresult s):
107 result += d.decode(bytes([c])) 101 result += d.decode(bytes([c]))
108 self.assertEqual(result, partialresult) 102 self.assertEqual(result, partialresult)
109 # check that there's nothing left in the buffers 103 # check that there's nothing left in the buffers
110 self.assertEqual(d.decode(b"", True), "") 104 self.assertEqual(d.decode(b"", True), "")
111 self.assertEqual(d.buffer, b"") 105 self.assertEqual(d.buffer, b"")
112 106
113 # Check whether the reset method works properly 107 # Check whether the reset method works properly
(...skipping 240 matching lines...) Expand 10 before | Expand all | Expand 10 after
354 "[\\udc80]".encode(self.encoding)) 348 "[\\udc80]".encode(self.encoding))
355 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"), 349 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
356 "[\\udc80]".encode(self.encoding)) 350 "[\\udc80]".encode(self.encoding))
357 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"), 351 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
358 "[�]".encode(self.encoding)) 352 "[�]".encode(self.encoding))
359 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"), 353 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
360 "[]".encode(self.encoding)) 354 "[]".encode(self.encoding))
361 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), 355 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
362 "[?]".encode(self.encoding)) 356 "[?]".encode(self.encoding))
363 357
358 # sequential surrogate characters
359 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
360 "[]".encode(self.encoding))
361 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
362 "[??]".encode(self.encoding))
363
364 bom = "".encode(self.encoding) 364 bom = "".encode(self.encoding)
365 for before, after in [("\U00010fff", "A"), ("[", "]"), 365 for before, after in [("\U00010fff", "A"), ("[", "]"),
366 ("A", "\U00010fff")]: 366 ("A", "\U00010fff")]:
367 before_sequence = before.encode(self.encoding)[len(bom):] 367 before_sequence = before.encode(self.encoding)[len(bom):]
368 after_sequence = after.encode(self.encoding)[len(bom):] 368 after_sequence = after.encode(self.encoding)[len(bom):]
369 test_string = before + "\uDC80" + after 369 test_string = before + "\uDC80" + after
370 test_sequence = (bom + before_sequence + 370 test_sequence = (bom + before_sequence +
371 self.ill_formed_sequence + after_sequence) 371 self.ill_formed_sequence + after_sequence)
372 self.assertRaises(UnicodeDecodeError, test_sequence.decode, 372 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
373 self.encoding) 373 self.encoding)
(...skipping 372 matching lines...) Expand 10 before | Expand all | Expand 10 after
746 def test_nonbmp(self): 746 def test_nonbmp(self):
747 self.assertEqual("\U00010203".encode(self.encoding), 747 self.assertEqual("\U00010203".encode(self.encoding),
748 b'\xd8\x00\xde\x03') 748 b'\xd8\x00\xde\x03')
749 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding), 749 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
750 "\U00010203") 750 "\U00010203")
751 751
752 class UTF8Test(ReadTest, unittest.TestCase): 752 class UTF8Test(ReadTest, unittest.TestCase):
753 encoding = "utf-8" 753 encoding = "utf-8"
754 ill_formed_sequence = b"\xed\xb2\x80" 754 ill_formed_sequence = b"\xed\xb2\x80"
755 ill_formed_sequence_replace = "\ufffd" * 3 755 ill_formed_sequence_replace = "\ufffd" * 3
756 BOM = b''
756 757
757 def test_partial(self): 758 def test_partial(self):
758 self.check_partial( 759 self.check_partial(
759 "\x00\xff\u07ff\u0800\uffff\U00010000", 760 "\x00\xff\u07ff\u0800\uffff\U00010000",
760 [ 761 [
761 "\x00", 762 "\x00",
762 "\x00", 763 "\x00",
763 "\x00\xff", 764 "\x00\xff",
764 "\x00\xff", 765 "\x00\xff",
765 "\x00\xff\u07ff", 766 "\x00\xff\u07ff",
766 "\x00\xff\u07ff", 767 "\x00\xff\u07ff",
767 "\x00\xff\u07ff", 768 "\x00\xff\u07ff",
768 "\x00\xff\u07ff\u0800", 769 "\x00\xff\u07ff\u0800",
769 "\x00\xff\u07ff\u0800", 770 "\x00\xff\u07ff\u0800",
770 "\x00\xff\u07ff\u0800", 771 "\x00\xff\u07ff\u0800",
771 "\x00\xff\u07ff\u0800\uffff", 772 "\x00\xff\u07ff\u0800\uffff",
772 "\x00\xff\u07ff\u0800\uffff", 773 "\x00\xff\u07ff\u0800\uffff",
773 "\x00\xff\u07ff\u0800\uffff", 774 "\x00\xff\u07ff\u0800\uffff",
774 "\x00\xff\u07ff\u0800\uffff", 775 "\x00\xff\u07ff\u0800\uffff",
775 "\x00\xff\u07ff\u0800\uffff\U00010000", 776 "\x00\xff\u07ff\u0800\uffff\U00010000",
776 ] 777 ]
777 ) 778 )
778 779
779 def test_decoder_state(self): 780 def test_decoder_state(self):
780 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff" 781 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
781 self.check_state_handling_decode(self.encoding, 782 self.check_state_handling_decode(self.encoding,
782 u, u.encode(self.encoding)) 783 u, u.encode(self.encoding))
783 784
785 def test_decode_error(self):
786 for data, error_handler, expected in (
787 (b'[\x80\xff]', 'ignore', '[]'),
788 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
789 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
790 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
791 ):
792 with self.subTest(data=data, error_handler=error_handler,
793 expected=expected):
794 self.assertEqual(data.decode(self.encoding, error_handler),
795 expected)
796
784 def test_lone_surrogates(self): 797 def test_lone_surrogates(self):
785 super().test_lone_surrogates() 798 super().test_lone_surrogates()
786 # not sure if this is making sense for 799 # not sure if this is making sense for
787 # UTF-16 and UTF-32 800 # UTF-16 and UTF-32
788 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"), 801 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
789 b'[\x80]') 802 self.BOM + b'[\x80]')
803
804 with self.assertRaises(UnicodeEncodeError) as cm:
805 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
806 exc = cm.exception
807 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
790 808
791 def test_surrogatepass_handler(self): 809 def test_surrogatepass_handler(self):
792 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"), 810 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
793 b"abc\xed\xa0\x80def") 811 self.BOM + b"abc\xed\xa0\x80def")
794 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"), 812 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass "),
813 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
814 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass") ,
815 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
816
817 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatep ass"),
795 "abc\ud800def") 818 "abc\ud800def")
796 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"), 819 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, " surrogatepass"),
797 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
798 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrog atepass"),
799 "\U00010fff\uD800") 820 "\U00010fff\uD800")
821
800 self.assertTrue(codecs.lookup_error("surrogatepass")) 822 self.assertTrue(codecs.lookup_error("surrogatepass"))
801 with self.assertRaises(UnicodeDecodeError): 823 with self.assertRaises(UnicodeDecodeError):
802 b"abc\xed\xa0".decode("utf-8", "surrogatepass") 824 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
803 with self.assertRaises(UnicodeDecodeError): 825 with self.assertRaises(UnicodeDecodeError):
804 b"abc\xed\xa0z".decode("utf-8", "surrogatepass") 826 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
805 827
806 828
807 @unittest.skipUnless(sys.platform == 'win32', 829 @unittest.skipUnless(sys.platform == 'win32',
808 'cp65001 is a Windows-only codec') 830 'cp65001 is a Windows-only codec')
809 class CP65001Test(ReadTest, unittest.TestCase): 831 class CP65001Test(ReadTest, unittest.TestCase):
810 encoding = "cp65001" 832 encoding = "cp65001"
811 833
812 def test_encode(self): 834 def test_encode(self):
813 tests = [ 835 tests = [
814 ('abc', 'strict', b'abc'), 836 ('abc', 'strict', b'abc'),
815 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'), 837 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
816 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'), 838 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
839 ('\udc80', 'strict', None),
840 ('\udc80', 'ignore', b''),
841 ('\udc80', 'replace', b'?'),
842 ('\udc80', 'backslashreplace', b'\\udc80'),
843 ('\udc80', 'namereplace', b'\\udc80'),
844 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
817 ] 845 ]
818 if VISTA_OR_LATER:
819 tests.extend((
820 ('\udc80', 'strict', None),
821 ('\udc80', 'ignore', b''),
822 ('\udc80', 'replace', b'?'),
823 ('\udc80', 'backslashreplace', b'\\udc80'),
824 ('\udc80', 'namereplace', b'\\udc80'),
825 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
826 ))
827 else:
828 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
829 for text, errors, expected in tests: 846 for text, errors, expected in tests:
830 if expected is not None: 847 if expected is not None:
831 try: 848 try:
832 encoded = text.encode('cp65001', errors) 849 encoded = text.encode('cp65001', errors)
833 except UnicodeEncodeError as err: 850 except UnicodeEncodeError as err:
834 self.fail('Unable to encode %a to cp65001 with ' 851 self.fail('Unable to encode %a to cp65001 with '
835 'errors=%r: %s' % (text, errors, err)) 852 'errors=%r: %s' % (text, errors, err))
836 self.assertEqual(encoded, expected, 853 self.assertEqual(encoded, expected,
837 '%a.encode("cp65001", %r)=%a != %a' 854 '%a.encode("cp65001", %r)=%a != %a'
838 % (text, errors, encoded, expected)) 855 % (text, errors, encoded, expected))
839 else: 856 else:
840 self.assertRaises(UnicodeEncodeError, 857 self.assertRaises(UnicodeEncodeError,
841 text.encode, "cp65001", errors) 858 text.encode, "cp65001", errors)
842 859
843 def test_decode(self): 860 def test_decode(self):
844 tests = [ 861 tests = [
845 (b'abc', 'strict', 'abc'), 862 (b'abc', 'strict', 'abc'),
846 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'), 863 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
847 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'), 864 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
848 (b'\xef\xbf\xbd', 'strict', '\ufffd'), 865 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
849 (b'[\xc3\xa9]', 'strict', '[\xe9]'), 866 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
850 # invalid bytes 867 # invalid bytes
851 (b'[\xff]', 'strict', None), 868 (b'[\xff]', 'strict', None),
852 (b'[\xff]', 'ignore', '[]'), 869 (b'[\xff]', 'ignore', '[]'),
853 (b'[\xff]', 'replace', '[\ufffd]'), 870 (b'[\xff]', 'replace', '[\ufffd]'),
854 (b'[\xff]', 'surrogateescape', '[\udcff]'), 871 (b'[\xff]', 'surrogateescape', '[\udcff]'),
872 (b'[\xed\xb2\x80]', 'strict', None),
873 (b'[\xed\xb2\x80]', 'ignore', '[]'),
874 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
855 ] 875 ]
856 if VISTA_OR_LATER:
857 tests.extend((
858 (b'[\xed\xb2\x80]', 'strict', None),
859 (b'[\xed\xb2\x80]', 'ignore', '[]'),
860 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
861 ))
862 else:
863 tests.extend((
864 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
865 ))
866 for raw, errors, expected in tests: 876 for raw, errors, expected in tests:
867 if expected is not None: 877 if expected is not None:
868 try: 878 try:
869 decoded = raw.decode('cp65001', errors) 879 decoded = raw.decode('cp65001', errors)
870 except UnicodeDecodeError as err: 880 except UnicodeDecodeError as err:
871 self.fail('Unable to decode %a from cp65001 with ' 881 self.fail('Unable to decode %a from cp65001 with '
872 'errors=%r: %s' % (raw, errors, err)) 882 'errors=%r: %s' % (raw, errors, err))
873 self.assertEqual(decoded, expected, 883 self.assertEqual(decoded, expected,
874 '%a.decode("cp65001", %r)=%a != %a' 884 '%a.decode("cp65001", %r)=%a != %a'
875 % (raw, errors, decoded, expected)) 885 % (raw, errors, decoded, expected))
876 else: 886 else:
877 self.assertRaises(UnicodeDecodeError, 887 self.assertRaises(UnicodeDecodeError,
878 raw.decode, 'cp65001', errors) 888 raw.decode, 'cp65001', errors)
879 889
880 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
881 def test_lone_surrogates(self): 890 def test_lone_surrogates(self):
882 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001") 891 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
883 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001") 892 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
884 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"), 893 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
885 b'[\\udc80]') 894 b'[\\udc80]')
886 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"), 895 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
887 b'[\\udc80]') 896 b'[\\udc80]')
888 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"), 897 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
889 b'[�]') 898 b'[�]')
890 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"), 899 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
891 b'[\x80]') 900 b'[\x80]')
892 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"), 901 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
893 b'[]') 902 b'[]')
894 self.assertEqual("[\uDC80]".encode("cp65001", "replace"), 903 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
895 b'[?]') 904 b'[?]')
896 905
897 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
898 def test_surrogatepass_handler(self): 906 def test_surrogatepass_handler(self):
899 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"), 907 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
900 b"abc\xed\xa0\x80def") 908 b"abc\xed\xa0\x80def")
901 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass" ), 909 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass" ),
902 "abc\ud800def") 910 "abc\ud800def")
903 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"), 911 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
904 b"\xf0\x90\xbf\xbf\xed\xa0\x80") 912 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
905 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surr ogatepass"), 913 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surr ogatepass"),
906 "\U00010fff\uD800") 914 "\U00010fff\uD800")
907 self.assertTrue(codecs.lookup_error("surrogatepass")) 915 self.assertTrue(codecs.lookup_error("surrogatepass"))
908 916
909 917
910 class UTF7Test(ReadTest, unittest.TestCase): 918 class UTF7Test(ReadTest, unittest.TestCase):
911 encoding = "utf-7" 919 encoding = "utf-7"
920
921 def test_ascii(self):
922 # Set D (directly encoded characters)
923 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
924 'abcdefghijklmnopqrstuvwxyz'
925 '0123456789'
926 '\'(),-./:?')
927 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
928 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
929 # Set O (optional direct characters)
930 set_o = ' !"#$%&*;<=>@[]^_`{|}'
931 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
932 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
933 # +
934 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
935 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
936 # White spaces
937 ws = ' \t\n\r'
938 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
939 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
940 # Other ASCII characters
941 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
942 set(set_d + set_o + '+' + ws)))
943 self.assertEqual(other_ascii.encode(self.encoding),
944 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
945 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
912 946
913 def test_partial(self): 947 def test_partial(self):
914 self.check_partial( 948 self.check_partial(
915 'a+-b\x00c\x80d\u0100e\U00010000f', 949 'a+-b\x00c\x80d\u0100e\U00010000f',
916 [ 950 [
917 'a', 951 'a',
918 'a', 952 'a',
919 'a+', 953 'a+',
920 'a+-', 954 'a+-',
921 'a+-b', 955 'a+-b',
(...skipping 22 matching lines...) Expand all
944 'a+-b\x00c\x80d\u0100e', 978 'a+-b\x00c\x80d\u0100e',
945 'a+-b\x00c\x80d\u0100e', 979 'a+-b\x00c\x80d\u0100e',
946 'a+-b\x00c\x80d\u0100e', 980 'a+-b\x00c\x80d\u0100e',
947 'a+-b\x00c\x80d\u0100e\U00010000', 981 'a+-b\x00c\x80d\u0100e\U00010000',
948 'a+-b\x00c\x80d\u0100e\U00010000f', 982 'a+-b\x00c\x80d\u0100e\U00010000f',
949 ] 983 ]
950 ) 984 )
951 985
952 def test_errors(self): 986 def test_errors(self):
953 tests = [ 987 tests = [
988 (b'\xffb', '\ufffdb'),
954 (b'a\xffb', 'a\ufffdb'), 989 (b'a\xffb', 'a\ufffdb'),
990 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
955 (b'a+IK', 'a\ufffd'), 991 (b'a+IK', 'a\ufffd'),
956 (b'a+IK-b', 'a\ufffdb'), 992 (b'a+IK-b', 'a\ufffdb'),
957 (b'a+IK,b', 'a\ufffdb'), 993 (b'a+IK,b', 'a\ufffdb'),
958 (b'a+IKx', 'a\u20ac\ufffd'), 994 (b'a+IKx', 'a\u20ac\ufffd'),
959 (b'a+IKx-b', 'a\u20ac\ufffdb'), 995 (b'a+IKx-b', 'a\u20ac\ufffdb'),
960 (b'a+IKwgr', 'a\u20ac\ufffd'), 996 (b'a+IKwgr', 'a\u20ac\ufffd'),
961 (b'a+IKwgr-b', 'a\u20ac\ufffdb'), 997 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
962 (b'a+IKwgr,', 'a\u20ac\ufffd'), 998 (b'a+IKwgr,', 'a\u20ac\ufffd'),
963 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'), 999 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
964 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'), 1000 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
965 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'), 1001 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
966 (b'a+/,+IKw-b', 'a\ufffd\u20acb'), 1002 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
967 (b'a+//,+IKw-b', 'a\ufffd\u20acb'), 1003 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
968 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), 1004 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
969 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), 1005 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
1006 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1007 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
970 ] 1008 ]
971 for raw, expected in tests: 1009 for raw, expected in tests:
972 with self.subTest(raw=raw): 1010 with self.subTest(raw=raw):
973 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode, 1011 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
974 raw, 'strict', True) 1012 raw, 'strict', True)
975 self.assertEqual(raw.decode('utf-7', 'replace'), expected) 1013 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
976 1014
977 def test_nonbmp(self): 1015 def test_nonbmp(self):
978 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') 1016 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
979 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') 1017 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
980 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') 1018 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
981 1019 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
982 test_lone_surrogates = None 1020 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-' )
1021 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0' )
1022 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1023 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1024 b'+IKwgrNgB3KA-')
1025 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1026 '\u20ac\u20ac\U000104A0')
1027 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1028 '\u20ac\u20ac\U000104A0')
1029
1030 def test_lone_surrogates(self):
1031 tests = [
1032 (b'a+2AE-b', 'a\ud801b'),
1033 (b'a+2AE\xffb', 'a\ufffdb'),
1034 (b'a+2AE', 'a\ufffd'),
1035 (b'a+2AEA-b', 'a\ufffdb'),
1036 (b'a+2AH-b', 'a\ufffdb'),
1037 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1038 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1039 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1040 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1041 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1042 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1043 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1044 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1045 ]
1046 for raw, expected in tests:
1047 with self.subTest(raw=raw):
1048 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
983 1049
984 1050
985 class UTF16ExTest(unittest.TestCase): 1051 class UTF16ExTest(unittest.TestCase):
986 1052
987 def test_errors(self): 1053 def test_errors(self):
988 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True) 1054 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
989 1055
990 def test_bad_args(self): 1056 def test_bad_args(self):
991 self.assertRaises(TypeError, codecs.utf_16_ex_decode) 1057 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
992 1058
993 class ReadBufferTest(unittest.TestCase): 1059 class ReadBufferTest(unittest.TestCase):
994 1060
995 def test_array(self): 1061 def test_array(self):
996 import array 1062 import array
997 self.assertEqual( 1063 self.assertEqual(
998 codecs.readbuffer_encode(array.array("b", b"spam")), 1064 codecs.readbuffer_encode(array.array("b", b"spam")),
999 (b"spam", 4) 1065 (b"spam", 4)
1000 ) 1066 )
1001 1067
1002 def test_empty(self): 1068 def test_empty(self):
1003 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0)) 1069 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
1004 1070
1005 def test_bad_args(self): 1071 def test_bad_args(self):
1006 self.assertRaises(TypeError, codecs.readbuffer_encode) 1072 self.assertRaises(TypeError, codecs.readbuffer_encode)
1007 self.assertRaises(TypeError, codecs.readbuffer_encode, 42) 1073 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1008 1074
1009 class UTF8SigTest(UTF8Test, unittest.TestCase): 1075 class UTF8SigTest(UTF8Test, unittest.TestCase):
1010 encoding = "utf-8-sig" 1076 encoding = "utf-8-sig"
1077 BOM = codecs.BOM_UTF8
1011 1078
1012 def test_partial(self): 1079 def test_partial(self):
1013 self.check_partial( 1080 self.check_partial(
1014 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 1081 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
1015 [ 1082 [
1016 "", 1083 "",
1017 "", 1084 "",
1018 "", # First BOM has been read and skipped 1085 "", # First BOM has been read and skipped
1019 "", 1086 "",
1020 "", 1087 "",
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
1101 b = bytes([b]) 1168 b = bytes([b])
1102 if b != b'\\': 1169 if b != b'\\':
1103 self.assertEqual(decode(b + b'0'), (b + b'0', 2)) 1170 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
1104 1171
1105 def test_escape(self): 1172 def test_escape(self):
1106 decode = codecs.escape_decode 1173 decode = codecs.escape_decode
1107 check = coding_checker(self, decode) 1174 check = coding_checker(self, decode)
1108 check(b"[\\\n]", b"[]") 1175 check(b"[\\\n]", b"[]")
1109 check(br'[\"]', b'["]') 1176 check(br'[\"]', b'["]')
1110 check(br"[\']", b"[']") 1177 check(br"[\']", b"[']")
1111 check(br"[\\]", br"[\]") 1178 check(br"[\\]", b"[\\]")
1112 check(br"[\a]", b"[\x07]") 1179 check(br"[\a]", b"[\x07]")
1113 check(br"[\b]", b"[\x08]") 1180 check(br"[\b]", b"[\x08]")
1114 check(br"[\t]", b"[\x09]") 1181 check(br"[\t]", b"[\x09]")
1115 check(br"[\n]", b"[\x0a]") 1182 check(br"[\n]", b"[\x0a]")
1116 check(br"[\v]", b"[\x0b]") 1183 check(br"[\v]", b"[\x0b]")
1117 check(br"[\f]", b"[\x0c]") 1184 check(br"[\f]", b"[\x0c]")
1118 check(br"[\r]", b"[\x0d]") 1185 check(br"[\r]", b"[\x0d]")
1119 check(br"[\7]", b"[\x07]") 1186 check(br"[\7]", b"[\x07]")
1120 check(br"[\8]", br"[\8]")
1121 check(br"[\78]", b"[\x078]") 1187 check(br"[\78]", b"[\x078]")
1122 check(br"[\41]", b"[!]") 1188 check(br"[\41]", b"[!]")
1123 check(br"[\418]", b"[!8]") 1189 check(br"[\418]", b"[!8]")
1124 check(br"[\101]", b"[A]") 1190 check(br"[\101]", b"[A]")
1125 check(br"[\1010]", b"[A0]") 1191 check(br"[\1010]", b"[A0]")
1126 check(br"[\501]", b"[A]") 1192 check(br"[\501]", b"[A]")
1127 check(br"[\x41]", b"[A]") 1193 check(br"[\x41]", b"[A]")
1128 check(br"[\X41]", br"[\X41]")
1129 check(br"[\x410]", b"[A0]") 1194 check(br"[\x410]", b"[A0]")
1130 for b in range(256): 1195 for i in range(97, 123):
1131 if b not in b'\n"\'\\abtnvfr01234567x': 1196 b = bytes([i])
1132 b = bytes([b]) 1197 if b not in b'abfnrtvx':
1133 check(b'\\' + b, b'\\' + b) 1198 with self.assertWarns(DeprecationWarning):
1199 check(b"\\" + b, b"\\" + b)
1200 with self.assertWarns(DeprecationWarning):
1201 check(b"\\" + b.upper(), b"\\" + b.upper())
1202 with self.assertWarns(DeprecationWarning):
1203 check(br"\8", b"\\8")
1204 with self.assertWarns(DeprecationWarning):
1205 check(br"\9", b"\\9")
1134 1206
1135 def test_errors(self): 1207 def test_errors(self):
1136 decode = codecs.escape_decode 1208 decode = codecs.escape_decode
1137 self.assertRaises(ValueError, decode, br"\x") 1209 self.assertRaises(ValueError, decode, br"\x")
1138 self.assertRaises(ValueError, decode, br"[\x]") 1210 self.assertRaises(ValueError, decode, br"[\x]")
1139 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6)) 1211 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1140 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6)) 1212 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1141 self.assertRaises(ValueError, decode, br"\x0") 1213 self.assertRaises(ValueError, decode, br"\x0")
1142 self.assertRaises(ValueError, decode, br"[\x0]") 1214 self.assertRaises(ValueError, decode, br"[\x0]")
1143 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8)) 1215 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
(...skipping 714 matching lines...) Expand 10 before | Expand all | Expand 10 after
1858 "unicode_internal", 1930 "unicode_internal",
1859 "utf_16", 1931 "utf_16",
1860 "utf_16_be", 1932 "utf_16_be",
1861 "utf_16_le", 1933 "utf_16_le",
1862 "utf_7", 1934 "utf_7",
1863 "utf_8", 1935 "utf_8",
1864 ] 1936 ]
1865 1937
1866 if hasattr(codecs, "mbcs_encode"): 1938 if hasattr(codecs, "mbcs_encode"):
1867 all_unicode_encodings.append("mbcs") 1939 all_unicode_encodings.append("mbcs")
1940 if hasattr(codecs, "oem_encode"):
1941 all_unicode_encodings.append("oem")
1868 1942
1869 # The following encoding is not tested, because it's not supposed 1943 # The following encoding is not tested, because it's not supposed
1870 # to work: 1944 # to work:
1871 # "undefined" 1945 # "undefined"
1872 1946
1873 # The following encodings don't work in stateful mode 1947 # The following encodings don't work in stateful mode
1874 broken_unicode_with_stateful = [ 1948 broken_unicode_with_stateful = [
1875 "punycode", 1949 "punycode",
1876 "unicode_internal" 1950 "unicode_internal"
1877 ] 1951 ]
(...skipping 429 matching lines...) Expand 10 before | Expand all | Expand 10 after
2307 codecs.latin_1_decode, 2381 codecs.latin_1_decode,
2308 codecs.ascii_decode, 2382 codecs.ascii_decode,
2309 codecs.charmap_decode, 2383 codecs.charmap_decode,
2310 ] 2384 ]
2311 if hasattr(codecs, "mbcs_decode"): 2385 if hasattr(codecs, "mbcs_decode"):
2312 decoders.append(codecs.mbcs_decode) 2386 decoders.append(codecs.mbcs_decode)
2313 for decoder in decoders: 2387 for decoder in decoders:
2314 self.assertRaises(TypeError, decoder, "xxx") 2388 self.assertRaises(TypeError, decoder, "xxx")
2315 2389
2316 def test_unicode_escape(self): 2390 def test_unicode_escape(self):
2317 # Escape-decoding an unicode string is supported ang gives the same 2391 # Escape-decoding a unicode string is supported and gives the same
2318 # result as decoding the equivalent ASCII bytes string. 2392 # result as decoding the equivalent ASCII bytes string.
2319 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6)) 2393 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2320 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6) ) 2394 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6) )
2321 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6)) 2395 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2322 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234" , 6)) 2396 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234" , 6))
2323 2397
2324 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\ U00110000") 2398 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\ U00110000")
2325 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10)) 2399 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2326 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashr eplace"), 2400 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashr eplace"),
2327 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10)) 2401 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
2372 check(br"[\']", "[']") 2446 check(br"[\']", "[']")
2373 check(br"[\\]", r"[\]") 2447 check(br"[\\]", r"[\]")
2374 check(br"[\a]", "[\x07]") 2448 check(br"[\a]", "[\x07]")
2375 check(br"[\b]", "[\x08]") 2449 check(br"[\b]", "[\x08]")
2376 check(br"[\t]", "[\x09]") 2450 check(br"[\t]", "[\x09]")
2377 check(br"[\n]", "[\x0a]") 2451 check(br"[\n]", "[\x0a]")
2378 check(br"[\v]", "[\x0b]") 2452 check(br"[\v]", "[\x0b]")
2379 check(br"[\f]", "[\x0c]") 2453 check(br"[\f]", "[\x0c]")
2380 check(br"[\r]", "[\x0d]") 2454 check(br"[\r]", "[\x0d]")
2381 check(br"[\7]", "[\x07]") 2455 check(br"[\7]", "[\x07]")
2382 check(br"[\8]", r"[\8]")
2383 check(br"[\78]", "[\x078]") 2456 check(br"[\78]", "[\x078]")
2384 check(br"[\41]", "[!]") 2457 check(br"[\41]", "[!]")
2385 check(br"[\418]", "[!8]") 2458 check(br"[\418]", "[!8]")
2386 check(br"[\101]", "[A]") 2459 check(br"[\101]", "[A]")
2387 check(br"[\1010]", "[A0]") 2460 check(br"[\1010]", "[A0]")
2388 check(br"[\x41]", "[A]") 2461 check(br"[\x41]", "[A]")
2389 check(br"[\x410]", "[A0]") 2462 check(br"[\x410]", "[A0]")
2390 check(br"\u20ac", "\u20ac") 2463 check(br"\u20ac", "\u20ac")
2391 check(br"\U0001d120", "\U0001d120") 2464 check(br"\U0001d120", "\U0001d120")
2392 for b in range(256): 2465 for i in range(97, 123):
2393 if b not in b'\n"\'\\abtnvfr01234567xuUN': 2466 b = bytes([i])
2394 check(b'\\' + bytes([b]), '\\' + chr(b)) 2467 if b not in b'abfnrtuvx':
2468 with self.assertWarns(DeprecationWarning):
2469 check(b"\\" + b, "\\" + chr(i))
2470 if b.upper() not in b'UN':
2471 with self.assertWarns(DeprecationWarning):
2472 check(b"\\" + b.upper(), "\\" + chr(i-32))
2473 with self.assertWarns(DeprecationWarning):
2474 check(br"\8", "\\8")
2475 with self.assertWarns(DeprecationWarning):
2476 check(br"\9", "\\9")
2395 2477
2396 def test_decode_errors(self): 2478 def test_decode_errors(self):
2397 decode = codecs.unicode_escape_decode 2479 decode = codecs.unicode_escape_decode
2398 for c, d in (b'x', 2), (b'u', 4), (b'U', 4): 2480 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2399 for i in range(d): 2481 for i in range(d):
2400 self.assertRaises(UnicodeDecodeError, decode, 2482 self.assertRaises(UnicodeDecodeError, decode,
2401 b"\\" + c + b"0"*i) 2483 b"\\" + c + b"0"*i)
2402 self.assertRaises(UnicodeDecodeError, decode, 2484 self.assertRaises(UnicodeDecodeError, decode,
2403 b"[\\" + c + b"0"*i + b"]") 2485 b"[\\" + c + b"0"*i + b"]")
2404 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i 2486 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
2456 self.assertEqual(decode(data, "replace"), 2538 self.assertEqual(decode(data, "replace"),
2457 ("[\ufffd]\ufffd", len(data))) 2539 ("[\ufffd]\ufffd", len(data)))
2458 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000") 2540 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2459 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) 2541 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2460 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) 2542 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2461 2543
2462 2544
2463 class EscapeEncodeTest(unittest.TestCase): 2545 class EscapeEncodeTest(unittest.TestCase):
2464 2546
2465 def test_escape_encode(self): 2547 def test_escape_encode(self):
2466 self.assertEqual(codecs.escape_encode(b''), (b'', 0)) 2548 tests = [
2549 (b'', (b'', 0)),
2550 (b'foobar', (b'foobar', 6)),
2551 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2552 (b'a\'b', (b"a\\'b", 3)),
2553 (b'b\\c', (b'b\\\\c', 3)),
2554 (b'c\nd', (b'c\\nd', 3)),
2555 (b'd\re', (b'd\\re', 3)),
2556 (b'f\x7fg', (b'f\\x7fg', 3)),
2557 (b'h\ri', (b'h\\ri', 3)),
storchaka 2016/09/16 13:52:41 Duplicates (b'd\re', (b'd\\re', 3))
2558 ]
2559 for data, output in tests:
2560 with self.subTest(data=data):
2561 self.assertEqual(codecs.escape_encode(data), output)
2562 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2563 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2467 2564
2468 2565
2469 class SurrogateEscapeTest(unittest.TestCase): 2566 class SurrogateEscapeTest(unittest.TestCase):
2470 2567
2471 def test_utf8(self): 2568 def test_utf8(self):
2472 # Bad byte 2569 # Bad byte
2473 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"), 2570 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
2474 "foo\udc80bar") 2571 "foo\udc80bar")
2475 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"), 2572 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
2476 b"foo\x80bar") 2573 b"foo\x80bar")
(...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after
2633 data = codecs.decode(data, encoding) 2730 data = codecs.decode(data, encoding)
2634 self.assertEqual(data, original) 2731 self.assertEqual(data, original)
2635 view_decoded = codecs.decode(view, encoding) 2732 view_decoded = codecs.decode(view, encoding)
2636 self.assertEqual(view_decoded, data) 2733 self.assertEqual(view_decoded, data)
2637 2734
2638 def test_text_to_binary_blacklists_binary_transforms(self): 2735 def test_text_to_binary_blacklists_binary_transforms(self):
2639 # Check binary -> binary codecs give a good error for str input 2736 # Check binary -> binary codecs give a good error for str input
2640 bad_input = "bad input type" 2737 bad_input = "bad input type"
2641 for encoding in bytes_transform_encodings: 2738 for encoding in bytes_transform_encodings:
2642 with self.subTest(encoding=encoding): 2739 with self.subTest(encoding=encoding):
2643 fmt = ( "{!r} is not a text encoding; " 2740 fmt = (r"{!r} is not a text encoding; "
2644 "use codecs.encode\(\) to handle arbitrary codecs") 2741 r"use codecs.encode\(\) to handle arbitrary codecs")
2645 msg = fmt.format(encoding) 2742 msg = fmt.format(encoding)
2646 with self.assertRaisesRegex(LookupError, msg) as failure: 2743 with self.assertRaisesRegex(LookupError, msg) as failure:
2647 bad_input.encode(encoding) 2744 bad_input.encode(encoding)
2648 self.assertIsNone(failure.exception.__cause__) 2745 self.assertIsNone(failure.exception.__cause__)
2649 2746
2650 def test_text_to_binary_blacklists_text_transforms(self): 2747 def test_text_to_binary_blacklists_text_transforms(self):
2651 # Check str.encode gives a good error message for str -> str codecs 2748 # Check str.encode gives a good error message for str -> str codecs
2652 msg = (r"^'rot_13' is not a text encoding; " 2749 msg = (r"^'rot_13' is not a text encoding; "
2653 "use codecs.encode\(\) to handle arbitrary codecs") 2750 r"use codecs.encode\(\) to handle arbitrary codecs")
2654 with self.assertRaisesRegex(LookupError, msg): 2751 with self.assertRaisesRegex(LookupError, msg):
2655 "just an example message".encode("rot_13") 2752 "just an example message".encode("rot_13")
2656 2753
2657 def test_binary_to_text_blacklists_binary_transforms(self): 2754 def test_binary_to_text_blacklists_binary_transforms(self):
2658 # Check bytes.decode and bytearray.decode give a good error 2755 # Check bytes.decode and bytearray.decode give a good error
2659 # message for binary -> binary codecs 2756 # message for binary -> binary codecs
2660 data = b"encode first to ensure we meet any format restrictions" 2757 data = b"encode first to ensure we meet any format restrictions"
2661 for encoding in bytes_transform_encodings: 2758 for encoding in bytes_transform_encodings:
2662 with self.subTest(encoding=encoding): 2759 with self.subTest(encoding=encoding):
2663 encoded_data = codecs.encode(data, encoding) 2760 encoded_data = codecs.encode(data, encoding)
2664 fmt = (r"{!r} is not a text encoding; " 2761 fmt = (r"{!r} is not a text encoding; "
2665 "use codecs.decode\(\) to handle arbitrary codecs") 2762 r"use codecs.decode\(\) to handle arbitrary codecs")
2666 msg = fmt.format(encoding) 2763 msg = fmt.format(encoding)
2667 with self.assertRaisesRegex(LookupError, msg): 2764 with self.assertRaisesRegex(LookupError, msg):
2668 encoded_data.decode(encoding) 2765 encoded_data.decode(encoding)
2669 with self.assertRaisesRegex(LookupError, msg): 2766 with self.assertRaisesRegex(LookupError, msg):
2670 bytearray(encoded_data).decode(encoding) 2767 bytearray(encoded_data).decode(encoding)
2671 2768
2672 def test_binary_to_text_blacklists_text_transforms(self): 2769 def test_binary_to_text_blacklists_text_transforms(self):
2673 # Check str -> str codec gives a good error for binary input 2770 # Check str -> str codec gives a good error for binary input
2674 for bad_input in (b"immutable", bytearray(b"mutable")): 2771 for bad_input in (b"immutable", bytearray(b"mutable")):
2675 with self.subTest(bad_input=bad_input): 2772 with self.subTest(bad_input=bad_input):
2676 msg = (r"^'rot_13' is not a text encoding; " 2773 msg = (r"^'rot_13' is not a text encoding; "
2677 "use codecs.decode\(\) to handle arbitrary codecs") 2774 r"use codecs.decode\(\) to handle arbitrary codecs")
2678 with self.assertRaisesRegex(LookupError, msg) as failure: 2775 with self.assertRaisesRegex(LookupError, msg) as failure:
2679 bad_input.decode("rot_13") 2776 bad_input.decode("rot_13")
2680 self.assertIsNone(failure.exception.__cause__) 2777 self.assertIsNone(failure.exception.__cause__)
2681 2778
2682 @unittest.skipUnless(zlib, "Requires zlib support") 2779 @unittest.skipUnless(zlib, "Requires zlib support")
2683 def test_custom_zlib_error_is_wrapped(self): 2780 def test_custom_zlib_error_is_wrapped(self):
2684 # Check zlib codec gives a good error for malformed input 2781 # Check zlib codec gives a good error for malformed input
2685 msg = "^decoding with 'zlib_codec' codec failed" 2782 msg = "^decoding with 'zlib_codec' codec failed"
2686 with self.assertRaisesRegex(Exception, msg) as failure: 2783 with self.assertRaisesRegex(Exception, msg) as failure:
2687 codecs.decode(b"hello", "zlib_codec") 2784 codecs.decode(b"hello", "zlib_codec")
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
2721 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec") 2818 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2722 2819
2723 2820
2724 # The codec system tries to wrap exceptions in order to ensure the error 2821 # The codec system tries to wrap exceptions in order to ensure the error
2725 # mentions the operation being performed and the codec involved. We 2822 # mentions the operation being performed and the codec involved. We
2726 # currently *only* want this to happen for relatively stateless 2823 # currently *only* want this to happen for relatively stateless
2727 # exceptions, where the only significant information they contain is their 2824 # exceptions, where the only significant information they contain is their
2728 # type and a single str argument. 2825 # type and a single str argument.
2729 2826
2730 # Use a local codec registry to avoid appearing to leak objects when 2827 # Use a local codec registry to avoid appearing to leak objects when
2731 # registering multiple seach functions 2828 # registering multiple search functions
2732 _TEST_CODECS = {} 2829 _TEST_CODECS = {}
2733 2830
2734 def _get_test_codec(codec_name): 2831 def _get_test_codec(codec_name):
2735 return _TEST_CODECS.get(codec_name) 2832 return _TEST_CODECS.get(codec_name)
2736 codecs.register(_get_test_codec) # Returns None, not usable as a decorator 2833 codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2737 2834
2738 try: 2835 try:
2739 # Issue #22166: Also need to clear the internal cache in CPython 2836 # Issue #22166: Also need to clear the internal cache in CPython
2740 from _codecs import _forget_codec 2837 from _codecs import _forget_codec
2741 except ImportError: 2838 except ImportError:
(...skipping 144 matching lines...) Expand 10 before | Expand all | Expand 10 after
2886 def decode_to_bytes(*args, **kwds): 2983 def decode_to_bytes(*args, **kwds):
2887 return b"not str!", 0 2984 return b"not str!", 0
2888 self.set_codec(encode_to_str, decode_to_bytes) 2985 self.set_codec(encode_to_str, decode_to_bytes)
2889 # No input or output type checks on the codecs module functions 2986 # No input or output type checks on the codecs module functions
2890 encoded = codecs.encode(None, self.codec_name) 2987 encoded = codecs.encode(None, self.codec_name)
2891 self.assertEqual(encoded, "not bytes!") 2988 self.assertEqual(encoded, "not bytes!")
2892 decoded = codecs.decode(None, self.codec_name) 2989 decoded = codecs.decode(None, self.codec_name)
2893 self.assertEqual(decoded, b"not str!") 2990 self.assertEqual(decoded, b"not str!")
2894 # Text model methods should complain 2991 # Text model methods should complain
2895 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; " 2992 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2896 "use codecs.encode\(\) to encode to arbitrary types$") 2993 r"use codecs.encode\(\) to encode to arbitrary types$")
2897 msg = fmt.format(self.codec_name) 2994 msg = fmt.format(self.codec_name)
2898 with self.assertRaisesRegex(TypeError, msg): 2995 with self.assertRaisesRegex(TypeError, msg):
2899 "str_input".encode(self.codec_name) 2996 "str_input".encode(self.codec_name)
2900 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; " 2997 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2901 "use codecs.decode\(\) to decode to arbitrary types$") 2998 r"use codecs.decode\(\) to decode to arbitrary types$")
2902 msg = fmt.format(self.codec_name) 2999 msg = fmt.format(self.codec_name)
2903 with self.assertRaisesRegex(TypeError, msg): 3000 with self.assertRaisesRegex(TypeError, msg):
2904 b"bytes input".decode(self.codec_name) 3001 b"bytes input".decode(self.codec_name)
2905 3002
2906 3003
2907 3004
2908 @unittest.skipUnless(sys.platform == 'win32', 3005 @unittest.skipUnless(sys.platform == 'win32',
2909 'code pages are specific to Windows') 3006 'code pages are specific to Windows')
2910 class CodePageTest(unittest.TestCase): 3007 class CodePageTest(unittest.TestCase):
2911 # CP_UTF8 is already tested by CP65001Test 3008 # CP_UTF8 is already tested by CP65001Test
(...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after
3032 3129
3033 def test_multibyte_encoding(self): 3130 def test_multibyte_encoding(self):
3034 self.check_decode(932, ( 3131 self.check_decode(932, (
3035 (b'\x84\xe9\x80', 'ignore', '\u9a3e'), 3132 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3036 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'), 3133 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3037 )) 3134 ))
3038 self.check_decode(self.CP_UTF8, ( 3135 self.check_decode(self.CP_UTF8, (
3039 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), 3136 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3040 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), 3137 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3041 )) 3138 ))
3042 if VISTA_OR_LATER: 3139 self.check_encode(self.CP_UTF8, (
3043 self.check_encode(self.CP_UTF8, ( 3140 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3044 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), 3141 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3045 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), 3142 ))
3046 ))
3047 3143
3048 def test_incremental(self): 3144 def test_incremental(self):
3049 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False) 3145 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3050 self.assertEqual(decoded, ('', 0)) 3146 self.assertEqual(decoded, ('', 0))
3051 3147
3052 decoded = codecs.code_page_decode(932, 3148 decoded = codecs.code_page_decode(932,
3053 b'\xe9\x80\xe9', 'strict', 3149 b'\xe9\x80\xe9', 'strict',
3054 False) 3150 False)
3055 self.assertEqual(decoded, ('\u9a3e', 2)) 3151 self.assertEqual(decoded, ('\u9a3e', 2))
3056 3152
3057 decoded = codecs.code_page_decode(932, 3153 decoded = codecs.code_page_decode(932,
3058 b'\xe9\x80\xe9\x80', 'strict', 3154 b'\xe9\x80\xe9\x80', 'strict',
3059 False) 3155 False)
3060 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4)) 3156 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3061 3157
3062 decoded = codecs.code_page_decode(932, 3158 decoded = codecs.code_page_decode(932,
3063 b'abc', 'strict', 3159 b'abc', 'strict',
3064 False) 3160 False)
3065 self.assertEqual(decoded, ('abc', 3)) 3161 self.assertEqual(decoded, ('abc', 3))
3162
3163 def test_mbcs_alias(self):
3164 # Check that looking up our 'default' codepage will return
3165 # mbcs when we don't have a more specific one available
3166 import _bootlocale
3167 def _get_fake_codepage(*a):
3168 return 'cp123'
3169 old_getpreferredencoding = _bootlocale.getpreferredencoding
3170 _bootlocale.getpreferredencoding = _get_fake_codepage
3171 try:
3172 codec = codecs.lookup('cp123')
3173 self.assertEqual(codec.name, 'mbcs')
3174 finally:
3175 _bootlocale.getpreferredencoding = old_getpreferredencoding
3066 3176
3067 3177
3068 class ASCIITest(unittest.TestCase): 3178 class ASCIITest(unittest.TestCase):
3069 def test_encode(self): 3179 def test_encode(self):
3070 self.assertEqual('abc123'.encode('ascii'), b'abc123') 3180 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3071 3181
3072 def test_encode_error(self): 3182 def test_encode_error(self):
3073 for data, error_handler, expected in ( 3183 for data, error_handler, expected in (
3074 ('[\x80\xff\u20ac]', 'ignore', b'[]'), 3184 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3075 ('[\x80\xff\u20ac]', 'replace', b'[???]'), 3185 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3076 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'), 3186 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
3077 ('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'), 3187 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3188 b'[\\x80\\xff\\u20ac\\U000abcde]'),
3078 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), 3189 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3079 ): 3190 ):
3080 with self.subTest(data=data, error_handler=error_handler, 3191 with self.subTest(data=data, error_handler=error_handler,
3081 expected=expected): 3192 expected=expected):
3082 self.assertEqual(data.encode('ascii', error_handler), 3193 self.assertEqual(data.encode('ascii', error_handler),
3083 expected) 3194 expected)
3084 3195
3085 def test_encode_surrogateescape_error(self): 3196 def test_encode_surrogateescape_error(self):
3086 with self.assertRaises(UnicodeEncodeError): 3197 with self.assertRaises(UnicodeEncodeError):
3087 # the first character can be decoded, but not the second 3198 # the first character can be decoded, but not the second
(...skipping 21 matching lines...) Expand all
3109 ('abc', b'abc'), 3220 ('abc', b'abc'),
3110 ('\x80\xe9\xff', b'\x80\xe9\xff'), 3221 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3111 ): 3222 ):
3112 with self.subTest(data=data, expected=expected): 3223 with self.subTest(data=data, expected=expected):
3113 self.assertEqual(data.encode('latin1'), expected) 3224 self.assertEqual(data.encode('latin1'), expected)
3114 3225
3115 def test_encode_errors(self): 3226 def test_encode_errors(self):
3116 for data, error_handler, expected in ( 3227 for data, error_handler, expected in (
3117 ('[\u20ac\udc80]', 'ignore', b'[]'), 3228 ('[\u20ac\udc80]', 'ignore', b'[]'),
3118 ('[\u20ac\udc80]', 'replace', b'[??]'), 3229 ('[\u20ac\udc80]', 'replace', b'[??]'),
3119 ('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'), 3230 ('[\u20ac\U000abcde]', 'backslashreplace',
3231 b'[\\u20ac\\U000abcde]'),
3120 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'), 3232 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3121 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), 3233 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3122 ): 3234 ):
3123 with self.subTest(data=data, error_handler=error_handler, 3235 with self.subTest(data=data, error_handler=error_handler,
3124 expected=expected): 3236 expected=expected):
3125 self.assertEqual(data.encode('latin1', error_handler), 3237 self.assertEqual(data.encode('latin1', error_handler),
3126 expected) 3238 expected)
3127 3239
3128 def test_encode_surrogateescape_error(self): 3240 def test_encode_surrogateescape_error(self):
3129 with self.assertRaises(UnicodeEncodeError): 3241 with self.assertRaises(UnicodeEncodeError):
3130 # the first character can be decoded, but not the second 3242 # the first character can be decoded, but not the second
3131 '\udc80\u20ac'.encode('latin1', 'surrogateescape') 3243 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3132 3244
3133 def test_decode(self): 3245 def test_decode(self):
3134 for data, expected in ( 3246 for data, expected in (
3135 (b'abc', 'abc'), 3247 (b'abc', 'abc'),
3136 (b'[\x80\xff]', '[\x80\xff]'), 3248 (b'[\x80\xff]', '[\x80\xff]'),
3137 ): 3249 ):
3138 with self.subTest(data=data, expected=expected): 3250 with self.subTest(data=data, expected=expected):
3139 self.assertEqual(data.decode('latin1'), expected) 3251 self.assertEqual(data.decode('latin1'), expected)
3140 3252
3141 3253
3142 if __name__ == "__main__": 3254 if __name__ == "__main__":
3143 unittest.main() 3255 unittest.main()
LEFTRIGHT

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+