Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(25246)

Delta Between Two Patch Sets: Lib/test/test_codecs.py

Issue 20538: Segfault in UTF-7 incremental decoder
Left Patch Set: Created 5 years, 7 months ago
Right Patch Set: Created 5 years, 7 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « no previous file | Objects/unicodeobject.c » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 import codecs 1 import codecs
2 import contextlib
2 import io 3 import io
3 import locale 4 import locale
4 import sys 5 import sys
5 import unittest 6 import unittest
6 import warnings 7 import warnings
8 import encodings
7 9
8 from test import support 10 from test import support
9 11
10 if sys.platform == 'win32': 12 if sys.platform == 'win32':
11 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6) 13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
12 else: 14 else:
13 VISTA_OR_LATER = False 15 VISTA_OR_LATER = False
14 16
15 try: 17 try:
16 import ctypes 18 import ctypes
(...skipping 317 matching lines...) Expand 10 before | Expand all | Expand 10 after
334 s = (s1+s2+s3+s4+s5).encode(self.encoding) 336 s = (s1+s2+s3+s4+s5).encode(self.encoding)
335 stream = io.BytesIO(s) 337 stream = io.BytesIO(s)
336 reader = codecs.getreader(self.encoding)(stream) 338 reader = codecs.getreader(self.encoding)(stream)
337 self.assertEqual(reader.readline(), s1) 339 self.assertEqual(reader.readline(), s1)
338 self.assertEqual(reader.readline(), s2) 340 self.assertEqual(reader.readline(), s2)
339 self.assertEqual(reader.readline(), s3) 341 self.assertEqual(reader.readline(), s3)
340 self.assertEqual(reader.readline(), s4) 342 self.assertEqual(reader.readline(), s4)
341 self.assertEqual(reader.readline(), s5) 343 self.assertEqual(reader.readline(), s5)
342 self.assertEqual(reader.readline(), "") 344 self.assertEqual(reader.readline(), "")
343 345
346 ill_formed_sequence_replace = "\ufffd"
347
348 def test_lone_surrogates(self):
349 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
350 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
351 "[\\udc80]".encode(self.encoding))
352 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
353 "[�]".encode(self.encoding))
354 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
355 "[]".encode(self.encoding))
356 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
357 "[?]".encode(self.encoding))
358
359 bom = "".encode(self.encoding)
360 for before, after in [("\U00010fff", "A"), ("[", "]"),
361 ("A", "\U00010fff")]:
362 before_sequence = before.encode(self.encoding)[len(bom):]
363 after_sequence = after.encode(self.encoding)[len(bom):]
364 test_string = before + "\uDC80" + after
365 test_sequence = (bom + before_sequence +
366 self.ill_formed_sequence + after_sequence)
367 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
368 self.encoding)
369 self.assertEqual(test_string.encode(self.encoding,
370 "surrogatepass"),
371 test_sequence)
372 self.assertEqual(test_sequence.decode(self.encoding,
373 "surrogatepass"),
374 test_string)
375 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
376 before + after)
377 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
378 before + self.ill_formed_sequence_replace + after)
379
344 class UTF32Test(ReadTest, unittest.TestCase): 380 class UTF32Test(ReadTest, unittest.TestCase):
345 encoding = "utf-32" 381 encoding = "utf-32"
382 if sys.byteorder == 'little':
383 ill_formed_sequence = b"\x80\xdc\x00\x00"
384 else:
385 ill_formed_sequence = b"\x00\x00\xdc\x80"
346 386
347 spamle = (b'\xff\xfe\x00\x00' 387 spamle = (b'\xff\xfe\x00\x00'
348 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' 388 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
349 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00') 389 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
350 spambe = (b'\x00\x00\xfe\xff' 390 spambe = (b'\x00\x00\xfe\xff'
351 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' 391 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
352 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') 392 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
353 393
354 def test_only_one_bom(self): 394 def test_only_one_bom(self):
355 _,_,reader,writer = codecs.lookup(self.encoding) 395 _,_,reader,writer = codecs.lookup(self.encoding)
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
427 # surrogate pairs on UCS-2 builds. 467 # surrogate pairs on UCS-2 builds.
428 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024 468 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
429 self.assertEqual('\U00010000' * 1024, 469 self.assertEqual('\U00010000' * 1024,
430 codecs.utf_32_decode(encoded_le)[0]) 470 codecs.utf_32_decode(encoded_le)[0])
431 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024 471 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
432 self.assertEqual('\U00010000' * 1024, 472 self.assertEqual('\U00010000' * 1024,
433 codecs.utf_32_decode(encoded_be)[0]) 473 codecs.utf_32_decode(encoded_be)[0])
434 474
435 class UTF32LETest(ReadTest, unittest.TestCase): 475 class UTF32LETest(ReadTest, unittest.TestCase):
436 encoding = "utf-32-le" 476 encoding = "utf-32-le"
477 ill_formed_sequence = b"\x80\xdc\x00\x00"
437 478
438 def test_partial(self): 479 def test_partial(self):
439 self.check_partial( 480 self.check_partial(
440 "\x00\xff\u0100\uffff\U00010000", 481 "\x00\xff\u0100\uffff\U00010000",
441 [ 482 [
442 "", 483 "",
443 "", 484 "",
444 "", 485 "",
445 "\x00", 486 "\x00",
446 "\x00", 487 "\x00",
(...skipping 24 matching lines...) Expand all
471 512
472 def test_issue8941(self): 513 def test_issue8941(self):
473 # Issue #8941: insufficient result allocation when decoding into 514 # Issue #8941: insufficient result allocation when decoding into
474 # surrogate pairs on UCS-2 builds. 515 # surrogate pairs on UCS-2 builds.
475 encoded = b'\x00\x00\x01\x00' * 1024 516 encoded = b'\x00\x00\x01\x00' * 1024
476 self.assertEqual('\U00010000' * 1024, 517 self.assertEqual('\U00010000' * 1024,
477 codecs.utf_32_le_decode(encoded)[0]) 518 codecs.utf_32_le_decode(encoded)[0])
478 519
479 class UTF32BETest(ReadTest, unittest.TestCase): 520 class UTF32BETest(ReadTest, unittest.TestCase):
480 encoding = "utf-32-be" 521 encoding = "utf-32-be"
522 ill_formed_sequence = b"\x00\x00\xdc\x80"
481 523
482 def test_partial(self): 524 def test_partial(self):
483 self.check_partial( 525 self.check_partial(
484 "\x00\xff\u0100\uffff\U00010000", 526 "\x00\xff\u0100\uffff\U00010000",
485 [ 527 [
486 "", 528 "",
487 "", 529 "",
488 "", 530 "",
489 "\x00", 531 "\x00",
490 "\x00", 532 "\x00",
(...skipping 25 matching lines...) Expand all
516 def test_issue8941(self): 558 def test_issue8941(self):
517 # Issue #8941: insufficient result allocation when decoding into 559 # Issue #8941: insufficient result allocation when decoding into
518 # surrogate pairs on UCS-2 builds. 560 # surrogate pairs on UCS-2 builds.
519 encoded = b'\x00\x01\x00\x00' * 1024 561 encoded = b'\x00\x01\x00\x00' * 1024
520 self.assertEqual('\U00010000' * 1024, 562 self.assertEqual('\U00010000' * 1024,
521 codecs.utf_32_be_decode(encoded)[0]) 563 codecs.utf_32_be_decode(encoded)[0])
522 564
523 565
524 class UTF16Test(ReadTest, unittest.TestCase): 566 class UTF16Test(ReadTest, unittest.TestCase):
525 encoding = "utf-16" 567 encoding = "utf-16"
568 if sys.byteorder == 'little':
569 ill_formed_sequence = b"\x80\xdc"
570 else:
571 ill_formed_sequence = b"\xdc\x80"
526 572
527 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' 573 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
528 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' 574 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
529 575
530 def test_only_one_bom(self): 576 def test_only_one_bom(self):
531 _,_,reader,writer = codecs.lookup(self.encoding) 577 _,_,reader,writer = codecs.lookup(self.encoding)
532 # encode some stream 578 # encode some stream
533 s = io.BytesIO() 579 s = io.BytesIO()
534 f = writer(s) 580 f = writer(s)
535 f.write("spam") 581 f.write("spam")
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
591 def test_bug691291(self): 637 def test_bug691291(self):
592 # Files are always opened in binary mode, even if no binary mode was 638 # Files are always opened in binary mode, even if no binary mode was
593 # specified. This means that no automatic conversion of '\n' is done 639 # specified. This means that no automatic conversion of '\n' is done
594 # on reading and writing. 640 # on reading and writing.
595 s1 = 'Hello\r\nworld\r\n' 641 s1 = 'Hello\r\nworld\r\n'
596 642
597 s = s1.encode(self.encoding) 643 s = s1.encode(self.encoding)
598 self.addCleanup(support.unlink, support.TESTFN) 644 self.addCleanup(support.unlink, support.TESTFN)
599 with open(support.TESTFN, 'wb') as fp: 645 with open(support.TESTFN, 'wb') as fp:
600 fp.write(s) 646 fp.write(s)
601 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader: 647 with support.check_warnings(('', DeprecationWarning)):
648 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
649 with reader:
602 self.assertEqual(reader.read(), s1) 650 self.assertEqual(reader.read(), s1)
603 651
604 class UTF16LETest(ReadTest, unittest.TestCase): 652 class UTF16LETest(ReadTest, unittest.TestCase):
605 encoding = "utf-16-le" 653 encoding = "utf-16-le"
654 ill_formed_sequence = b"\x80\xdc"
606 655
607 def test_partial(self): 656 def test_partial(self):
608 self.check_partial( 657 self.check_partial(
609 "\x00\xff\u0100\uffff\U00010000", 658 "\x00\xff\u0100\uffff\U00010000",
610 [ 659 [
611 "", 660 "",
612 "\x00", 661 "\x00",
613 "\x00", 662 "\x00",
614 "\x00\xff", 663 "\x00\xff",
615 "\x00\xff", 664 "\x00\xff",
(...skipping 23 matching lines...) Expand all
639 self.assertEqual(raw.decode('utf-16le', 'replace'), expected) 688 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
640 689
641 def test_nonbmp(self): 690 def test_nonbmp(self):
642 self.assertEqual("\U00010203".encode(self.encoding), 691 self.assertEqual("\U00010203".encode(self.encoding),
643 b'\x00\xd8\x03\xde') 692 b'\x00\xd8\x03\xde')
644 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding), 693 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
645 "\U00010203") 694 "\U00010203")
646 695
647 class UTF16BETest(ReadTest, unittest.TestCase): 696 class UTF16BETest(ReadTest, unittest.TestCase):
648 encoding = "utf-16-be" 697 encoding = "utf-16-be"
698 ill_formed_sequence = b"\xdc\x80"
649 699
650 def test_partial(self): 700 def test_partial(self):
651 self.check_partial( 701 self.check_partial(
652 "\x00\xff\u0100\uffff\U00010000", 702 "\x00\xff\u0100\uffff\U00010000",
653 [ 703 [
654 "", 704 "",
655 "\x00", 705 "\x00",
656 "\x00", 706 "\x00",
657 "\x00\xff", 707 "\x00\xff",
658 "\x00\xff", 708 "\x00\xff",
(...skipping 23 matching lines...) Expand all
682 self.assertEqual(raw.decode('utf-16be', 'replace'), expected) 732 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
683 733
684 def test_nonbmp(self): 734 def test_nonbmp(self):
685 self.assertEqual("\U00010203".encode(self.encoding), 735 self.assertEqual("\U00010203".encode(self.encoding),
686 b'\xd8\x00\xde\x03') 736 b'\xd8\x00\xde\x03')
687 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding), 737 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
688 "\U00010203") 738 "\U00010203")
689 739
690 class UTF8Test(ReadTest, unittest.TestCase): 740 class UTF8Test(ReadTest, unittest.TestCase):
691 encoding = "utf-8" 741 encoding = "utf-8"
742 ill_formed_sequence = b"\xed\xb2\x80"
743 ill_formed_sequence_replace = "\ufffd" * 3
692 744
693 def test_partial(self): 745 def test_partial(self):
694 self.check_partial( 746 self.check_partial(
695 "\x00\xff\u07ff\u0800\uffff\U00010000", 747 "\x00\xff\u07ff\u0800\uffff\U00010000",
696 [ 748 [
697 "\x00", 749 "\x00",
698 "\x00", 750 "\x00",
699 "\x00\xff", 751 "\x00\xff",
700 "\x00\xff", 752 "\x00\xff",
701 "\x00\xff\u07ff", 753 "\x00\xff\u07ff",
702 "\x00\xff\u07ff", 754 "\x00\xff\u07ff",
703 "\x00\xff\u07ff", 755 "\x00\xff\u07ff",
704 "\x00\xff\u07ff\u0800", 756 "\x00\xff\u07ff\u0800",
705 "\x00\xff\u07ff\u0800", 757 "\x00\xff\u07ff\u0800",
706 "\x00\xff\u07ff\u0800", 758 "\x00\xff\u07ff\u0800",
707 "\x00\xff\u07ff\u0800\uffff", 759 "\x00\xff\u07ff\u0800\uffff",
708 "\x00\xff\u07ff\u0800\uffff", 760 "\x00\xff\u07ff\u0800\uffff",
709 "\x00\xff\u07ff\u0800\uffff", 761 "\x00\xff\u07ff\u0800\uffff",
710 "\x00\xff\u07ff\u0800\uffff", 762 "\x00\xff\u07ff\u0800\uffff",
711 "\x00\xff\u07ff\u0800\uffff\U00010000", 763 "\x00\xff\u07ff\u0800\uffff\U00010000",
712 ] 764 ]
713 ) 765 )
714 766
715 def test_decoder_state(self): 767 def test_decoder_state(self):
716 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff" 768 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
717 self.check_state_handling_decode(self.encoding, 769 self.check_state_handling_decode(self.encoding,
718 u, u.encode(self.encoding)) 770 u, u.encode(self.encoding))
719 771
720 def test_lone_surrogates(self): 772 def test_lone_surrogates(self):
721 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8") 773 super().test_lone_surrogates()
722 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8") 774 # not sure if this is making sense for
723 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"), 775 # UTF-16 and UTF-32
724 b'[\\udc80]') 776 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
725 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
726 b'[�]')
727 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
728 b'[\x80]') 777 b'[\x80]')
729 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
730 b'[]')
731 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
732 b'[?]')
733 778
734 def test_surrogatepass_handler(self): 779 def test_surrogatepass_handler(self):
735 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"), 780 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
736 b"abc\xed\xa0\x80def") 781 b"abc\xed\xa0\x80def")
737 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"), 782 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
738 "abc\ud800def") 783 "abc\ud800def")
739 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"), 784 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
740 b"\xf0\x90\xbf\xbf\xed\xa0\x80") 785 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
741 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrog atepass"), 786 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrog atepass"),
742 "\U00010fff\uD800") 787 "\U00010fff\uD800")
(...skipping 159 matching lines...) Expand 10 before | Expand all | Expand 10 after
902 (b'a+IKwgr,', 'a\u20ac\ufffd'), 947 (b'a+IKwgr,', 'a\u20ac\ufffd'),
903 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'), 948 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
904 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'), 949 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
905 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'), 950 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
906 (b'a+/,+IKw-b', 'a\ufffd\u20acb'), 951 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
907 (b'a+//,+IKw-b', 'a\ufffd\u20acb'), 952 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
908 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), 953 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
909 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), 954 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
910 ] 955 ]
911 for raw, expected in tests: 956 for raw, expected in tests:
912 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode, 957 with self.subTest(raw=raw):
913 raw, 'strict', True) 958 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
914 self.assertEqual(raw.decode('utf-7', 'replace'), expected) 959 raw, 'strict', True)
960 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
915 961
916 def test_nonbmp(self): 962 def test_nonbmp(self):
917 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') 963 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
918 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') 964 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
919 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') 965 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
920 966
967 test_lone_surrogates = None
968
969
921 class UTF16ExTest(unittest.TestCase): 970 class UTF16ExTest(unittest.TestCase):
922 971
923 def test_errors(self): 972 def test_errors(self):
924 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True) 973 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
925 974
926 def test_bad_args(self): 975 def test_bad_args(self):
927 self.assertRaises(TypeError, codecs.utf_16_ex_decode) 976 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
928 977
929 class ReadBufferTest(unittest.TestCase): 978 class ReadBufferTest(unittest.TestCase):
930 979
931 def test_array(self): 980 def test_array(self):
932 import array 981 import array
933 self.assertEqual( 982 self.assertEqual(
934 codecs.readbuffer_encode(array.array("b", b"spam")), 983 codecs.readbuffer_encode(array.array("b", b"spam")),
935 (b"spam", 4) 984 (b"spam", 4)
936 ) 985 )
937 986
938 def test_empty(self): 987 def test_empty(self):
939 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0)) 988 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
940 989
941 def test_bad_args(self): 990 def test_bad_args(self):
942 self.assertRaises(TypeError, codecs.readbuffer_encode) 991 self.assertRaises(TypeError, codecs.readbuffer_encode)
943 self.assertRaises(TypeError, codecs.readbuffer_encode, 42) 992 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
944 993
945 class UTF8SigTest(ReadTest, unittest.TestCase): 994 class UTF8SigTest(UTF8Test, unittest.TestCase):
946 encoding = "utf-8-sig" 995 encoding = "utf-8-sig"
947 996
948 def test_partial(self): 997 def test_partial(self):
949 self.check_partial( 998 self.check_partial(
950 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 999 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
951 [ 1000 [
952 "", 1001 "",
953 "", 1002 "",
954 "", # First BOM has been read and skipped 1003 "", # First BOM has been read and skipped
955 "", 1004 "",
(...skipping 661 matching lines...) Expand 10 before | Expand all | Expand 10 after
1617 self.assertEqual(f.getvalue(), b'\xfc') 1666 self.assertEqual(f.getvalue(), b'\xfc')
1618 1667
1619 all_unicode_encodings = [ 1668 all_unicode_encodings = [
1620 "ascii", 1669 "ascii",
1621 "big5", 1670 "big5",
1622 "big5hkscs", 1671 "big5hkscs",
1623 "charmap", 1672 "charmap",
1624 "cp037", 1673 "cp037",
1625 "cp1006", 1674 "cp1006",
1626 "cp1026", 1675 "cp1026",
1676 "cp1125",
1627 "cp1140", 1677 "cp1140",
1628 "cp1250", 1678 "cp1250",
1629 "cp1251", 1679 "cp1251",
1630 "cp1252", 1680 "cp1252",
1631 "cp1253", 1681 "cp1253",
1632 "cp1254", 1682 "cp1254",
1633 "cp1255", 1683 "cp1255",
1634 "cp1256", 1684 "cp1256",
1635 "cp1257", 1685 "cp1257",
1636 "cp1258", 1686 "cp1258",
(...skipping 722 matching lines...) Expand 10 before | Expand all | Expand 10 after
2359 f.seek(0) 2409 f.seek(0)
2360 self.assertEqual(f.read(), data * 2) 2410 self.assertEqual(f.read(), data * 2)
2361 2411
2362 2412
2363 bytes_transform_encodings = [ 2413 bytes_transform_encodings = [
2364 "base64_codec", 2414 "base64_codec",
2365 "uu_codec", 2415 "uu_codec",
2366 "quopri_codec", 2416 "quopri_codec",
2367 "hex_codec", 2417 "hex_codec",
2368 ] 2418 ]
2419
2420 transform_aliases = {
2421 "base64_codec": ["base64", "base_64"],
2422 "uu_codec": ["uu"],
2423 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2424 "hex_codec": ["hex"],
2425 "rot_13": ["rot13"],
2426 }
2427
2369 try: 2428 try:
2370 import zlib 2429 import zlib
2371 except ImportError: 2430 except ImportError:
2372 pass 2431 zlib = None
2373 else: 2432 else:
2374 bytes_transform_encodings.append("zlib_codec") 2433 bytes_transform_encodings.append("zlib_codec")
2434 transform_aliases["zlib_codec"] = ["zip", "zlib"]
2375 try: 2435 try:
2376 import bz2 2436 import bz2
2377 except ImportError: 2437 except ImportError:
2378 pass 2438 pass
2379 else: 2439 else:
2380 bytes_transform_encodings.append("bz2_codec") 2440 bytes_transform_encodings.append("bz2_codec")
2441 transform_aliases["bz2_codec"] = ["bz2"]
2381 2442
2382 class TransformCodecTest(unittest.TestCase): 2443 class TransformCodecTest(unittest.TestCase):
2383 2444
2384 def test_basics(self): 2445 def test_basics(self):
2385 binput = bytes(range(256)) 2446 binput = bytes(range(256))
2386 for encoding in bytes_transform_encodings: 2447 for encoding in bytes_transform_encodings:
2387 # generic codecs interface 2448 with self.subTest(encoding=encoding):
2388 (o, size) = codecs.getencoder(encoding)(binput) 2449 # generic codecs interface
2389 self.assertEqual(size, len(binput)) 2450 (o, size) = codecs.getencoder(encoding)(binput)
2390 (i, size) = codecs.getdecoder(encoding)(o) 2451 self.assertEqual(size, len(binput))
2391 self.assertEqual(size, len(o)) 2452 (i, size) = codecs.getdecoder(encoding)(o)
2392 self.assertEqual(i, binput) 2453 self.assertEqual(size, len(o))
2454 self.assertEqual(i, binput)
2393 2455
2394 def test_read(self): 2456 def test_read(self):
2395 for encoding in bytes_transform_encodings: 2457 for encoding in bytes_transform_encodings:
2396 sin = codecs.encode(b"\x80", encoding) 2458 with self.subTest(encoding=encoding):
2397 reader = codecs.getreader(encoding)(io.BytesIO(sin)) 2459 sin = codecs.encode(b"\x80", encoding)
2398 sout = reader.read() 2460 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2399 self.assertEqual(sout, b"\x80") 2461 sout = reader.read()
2462 self.assertEqual(sout, b"\x80")
2400 2463
2401 def test_readline(self): 2464 def test_readline(self):
2402 for encoding in bytes_transform_encodings: 2465 for encoding in bytes_transform_encodings:
2403 sin = codecs.encode(b"\x80", encoding) 2466 with self.subTest(encoding=encoding):
2404 reader = codecs.getreader(encoding)(io.BytesIO(sin)) 2467 sin = codecs.encode(b"\x80", encoding)
2405 sout = reader.readline() 2468 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2406 self.assertEqual(sout, b"\x80") 2469 sout = reader.readline()
2470 self.assertEqual(sout, b"\x80")
2471
2472 def test_buffer_api_usage(self):
2473 # We check all the transform codecs accept memoryview input
2474 # for encoding and decoding
2475 # and also that they roundtrip correctly
2476 original = b"12345\x80"
2477 for encoding in bytes_transform_encodings:
2478 with self.subTest(encoding=encoding):
2479 data = original
2480 view = memoryview(data)
2481 data = codecs.encode(data, encoding)
2482 view_encoded = codecs.encode(view, encoding)
2483 self.assertEqual(view_encoded, data)
2484 view = memoryview(data)
2485 data = codecs.decode(data, encoding)
2486 self.assertEqual(data, original)
2487 view_decoded = codecs.decode(view, encoding)
2488 self.assertEqual(view_decoded, data)
2489
2490 def test_text_to_binary_blacklists_binary_transforms(self):
2491 # Check binary -> binary codecs give a good error for str input
2492 bad_input = "bad input type"
2493 for encoding in bytes_transform_encodings:
2494 with self.subTest(encoding=encoding):
2495 fmt = ( "{!r} is not a text encoding; "
2496 "use codecs.encode\(\) to handle arbitrary codecs")
2497 msg = fmt.format(encoding)
2498 with self.assertRaisesRegex(LookupError, msg) as failure:
2499 bad_input.encode(encoding)
2500 self.assertIsNone(failure.exception.__cause__)
2501
2502 def test_text_to_binary_blacklists_text_transforms(self):
2503 # Check str.encode gives a good error message for str -> str codecs
2504 msg = (r"^'rot_13' is not a text encoding; "
2505 "use codecs.encode\(\) to handle arbitrary codecs")
2506 with self.assertRaisesRegex(LookupError, msg):
2507 "just an example message".encode("rot_13")
2508
2509 def test_binary_to_text_blacklists_binary_transforms(self):
2510 # Check bytes.decode and bytearray.decode give a good error
2511 # message for binary -> binary codecs
2512 data = b"encode first to ensure we meet any format restrictions"
2513 for encoding in bytes_transform_encodings:
2514 with self.subTest(encoding=encoding):
2515 encoded_data = codecs.encode(data, encoding)
2516 fmt = (r"{!r} is not a text encoding; "
2517 "use codecs.decode\(\) to handle arbitrary codecs")
2518 msg = fmt.format(encoding)
2519 with self.assertRaisesRegex(LookupError, msg):
2520 encoded_data.decode(encoding)
2521 with self.assertRaisesRegex(LookupError, msg):
2522 bytearray(encoded_data).decode(encoding)
2523
2524 def test_binary_to_text_blacklists_text_transforms(self):
2525 # Check str -> str codec gives a good error for binary input
2526 for bad_input in (b"immutable", bytearray(b"mutable")):
2527 with self.subTest(bad_input=bad_input):
2528 msg = (r"^'rot_13' is not a text encoding; "
2529 "use codecs.decode\(\) to handle arbitrary codecs")
2530 with self.assertRaisesRegex(LookupError, msg) as failure:
2531 bad_input.decode("rot_13")
2532 self.assertIsNone(failure.exception.__cause__)
2533
2534 @unittest.skipUnless(zlib, "Requires zlib support")
2535 def test_custom_zlib_error_is_wrapped(self):
2536 # Check zlib codec gives a good error for malformed input
2537 msg = "^decoding with 'zlib_codec' codec failed"
2538 with self.assertRaisesRegex(Exception, msg) as failure:
2539 codecs.decode(b"hello", "zlib_codec")
2540 self.assertIsInstance(failure.exception.__cause__,
2541 type(failure.exception))
2542
2543 def test_custom_hex_error_is_wrapped(self):
2544 # Check hex codec gives a good error for malformed input
2545 msg = "^decoding with 'hex_codec' codec failed"
2546 with self.assertRaisesRegex(Exception, msg) as failure:
2547 codecs.decode(b"hello", "hex_codec")
2548 self.assertIsInstance(failure.exception.__cause__,
2549 type(failure.exception))
2550
2551 # Unfortunately, the bz2 module throws OSError, which the codec
2552 # machinery currently can't wrap :(
2553
2554 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2555 def test_aliases(self):
2556 for codec_name, aliases in transform_aliases.items():
2557 expected_name = codecs.lookup(codec_name).name
2558 for alias in aliases:
2559 with self.subTest(alias=alias):
2560 info = codecs.lookup(alias)
2561 self.assertEqual(info.name, expected_name)
2562
2563
2564 # The codec system tries to wrap exceptions in order to ensure the error
2565 # mentions the operation being performed and the codec involved. We
2566 # currently *only* want this to happen for relatively stateless
2567 # exceptions, where the only significant information they contain is their
2568 # type and a single str argument.
2569
2570 # Use a local codec registry to avoid appearing to leak objects when
2571 # registering multiple seach functions
2572 _TEST_CODECS = {}
2573
2574 def _get_test_codec(codec_name):
2575 return _TEST_CODECS.get(codec_name)
2576 codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2577
2578 class ExceptionChainingTest(unittest.TestCase):
2579
2580 def setUp(self):
2581 # There's no way to unregister a codec search function, so we just
2582 # ensure we render this one fairly harmless after the test
2583 # case finishes by using the test case repr as the codec name
2584 # The codecs module normalizes codec names, although this doesn't
2585 # appear to be formally documented...
2586 # We also make sure we use a truly unique id for the custom codec
2587 # to avoid issues with the codec cache when running these tests
2588 # multiple times (e.g. when hunting for refleaks)
2589 unique_id = repr(self) + str(id(self))
2590 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2591
2592 # We store the object to raise on the instance because of a bad
2593 # interaction between the codec caching (which means we can't
2594 # recreate the codec entry) and regrtest refleak hunting (which
2595 # runs the same test instance multiple times). This means we
2596 # need to ensure the codecs call back in to the instance to find
2597 # out which exception to raise rather than binding them in a
2598 # closure to an object that may change on the next run
2599 self.obj_to_raise = RuntimeError
2600
2601 def tearDown(self):
2602 _TEST_CODECS.pop(self.codec_name, None)
2603
2604 def set_codec(self, encode, decode):
2605 codec_info = codecs.CodecInfo(encode, decode,
2606 name=self.codec_name)
2607 _TEST_CODECS[self.codec_name] = codec_info
2608
2609 @contextlib.contextmanager
2610 def assertWrapped(self, operation, exc_type, msg):
2611 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
2612 operation, self.codec_name, exc_type.__name__, msg)
2613 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2614 yield caught
2615 self.assertIsInstance(caught.exception.__cause__, exc_type)
2616 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
2617
2618 def raise_obj(self, *args, **kwds):
2619 # Helper to dynamically change the object raised by a test codec
2620 raise self.obj_to_raise
2621
2622 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
2623 self.obj_to_raise = obj_to_raise
2624 self.set_codec(self.raise_obj, self.raise_obj)
2625 with self.assertWrapped("encoding", exc_type, msg):
2626 "str_input".encode(self.codec_name)
2627 with self.assertWrapped("encoding", exc_type, msg):
2628 codecs.encode("str_input", self.codec_name)
2629 with self.assertWrapped("decoding", exc_type, msg):
2630 b"bytes input".decode(self.codec_name)
2631 with self.assertWrapped("decoding", exc_type, msg):
2632 codecs.decode(b"bytes input", self.codec_name)
2633
2634 def test_raise_by_type(self):
2635 self.check_wrapped(RuntimeError, "")
2636
2637 def test_raise_by_value(self):
2638 msg = "This should be wrapped"
2639 self.check_wrapped(RuntimeError(msg), msg)
2640
2641 def test_raise_grandchild_subclass_exact_size(self):
2642 msg = "This should be wrapped"
2643 class MyRuntimeError(RuntimeError):
2644 __slots__ = ()
2645 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2646
2647 def test_raise_subclass_with_weakref_support(self):
2648 msg = "This should be wrapped"
2649 class MyRuntimeError(RuntimeError):
2650 pass
2651 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2652
2653 def check_not_wrapped(self, obj_to_raise, msg):
2654 def raise_obj(*args, **kwds):
2655 raise obj_to_raise
2656 self.set_codec(raise_obj, raise_obj)
2657 with self.assertRaisesRegex(RuntimeError, msg):
2658 "str input".encode(self.codec_name)
2659 with self.assertRaisesRegex(RuntimeError, msg):
2660 codecs.encode("str input", self.codec_name)
2661 with self.assertRaisesRegex(RuntimeError, msg):
2662 b"bytes input".decode(self.codec_name)
2663 with self.assertRaisesRegex(RuntimeError, msg):
2664 codecs.decode(b"bytes input", self.codec_name)
2665
2666 def test_init_override_is_not_wrapped(self):
2667 class CustomInit(RuntimeError):
2668 def __init__(self):
2669 pass
2670 self.check_not_wrapped(CustomInit, "")
2671
2672 def test_new_override_is_not_wrapped(self):
2673 class CustomNew(RuntimeError):
2674 def __new__(cls):
2675 return super().__new__(cls)
2676 self.check_not_wrapped(CustomNew, "")
2677
2678 def test_instance_attribute_is_not_wrapped(self):
2679 msg = "This should NOT be wrapped"
2680 exc = RuntimeError(msg)
2681 exc.attr = 1
2682 self.check_not_wrapped(exc, "^{}$".format(msg))
2683
2684 def test_non_str_arg_is_not_wrapped(self):
2685 self.check_not_wrapped(RuntimeError(1), "1")
2686
2687 def test_multiple_args_is_not_wrapped(self):
2688 msg_re = r"^\('a', 'b', 'c'\)$"
2689 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
2690
2691 # http://bugs.python.org/issue19609
2692 def test_codec_lookup_failure_not_wrapped(self):
2693 msg = "^unknown encoding: {}$".format(self.codec_name)
2694 # The initial codec lookup should not be wrapped
2695 with self.assertRaisesRegex(LookupError, msg):
2696 "str input".encode(self.codec_name)
2697 with self.assertRaisesRegex(LookupError, msg):
2698 codecs.encode("str input", self.codec_name)
2699 with self.assertRaisesRegex(LookupError, msg):
2700 b"bytes input".decode(self.codec_name)
2701 with self.assertRaisesRegex(LookupError, msg):
2702 codecs.decode(b"bytes input", self.codec_name)
2703
2704 def test_unflagged_non_text_codec_handling(self):
2705 # The stdlib non-text codecs are now marked so they're
2706 # pre-emptively skipped by the text model related methods
2707 # However, third party codecs won't be flagged, so we still make
2708 # sure the case where an inappropriate output type is produced is
2709 # handled appropriately
2710 def encode_to_str(*args, **kwds):
2711 return "not bytes!", 0
2712 def decode_to_bytes(*args, **kwds):
2713 return b"not str!", 0
2714 self.set_codec(encode_to_str, decode_to_bytes)
2715 # No input or output type checks on the codecs module functions
2716 encoded = codecs.encode(None, self.codec_name)
2717 self.assertEqual(encoded, "not bytes!")
2718 decoded = codecs.decode(None, self.codec_name)
2719 self.assertEqual(decoded, b"not str!")
2720 # Text model methods should complain
2721 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2722 "use codecs.encode\(\) to encode to arbitrary types$")
2723 msg = fmt.format(self.codec_name)
2724 with self.assertRaisesRegex(TypeError, msg):
2725 "str_input".encode(self.codec_name)
2726 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2727 "use codecs.decode\(\) to decode to arbitrary types$")
2728 msg = fmt.format(self.codec_name)
2729 with self.assertRaisesRegex(TypeError, msg):
2730 b"bytes input".decode(self.codec_name)
2731
2407 2732
2408 2733
2409 @unittest.skipUnless(sys.platform == 'win32', 2734 @unittest.skipUnless(sys.platform == 'win32',
2410 'code pages are specific to Windows') 2735 'code pages are specific to Windows')
2411 class CodePageTest(unittest.TestCase): 2736 class CodePageTest(unittest.TestCase):
2412 # CP_UTF8 is already tested by CP65001Test 2737 # CP_UTF8 is already tested by CP65001Test
2413 CP_UTF8 = 65001 2738 CP_UTF8 = 65001
2414 2739
2415 def test_invalid_code_page(self): 2740 def test_invalid_code_page(self):
2416 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a') 2741 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2417 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a') 2742 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
2418 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a') 2743 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2419 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a') 2744 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
2420 2745
2421 def test_code_page_name(self): 2746 def test_code_page_name(self):
2422 self.assertRaisesRegex(UnicodeEncodeError, 'cp932', 2747 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2423 codecs.code_page_encode, 932, '\xff') 2748 codecs.code_page_encode, 932, '\xff')
2424 self.assertRaisesRegex(UnicodeDecodeError, 'cp932', 2749 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2425 codecs.code_page_decode, 932, b'\x81\x00') 2750 codecs.code_page_decode, 932, b'\x81\x00')
2426 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8', 2751 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2427 codecs.code_page_decode, self.CP_UTF8, b'\xff') 2752 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2428 2753
2429 def check_decode(self, cp, tests): 2754 def check_decode(self, cp, tests):
(...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after
2550 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4)) 2875 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2551 2876
2552 decoded = codecs.code_page_decode(932, 2877 decoded = codecs.code_page_decode(932,
2553 b'abc', 'strict', 2878 b'abc', 'strict',
2554 False) 2879 False)
2555 self.assertEqual(decoded, ('abc', 3)) 2880 self.assertEqual(decoded, ('abc', 3))
2556 2881
2557 2882
2558 if __name__ == "__main__": 2883 if __name__ == "__main__":
2559 unittest.main() 2884 unittest.main()
LEFTRIGHT

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+