diff -r 48943533965e Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Sun Sep 27 22:38:33 2015 +0300 +++ b/Lib/test/test_codecs.py Mon Sep 28 01:13:40 2015 +0300 @@ -910,6 +910,32 @@ class CP65001Test(ReadTest, unittest.Tes class UTF7Test(ReadTest, unittest.TestCase): encoding = "utf-7" + def test_ascii(self): + # Set D (directly encoded characters) + set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' + '\'(),-./:?') + self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii')) + self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d) + # Set O (optional direct characters) + set_o = ' !"#$%&*;<=>@[]^_`{|}' + self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii')) + self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o) + # + + self.assertEqual('a+b'.encode(self.encoding), b'a+-b') + self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b') + # White spaces + ws = ' \t\n\r' + self.assertEqual(ws.encode(self.encoding), ws.encode('ascii')) + self.assertEqual(ws.encode('ascii').decode(self.encoding), ws) + # Other ASCII characters + other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) - + set(set_d + set_o + '+' + ws))) + self.assertEqual(other_ascii.encode(self.encoding), + b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU' + b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-') + def test_partial(self): self.check_partial( 'a+-b\x00c\x80d\u0100e\U00010000f', @@ -951,7 +977,9 @@ class UTF7Test(ReadTest, unittest.TestCa def test_errors(self): tests = [ + (b'\xffb', '\ufffdb'), (b'a\xffb', 'a\ufffdb'), + (b'a\xff\xffb', 'a\ufffd\ufffdb'), (b'a+IK', 'a\ufffd'), (b'a+IK-b', 'a\ufffdb'), (b'a+IK,b', 'a\ufffdb'), @@ -967,6 +995,8 @@ class UTF7Test(ReadTest, unittest.TestCa (b'a+//,+IKw-b', 'a\ufffd\u20acb'), (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), + (b'a+IKw-b\xff', 'a\u20acb\ufffd'), + (b'a+IKw\xffb', 'a\u20ac\ufffdb'), ] for raw, expected in tests: with self.subTest(raw=raw): @@ -978,8 +1008,36 @@ class UTF7Test(ReadTest, unittest.TestCa self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') - - test_lone_surrogates = None + self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0') + self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-') + self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0') + self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0') + self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding), + b'+IKwgrNgB3KA-') + self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding), + '\u20ac\u20ac\U000104A0') + self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding), + '\u20ac\u20ac\U000104A0') + + def test_lone_surrogates(self): + tests = [ + (b'a+2AE-b', 'a\ud801b'), + (b'a+2AE\xffb', 'a\ufffdb'), + (b'a+2AE', 'a\ufffd'), + (b'a+2AEA-b', 'a\ufffdb'), + (b'a+2AH-b', 'a\ufffdb'), + (b'a+IKzYAQ-b', 'a\u20ac\ud801b'), + (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'), + (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'), + (b'a+IKzYAd-b', 'a\u20ac\ufffdb'), + (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'), + (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'), + (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'), + (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'), + ] + for raw, expected in tests: + with self.subTest(raw=raw): + self.assertEqual(raw.decode('utf-7', 'replace'), expected) class UTF16ExTest(unittest.TestCase): diff -r 48943533965e Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py Sun Sep 27 22:38:33 2015 +0300 +++ b/Lib/test/test_unicode.py Mon Sep 28 01:13:40 2015 +0300 @@ -1553,7 +1553,7 @@ class UnicodeTest(string_tests.CommonTes self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde') # Issue #2242: crash on some Windows/MSVC versions - self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1') + self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '') # Direct encoded characters set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" @@ -1995,6 +1995,7 @@ class UnicodeTest(string_tests.CommonTes self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict') self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x") self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x') + self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x') # Error handling (unknown character names) self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx") diff -r 48943533965e Objects/unicodeobject.c --- a/Objects/unicodeobject.c Sun Sep 27 22:38:33 2015 +0300 +++ b/Objects/unicodeobject.c Mon Sep 28 01:13:40 2015 +0300 @@ -4357,31 +4357,31 @@ PyUnicode_DecodeUTF7Stateful(const char } else { /* now leaving a base-64 section */ inShift = 0; - s++; - if (surrogate) { - if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) - goto onError; - surrogate = 0; - } if (base64bits > 0) { /* left-over bits */ if (base64bits >= 6) { /* We've seen at least one base-64 character */ + s++; errmsg = "partial character in shift sequence"; goto utf7Error; } else { /* Some bits remain; they should be zero */ if (base64buffer != 0) { + s++; errmsg = "non-zero padding bits in shift sequence"; goto utf7Error; } } } - if (ch != '-') { + if (surrogate && DECODE_DIRECT(ch)) { + if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) + goto onError; + } + surrogate = 0; + if (ch == '-') { /* '-' is absorbed; other terminating characters are preserved */ - if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) - goto onError; + s++; } } } @@ -4395,6 +4395,7 @@ PyUnicode_DecodeUTF7Stateful(const char } else { /* begin base64-encoded section */ inShift = 1; + surrogate = 0; shiftOutStart = writer.pos; base64bits = 0; base64buffer = 0; @@ -4426,6 +4427,7 @@ utf7Error: if (inShift && !consumed) { /* in shift sequence, no more to follow */ /* if we're in an inconsistent state, that's an error */ + inShift = 0; if (surrogate || (base64bits >= 6) || (base64bits > 0 && base64buffer != 0)) { @@ -13347,6 +13349,7 @@ int if (maxchar > writer->maxchar || writer->readonly) { /* resize + widen */ + maxchar = Py_MAX(maxchar, writer->maxchar); newbuffer = PyUnicode_New(newlen, maxchar); if (newbuffer == NULL) return -1;