diff -r 977e60f597de Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Thu Aug 20 11:13:38 2015 +1200 +++ b/Lib/test/test_codecs.py Fri Aug 21 23:40:33 2015 +0300 @@ -903,6 +903,44 @@ class UTF7Test(ReadTest, unittest.TestCase): encoding = "utf-7" + def test_basic(self): + # Examples from RFC 2152 + tests = [ + ('A\u2262\u0391.', b'A+ImIDkQ.'), + ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), + ('\u65e5\u672c\u8a9e', b'+ZeVnLIqe-'), + ] + for decoded, encoded in tests: + with self.subTest(str=decoded): + self.assertEqual(decoded.encode(self.encoding), encoded) + self.assertEqual(encoded.decode(self.encoding), decoded) + + def test_ascii(self): + # Set D (directly encoded characters) + set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' + '\'(),-./:?') + self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii')) + self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d) + # Set O (optional direct characters) + set_o = ' !"#$%&*;<=>@[]^_`{|}' + self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii')) + self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o) + # + + self.assertEqual('a+b'.encode(self.encoding), b'a+-b') + self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b') + # White spaces + ws = ' \t\n\r' + self.assertEqual(ws.encode(self.encoding), ws.encode('ascii')) + self.assertEqual(ws.encode('ascii').decode(self.encoding), ws) + # Other ASCII characters + other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) - + set(set_d + set_o + '+' + ws))) + self.assertEqual(other_ascii.encode(self.encoding), + b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU' + b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-') + def test_partial(self): self.check_partial( 'a+-b\x00c\x80d\u0100e\U00010000f', @@ -960,6 +998,8 @@ (b'a+//,+IKw-b', 'a\ufffd\u20acb'), (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), + (b'a+IKw-b\xff', 'a\u20acb\ufffd'), + (b'a+IKw\xffb', 'a\u20ac\ufffdb'), ] for raw, expected in tests: with self.subTest(raw=raw): @@ -971,8 +1011,36 @@ self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') - - test_lone_surrogates = None + self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0') + self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-') + self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0') + self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0') + self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding), + b'+IKwgrNgB3KA-') + self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding), + '\u20ac\u20ac\U000104A0') + self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding), + '\u20ac\u20ac\U000104A0') + + def test_lone_surrogates(self): + tests = [ + (b'a+2AE-b', 'a\ud801b'), + (b'a+2AE\xffb', 'a\ufffdb'), + (b'a+2AE', 'a\ufffd'), + (b'a+2AEA-b', 'a\ufffdb'), + (b'a+2AH-b', 'a\ufffdb'), + (b'a+IKzYAQ-b', 'a\u20ac\ud801b'), + (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'), + (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'), + (b'a+IKzYAd-b', 'a\u20ac\ufffdb'), + (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'), + (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'), + (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'), + (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'), + ] + for raw, expected in tests: + with self.subTest(raw=raw): + self.assertEqual(raw.decode('utf-7', 'replace'), expected) class UTF16ExTest(unittest.TestCase): diff -r 977e60f597de Objects/unicodeobject.c --- a/Objects/unicodeobject.c Thu Aug 20 11:13:38 2015 +1200 +++ b/Objects/unicodeobject.c Fri Aug 21 23:40:33 2015 +0300 @@ -4331,31 +4331,31 @@ } else { /* now leaving a base-64 section */ inShift = 0; - s++; - if (surrogate) { - if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) - goto onError; - surrogate = 0; - } if (base64bits > 0) { /* left-over bits */ if (base64bits >= 6) { /* We've seen at least one base-64 character */ + s++; errmsg = "partial character in shift sequence"; goto utf7Error; } else { /* Some bits remain; they should be zero */ if (base64buffer != 0) { + s++; errmsg = "non-zero padding bits in shift sequence"; goto utf7Error; } } } - if (ch != '-') { + if (surrogate && DECODE_DIRECT(ch)) { + if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) + goto onError; + } + surrogate = 0; + if (ch == '-') { /* '-' is absorbed; other terminating characters are preserved */ - if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) - goto onError; + s++; } } } @@ -4369,6 +4369,7 @@ } else { /* begin base64-encoded section */ inShift = 1; + surrogate = 0; shiftOutStart = writer.pos; base64bits = 0; base64buffer = 0; @@ -4400,6 +4401,7 @@ if (inShift && !consumed) { /* in shift sequence, no more to follow */ /* if we're in an inconsistent state, that's an error */ + inShift = 0; if (surrogate || (base64bits >= 6) || (base64bits > 0 && base64buffer != 0)) {