diff -r ff735436c247 Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py Sat Apr 16 16:54:15 2011 +0200 +++ b/Lib/test/test_unicode.py Tue Apr 19 15:23:24 2011 +0300 @@ -1026,7 +1026,7 @@ # with start byte of a 2-byte sequence (b'\xc2', FFFD), # only the start byte (b'\xc2\xc2', FFFD*2), # 2 start bytes - (b'\xc2\xc2\xc2', FFFD*3), # 2 start bytes + (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes (b'\xc2\x41', FFFD+'A'), # invalid continuation byte # with start byte of a 3-byte sequence (b'\xe1', FFFD), # only the start byte @@ -1096,6 +1096,233 @@ self.assertEqual(seq.decode('utf-8', 'ignore'), res.replace('\uFFFD', '')) + def to_bytestring(self, seq): + return bytes(int(c, 16) for c in seq.split()) + + def assertCorrectUTF8Decoding(self, seq, res, err): + """ + Check that an invalid UTF-8 sequence raises an UnicodeDecodeError when + 'strict' is used, returns res when 'replace' is used, and that doesn't + return anything when 'ignore' is used. + """ + #with self.assertRaises(UnicodeDecodeError) as cm: + #seq.decode('utf-8') + #exc = cm.exception + try: + seq.decode('utf-8') + except UnicodeDecodeError as e: + exc = str(e) + else: + self.fail( + "%r.decode('utf-8') didn't raise UnicodeDecodeError." % seq) + + self.assertIn(err, str(exc)) + self.assertEqual(seq.decode('utf-8', 'replace'), res) + self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'), + 'aaaa' + res + 'bbbb') + res = res.replace('\ufffd', '') + self.assertEqual(seq.decode('utf-8', 'ignore'), res) + self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'), + 'aaaa' + res + 'bbbb') + + def test_invalid_start_byte(self): + """ + Test that an 'invalid start byte' error is raised when the first byte + is not in the ASCII range or is not a valid start byte of a 2-, 3-, or + 4-bytes sequence. The invalid start byte is replaced with a single + U+FFFD when errors='replace'. + E.g. <80> is a continuation byte and can appear only after a start byte. + """ + FFFD = '\ufffd' + for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF': + self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd', + 'invalid start byte') + + def test_unexpected_end_of_data(self): + """ + Test that an 'unexpected end of data' error is raised when the string + ends after a start byte of a 2-, 3-, or 4-bytes sequence without having + enough continuation bytes. The incomplete sequence is replaced with a + single U+FFFD when errors='replace'. + E.g. in the sequence , F3 is the start byte of a 4-bytes + sequence, but it's followed by only 2 valid continuation bytes and the + last continuation bytes is missing. + Note: the continuation bytes must be all valid, if one of them is + invalid another error will be raised. + """ + sequences = [ + 'C2', 'DF', + 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF', + 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF', + 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF', + 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF', + 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF', + 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF' + ] + FFFD = '\ufffd' + for seq in sequences: + self.assertCorrectUTF8Decoding(self.to_bytestring(seq), '\ufffd', + 'unexpected end of data') + + def test_invalid_cb_for_2bytes_seq(self): + """ + Test that an 'invalid continuation byte' error is raised when the + continuation byte of a 2-bytes sequence is invalid. The start byte + is replaced by a single U+FFFD and the second byte is handled + separately when errors='replace'. + E.g. in the sequence , C2 is the start byte of a 2-bytes + sequence, but 41 is not a valid continuation byte because it's the + ASCII letter 'A'. + """ + FFFD = '\ufffd' + FFFDx2 = FFFD * 2 + sequences = [ + ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'), + ('C2 C0', FFFDx2), ('C2 FF', FFFDx2), + ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'), + ('DF C0', FFFDx2), ('DF FF', FFFDx2), + ] + for seq, res in sequences: + self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res, + 'invalid continuation byte') + + def test_invalid_cb_for_3bytes_seq(self): + """ + Test that an 'invalid continuation byte' error is raised when the + continuation byte(s) of a 3-bytes sequence are invalid. When + errors='replace', if the first continuation byte is valid, the first + two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the + third byte is handled separately, otherwise only the start byte is + replaced with a U+FFFD and the other continuation bytes are handled + separately. + E.g. in the sequence , E1 is the start byte of a 3-bytes + sequence, 80 is a valid continuation byte, but 41 is not a valid cb + because it's the ASCII letter 'A'. + Note: when the start byte is E0 or ED, the valid ranges for the first + continuation byte are limited to A0..BF and 80..9F respectively. + However, when the start byte is ED, Python 2 considers all the bytes + in range 80..BF valid. This is fixed in Python 3. + """ + FFFD = '\ufffd' + FFFDx2 = FFFD * 2 + sequences = [ + ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2), + ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2), + ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'), + ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2), + ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'), + ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'), + ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2), + ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'), + ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2), + ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'), + ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'), + ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2), + ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'), + ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2), + ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'), + ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'), + ('ED 7F', FFFD+'\x7f'), + # ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^ + ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'), + ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2), + ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'), + ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2), + ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'), + ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2), + ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'), + ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2), + ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'), + ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'), + ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2), + ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'), + ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2), + ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'), + ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2), + ] + for seq, res in sequences: + self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res, + 'invalid continuation byte') + + def test_invalid_cb_for_4bytes_seq(self): + """ + Test that an 'invalid continuation byte' error is raised when the + continuation byte(s) of a 4-bytes sequence are invalid. When + errors='replace',the start byte and all the following valid + continuation bytes are replaced with a single U+FFFD, and all the bytes + starting from the first invalid continuation bytes (included) are + handled separately. + E.g. in the sequence , E1 is the start byte of a 3-bytes + sequence, 80 is a valid continuation byte, but 41 is not a valid cb + because it's the ASCII letter 'A'. + Note: when the start byte is E0 or ED, the valid ranges for the first + continuation byte are limited to A0..BF and 80..9F respectively. + However, when the start byte is ED, Python 2 considers all the bytes + in range 80..BF valid. This is fixed in Python 3. + """ + FFFD = '\ufffd' + FFFDx2 = FFFD * 2 + sequences = [ + ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2), + ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2), + ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'), + ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2), + ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'), + ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2), + ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'), + ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2), + ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'), + ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2), + ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'), + ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2), + ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'), + ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2), + ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2), + ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'), + ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2), + ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'), + ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2), + ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'), + ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2), + ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'), + ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2), + ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'), + ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2), + ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'), + ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2), + ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'), + ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2), + ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'), + ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2), + ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'), + ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2), + ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'), + ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2), + ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'), + ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2), + ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'), + ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2), + ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'), + ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2), + ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2), + ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2), + ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'), + ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2), + ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'), + ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2), + ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'), + ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2), + ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'), + ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2), + ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'), + ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2), + ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'), + ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2) + ] + for seq, res in sequences: + self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res, + 'invalid continuation byte') + def test_codecs_idna(self): # Test whether trailing dot is preserved self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.") diff -r ff735436c247 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Sat Apr 16 16:54:15 2011 +0200 +++ b/Objects/unicodeobject.c Tue Apr 19 15:23:24 2011 +0300 @@ -1554,7 +1554,7 @@ arg = PyUnicode_FromObject(arg); if (!arg) return 0; - output = PyUnicode_AsEncodedObject(arg, + output = PyUnicode_AsEncodedObject(arg, Py_FileSystemDefaultEncoding, "surrogateescape"); Py_DECREF(arg); @@ -1569,7 +1569,7 @@ if (PyBytes_Check(output)) { size = PyBytes_GET_SIZE(output); data = PyBytes_AS_STRING(output); - } + } else { size = PyByteArray_GET_SIZE(output); data = PyByteArray_AS_STRING(output); @@ -2148,7 +2148,7 @@ illegal prefix. See RFC 3629 for details */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -2171,6 +2171,20 @@ return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); } + +/* Macros to check if a continuation byte (CB) is INvalid. + See http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf (table 3-7) */ +#define _PyUnicode_IS_INVALID_UTF8_CB(cb) ((cb & 0xc0) != 0x80) +/* Depending on the first byte, the valid range for the second CB might vary */ +#define _PyUnicode_IS_INVALID_3SEQ_2ND_CB(fst, snd) \ + (_PyUnicode_IS_INVALID_UTF8_CB(snd) || \ + ((unsigned char)fst == 0xE0 && (unsigned char)snd < 0xA0) || \ + ((unsigned char)fst == 0xED && (unsigned char)snd > 0x9F)) +#define _PyUnicode_IS_INVALID_4SEQ_2ND_CB(fst, snd) \ + (_PyUnicode_IS_INVALID_UTF8_CB(snd) || \ + ((unsigned char)fst == 0xF0 && (unsigned char)snd < 0x90) || \ + ((unsigned char)fst == 0xF4 && (unsigned char)snd > 0x8F)) + /* Mask to check or force alignment of a pointer to C 'long' boundaries */ #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) @@ -2191,7 +2205,7 @@ { const char *starts = s; int n; - int k; + int charsleft; Py_ssize_t startinpos; Py_ssize_t endinpos; Py_ssize_t outpos; @@ -2270,16 +2284,60 @@ n = utf8_code_length[ch]; if (s + n > e) { + /* there are not enough bytes to complete the sequence */ if (consumed) break; - else { + charsleft = e - s - 1; /* either 0, 1, 2 */ + /* note: when we get the 'unexpected end of data' we don't care + about the pos anymore and we just ignore the value */ + if (charsleft == 0) { + /* there's only the start byte and nothing else */ errmsg = "unexpected end of data"; startinpos = s-starts; endinpos = startinpos+1; - for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) - endinpos++; goto utf8Error; } + if (n == 3) { + /* 3-bytes seq with only a continuation byte */ + if (_PyUnicode_IS_INVALID_3SEQ_2ND_CB(s[0], s[1])) { + /* second byte invalid */ + errmsg = "invalid continuation byte"; + startinpos = s-starts; + endinpos = startinpos+1; + goto utf8Error; + } + else { + /* second byte valid, but third byte missing */ + errmsg = "unexpected end of data"; + startinpos = s-starts; + endinpos = startinpos+2; + goto utf8Error; + } + } + else if (n == 4) { + if (_PyUnicode_IS_INVALID_4SEQ_2ND_CB(s[0], s[1])) { + /* second byte invalid */ + errmsg = "invalid continuation byte"; + startinpos = s-starts; + endinpos = startinpos+1; + goto utf8Error; + } + else if ((charsleft == 2) && + _PyUnicode_IS_INVALID_UTF8_CB(s[2])) { + /* third byte invalid */ + errmsg = "invalid continuation byte"; + startinpos = s-starts; + endinpos = startinpos+2; + goto utf8Error; + } + else { + /* there's only 1 or 2 valid cbs, the others are missing */ + errmsg = "unexpected end of data"; + startinpos = s-starts; + endinpos = startinpos + charsleft + 1; + goto utf8Error; + } + } } switch (n) { @@ -2297,7 +2355,7 @@ goto utf8Error; case 2: - if ((s[1] & 0xc0) != 0x80) { + if (_PyUnicode_IS_INVALID_UTF8_CB(s[1])) { errmsg = "invalid continuation byte"; startinpos = s-starts; endinpos = startinpos + 1; @@ -2312,24 +2370,19 @@ /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf will result in surrogates in range d800-dfff. Surrogates are not valid UTF-8 so they are rejected. - See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf + See http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ - if ((s[1] & 0xc0) != 0x80 || - (s[2] & 0xc0) != 0x80 || - ((unsigned char)s[0] == 0xE0 && - (unsigned char)s[1] < 0xA0) || - ((unsigned char)s[0] == 0xED && - (unsigned char)s[1] > 0x9F)) { - errmsg = "invalid continuation byte"; + errmsg = "invalid continuation byte"; + /* check if the second byte is not a valid continuation byte */ + if (_PyUnicode_IS_INVALID_3SEQ_2ND_CB(s[0], s[1])) { startinpos = s-starts; endinpos = startinpos + 1; - - /* if s[1] first two bits are 1 and 0, then the invalid - continuation byte is s[2], so increment endinpos by 1, - if not, s[1] is invalid and endinpos doesn't need to - be incremented. */ - if ((s[1] & 0xC0) == 0x80) - endinpos++; + goto utf8Error; + } + /* check if the third byte is not a valid continuation byte */ + else if (_PyUnicode_IS_INVALID_UTF8_CB(s[2])) { + startinpos = s-starts; + endinpos = startinpos + 2; goto utf8Error; } ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); @@ -2338,21 +2391,20 @@ break; case 4: - if ((s[1] & 0xc0) != 0x80 || - (s[2] & 0xc0) != 0x80 || - (s[3] & 0xc0) != 0x80 || - ((unsigned char)s[0] == 0xF0 && - (unsigned char)s[1] < 0x90) || - ((unsigned char)s[0] == 0xF4 && - (unsigned char)s[1] > 0x8F)) { - errmsg = "invalid continuation byte"; + errmsg = "invalid continuation byte"; + if (_PyUnicode_IS_INVALID_4SEQ_2ND_CB(s[0], s[1])) { startinpos = s-starts; endinpos = startinpos + 1; - if ((s[1] & 0xC0) == 0x80) { - endinpos++; - if ((s[2] & 0xC0) == 0x80) - endinpos++; - } + goto utf8Error; + } + else if (_PyUnicode_IS_INVALID_UTF8_CB(s[2])) { + startinpos = s-starts; + endinpos = startinpos + 2; + goto utf8Error; + } + else if (_PyUnicode_IS_INVALID_UTF8_CB(s[3])) { + startinpos = s-starts; + endinpos = startinpos + 3; goto utf8Error; } ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + @@ -2631,7 +2683,7 @@ #endif PyObject *errorHandler = NULL; PyObject *exc = NULL; - + q = (unsigned char *)s; e = q + size;