Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (revision 79542) +++ Objects/unicodeobject.c (working copy) @@ -1896,6 +1896,7 @@ { const char *starts = s; int n; + int k; Py_ssize_t startinpos; Py_ssize_t endinpos; Py_ssize_t outpos; @@ -1961,7 +1962,9 @@ if ((s[1] & 0xc0) != 0x80) { errmsg = "invalid data"; startinpos = s-starts; - endinpos = startinpos+2; + endinpos = startinpos + 1; + for (k=1; (k < 2) && ((s[k]&0xC0) == 0x80); k++) + endinpos++; goto utf8Error; } ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); @@ -1980,7 +1983,12 @@ (s[2] & 0xc0) != 0x80) { errmsg = "invalid data"; startinpos = s-starts; - endinpos = startinpos+3; + endinpos = startinpos + 1; + if (((s[0] == 0xE0) && (s[1] < 0xA0)) || + ((s[0] == 0xED) && (s[1] > 0x9F))) + goto utf8Error; + for (k=1; (k < 3) && ((s[k]&0xC0) == 0x80); k++) + endinpos++; goto utf8Error; } ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); @@ -2007,7 +2015,12 @@ (s[3] & 0xc0) != 0x80) { errmsg = "invalid data"; startinpos = s-starts; - endinpos = startinpos+4; + endinpos = startinpos + 1; + if (((s[0] == 0xF0) && (s[1] < 0x90)) || + ((s[0] == 0xF4) && (s[1] > 0x8F))) + goto utf8Error; + for (k=1; (k < 4) && ((s[k]&0xC0) == 0x80); k++) + endinpos++; goto utf8Error; } ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + Index: Lib/test/test_bytes.py =================================================================== --- Lib/test/test_bytes.py (revision 79542) +++ Lib/test/test_bytes.py (working copy) @@ -189,6 +189,55 @@ self.assertEqual(b.decode(errors="ignore", encoding="utf8"), "Hello world\n") + def test_utf8_decode_valid_sequences(self): + sequences = [ + ('a', u'a'), # 1 byte seq + ('\xc2\xb0', u'\u00b0'), # 2 bytes seq + ('\xe9\x99\xaa', u'\u966a'), # 3 bytes seq + ('\xf1\x80\x90\xa0', u'\U00040420'), # 4 bytes seq + ] + for seq, res in sequences: + self.assertEqual(seq.decode('utf-8'), res) + + def test_utf8_decode_invalid_sequences(self): + #8271 + sequences = [ + ('\x80', u'\ufffd'), # continuation byte + ('\xc0', u'\ufffd'), # overlong encoding + ('\xc2', u'\ufffd'), # 2 bytes seq with only 1 byte + ('\xc2\x41', u'\ufffdA'), # 2 bytes seq without continuation byte + ('\xe0', u'\ufffd'), # 3 bytes seq with only 1 byte + ('\xe0\x80', u'\ufffd'), # 3 bytes seq with only 2 bytes + ('\xe0\x81\x41', u'\ufffdA'), # 3 bytes seq with only 2 valid bytes + ('\xf0', u'\ufffd'), # 4 bytes seq with only 1 byte + ('\xf0\x82', u'\ufffd'), # 4 bytes seq with only 2 bytes + ('\xf0\x83\x84', u'\ufffd'), # 4 bytes seq with only 3 bytes + ('\xf0\x83\x84\x41', u'\ufffdA'), # 4 bytes seq with only 3 valid bytes + ('\xf5', u'\ufffd'), # invalid 4 bytes seq with only 1 byte + #('\xf5\x85', u'\ufffd'), # invalid 4 bytes seq with only 2 bytes + #('\xf5\x80\x41', u'\ufffdA'), # invalid 4 bytes seq with only 2 bytes + #('\xf5\x86\x87\x88', u'\ufffd'), # invalid 4 bytes seq with 4 bytes + ('\xf8', u'\ufffd'), # invalid 5 bytes seq with only 1 byte + #('\xf8\x89', u'\ufffd'), # invalid 5 bytes seq with only 2 bytes + #('\xf8\x80\x41', u'\ufffdA'), # invalid 4 bytes seq with only 2 bytes + #('\xf8\x90\x91\x92\x93', u'\ufffd'), # invalid 5 bytes seq with 5 bytes + ('\xfc', u'\ufffd'), # invalid 6 bytes seq with only 1 byte + #('\xfc\x94\x95', u'\ufffd'), # invalid 6 bytes seq with only 3 bytes + #('\xfc\x96\x97\x98\x99\xa0', u'\ufffd'), # invalid 6 bytes seq with 6 bytes + ('\xfe', u'\ufffd'), # invalid seq + #('\xfe\xa1\xa2', u'\ufffd\ufffd\ufffd'), # invalid seq with 3 bytes + # other sequences + ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'), + ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'), + ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'), + ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64', + u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'), + ] + for n, (seq, res) in enumerate(sequences): + #sys.__stdout__.write('%d %r %r %r\n' % (n, seq, res, seq.decode('utf-8', 'replace'))) + self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8') + self.assertEqual(seq.decode('utf-8', 'replace'), res) + def test_from_int(self): b = self.type2test(0) self.assertEqual(b, self.type2test())