diff --git a/Lib/json/scanner.py b/Lib/json/scanner.py --- a/Lib/json/scanner.py +++ b/Lib/json/scanner.py @@ -26,6 +26,10 @@ object_pairs_hook = context.object_pairs_hook def _scan_once(string, idx): + if (idx == 0 and + ((isinstance(string, str) and string.startswith('\xef\xbb\xbf')) or + (isinstance(string, unicode) and string.startswith(u'\ufeff')))): + raise ValueError("Unexpected UTF-8 BOM (decode using utf-8-sig)") try: nextchar = string[idx] except IndexError: diff --git a/Lib/json/tests/test_decode.py b/Lib/json/tests/test_decode.py --- a/Lib/json/tests/test_decode.py +++ b/Lib/json/tests/test_decode.py @@ -60,5 +60,21 @@ msg = 'escape' self.assertRaisesRegexp(ValueError, msg, self.loads, s) + def test_string_with_utf8_bom(self): + # see #18958 + bom_json = u"[1,2,3]".encode('utf-8-sig') + for s in (bom_json, bom_json.decode('utf-8')): + with self.assertRaises(ValueError) as cm: + self.loads(s) + self.assertIn('BOM', str(cm.exception)) + with self.assertRaises(ValueError) as cm: + self.json.load(StringIO(s)) + self.assertIn('BOM', str(cm.exception)) + # make sure that the BOM is not detected in the middle of a string + bom_in_str = '"{}"'.format(''.encode('utf-8-sig')) + for s in (bom_in_str, bom_in_str.decode('utf-8')): + self.assertEqual(self.loads(s), u'\ufeff') + self.assertEqual(self.json.load(StringIO(s)), u'\ufeff') + class TestPyDecode(TestDecode, PyTest): pass class TestCDecode(TestDecode, CTest): pass diff --git a/Modules/_json.c b/Modules/_json.c --- a/Modules/_json.c +++ b/Modules/_json.c @@ -1495,6 +1495,16 @@ PyErr_SetNone(PyExc_StopIteration); return NULL; } + if (idx == 0 && length >= 3 && + str[0] == '\xef' && + str[1] == '\xbb' && + str[2] == '\xbf') { + /* UTF-8 BOM detected */ + raise_errmsg("Unexpected UTF-8 BOM (decode using utf-8-sig)", + pystr, idx); + return NULL; + } + switch (str[idx]) { case '"': /* string */ @@ -1582,6 +1592,12 @@ PyErr_SetNone(PyExc_StopIteration); return NULL; } + if (idx == 0 && length >= 1 && str[0] == 0xfeff) { + /* UTF-8 BOM detected */ + raise_errmsg("Unexpected UTF-8 BOM (decode using utf-8-sig)", + pystr, idx); + return NULL; + } switch (str[idx]) { case '"': /* string */