diff --git a/Lib/json/scanner.py b/Lib/json/scanner.py --- a/Lib/json/scanner.py +++ b/Lib/json/scanner.py @@ -26,6 +26,8 @@ memo = context.memo def _scan_once(string, idx): + if idx == 0 and string.startswith(u'\ufeff'): + raise ValueError("Unexpected UTF-8 BOM (decode using utf-8-sig)") try: nextchar = string[idx] except IndexError: diff --git a/Lib/test/test_json/test_decode.py b/Lib/test/test_json/test_decode.py --- a/Lib/test/test_json/test_decode.py +++ b/Lib/test/test_json/test_decode.py @@ -70,5 +70,19 @@ msg = 'escape' self.assertRaisesRegex(ValueError, msg, self.loads, s) + def test_string_with_utf8_bom(self): + # see #18958 + bom_json = "[1,2,3]".encode('utf-8-sig').decode('utf-8') + with self.assertRaises(ValueError) as cm: + self.loads(bom_json) + self.assertIn('BOM', str(cm.exception)) + with self.assertRaises(ValueError) as cm: + self.json.load(StringIO(bom_json)) + self.assertIn('BOM', str(cm.exception)) + # make sure that the BOM is not detected in the middle of a string + bom_in_str = '"{}"'.format(''.encode('utf-8-sig').decode('utf-8')) + self.assertEqual(self.loads(bom_in_str), '\ufeff') + self.assertEqual(self.json.load(StringIO(bom_in_str)), '\ufeff') + class TestPyDecode(TestDecode, PyTest): pass class TestCDecode(TestDecode, CTest): pass diff --git a/Modules/_json.c b/Modules/_json.c --- a/Modules/_json.c +++ b/Modules/_json.c @@ -957,6 +957,12 @@ raise_stop_iteration(idx); return NULL; } + if (idx == 0 && length >= 1 && PyUnicode_READ(kind, str, idx) == 0xfeff) { + /* UTF-8 BOM detected */ + raise_errmsg("Unexpected UTF-8 BOM (decode using utf-8-sig)", + pystr, idx); + return NULL; + } switch (PyUnicode_READ(kind, str, idx)) { case '"':