diff --git a/Lib/json/__init__.py b/Lib/json/__init__.py --- a/Lib/json/__init__.py +++ b/Lib/json/__init__.py @@ -310,6 +310,8 @@ The ``encoding`` argument is ignored and deprecated. """ + if s.startswith(u'\ufeff'): + raise ValueError("Unexpected UTF-8 BOM (decode using utf-8-sig)") if (cls is None and object_hook is None and parse_int is None and parse_float is None and parse_constant is None and object_pairs_hook is None and not kw): diff --git a/Lib/test/test_json/test_decode.py b/Lib/test/test_json/test_decode.py --- a/Lib/test/test_json/test_decode.py +++ b/Lib/test/test_json/test_decode.py @@ -70,5 +70,19 @@ msg = 'escape' self.assertRaisesRegex(ValueError, msg, self.loads, s) + def test_string_with_utf8_bom(self): + # see #18958 + bom_json = "[1,2,3]".encode('utf-8-sig').decode('utf-8') + with self.assertRaises(ValueError) as cm: + self.loads(bom_json) + self.assertIn('BOM', str(cm.exception)) + with self.assertRaises(ValueError) as cm: + self.json.load(StringIO(bom_json)) + self.assertIn('BOM', str(cm.exception)) + # make sure that the BOM is not detected in the middle of a string + bom_in_str = '"{}"'.format(''.encode('utf-8-sig').decode('utf-8')) + self.assertEqual(self.loads(bom_in_str), '\ufeff') + self.assertEqual(self.json.load(StringIO(bom_in_str)), '\ufeff') + class TestPyDecode(TestDecode, PyTest): pass class TestCDecode(TestDecode, CTest): pass