diff -r ed0c30b4c082 Lib/json/__init__.py --- a/Lib/json/__init__.py Sat May 04 15:16:16 2013 +0300 +++ b/Lib/json/__init__.py Sun May 05 16:01:07 2013 +0300 @@ -108,6 +108,7 @@ from .decoder import JSONDecoder from .encoder import JSONEncoder +import codecs _default_encoder = JSONEncoder( skipkeys=False, @@ -243,6 +244,28 @@ _default_decoder = JSONDecoder(object_hook=None, object_pairs_hook=None) +def detect_encoding(b): + bstartswith = b.startswith + if bstartswith((codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE)): + return 'utf-32' + if bstartswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)): + return 'utf-16' + if bstartswith(codecs.BOM_UTF8): + return 'utf-8-sig' + + if len(b) >= 4: + if not b[0]: + return 'utf-16be' if b[1] or b[2] else 'utf-32be' + if not b[1]: + return 'utf-16le' if b[2] or b[3] else 'utf-32le' + elif len(b) == 2: + if not b[0]: + return 'utf-16be' + if not b[1]: + return 'utf-16le' + return 'utf-8' + + def load(fp, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw): """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing @@ -310,6 +333,8 @@ The ``encoding`` argument is ignored and deprecated. """ + if isinstance(s, bytes): + s = s.decode(detect_encoding(s), 'surrogatepass') if (cls is None and object_hook is None and parse_int is None and parse_float is None and parse_constant is None and object_pairs_hook is None and not kw): diff -r ed0c30b4c082 Lib/test/json_tests/test_unicode.py --- a/Lib/test/json_tests/test_unicode.py Sat May 04 15:16:16 2013 +0300 +++ b/Lib/test/json_tests/test_unicode.py Sun May 05 16:01:07 2013 +0300 @@ -1,3 +1,4 @@ +import codecs from collections import OrderedDict from test.json_tests import PyTest, CTest @@ -52,8 +53,18 @@ self.assertRaises(TypeError, self.dumps, [b"hi"]) def test_bytes_decode(self): - self.assertRaises(TypeError, self.loads, b'"hi"') - self.assertRaises(TypeError, self.loads, b'["hi"]') + for encoding, bom in [ + ('utf-8', codecs.BOM_UTF8), + ('utf-16be', codecs.BOM_UTF16_BE), + ('utf-16le', codecs.BOM_UTF16_LE), + ('utf-32be', codecs.BOM_UTF32_BE), + ('utf-32le', codecs.BOM_UTF32_LE), + ]: + data = ["a\xb5\u20ac\U0001d120"] + encoded = self.dumps(data).encode(encoding) + self.assertEqual(self.loads(bom + encoded), data) + self.assertEqual(self.loads(encoded), data) + self.assertRaises(UnicodeDecodeError, self.loads, b'["\x80"]') def test_object_pairs_hook_with_unicode(self):