diff -r 33769ce40b47 Doc/library/json.rst --- a/Doc/library/json.rst Wed Jun 22 05:49:15 2016 -0400 +++ b/Doc/library/json.rst Wed Jun 22 19:57:06 2016 +0300 @@ -265,8 +265,9 @@ Basic Usage .. function:: loads(s, *, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw) - Deserialize *s* (a :class:`str` instance containing a JSON document) to a - Python object using this :ref:`conversion table `. + Deserialize *s* (a :class:`str`, :class:`bytes` or :class:`bytearray` + instance containing a JSON document) to a Python object using this + :ref:`conversion table `. The other arguments have the same meaning as in :func:`load`, except *encoding* which is ignored and deprecated. diff -r 33769ce40b47 Doc/whatsnew/3.6.rst --- a/Doc/whatsnew/3.6.rst Wed Jun 22 05:49:15 2016 -0400 +++ b/Doc/whatsnew/3.6.rst Wed Jun 22 19:57:06 2016 +0300 @@ -291,6 +291,13 @@ The idlelib package is being modernized In compensation, the eventual result with be that some idlelib classes will be easier to use, with better APIs and docstrings explaining them. Additional useful information will be added to idlelib when available. +json +---- + +:func:`json.load` and :func:`json.loads` now support binary input. Encoded +JSON should be represented using either UTF-8, UTF-16, or UTF-32. + + os -- diff -r 33769ce40b47 Lib/json/__init__.py --- a/Lib/json/__init__.py Wed Jun 22 05:49:15 2016 -0400 +++ b/Lib/json/__init__.py Wed Jun 22 19:57:06 2016 +0300 @@ -105,6 +105,7 @@ Using json.tool from the shell to valida from .decoder import JSONDecoder, JSONDecodeError from .encoder import JSONEncoder +import codecs _default_encoder = JSONEncoder( skipkeys=False, @@ -240,6 +241,35 @@ def dumps(obj, *, skipkeys=False, ensure _default_decoder = JSONDecoder(object_hook=None, object_pairs_hook=None) +def detect_encoding(b): + bstartswith = b.startswith + if bstartswith((codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE)): + return 'utf-32' + if bstartswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)): + return 'utf-16' + if bstartswith(codecs.BOM_UTF8): + return 'utf-8-sig' + + if len(b) >= 4: + if not b[0]: + # 00 00 -- -- - utf-32-be + # 00 XX -- -- - utf-16-be + return 'utf-16-be' if b[1] else 'utf-32-be' + if not b[1]: + # XX 00 00 00 - utf-32-le + # XX 00 XX XX - utf-16-le + return 'utf-16-le' if b[2] or b[3] else 'utf-32-le' + elif len(b) == 2: + if not b[0]: + # 00 XX - utf-16-be + return 'utf-16-be' + if not b[1]: + # XX 00 - utf-16-le + return 'utf-16-le' + # default + return 'utf-8' + + def load(fp, *, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw): """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing @@ -270,8 +300,8 @@ def load(fp, *, cls=None, object_hook=No def loads(s, *, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw): - """Deserialize ``s`` (a ``str`` instance containing a JSON - document) to a Python object. + """Deserialize ``s`` (a ``str``, ``bytes`` or ``bytearray`` instance + containing a JSON document) to a Python object. ``object_hook`` is an optional function that will be called with the result of any object literal decode (a ``dict``). The return value of @@ -307,12 +337,16 @@ def loads(s, *, encoding=None, cls=None, The ``encoding`` argument is ignored and deprecated. """ - if not isinstance(s, str): - raise TypeError('the JSON object must be str, not {!r}'.format( - s.__class__.__name__)) - if s.startswith(u'\ufeff'): - raise JSONDecodeError("Unexpected UTF-8 BOM (decode using utf-8-sig)", - s, 0) + if isinstance(s, str): + if s.startswith('\ufeff'): + raise JSONDecodeError("Unexpected UTF-8 BOM (decode using utf-8-sig)", + s, 0) + else: + if not isinstance(s, (bytes, bytearray)): + raise TypeError('the JSON object must be str, bytes or bytearray, ' + 'not {!r}'.format(s.__class__.__name__)) + s = s.decode(detect_encoding(s), 'surrogatepass') + if (cls is None and object_hook is None and parse_int is None and parse_float is None and parse_constant is None and object_pairs_hook is None and not kw): diff -r 33769ce40b47 Lib/test/test_json/test_decode.py --- a/Lib/test/test_json/test_decode.py Wed Jun 22 05:49:15 2016 -0400 +++ b/Lib/test/test_json/test_decode.py Wed Jun 22 19:57:06 2016 +0300 @@ -72,10 +72,8 @@ class TestDecode: def test_invalid_input_type(self): msg = 'the JSON object must be str' - for value in [1, 3.14, b'bytes', b'\xff\x00', [], {}, None]: + for value in [1, 3.14, [], {}, None]: self.assertRaisesRegex(TypeError, msg, self.loads, value) - with self.assertRaisesRegex(TypeError, msg): - self.json.load(BytesIO(b'[1,2,3]')) def test_string_with_utf8_bom(self): # see #18958 diff -r 33769ce40b47 Lib/test/test_json/test_unicode.py --- a/Lib/test/test_json/test_unicode.py Wed Jun 22 05:49:15 2016 -0400 +++ b/Lib/test/test_json/test_unicode.py Wed Jun 22 19:57:06 2016 +0300 @@ -1,3 +1,4 @@ +import codecs from collections import OrderedDict from test.test_json import PyTest, CTest @@ -52,9 +53,18 @@ class TestUnicode: self.assertRaises(TypeError, self.dumps, [b"hi"]) def test_bytes_decode(self): - self.assertRaises(TypeError, self.loads, b'"hi"') - self.assertRaises(TypeError, self.loads, b'["hi"]') - + for encoding, bom in [ + ('utf-8', codecs.BOM_UTF8), + ('utf-16be', codecs.BOM_UTF16_BE), + ('utf-16le', codecs.BOM_UTF16_LE), + ('utf-32be', codecs.BOM_UTF32_BE), + ('utf-32le', codecs.BOM_UTF32_LE), + ]: + data = ["a\xb5\u20ac\U0001d120"] + encoded = self.dumps(data).encode(encoding) + self.assertEqual(self.loads(bom + encoded), data) + self.assertEqual(self.loads(encoded), data) + self.assertRaises(UnicodeDecodeError, self.loads, b'["\x80"]') def test_object_pairs_hook_with_unicode(self): s = '{"xkd":1, "kcw":2, "art":3, "hxm":4, "qrt":5, "pad":6, "hoy":7}'