From b161562f72a28e83e62ec0a0a5de601e7724629f Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 10 Sep 2016 20:16:18 +1000 Subject: [PATCH] Issue #17909: Accept binary input in json.loads json.loads (and hence json.load) now support binary input encoded as UTF-8, UTF-16 or UTF-32. Patch by Serhiy Storchaka. --- Doc/library/json.rst | 5 +-- Doc/whatsnew/3.6.rst | 8 +++++ Lib/json/__init__.py | 50 +++++++++++++++++++++++++----- Lib/test/test_json/test_decode.py | 4 +-- Lib/test/test_json/test_unicode.py | 16 ++++++++-- Misc/NEWS | 3 ++ 6 files changed, 70 insertions(+), 16 deletions(-) diff --git a/Doc/library/json.rst b/Doc/library/json.rst index 73824f838c3..302f8396ff8 100644 --- a/Doc/library/json.rst +++ b/Doc/library/json.rst @@ -268,8 +268,9 @@ Basic Usage .. function:: loads(s, *, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw) - Deserialize *s* (a :class:`str` instance containing a JSON document) to a - Python object using this :ref:`conversion table `. + Deserialize *s* (a :class:`str`, :class:`bytes` or :class:`bytearray` + instance containing a JSON document) to a Python object using this + :ref:`conversion table `. The other arguments have the same meaning as in :func:`load`, except *encoding* which is ignored and deprecated. diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst index 4083f39e841..59ac332c94c 100644 --- a/Doc/whatsnew/3.6.rst +++ b/Doc/whatsnew/3.6.rst @@ -680,6 +680,14 @@ restriction that :class:`importlib.machinery.BuiltinImporter` and :term:`path-like object`. +json +---- + +:func:`json.load` and :func:`json.loads` now support binary input. Encoded +JSON should be represented using either UTF-8, UTF-16, or UTF-32. +(Contributed by Serhiy Storchaka in :issue:`17909`.) + + os -- diff --git a/Lib/json/__init__.py b/Lib/json/__init__.py index f2c0d23a321..8dcc6786e27 100644 --- a/Lib/json/__init__.py +++ b/Lib/json/__init__.py @@ -105,6 +105,7 @@ __author__ = 'Bob Ippolito ' from .decoder import JSONDecoder, JSONDecodeError from .encoder import JSONEncoder +import codecs _default_encoder = JSONEncoder( skipkeys=False, @@ -240,6 +241,35 @@ def dumps(obj, *, skipkeys=False, ensure_ascii=True, check_circular=True, _default_decoder = JSONDecoder(object_hook=None, object_pairs_hook=None) +def detect_encoding(b): + bstartswith = b.startswith + if bstartswith((codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE)): + return 'utf-32' + if bstartswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)): + return 'utf-16' + if bstartswith(codecs.BOM_UTF8): + return 'utf-8-sig' + + if len(b) >= 4: + if not b[0]: + # 00 00 -- -- - utf-32-be + # 00 XX -- -- - utf-16-be + return 'utf-16-be' if b[1] else 'utf-32-be' + if not b[1]: + # XX 00 00 00 - utf-32-le + # XX 00 XX XX - utf-16-le + return 'utf-16-le' if b[2] or b[3] else 'utf-32-le' + elif len(b) == 2: + if not b[0]: + # 00 XX - utf-16-be + return 'utf-16-be' + if not b[1]: + # XX 00 - utf-16-le + return 'utf-16-le' + # default + return 'utf-8' + + def load(fp, *, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw): """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing @@ -270,8 +300,8 @@ def load(fp, *, cls=None, object_hook=None, parse_float=None, def loads(s, *, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw): - """Deserialize ``s`` (a ``str`` instance containing a JSON - document) to a Python object. + """Deserialize ``s`` (a ``str``, ``bytes`` or ``bytearray`` instance + containing a JSON document) to a Python object. ``object_hook`` is an optional function that will be called with the result of any object literal decode (a ``dict``). The return value of @@ -307,12 +337,16 @@ def loads(s, *, encoding=None, cls=None, object_hook=None, parse_float=None, The ``encoding`` argument is ignored and deprecated. """ - if not isinstance(s, str): - raise TypeError('the JSON object must be str, not {!r}'.format( - s.__class__.__name__)) - if s.startswith(u'\ufeff'): - raise JSONDecodeError("Unexpected UTF-8 BOM (decode using utf-8-sig)", - s, 0) + if isinstance(s, str): + if s.startswith('\ufeff'): + raise JSONDecodeError("Unexpected UTF-8 BOM (decode using utf-8-sig)", + s, 0) + else: + if not isinstance(s, (bytes, bytearray)): + raise TypeError('the JSON object must be str, bytes or bytearray, ' + 'not {!r}'.format(s.__class__.__name__)) + s = s.decode(detect_encoding(s), 'surrogatepass') + if (cls is None and object_hook is None and parse_int is None and parse_float is None and parse_constant is None and object_pairs_hook is None and not kw): diff --git a/Lib/test/test_json/test_decode.py b/Lib/test/test_json/test_decode.py index fdafeb6d8fe..7e568be4097 100644 --- a/Lib/test/test_json/test_decode.py +++ b/Lib/test/test_json/test_decode.py @@ -72,10 +72,8 @@ class TestDecode: def test_invalid_input_type(self): msg = 'the JSON object must be str' - for value in [1, 3.14, b'bytes', b'\xff\x00', [], {}, None]: + for value in [1, 3.14, [], {}, None]: self.assertRaisesRegex(TypeError, msg, self.loads, value) - with self.assertRaisesRegex(TypeError, msg): - self.json.load(BytesIO(b'[1,2,3]')) def test_string_with_utf8_bom(self): # see #18958 diff --git a/Lib/test/test_json/test_unicode.py b/Lib/test/test_json/test_unicode.py index c7cc8a7e922..eda177aa68c 100644 --- a/Lib/test/test_json/test_unicode.py +++ b/Lib/test/test_json/test_unicode.py @@ -1,3 +1,4 @@ +import codecs from collections import OrderedDict from test.test_json import PyTest, CTest @@ -52,9 +53,18 @@ class TestUnicode: self.assertRaises(TypeError, self.dumps, [b"hi"]) def test_bytes_decode(self): - self.assertRaises(TypeError, self.loads, b'"hi"') - self.assertRaises(TypeError, self.loads, b'["hi"]') - + for encoding, bom in [ + ('utf-8', codecs.BOM_UTF8), + ('utf-16be', codecs.BOM_UTF16_BE), + ('utf-16le', codecs.BOM_UTF16_LE), + ('utf-32be', codecs.BOM_UTF32_BE), + ('utf-32le', codecs.BOM_UTF32_LE), + ]: + data = ["a\xb5\u20ac\U0001d120"] + encoded = self.dumps(data).encode(encoding) + self.assertEqual(self.loads(bom + encoded), data) + self.assertEqual(self.loads(encoded), data) + self.assertRaises(UnicodeDecodeError, self.loads, b'["\x80"]') def test_object_pairs_hook_with_unicode(self): s = '{"xkd":1, "kcw":2, "art":3, "hxm":4, "qrt":5, "pad":6, "hoy":7}' diff --git a/Misc/NEWS b/Misc/NEWS index a7a91046b14..42821a435d0 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -135,6 +135,9 @@ Core and Builtins Library ------- +- Issue #17909: ``json.load`` and ``json.loads`` now support binary input + encoded as UTF-8, UTF-16 or UTF-32. Patch by Serhiy Storchaka. + - Issue #27137: the pure Python fallback implementation of ``functools.partial`` now matches the behaviour of its accelerated C counterpart for subclassing, pickling and text representation purposes. Patch by Emanuel Barry and