Index: Lib/json/scanner.py =================================================================== --- Lib/json/scanner.py (revision 66961) +++ Lib/json/scanner.py (working copy) @@ -1,69 +1,66 @@ -"""Iterator based sre token scanner - +"""JSON token scanner """ import re -import sre_parse -import sre_compile -import sre_constants +try: + from _json import make_scanner as c_make_scanner +except ImportError: + c_make_scanner = None -from re import VERBOSE, MULTILINE, DOTALL -from sre_constants import BRANCH, SUBPATTERN +__all__ = ['make_scanner'] -__all__ = ['Scanner', 'pattern'] +NUMBER_RE = re.compile( + r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?', + (re.VERBOSE | re.MULTILINE | re.DOTALL)) -FLAGS = (VERBOSE | MULTILINE | DOTALL) +def py_make_scanner(context): + parse_object = context.parse_object + parse_array = context.parse_array + parse_string = context.parse_string + match_number = NUMBER_RE.match + encoding = context.encoding + strict = context.strict + parse_float = context.parse_float + parse_int = context.parse_int + parse_constant = context.parse_constant + object_hook = context.object_hook -class Scanner(object): - def __init__(self, lexicon, flags=FLAGS): - self.actions = [None] - # Combine phrases into a compound pattern - s = sre_parse.Pattern() - s.flags = flags - p = [] - for idx, token in enumerate(lexicon): - phrase = token.pattern - try: - subpattern = sre_parse.SubPattern(s, - [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) - except sre_constants.error: - raise - p.append(subpattern) - self.actions.append(token) + def _scan_once(string, idx): + try: + nextchar = string[idx] + except IndexError: + raise StopIteration - s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work - p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) - self.scanner = sre_compile.compile(p) + if nextchar == '"': + return parse_string(string, idx + 1, encoding, strict) + elif nextchar == '{': + return parse_object((string, idx + 1), encoding, strict, _scan_once, object_hook) + elif nextchar == '[': + return parse_array((string, idx + 1), _scan_once) + elif nextchar == 'n' and string[idx:idx + 4] == 'null': + return None, idx + 4 + elif nextchar == 't' and string[idx:idx + 4] == 'true': + return True, idx + 4 + elif nextchar == 'f' and string[idx:idx + 5] == 'false': + return False, idx + 5 - def iterscan(self, string, idx=0, context=None): - """Yield match, end_idx for each match + m = match_number(string, idx) + if m is not None: + integer, frac, exp = m.groups() + if frac or exp: + res = parse_float(integer + (frac or '') + (exp or '')) + else: + res = parse_int(integer) + return res, m.end() + elif nextchar == 'N' and string[idx:idx + 3] == 'NaN': + return parse_constant('NaN'), idx + 3 + elif nextchar == 'I' and string[idx:idx + 8] == 'Infinity': + return parse_constant('Infinity'), idx + 8 + elif nextchar == '-' and string[idx:idx + 9] == '-Infinity': + return parse_constant('-Infinity'), idx + 9 + else: + raise StopIteration - """ - match = self.scanner.scanner(string, idx).match - actions = self.actions - lastend = idx - end = len(string) - while True: - m = match() - if m is None: - break - matchbegin, matchend = m.span() - if lastend == matchend: - break - action = actions[m.lastindex] - if action is not None: - rval, next_pos = action(m, context) - if next_pos is not None and next_pos != matchend: - # "fast forward" the scanner - matchend = next_pos - match = self.scanner.scanner(string, matchend).match - yield rval, matchend - lastend = matchend + return _scan_once - -def pattern(pattern, flags=FLAGS): - def decorator(fn): - fn.pattern = pattern - fn.regex = re.compile(pattern, flags) - return fn - return decorator +make_scanner = c_make_scanner or py_make_scanner Index: Lib/json/tests/test_float.py =================================================================== --- Lib/json/tests/test_float.py (revision 66961) +++ Lib/json/tests/test_float.py (working copy) @@ -7,3 +7,8 @@ def test_floats(self): for num in [1617161771.7650001, math.pi, math.pi**100, math.pi**-100]: self.assertEquals(float(json.dumps(num)), num) + + def test_ints(self): + for num in [1, 1L, 1<<32, 1<<64]: + self.assertEquals(json.dumps(num), str(num)) + self.assertEquals(int(json.dumps(num)), num) Index: Lib/json/tests/test_decode.py =================================================================== --- Lib/json/tests/test_decode.py (revision 66961) +++ Lib/json/tests/test_decode.py (working copy) @@ -13,3 +13,10 @@ rval = json.loads('1', parse_int=float) self.assert_(isinstance(rval, float)) self.assertEquals(rval, 1.0) + + def test_decoder_optimizations(self): + # Several optimizations were made that skip over calls to + # the whitespace regex, so this test is designed to try and + # exercise the uncommon cases. The array cases are already covered. + rval = json.loads('{ "key" : "value" , "k":"v" }') + self.assertEquals(rval, {"key":"value", "k":"v"}) Index: Lib/json/tool.py =================================================================== --- Lib/json/tool.py (revision 66961) +++ Lib/json/tool.py (working copy) @@ -8,7 +8,6 @@ } $ echo '{ 1.2:3.4}' | python -mjson.tool Expecting property name: line 1 column 2 (char 2) - """ import sys import json Index: Lib/json/__init__.py =================================================================== --- Lib/json/__init__.py (revision 66961) +++ Lib/json/__init__.py (working copy) @@ -29,8 +29,10 @@ Compact encoding:: >>> import json - >>> json.dumps([1,2,3,{'4': 5, '6': 7}], separators=(',',':')) - '[1,2,3,{"4":5,"6":7}]' + >>> compact = json.dumps([1,2,3,{'4': 5, '6': 7}], separators=(',',':')) + >>> # Can't assume dict ordering + >>> compact in ('[1,2,3,{"4":5,"6":7}]', '[1,2,3,{"6":7,"4":5}]') + True Pretty printing (using repr() because of extraneous whitespace in the output):: @@ -41,14 +43,14 @@ Decoding JSON:: >>> import json - >>> json.loads('["foo", {"bar":["baz", null, 1.0, 2]}]') - [u'foo', {u'bar': [u'baz', None, 1.0, 2]}] - >>> json.loads('"\\"foo\\bar"') - u'"foo\x08ar' + >>> json.loads('["foo", {"bar":["baz", null, 1.0, 2]}]') == ["foo", {"bar":["baz", None, 1.0, 2]}] + True + >>> json.loads('"\\"foo\\bar"') == '"foo\x08ar' + True >>> from StringIO import StringIO >>> io = StringIO('["streaming API"]') - >>> json.load(io) - [u'streaming API'] + >>> json.load(io) == ["streaming API"] + True Specializing JSON object decoding:: @@ -61,9 +63,9 @@ >>> json.loads('{"__complex__": true, "real": 1, "imag": 2}', ... object_hook=as_complex) (1+2j) - >>> import decimal - >>> json.loads('1.1', parse_float=decimal.Decimal) - Decimal('1.1') + >>> from decimal import Decimal + >>> json.loads('1.1', parse_float=Decimal) == Decimal("1.1") + True Extending JSONEncoder:: @@ -78,8 +80,8 @@ '[2.0, 1.0]' >>> ComplexEncoder().encode(2 + 1j) '[2.0, 1.0]' - >>> list(ComplexEncoder().iterencode(2 + 1j)) - ['[', '2.0', ', ', '1.0', ']'] + >>> ''.join(ComplexEncoder().iterencode(2 + 1j)) + '[2.0, 1.0]' Using json.tool from the shell to validate and @@ -96,8 +98,7 @@ is a subset of YAML, so it may be used as a serializer for that as well. """ - -__version__ = '1.9' +__version__ = '2.0.3' __all__ = [ 'dump', 'dumps', 'load', 'loads', 'JSONDecoder', 'JSONEncoder', Index: Lib/json/encoder.py =================================================================== --- Lib/json/encoder.py (revision 66961) +++ Lib/json/encoder.py (working copy) @@ -8,6 +8,10 @@ from _json import encode_basestring_ascii as c_encode_basestring_ascii except ImportError: c_encode_basestring_ascii = None +try: + from _json import make_encoder as c_make_encoder +except ImportError: + c_make_encoder = None __all__ = ['JSONEncoder'] @@ -28,27 +32,6 @@ FLOAT_REPR = repr -def floatstr(o, allow_nan=True): - # Check for specials. Note that this type of test is processor- and/or - # platform-specific, so do tests which don't depend on the internals. - - if math.isnan(o): - text = 'NaN' - elif math.isinf(o): - if math.copysign(1., o) == 1.: - text = 'Infinity' - else: - text = '-Infinity' - else: - return FLOAT_REPR(o) - - if not allow_nan: - msg = "Out of range float values are not JSON compliant: " + repr(o) - raise ValueError(msg) - - return text - - def encode_basestring(s): """Return a JSON representation of a Python string @@ -78,12 +61,8 @@ return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"' -if c_encode_basestring_ascii is not None: - encode_basestring_ascii = c_encode_basestring_ascii -else: - encode_basestring_ascii = py_encode_basestring_ascii +encode_basestring_ascii = c_encode_basestring_ascii or py_encode_basestring_ascii - class JSONEncoder(object): """Extensible JSON encoder for Python data structures. @@ -167,17 +146,133 @@ self.allow_nan = allow_nan self.sort_keys = sort_keys self.indent = indent - self.current_indent_level = 0 if separators is not None: self.item_separator, self.key_separator = separators if default is not None: self.default = default self.encoding = encoding - def _newline_indent(self): - return '\n' + (' ' * (self.indent * self.current_indent_level)) + def default(self, o): + """Implement this method in a subclass such that it returns + a serializable object for ``o``, or calls the base implementation + (to raise a ``TypeError``). - def _iterencode_list(self, lst, markers=None): + For example, to support arbitrary iterators, you could + implement default like this:: + + def default(self, o): + try: + iterable = iter(o) + except TypeError: + pass + else: + return list(iterable) + return JSONEncoder.default(self, o) + + """ + raise TypeError("{0!r} is not JSON serializable".format(o)) + + def encode(self, o): + """Return a JSON string representation of a Python data structure. + + >>> JSONEncoder().encode({"foo": ["bar", "baz"]}) + '{"foo": ["bar", "baz"]}' + + """ + # This is for extremely simple cases and benchmarks. + if isinstance(o, basestring): + if isinstance(o, str): + _encoding = self.encoding + if (_encoding is not None + and not (_encoding == 'utf-8')): + o = o.decode(_encoding) + if self.ensure_ascii: + return encode_basestring_ascii(o) + else: + return encode_basestring(o) + # This doesn't pass the iterator directly to ''.join() because the + # exceptions aren't as detailed. The list call should be roughly + # equivalent to the PySequence_Fast that ''.join() would do. + chunks = self.iterencode(o, _one_shot=True) + if not isinstance(chunks, (list, tuple)): + chunks = list(chunks) + return ''.join(chunks) + + def iterencode(self, o, _one_shot=False): + """Encode the given object and yield each string + representation as available. + + For example:: + + for chunk in JSONEncoder().iterencode(bigobject): + mysocket.write(chunk) + + """ + if self.check_circular: + markers = {} + else: + markers = None + if self.ensure_ascii: + _encoder = encode_basestring_ascii + else: + _encoder = encode_basestring + if self.encoding != 'utf-8': + def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding): + if isinstance(o, str): + o = o.decode(_encoding) + return _orig_encoder(o) + + def floatstr(o, allow_nan=True): + # Check for specials. Note that this type of test is processor- + # and/or platform-specific, so do tests which don't depend on the + # internals. + + if math.isnan(o): + text = 'NaN' + elif math.isinf(o): + if math.copysign(1., o) == 1.: + text = 'Infinity' + else: + text = '-Infinity' + else: + return FLOAT_REPR(o) + + if not allow_nan: + msg = "Out of range float values are not JSON compliant: " + repr(o) + raise ValueError(msg) + + return text + + if _one_shot and c_make_encoder is not None and not self.indent and not self.sort_keys: + _iterencode = c_make_encoder( + markers, self.default, _encoder, self.indent, + self.key_separator, self.item_separator, self.sort_keys, + self.skipkeys, self.allow_nan) + else: + _iterencode = _make_iterencode( + markers, self.default, _encoder, self.indent, floatstr, + self.key_separator, self.item_separator, self.sort_keys, + self.skipkeys, _one_shot) + return _iterencode(o, 0) + +def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot, + ## HACK: hand-optimized bytecode; turn globals into locals + False=False, + True=True, + ValueError=ValueError, + basestring=basestring, + dict=dict, + float=float, + id=id, + int=int, + isinstance=isinstance, + list=list, + long=long, + str=str, + tuple=tuple, + ): + + def _iterencode_list(lst, _current_indent_level): if not lst: yield '[]' return @@ -186,31 +281,51 @@ if markerid in markers: raise ValueError("Circular reference detected") markers[markerid] = lst - yield '[' - if self.indent is not None: - self.current_indent_level += 1 - newline_indent = self._newline_indent() - separator = self.item_separator + newline_indent - yield newline_indent + buf = '[' + if _indent is not None: + _current_indent_level += 1 + newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) + separator = _item_separator + newline_indent + buf += newline_indent else: newline_indent = None - separator = self.item_separator + separator = _item_separator first = True for value in lst: if first: first = False else: - yield separator - for chunk in self._iterencode(value, markers): - yield chunk + buf = separator + if isinstance(value, basestring): + yield buf + _encoder(value) + elif value is None: + yield buf + 'null' + elif value is True: + yield buf + 'true' + elif value is False: + yield buf + 'false' + elif isinstance(value, (int, long)): + yield buf + str(value) + elif isinstance(value, float): + yield buf + _floatstr(value) + else: + yield buf + if isinstance(value, (list, tuple)): + chunks = _iterencode_list(value, _current_indent_level) + elif isinstance(value, dict): + chunks = _iterencode_dict(value, _current_indent_level) + else: + chunks = _iterencode(value, _current_indent_level) + for chunk in chunks: + yield chunk if newline_indent is not None: - self.current_indent_level -= 1 - yield self._newline_indent() + _current_indent_level -= 1 + yield '\n' + (' ' * (_indent * _current_indent_level)) yield ']' if markers is not None: del markers[markerid] - def _iterencode_dict(self, dct, markers=None): + def _iterencode_dict(dct, _current_indent_level): if not dct: yield '{}' return @@ -220,40 +335,27 @@ raise ValueError("Circular reference detected") markers[markerid] = dct yield '{' - key_separator = self.key_separator - if self.indent is not None: - self.current_indent_level += 1 - newline_indent = self._newline_indent() - item_separator = self.item_separator + newline_indent + if _indent is not None: + _current_indent_level += 1 + newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) + item_separator = _item_separator + newline_indent yield newline_indent else: newline_indent = None - item_separator = self.item_separator + item_separator = _item_separator first = True - if self.ensure_ascii: - encoder = encode_basestring_ascii + if _sort_keys: + items = dct.items() + items.sort(key=lambda kv: kv[0]) else: - encoder = encode_basestring - allow_nan = self.allow_nan - if self.sort_keys: - keys = dct.keys() - keys.sort() - items = [(k, dct[k]) for k in keys] - else: items = dct.iteritems() - _encoding = self.encoding - _do_decode = (_encoding is not None - and not (_encoding == 'utf-8')) for key, value in items: - if isinstance(key, str): - if _do_decode: - key = key.decode(_encoding) - elif isinstance(key, basestring): + if isinstance(key, basestring): pass # JavaScript is weakly typed for these, so it makes sense to # also allow them. Many encoders seem to do something like this. elif isinstance(key, float): - key = floatstr(key, allow_nan) + key = _floatstr(key) elif isinstance(key, (int, long)): key = str(key) elif key is True: @@ -262,7 +364,7 @@ key = 'false' elif key is None: key = 'null' - elif self.skipkeys: + elif _skipkeys: continue else: raise TypeError("key {0!r} is not a string".format(key)) @@ -270,28 +372,39 @@ first = False else: yield item_separator - yield encoder(key) - yield key_separator - for chunk in self._iterencode(value, markers): - yield chunk + yield _encoder(key) + yield _key_separator + if isinstance(value, basestring): + yield _encoder(value) + elif value is None: + yield 'null' + elif value is True: + yield 'true' + elif value is False: + yield 'false' + elif isinstance(value, (int, long)): + yield str(value) + elif isinstance(value, float): + yield _floatstr(value) + else: + if isinstance(value, (list, tuple)): + chunks = _iterencode_list(value, _current_indent_level) + elif isinstance(value, dict): + chunks = _iterencode_dict(value, _current_indent_level) + else: + chunks = _iterencode(value, _current_indent_level) + for chunk in chunks: + yield chunk if newline_indent is not None: - self.current_indent_level -= 1 - yield self._newline_indent() + _current_indent_level -= 1 + yield '\n' + (' ' * (_indent * _current_indent_level)) yield '}' if markers is not None: del markers[markerid] - def _iterencode(self, o, markers=None): + def _iterencode(o, _current_indent_level): if isinstance(o, basestring): - if self.ensure_ascii: - encoder = encode_basestring_ascii - else: - encoder = encode_basestring - _encoding = self.encoding - if (_encoding is not None and isinstance(o, str) - and not (_encoding == 'utf-8')): - o = o.decode(_encoding) - yield encoder(o) + yield _encoder(o) elif o is None: yield 'null' elif o is True: @@ -301,12 +414,12 @@ elif isinstance(o, (int, long)): yield str(o) elif isinstance(o, float): - yield floatstr(o, self.allow_nan) + yield _floatstr(o) elif isinstance(o, (list, tuple)): - for chunk in self._iterencode_list(o, markers): + for chunk in _iterencode_list(o, _current_indent_level): yield chunk elif isinstance(o, dict): - for chunk in self._iterencode_dict(o, markers): + for chunk in _iterencode_dict(o, _current_indent_level): yield chunk else: if markers is not None: @@ -314,71 +427,10 @@ if markerid in markers: raise ValueError("Circular reference detected") markers[markerid] = o - for chunk in self._iterencode_default(o, markers): + o = _default(o) + for chunk in _iterencode(o, _current_indent_level): yield chunk if markers is not None: del markers[markerid] - def _iterencode_default(self, o, markers=None): - newobj = self.default(o) - return self._iterencode(newobj, markers) - - def default(self, o): - """Implement this method in a subclass such that it returns a serializable - object for ``o``, or calls the base implementation (to raise a - ``TypeError``). - - For example, to support arbitrary iterators, you could implement - default like this:: - - def default(self, o): - try: - iterable = iter(o) - except TypeError: - pass - else: - return list(iterable) - return JSONEncoder.default(self, o) - - """ - raise TypeError(repr(o) + " is not JSON serializable") - - def encode(self, o): - """Return a JSON string representation of a Python data structure. - - >>> JSONEncoder().encode({"foo": ["bar", "baz"]}) - '{"foo": ["bar", "baz"]}' - - """ - # This is for extremely simple cases and benchmarks. - if isinstance(o, basestring): - if isinstance(o, str): - _encoding = self.encoding - if (_encoding is not None - and not (_encoding == 'utf-8')): - o = o.decode(_encoding) - if self.ensure_ascii: - return encode_basestring_ascii(o) - else: - return encode_basestring(o) - # This doesn't pass the iterator directly to ''.join() because the - # exceptions aren't as detailed. The list call should be roughly - # equivalent to the PySequence_Fast that ''.join() would do. - chunks = list(self.iterencode(o)) - return ''.join(chunks) - - def iterencode(self, o): - """Encode the given object and yield each string representation as - available. - - For example:: - - for chunk in JSONEncoder().iterencode(bigobject): - mysocket.write(chunk) - - """ - if self.check_circular: - markers = {} - else: - markers = None - return self._iterencode(o, markers) + return _iterencode Index: Lib/json/decoder.py =================================================================== --- Lib/json/decoder.py (revision 66961) +++ Lib/json/decoder.py (working copy) @@ -3,8 +3,9 @@ import re import sys +import struct -from json.scanner import Scanner, pattern +from json.scanner import make_scanner try: from _json import scanstring as c_scanstring except ImportError: @@ -40,36 +41,8 @@ '-Infinity': NegInf, 'Infinity': PosInf, 'NaN': NaN, - 'true': True, - 'false': False, - 'null': None, } - -def JSONConstant(match, context, c=_CONSTANTS): - s = match.group(0) - fn = getattr(context, 'parse_constant', None) - if fn is None: - rval = c[s] - else: - rval = fn(s) - return rval, None -pattern('(-?Infinity|NaN|true|false|null)')(JSONConstant) - - -def JSONNumber(match, context): - match = JSONNumber.regex.match(match.string, *match.span()) - integer, frac, exp = match.groups() - if frac or exp: - fn = getattr(context, 'parse_float', None) or float - res = fn(integer + (frac or '') + (exp or '')) - else: - fn = getattr(context, 'parse_int', None) or int - res = fn(integer) - return res, None -pattern(r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?')(JSONNumber) - - STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) BACKSLASH = { '"': u'"', '\\': u'\\', '/': u'/', @@ -100,7 +73,7 @@ break elif terminator != '\\': if strict: - msg = "Invalid control character {0!r} at".format(terminator) + msg = "Invalid control character {0!r} at".format(esc) raise ValueError(errmsg(msg, s, end)) else: _append(terminator) @@ -114,8 +87,8 @@ try: m = _b[esc] except KeyError: - msg = "Invalid \\escape: {0!r}".format(esc) - raise ValueError(errmsg(msg, s, end)) + raise ValueError( + errmsg("Invalid \\escape: {0!r}".format(esc), s, end)) end += 1 else: esc = s[end + 1:end + 5] @@ -143,104 +116,121 @@ return u''.join(chunks), end -# Use speedup -if c_scanstring is not None: - scanstring = c_scanstring -else: - scanstring = py_scanstring +# Use speedup if available +scanstring = c_scanstring or py_scanstring -def JSONString(match, context): - encoding = getattr(context, 'encoding', None) - strict = getattr(context, 'strict', True) - return scanstring(match.string, match.end(), encoding, strict) -pattern(r'"')(JSONString) +WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) +WHITESPACE_STR = ' \t\n\r' - -WHITESPACE = re.compile(r'\s*', FLAGS) - - -def JSONObject(match, context, _w=WHITESPACE.match): +def JSONObject((s, end), encoding, strict, scan_once, object_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR): pairs = {} - s = match.string - end = _w(s, match.end()).end() nextchar = s[end:end + 1] - # Trivial empty object - if nextchar == '}': - return pairs, end + 1 + # Normally we expect nextchar == '"' if nextchar != '"': - raise ValueError(errmsg("Expecting property name", s, end)) + if nextchar in _ws: + end = _w(s, end).end() + nextchar = s[end:end + 1] + # Trivial empty object + if nextchar == '}': + return pairs, end + 1 + elif nextchar != '"': + raise ValueError(errmsg("Expecting property name", s, end)) end += 1 - encoding = getattr(context, 'encoding', None) - strict = getattr(context, 'strict', True) - iterscan = JSONScanner.iterscan while True: key, end = scanstring(s, end, encoding, strict) - end = _w(s, end).end() + + # To skip some function call overhead we optimize the fast paths where + # the JSON key separator is ": " or just ":". if s[end:end + 1] != ':': - raise ValueError(errmsg("Expecting : delimiter", s, end)) - end = _w(s, end + 1).end() + end = _w(s, end).end() + if s[end:end + 1] != ':': + raise ValueError(errmsg("Expecting : delimiter", s, end)) + + end += 1 + try: - value, end = iterscan(s, idx=end, context=context).next() + if s[end] in _ws: + end += 1 + if s[end] in _ws: + end = _w(s, end + 1).end() + except IndexError: + pass + + try: + value, end = scan_once(s, end) except StopIteration: raise ValueError(errmsg("Expecting object", s, end)) pairs[key] = value - end = _w(s, end).end() - nextchar = s[end:end + 1] + + try: + nextchar = s[end] + if nextchar in _ws: + end = _w(s, end + 1).end() + nextchar = s[end] + except IndexError: + nextchar = '' end += 1 + if nextchar == '}': break - if nextchar != ',': + elif nextchar != ',': raise ValueError(errmsg("Expecting , delimiter", s, end - 1)) - end = _w(s, end).end() - nextchar = s[end:end + 1] + + try: + nextchar = s[end] + if nextchar in _ws: + end += 1 + nextchar = s[end] + if nextchar in _ws: + end = _w(s, end + 1).end() + nextchar = s[end] + except IndexError: + nextchar = '' + end += 1 if nextchar != '"': raise ValueError(errmsg("Expecting property name", s, end - 1)) - object_hook = getattr(context, 'object_hook', None) + if object_hook is not None: pairs = object_hook(pairs) return pairs, end -pattern(r'{')(JSONObject) - -def JSONArray(match, context, _w=WHITESPACE.match): +def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): values = [] - s = match.string - end = _w(s, match.end()).end() - # Look-ahead for trivial empty array nextchar = s[end:end + 1] + if nextchar in _ws: + end = _w(s, end + 1).end() + nextchar = s[end:end + 1] + # Look-ahead for trivial empty array if nextchar == ']': return values, end + 1 - iterscan = JSONScanner.iterscan + _append = values.append while True: try: - value, end = iterscan(s, idx=end, context=context).next() + value, end = scan_once(s, end) except StopIteration: raise ValueError(errmsg("Expecting object", s, end)) - values.append(value) - end = _w(s, end).end() + _append(value) nextchar = s[end:end + 1] + if nextchar in _ws: + end = _w(s, end + 1).end() + nextchar = s[end:end + 1] end += 1 if nextchar == ']': break - if nextchar != ',': + elif nextchar != ',': raise ValueError(errmsg("Expecting , delimiter", s, end)) - end = _w(s, end).end() - return values, end -pattern(r'\[')(JSONArray) + try: + if s[end] in _ws: + end += 1 + if s[end] in _ws: + end = _w(s, end + 1).end() + except IndexError: + pass -ANYTHING = [ - JSONObject, - JSONArray, - JSONString, - JSONConstant, - JSONNumber, -] + return values, end -JSONScanner = Scanner(ANYTHING) - - class JSONDecoder(object): """Simple JSON decoder @@ -270,7 +260,6 @@ their corresponding ``float`` values, which is outside the JSON spec. """ - _scanner = Scanner(ANYTHING) __all__ = ['__init__', 'decode', 'raw_decode'] def __init__(self, encoding=None, object_hook=None, parse_float=None, @@ -282,8 +271,8 @@ Note that currently only encodings that are a superset of ASCII work, strings of other encodings should be passed in as ``unicode``. - ``object_hook``, if specified, will be called with the result of - every JSON object decoded and its return value will be used in + ``object_hook``, if specified, will be called with the result + of every JSON object decoded and its return value will be used in place of the given ``dict``. This can be used to provide custom deserializations (e.g. to support JSON-RPC class hinting). @@ -298,21 +287,24 @@ for JSON integers (e.g. float). ``parse_constant``, if specified, will be called with one of the - following strings: -Infinity, Infinity, NaN, null, true, false. + following strings: -Infinity, Infinity, NaN. This can be used to raise an exception if invalid JSON numbers are encountered. """ self.encoding = encoding self.object_hook = object_hook - self.parse_float = parse_float - self.parse_int = parse_int - self.parse_constant = parse_constant + self.parse_float = parse_float or float + self.parse_int = parse_int or int + self.parse_constant = parse_constant or _CONSTANTS.__getitem__ self.strict = strict + self.parse_object = JSONObject + self.parse_array = JSONArray + self.parse_string = scanstring + self.scan_once = make_scanner(self) def decode(self, s, _w=WHITESPACE.match): - """ - Return the Python representation of ``s`` (a ``str`` or ``unicode`` + """Return the Python representation of ``s`` (a ``str`` or ``unicode`` instance containing a JSON document) """ @@ -322,7 +314,7 @@ raise ValueError(errmsg("Extra data", s, end, len(s))) return obj - def raw_decode(self, s, **kw): + def raw_decode(self, s, idx=0): """Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning with a JSON document) and return a 2-tuple of the Python representation and the index in ``s`` where the document ended. @@ -331,9 +323,8 @@ have extraneous data at the end. """ - kw.setdefault('context', self) try: - obj, end = self._scanner.iterscan(s, **kw).next() + obj, end = self.scan_once(s, idx) except StopIteration: raise ValueError("No JSON object could be decoded") return obj, end Index: Modules/_json.c =================================================================== --- Modules/_json.c (revision 66961) +++ Modules/_json.c (working copy) @@ -1,7 +1,106 @@ #include "Python.h" +#include "structmember.h" #define DEFAULT_ENCODING "utf-8" +#define PyScanner_Check(op) PyObject_TypeCheck(op, &PyScannerType) +#define PyScanner_CheckExact(op) (Py_TYPE(op) == &PyScannerType) +#define PyEncoder_Check(op) PyObject_TypeCheck(op, &PyEncoderType) +#define PyEncoder_CheckExact(op) (Py_TYPE(op) == &PyEncoderType) + +static PyTypeObject PyScannerType; +static PyTypeObject PyEncoderType; + +typedef struct _PyScannerObject { + PyObject_HEAD + PyObject *encoding; + PyObject *strict; + PyObject *object_hook; + PyObject *parse_float; + PyObject *parse_int; + PyObject *parse_constant; +} PyScannerObject; + +static PyMemberDef scanner_members[] = { + {"encoding", T_OBJECT, offsetof(PyScannerObject, encoding), READONLY, "encoding"}, + {"strict", T_OBJECT, offsetof(PyScannerObject, strict), READONLY, "strict"}, + {"object_hook", T_OBJECT, offsetof(PyScannerObject, object_hook), READONLY, "object_hook"}, + {"parse_float", T_OBJECT, offsetof(PyScannerObject, parse_float), READONLY, "parse_float"}, + {"parse_int", T_OBJECT, offsetof(PyScannerObject, parse_int), READONLY, "parse_int"}, + {"parse_constant", T_OBJECT, offsetof(PyScannerObject, parse_constant), READONLY, "parse_constant"}, + {NULL} +}; + +typedef struct _PyEncoderObject { + PyObject_HEAD + PyObject *markers; + PyObject *defaultfn; + PyObject *encoder; + PyObject *indent; + PyObject *key_separator; + PyObject *item_separator; + PyObject *sort_keys; + PyObject *skipkeys; + int fast_encode; + int allow_nan; +} PyEncoderObject; + +static PyMemberDef encoder_members[] = { + {"markers", T_OBJECT, offsetof(PyEncoderObject, markers), READONLY, "markers"}, + {"default", T_OBJECT, offsetof(PyEncoderObject, defaultfn), READONLY, "default"}, + {"encoder", T_OBJECT, offsetof(PyEncoderObject, encoder), READONLY, "encoder"}, + {"indent", T_OBJECT, offsetof(PyEncoderObject, indent), READONLY, "indent"}, + {"key_separator", T_OBJECT, offsetof(PyEncoderObject, key_separator), READONLY, "key_separator"}, + {"item_separator", T_OBJECT, offsetof(PyEncoderObject, item_separator), READONLY, "item_separator"}, + {"sort_keys", T_OBJECT, offsetof(PyEncoderObject, sort_keys), READONLY, "sort_keys"}, + {"skipkeys", T_OBJECT, offsetof(PyEncoderObject, skipkeys), READONLY, "skipkeys"}, + {NULL} +}; + +static Py_ssize_t +ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars); +static PyObject * +ascii_escape_unicode(PyObject *pystr); +static PyObject * +ascii_escape_str(PyObject *pystr); +static PyObject * +py_encode_basestring_ascii(PyObject* self, PyObject *pystr); +void init_json(void); +static PyObject * +scan_once_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr); +static PyObject * +scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr); +static PyObject * +_build_rval_index_tuple(PyObject *rval, Py_ssize_t idx); +static int +scanner_init(PyObject *self, PyObject *args, PyObject *kwds); +static void +scanner_dealloc(PyObject *self); +static int +encoder_init(PyObject *self, PyObject *args, PyObject *kwds); +static void +encoder_dealloc(PyObject *self); +static int +encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level); +static int +encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level); +static int +encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level); +static PyObject * +_encoded_const(PyObject *const); +static void +raise_errmsg(char *msg, PyObject *s, Py_ssize_t end); +static PyObject * +encoder_encode_string(PyEncoderObject *s, PyObject *obj); +static int +_convertPyInt_AsSsize_t(PyObject *o, Py_ssize_t *size_ptr); +static PyObject * +_convertPyInt_FromSsize_t(Py_ssize_t *size_ptr); +static PyObject * +encoder_encode_float(PyEncoderObject *s, PyObject *obj); + #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"') +#define IS_WHITESPACE(c) (((c) == ' ') || ((c) == '\t') || ((c) == '\n') || ((c) == '\r')) + #define MIN_EXPANSION 6 #ifdef Py_UNICODE_WIDE @@ -10,10 +109,24 @@ #define MAX_EXPANSION MIN_EXPANSION #endif +static int +_convertPyInt_AsSsize_t(PyObject *o, Py_ssize_t *size_ptr) +{ + *size_ptr = PyInt_AsSsize_t(o); + if (*size_ptr == -1 && PyErr_Occurred()); + return 1; + return 0; +} + +static PyObject * +_convertPyInt_FromSsize_t(Py_ssize_t *size_ptr) +{ + return PyInt_FromSsize_t(*size_ptr); +} + static Py_ssize_t ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars) { - Py_UNICODE x; output[chars++] = '\\'; switch (c) { case '\\': output[chars++] = (char)c; break; @@ -30,27 +143,19 @@ Py_UNICODE v = c - 0x10000; c = 0xd800 | ((v >> 10) & 0x3ff); output[chars++] = 'u'; - x = (c & 0xf000) >> 12; - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); - x = (c & 0x0f00) >> 8; - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); - x = (c & 0x00f0) >> 4; - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); - x = (c & 0x000f); - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); + output[chars++] = "0123456789abcdef"[(c >> 12) & 0xf]; + output[chars++] = "0123456789abcdef"[(c >> 8) & 0xf]; + output[chars++] = "0123456789abcdef"[(c >> 4) & 0xf]; + output[chars++] = "0123456789abcdef"[(c ) & 0xf]; c = 0xdc00 | (v & 0x3ff); output[chars++] = '\\'; } #endif output[chars++] = 'u'; - x = (c & 0xf000) >> 12; - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); - x = (c & 0x0f00) >> 8; - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); - x = (c & 0x00f0) >> 4; - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); - x = (c & 0x000f); - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); + output[chars++] = "0123456789abcdef"[(c >> 12) & 0xf]; + output[chars++] = "0123456789abcdef"[(c >> 8) & 0xf]; + output[chars++] = "0123456789abcdef"[(c >> 4) & 0xf]; + output[chars++] = "0123456789abcdef"[(c ) & 0xf]; } return chars; } @@ -68,6 +173,7 @@ input_chars = PyUnicode_GET_SIZE(pystr); input_unicode = PyUnicode_AS_UNICODE(pystr); + /* One char input can be up to 6 chars output, estimate 4 of these */ output_size = 2 + (MIN_EXPANSION * 4) + input_chars; rval = PyString_FromStringAndSize(NULL, output_size); @@ -82,7 +188,7 @@ if (S_CHAR(c)) { output[chars++] = (char)c; } - else { + else { chars = ascii_escape_char(c, output, chars); } if (output_size - chars < (1 + MAX_EXPANSION)) { @@ -118,33 +224,56 @@ input_chars = PyString_GET_SIZE(pystr); input_str = PyString_AS_STRING(pystr); - /* One char input can be up to 6 chars output, estimate 4 of these */ - output_size = 2 + (MIN_EXPANSION * 4) + input_chars; + + /* Fast path for a string that's already ASCII */ + for (i = 0; i < input_chars; i++) { + Py_UNICODE c = (Py_UNICODE)(unsigned char)input_str[i]; + if (!S_CHAR(c)) { + /* If we have to escape something, scan the string for unicode */ + Py_ssize_t j; + for (j = i; j < input_chars; j++) { + c = (Py_UNICODE)(unsigned char)input_str[j]; + if (c > 0x7f) { + /* We hit a non-ASCII character, bail to unicode mode */ + PyObject *uni; + uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict"); + if (uni == NULL) { + return NULL; + } + rval = ascii_escape_unicode(uni); + Py_DECREF(uni); + return rval; + } + } + break; + } + } + + if (i == input_chars) { + /* Input is already ASCII */ + output_size = 2 + input_chars; + } + else { + /* One char input can be up to 6 chars output, estimate 4 of these */ + output_size = 2 + (MIN_EXPANSION * 4) + input_chars; + } rval = PyString_FromStringAndSize(NULL, output_size); if (rval == NULL) { return NULL; } output = PyString_AS_STRING(rval); - chars = 0; - output[chars++] = '"'; - for (i = 0; i < input_chars; i++) { - Py_UNICODE c = (Py_UNICODE)input_str[i]; + output[0] = '"'; + + /* We know that everything up to i is ASCII already */ + chars = i + 1; + memcpy(&output[1], input_str, i); + + for (; i < input_chars; i++) { + Py_UNICODE c = (Py_UNICODE)(unsigned char)input_str[i]; if (S_CHAR(c)) { output[chars++] = (char)c; } - else if (c > 0x7F) { - /* We hit a non-ASCII character, bail to unicode mode */ - PyObject *uni; - Py_DECREF(rval); - uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict"); - if (uni == NULL) { - return NULL; - } - rval = ascii_escape_unicode(uni); - Py_DECREF(uni); - return rval; - } - else { + else { chars = ascii_escape_char(c, output, chars); } /* An ASCII char can't possibly expand to a surrogate! */ @@ -167,7 +296,7 @@ return rval; } -void +static void raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) { static PyObject *errmsg_fn = NULL; @@ -177,61 +306,68 @@ if (decoder == NULL) return; errmsg_fn = PyObject_GetAttrString(decoder, "errmsg"); + Py_DECREF(decoder); if (errmsg_fn == NULL) return; - Py_DECREF(decoder); } - pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end); + pymsg = PyObject_CallFunction(errmsg_fn, "(zOO&)", msg, s, _convertPyInt_FromSsize_t, &end); if (pymsg) { PyErr_SetObject(PyExc_ValueError, pymsg); Py_DECREF(pymsg); } -/* - -def linecol(doc, pos): - lineno = doc.count('\n', 0, pos) + 1 - if lineno == 1: - colno = pos - else: - colno = pos - doc.rindex('\n', 0, pos) - return lineno, colno - -def errmsg(msg, doc, pos, end=None): - lineno, colno = linecol(doc, pos) - if end is None: - return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos) - endlineno, endcolno = linecol(doc, end) - return '%s: line %d column %d - line %d column %d (char %d - %d)' % ( - msg, lineno, colno, endlineno, endcolno, pos, end) - -*/ } static PyObject * -join_list_unicode(PyObject *lst) +join_list_string(PyObject *lst) { - static PyObject *ustr = NULL; - static PyObject *joinstr = NULL; - if (ustr == NULL) { - Py_UNICODE c = 0; - ustr = PyUnicode_FromUnicode(&c, 0); + static PyObject *joinfn = NULL; + if (joinfn == NULL) { + PyObject *ustr = PyString_FromStringAndSize(NULL, 0); + if (ustr == NULL) + return NULL; + + joinfn = PyObject_GetAttrString(ustr, "join"); + Py_DECREF(ustr); + if (joinfn == NULL) + return NULL; } - if (joinstr == NULL) { - joinstr = PyString_InternFromString("join"); + return PyObject_CallFunctionObjArgs(joinfn, lst, NULL); +} + +static PyObject * +_build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) { + PyObject *tpl; + PyObject *pyidx; + /* + steal a reference to rval, returns (rval, idx) + */ + if (rval == NULL) { + return NULL; } - if (joinstr == NULL || ustr == NULL) { + pyidx = PyInt_FromSsize_t(idx); + if (pyidx == NULL) { + Py_DECREF(rval); return NULL; } - return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL); + tpl = PyTuple_New(2); + if (tpl == NULL) { + Py_DECREF(pyidx); + Py_DECREF(rval); + return NULL; + } + PyTuple_SET_ITEM(tpl, 0, rval); + PyTuple_SET_ITEM(tpl, 1, pyidx); + return tpl; } static PyObject * -scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict) +scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_ssize_t *next_end_ptr) { PyObject *rval; Py_ssize_t len = PyString_GET_SIZE(pystr); Py_ssize_t begin = end - 1; Py_ssize_t next = begin; + int has_unicode = 0; char *buf = PyString_AS_STRING(pystr); PyObject *chunks = PyList_New(0); if (chunks == NULL) { @@ -246,7 +382,7 @@ Py_UNICODE c = 0; PyObject *chunk = NULL; for (next = end; next < len; next++) { - c = buf[next]; + c = (unsigned char)buf[next]; if (c == '"' || c == '\\') { break; } @@ -254,6 +390,9 @@ raise_errmsg("Invalid control character at", pystr, next); goto bail; } + else if (c > 0x7f) { + has_unicode = 1; + } } if (!(c == '"' || c == '\\')) { raise_errmsg("Unterminated string starting at", pystr, begin); @@ -261,15 +400,20 @@ } /* Pick up this chunk if it's not zero length */ if (next != end) { - PyObject *strchunk = PyBuffer_FromMemory(&buf[end], next - end); + PyObject *strchunk = PyString_FromStringAndSize(&buf[end], next - end); if (strchunk == NULL) { goto bail; } - chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL); - Py_DECREF(strchunk); - if (chunk == NULL) { - goto bail; + if (has_unicode) { + chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL); + Py_DECREF(strchunk); + if (chunk == NULL) { + goto bail; + } } + else { + chunk = strchunk; + } if (PyList_Append(chunks, chunk)) { Py_DECREF(chunk); goto bail; @@ -315,18 +459,18 @@ } /* Decode 4 hex digits */ for (; next < end; next++) { - Py_ssize_t shl = (end - next - 1) << 2; Py_UNICODE digit = buf[next]; + c <<= 4; switch (digit) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - c |= (digit - '0') << shl; break; + c |= (digit - '0'); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - c |= (digit - 'a' + 10) << shl; break; + c |= (digit - 'a' + 10); break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - c |= (digit - 'A' + 10) << shl; break; + c |= (digit - 'A' + 10); break; default: raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); goto bail; @@ -334,44 +478,64 @@ } #ifdef Py_UNICODE_WIDE /* Surrogate pair */ - if (c >= 0xd800 && c <= 0xdbff) { + if ((c & 0xfc00) == 0xd800) { Py_UNICODE c2 = 0; if (end + 6 >= len) { - raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, - end - 5); + raise_errmsg("Unpaired high surrogate", pystr, end - 5); + goto bail; } if (buf[next++] != '\\' || buf[next++] != 'u') { - raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, - end - 5); + raise_errmsg("Unpaired high surrogate", pystr, end - 5); + goto bail; } end += 6; /* Decode 4 hex digits */ for (; next < end; next++) { - Py_ssize_t shl = (end - next - 1) << 2; + c2 <<= 4; Py_UNICODE digit = buf[next]; switch (digit) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - c2 |= (digit - '0') << shl; break; + c2 |= (digit - '0'); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - c2 |= (digit - 'a' + 10) << shl; break; + c2 |= (digit - 'a' + 10); break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - c2 |= (digit - 'A' + 10) << shl; break; + c2 |= (digit - 'A' + 10); break; default: raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); goto bail; } } + if ((c2 & 0xfc00) != 0xdc00) { + raise_errmsg("Unpaired high surrogate", pystr, end - 5); + goto bail; + } c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); } + else if ((c & 0xfc00) == 0xdc00) { + raise_errmsg("Unpaired low surrogate", pystr, end - 5); + goto bail; + } #endif } - chunk = PyUnicode_FromUnicode(&c, 1); - if (chunk == NULL) { - goto bail; + if (c > 0x7f) { + has_unicode = 1; } + if (has_unicode) { + chunk = PyUnicode_FromUnicode(&c, 1); + if (chunk == NULL) { + goto bail; + } + } + else { + char c_char = Py_CHARMASK(c); + chunk = PyString_FromStringAndSize(&c_char, 1); + if (chunk == NULL) { + goto bail; + } + } if (PyList_Append(chunks, chunk)) { Py_DECREF(chunk); goto bail; @@ -379,20 +543,22 @@ Py_DECREF(chunk); } - rval = join_list_unicode(chunks); + rval = join_list_string(chunks); if (rval == NULL) { goto bail; } Py_CLEAR(chunks); - return Py_BuildValue("(Nn)", rval, end); + *next_end_ptr = end; + return rval; bail: + *next_end_ptr = -1; Py_XDECREF(chunks); return NULL; } static PyObject * -scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict) +scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr) { PyObject *rval; Py_ssize_t len = PyUnicode_GET_SIZE(pystr); @@ -476,18 +642,18 @@ } /* Decode 4 hex digits */ for (; next < end; next++) { - Py_ssize_t shl = (end - next - 1) << 2; Py_UNICODE digit = buf[next]; + c <<= 4; switch (digit) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - c |= (digit - '0') << shl; break; + c |= (digit - '0'); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - c |= (digit - 'a' + 10) << shl; break; + c |= (digit - 'a' + 10); break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - c |= (digit - 'A' + 10) << shl; break; + c |= (digit - 'A' + 10); break; default: raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); goto bail; @@ -495,38 +661,46 @@ } #ifdef Py_UNICODE_WIDE /* Surrogate pair */ - if (c >= 0xd800 && c <= 0xdbff) { + if ((c & 0xfc00) == 0xd800) { Py_UNICODE c2 = 0; if (end + 6 >= len) { - raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, - end - 5); + raise_errmsg("Unpaired high surrogate", pystr, end - 5); + goto bail; } if (buf[next++] != '\\' || buf[next++] != 'u') { - raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, - end - 5); + raise_errmsg("Unpaired high surrogate", pystr, end - 5); + goto bail; } end += 6; /* Decode 4 hex digits */ for (; next < end; next++) { - Py_ssize_t shl = (end - next - 1) << 2; + c2 <<= 4; Py_UNICODE digit = buf[next]; switch (digit) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - c2 |= (digit - '0') << shl; break; + c2 |= (digit - '0'); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - c2 |= (digit - 'a' + 10) << shl; break; + c2 |= (digit - 'a' + 10); break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - c2 |= (digit - 'A' + 10) << shl; break; + c2 |= (digit - 'A' + 10); break; default: raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); goto bail; } } + if ((c2 & 0xfc00) != 0xdc00) { + raise_errmsg("Unpaired high surrogate", pystr, end - 5); + goto bail; + } c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); } + else if ((c & 0xfc00) == 0xdc00) { + raise_errmsg("Unpaired low surrogate", pystr, end - 5); + goto bail; + } #endif } chunk = PyUnicode_FromUnicode(&c, 1); @@ -540,49 +714,60 @@ Py_DECREF(chunk); } - rval = join_list_unicode(chunks); + rval = join_list_string(chunks); if (rval == NULL) { goto bail; } - Py_CLEAR(chunks); - return Py_BuildValue("(Nn)", rval, end); + Py_DECREF(chunks); + *next_end_ptr = end; + return rval; bail: + *next_end_ptr = -1; Py_XDECREF(chunks); return NULL; } PyDoc_STRVAR(pydoc_scanstring, -"scanstring(basestring, end, encoding) -> (str, end)\n"); + "scanstring(basestring, end, encoding) -> (str, end)\n" + "\n" + "..." +); static PyObject * py_scanstring(PyObject* self, PyObject *args) { PyObject *pystr; + PyObject *rval; Py_ssize_t end; + Py_ssize_t next_end = -1; char *encoding = NULL; int strict = 0; - if (!PyArg_ParseTuple(args, "On|zi:scanstring", &pystr, &end, &encoding, &strict)) { + if (!PyArg_ParseTuple(args, "OO&|zi:scanstring", &pystr, _convertPyInt_AsSsize_t, &end, &encoding, &strict)) { return NULL; } if (encoding == NULL) { encoding = DEFAULT_ENCODING; } if (PyString_Check(pystr)) { - return scanstring_str(pystr, end, encoding, strict); + rval = scanstring_str(pystr, end, encoding, strict, &next_end); } else if (PyUnicode_Check(pystr)) { - return scanstring_unicode(pystr, end, strict); + rval = scanstring_unicode(pystr, end, strict, &next_end); } else { - PyErr_Format(PyExc_TypeError, - "first argument must be a string or unicode, not %.80s", + PyErr_Format(PyExc_TypeError, + "first argument must be a string, not %.80s", Py_TYPE(pystr)->tp_name); return NULL; } + return _build_rval_index_tuple(rval, next_end); } PyDoc_STRVAR(pydoc_encode_basestring_ascii, -"encode_basestring_ascii(basestring) -> str\n"); + "encode_basestring_ascii(basestring) -> str\n" + "\n" + "..." +); static PyObject * py_encode_basestring_ascii(PyObject* self, PyObject *pystr) @@ -595,18 +780,1297 @@ return ascii_escape_unicode(pystr); } else { - PyErr_Format(PyExc_TypeError, - "first argument must be a string or unicode, not %.80s", + PyErr_Format(PyExc_TypeError, + "first argument must be a string, not %.80s", Py_TYPE(pystr)->tp_name); return NULL; } } +static void +scanner_dealloc(PyObject *self) +{ + PyScannerObject *s; + assert(PyScanner_Check(self)); + s = (PyScannerObject *)self; + Py_CLEAR(s->encoding); + Py_CLEAR(s->strict); + Py_CLEAR(s->object_hook); + Py_CLEAR(s->parse_float); + Py_CLEAR(s->parse_int); + Py_CLEAR(s->parse_constant); + self->ob_type->tp_free(self); +} + +static PyObject * +_parse_object_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + char *str = PyString_AS_STRING(pystr); + Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1; + PyObject *rval = PyDict_New(); + PyObject *key = NULL; + PyObject *val = NULL; + char *encoding = PyString_AS_STRING(s->encoding); + int strict = PyObject_IsTrue(s->strict); + Py_ssize_t next_idx; + if (rval == NULL) + return NULL; + + /* skip whitespace after { */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* only loop if the object is non-empty */ + if (idx <= end_idx && str[idx] != '}') { + while (idx <= end_idx) { + /* read key */ + if (str[idx] != '"') { + raise_errmsg("Expecting property name", pystr, idx); + goto bail; + } + key = scanstring_str(pystr, idx + 1, encoding, strict, &next_idx); + if (key == NULL) + goto bail; + idx = next_idx; + + /* skip whitespace between key and : delimiter, read :, skip whitespace */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + if (idx > end_idx || str[idx] != ':') { + raise_errmsg("Expecting : delimiter", pystr, idx); + goto bail; + } + idx++; + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* read any JSON data type */ + val = scan_once_str(s, pystr, idx, &next_idx); + if (val == NULL) + goto bail; + + if (PyDict_SetItem(rval, key, val) == -1) + goto bail; + + Py_CLEAR(key); + Py_CLEAR(val); + idx = next_idx; + + /* skip whitespace before } or , */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* bail if the object is closed or we didn't get the , delimiter */ + if (idx > end_idx) break; + if (str[idx] == '}') { + break; + } + else if (str[idx] != ',') { + raise_errmsg("Expecting , delimiter", pystr, idx); + goto bail; + } + idx++; + + /* skip whitespace after , delimiter */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + } + } + /* verify that idx < end_idx, str[idx] should be '}' */ + if (idx > end_idx || str[idx] != '}') { + raise_errmsg("Expecting object", pystr, end_idx); + goto bail; + } + /* if object_hook is not None: rval = object_hook(rval) */ + if (s->object_hook != Py_None) { + val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL); + if (val == NULL) + goto bail; + Py_DECREF(rval); + rval = val; + val = NULL; + } + *next_idx_ptr = idx + 1; + return rval; +bail: + Py_XDECREF(key); + Py_XDECREF(val); + Py_DECREF(rval); + return NULL; +} + +static PyObject * +_parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); + Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; + PyObject *val = NULL; + PyObject *rval = PyDict_New(); + PyObject *key = NULL; + int strict = PyObject_IsTrue(s->strict); + Py_ssize_t next_idx; + if (rval == NULL) + return NULL; + + /* skip whitespace after { */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* only loop if the object is non-empty */ + if (idx <= end_idx && str[idx] != '}') { + while (idx <= end_idx) { + /* read key */ + if (str[idx] != '"') { + raise_errmsg("Expecting property name", pystr, idx); + goto bail; + } + key = scanstring_unicode(pystr, idx + 1, strict, &next_idx); + if (key == NULL) + goto bail; + idx = next_idx; + + /* skip whitespace between key and : delimiter, read :, skip whitespace */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + if (idx > end_idx || str[idx] != ':') { + raise_errmsg("Expecting : delimiter", pystr, idx); + goto bail; + } + idx++; + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* read any JSON term */ + val = scan_once_unicode(s, pystr, idx, &next_idx); + if (val == NULL) + goto bail; + + if (PyDict_SetItem(rval, key, val) == -1) + goto bail; + + Py_CLEAR(key); + Py_CLEAR(val); + idx = next_idx; + + /* skip whitespace before } or , */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* bail if the object is closed or we didn't get the , delimiter */ + if (idx > end_idx) break; + if (str[idx] == '}') { + break; + } + else if (str[idx] != ',') { + raise_errmsg("Expecting , delimiter", pystr, idx); + goto bail; + } + idx++; + + /* skip whitespace after , delimiter */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + } + } + + /* verify that idx < end_idx, str[idx] should be '}' */ + if (idx > end_idx || str[idx] != '}') { + raise_errmsg("Expecting object", pystr, end_idx); + goto bail; + } + + /* if object_hook is not None: rval = object_hook(rval) */ + if (s->object_hook != Py_None) { + val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL); + if (val == NULL) + goto bail; + Py_DECREF(rval); + rval = val; + val = NULL; + } + *next_idx_ptr = idx + 1; + return rval; +bail: + Py_XDECREF(key); + Py_XDECREF(val); + Py_DECREF(rval); + return NULL; +} + +static PyObject * +_parse_array_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + char *str = PyString_AS_STRING(pystr); + Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1; + PyObject *val = NULL; + PyObject *rval = PyList_New(0); + Py_ssize_t next_idx; + if (rval == NULL) + return NULL; + + /* skip whitespace after [ */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* only loop if the array is non-empty */ + if (idx <= end_idx && str[idx] != ']') { + while (idx <= end_idx) { + + /* read any JSON term and de-tuplefy the (rval, idx) */ + val = scan_once_str(s, pystr, idx, &next_idx); + if (val == NULL) + goto bail; + + if (PyList_Append(rval, val) == -1) + goto bail; + + Py_CLEAR(val); + idx = next_idx; + + /* skip whitespace between term and , */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* bail if the array is closed or we didn't get the , delimiter */ + if (idx > end_idx) break; + if (str[idx] == ']') { + break; + } + else if (str[idx] != ',') { + raise_errmsg("Expecting , delimiter", pystr, idx); + goto bail; + } + idx++; + + /* skip whitespace after , */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + } + } + + /* verify that idx < end_idx, str[idx] should be ']' */ + if (idx > end_idx || str[idx] != ']') { + raise_errmsg("Expecting object", pystr, end_idx); + goto bail; + } + *next_idx_ptr = idx + 1; + return rval; +bail: + Py_XDECREF(val); + Py_DECREF(rval); + return NULL; +} + +static PyObject * +_parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); + Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; + PyObject *val = NULL; + PyObject *rval = PyList_New(0); + Py_ssize_t next_idx; + if (rval == NULL) + return NULL; + + /* skip whitespace after [ */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* only loop if the array is non-empty */ + if (idx <= end_idx && str[idx] != ']') { + while (idx <= end_idx) { + + /* read any JSON term */ + val = scan_once_unicode(s, pystr, idx, &next_idx); + if (val == NULL) + goto bail; + + if (PyList_Append(rval, val) == -1) + goto bail; + + Py_CLEAR(val); + idx = next_idx; + + /* skip whitespace between term and , */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* bail if the array is closed or we didn't get the , delimiter */ + if (idx > end_idx) break; + if (str[idx] == ']') { + break; + } + else if (str[idx] != ',') { + raise_errmsg("Expecting , delimiter", pystr, idx); + goto bail; + } + idx++; + + /* skip whitespace after , */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + } + } + + /* verify that idx < end_idx, str[idx] should be ']' */ + if (idx > end_idx || str[idx] != ']') { + raise_errmsg("Expecting object", pystr, end_idx); + goto bail; + } + *next_idx_ptr = idx + 1; + return rval; +bail: + Py_XDECREF(val); + Py_DECREF(rval); + return NULL; +} + +static PyObject * +_parse_constant(PyScannerObject *s, char *constant, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + PyObject *cstr; + PyObject *rval; + /* constant is "NaN", "Infinity", or "-Infinity" */ + cstr = PyString_InternFromString(constant); + if (cstr == NULL) + return NULL; + + /* rval = parse_constant(constant) */ + rval = PyObject_CallFunctionObjArgs(s->parse_constant, cstr, NULL); + idx += PyString_GET_SIZE(cstr); + Py_DECREF(cstr); + *next_idx_ptr = idx; + return rval; +} + +static PyObject * +_match_number_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ssize_t *next_idx_ptr) { + char *str = PyString_AS_STRING(pystr); + Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1; + Py_ssize_t idx = start; + int is_float = 0; + PyObject *rval; + PyObject *numstr; + + /* read a sign if it's there, make sure it's not the end of the string */ + if (str[idx] == '-') { + idx++; + if (idx > end_idx) { + PyErr_SetNone(PyExc_StopIteration); + return NULL; + } + } + + /* read as many integer digits as we find as long as it doesn't start with 0 */ + if (str[idx] >= '1' && str[idx] <= '9') { + idx++; + while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; + } + /* if it starts with 0 we only expect one integer digit */ + else if (str[idx] == '0') { + idx++; + } + /* no integer digits, error */ + else { + PyErr_SetNone(PyExc_StopIteration); + return NULL; + } + + /* if the next char is '.' followed by a digit then read all float digits */ + if (idx < end_idx && str[idx] == '.' && str[idx + 1] >= '0' && str[idx + 1] <= '9') { + is_float = 1; + idx += 2; + while (idx < end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; + } + + /* if the next char is 'e' or 'E' then maybe read the exponent (or backtrack) */ + if (idx < end_idx && (str[idx] == 'e' || str[idx] == 'E')) { + + /* save the index of the 'e' or 'E' just in case we need to backtrack */ + Py_ssize_t e_start = idx; + idx++; + + /* read an exponent sign if present */ + if (idx < end_idx && (str[idx] == '-' || str[idx] == '+')) idx++; + + /* read all digits */ + while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; + + /* if we got a digit, then parse as float. if not, backtrack */ + if (str[idx - 1] >= '0' && str[idx - 1] <= '9') { + is_float = 1; + } + else { + idx = e_start; + } + } + + /* copy the section we determined to be a number */ + numstr = PyString_FromStringAndSize(&str[start], idx - start); + if (numstr == NULL) + return NULL; + if (is_float) { + /* parse as a float using a fast path if available, otherwise call user defined method */ + if (s->parse_float != (PyObject *)&PyFloat_Type) { + rval = PyObject_CallFunctionObjArgs(s->parse_float, numstr, NULL); + } + else { + rval = PyFloat_FromDouble(PyOS_ascii_atof(PyString_AS_STRING(numstr))); + } + } + else { + /* parse as an int using a fast path if available, otherwise call user defined method */ + if (s->parse_int != (PyObject *)&PyInt_Type) { + rval = PyObject_CallFunctionObjArgs(s->parse_int, numstr, NULL); + } + else { + rval = PyInt_FromString(PyString_AS_STRING(numstr), NULL, 10); + } + } + Py_DECREF(numstr); + *next_idx_ptr = idx; + return rval; +} + +static PyObject * +_match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ssize_t *next_idx_ptr) { + Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); + Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; + Py_ssize_t idx = start; + int is_float = 0; + PyObject *rval; + PyObject *numstr; + + /* read a sign if it's there, make sure it's not the end of the string */ + if (str[idx] == '-') { + idx++; + if (idx > end_idx) { + PyErr_SetNone(PyExc_StopIteration); + return NULL; + } + } + + /* read as many integer digits as we find as long as it doesn't start with 0 */ + if (str[idx] >= '1' && str[idx] <= '9') { + idx++; + while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; + } + /* if it starts with 0 we only expect one integer digit */ + else if (str[idx] == '0') { + idx++; + } + /* no integer digits, error */ + else { + PyErr_SetNone(PyExc_StopIteration); + return NULL; + } + + /* if the next char is '.' followed by a digit then read all float digits */ + if (idx < end_idx && str[idx] == '.' && str[idx + 1] >= '0' && str[idx + 1] <= '9') { + is_float = 1; + idx += 2; + while (idx < end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; + } + + /* if the next char is 'e' or 'E' then maybe read the exponent (or backtrack) */ + if (idx < end_idx && (str[idx] == 'e' || str[idx] == 'E')) { + Py_ssize_t e_start = idx; + idx++; + + /* read an exponent sign if present */ + if (idx < end_idx && (str[idx] == '-' || str[idx] == '+')) idx++; + + /* read all digits */ + while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; + + /* if we got a digit, then parse as float. if not, backtrack */ + if (str[idx - 1] >= '0' && str[idx - 1] <= '9') { + is_float = 1; + } + else { + idx = e_start; + } + } + + /* copy the section we determined to be a number */ + numstr = PyUnicode_FromUnicode(&str[start], idx - start); + if (numstr == NULL) + return NULL; + if (is_float) { + /* parse as a float using a fast path if available, otherwise call user defined method */ + if (s->parse_float != (PyObject *)&PyFloat_Type) { + rval = PyObject_CallFunctionObjArgs(s->parse_float, numstr, NULL); + } + else { + rval = PyFloat_FromString(numstr, NULL); + } + } + else { + /* no fast path for unicode -> int, just call */ + rval = PyObject_CallFunctionObjArgs(s->parse_int, numstr, NULL); + } + Py_DECREF(numstr); + *next_idx_ptr = idx; + return rval; +} + +static PyObject * +scan_once_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) +{ + char *str = PyString_AS_STRING(pystr); + Py_ssize_t length = PyString_GET_SIZE(pystr); + if (idx >= length) { + PyErr_SetNone(PyExc_StopIteration); + return NULL; + } + switch (str[idx]) { + case '"': + /* string */ + return scanstring_str(pystr, idx + 1, + PyString_AS_STRING(s->encoding), + PyObject_IsTrue(s->strict), + next_idx_ptr); + case '{': + /* object */ + return _parse_object_str(s, pystr, idx + 1, next_idx_ptr); + case '[': + /* array */ + return _parse_array_str(s, pystr, idx + 1, next_idx_ptr); + case 'n': + /* null */ + if ((idx + 3 < length) && str[idx + 1] == 'u' && str[idx + 2] == 'l' && str[idx + 3] == 'l') { + Py_INCREF(Py_None); + *next_idx_ptr = idx + 4; + return Py_None; + } + break; + case 't': + /* true */ + if ((idx + 3 < length) && str[idx + 1] == 'r' && str[idx + 2] == 'u' && str[idx + 3] == 'e') { + Py_INCREF(Py_True); + *next_idx_ptr = idx + 4; + return Py_True; + } + break; + case 'f': + /* false */ + if ((idx + 4 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'l' && str[idx + 3] == 's' && str[idx + 4] == 'e') { + Py_INCREF(Py_False); + *next_idx_ptr = idx + 5; + return Py_False; + } + break; + case 'N': + /* NaN */ + if ((idx + 2 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'N') { + return _parse_constant(s, "NaN", idx, next_idx_ptr); + } + break; + case 'I': + /* Infinity */ + if ((idx + 7 < length) && str[idx + 1] == 'n' && str[idx + 2] == 'f' && str[idx + 3] == 'i' && str[idx + 4] == 'n' && str[idx + 5] == 'i' && str[idx + 6] == 't' && str[idx + 7] == 'y') { + return _parse_constant(s, "Infinity", idx, next_idx_ptr); + } + break; + case '-': + /* -Infinity */ + if ((idx + 8 < length) && str[idx + 1] == 'I' && str[idx + 2] == 'n' && str[idx + 3] == 'f' && str[idx + 4] == 'i' && str[idx + 5] == 'n' && str[idx + 6] == 'i' && str[idx + 7] == 't' && str[idx + 8] == 'y') { + return _parse_constant(s, "-Infinity", idx, next_idx_ptr); + } + break; + } + /* Didn't find a string, object, array, or named constant. Look for a number. */ + return _match_number_str(s, pystr, idx, next_idx_ptr); +} + +static PyObject * +scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) +{ + Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); + Py_ssize_t length = PyUnicode_GET_SIZE(pystr); + if (idx >= length) { + PyErr_SetNone(PyExc_StopIteration); + return NULL; + } + switch (str[idx]) { + case '"': + /* string */ + return scanstring_unicode(pystr, idx + 1, + PyObject_IsTrue(s->strict), + next_idx_ptr); + case '{': + /* object */ + return _parse_object_unicode(s, pystr, idx + 1, next_idx_ptr); + case '[': + /* array */ + return _parse_array_unicode(s, pystr, idx + 1, next_idx_ptr); + case 'n': + /* null */ + if ((idx + 3 < length) && str[idx + 1] == 'u' && str[idx + 2] == 'l' && str[idx + 3] == 'l') { + Py_INCREF(Py_None); + *next_idx_ptr = idx + 4; + return Py_None; + } + break; + case 't': + /* true */ + if ((idx + 3 < length) && str[idx + 1] == 'r' && str[idx + 2] == 'u' && str[idx + 3] == 'e') { + Py_INCREF(Py_True); + *next_idx_ptr = idx + 4; + return Py_True; + } + break; + case 'f': + /* false */ + if ((idx + 4 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'l' && str[idx + 3] == 's' && str[idx + 4] == 'e') { + Py_INCREF(Py_False); + *next_idx_ptr = idx + 5; + return Py_False; + } + break; + case 'N': + /* NaN */ + if ((idx + 2 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'N') { + return _parse_constant(s, "NaN", idx, next_idx_ptr); + } + break; + case 'I': + /* Infinity */ + if ((idx + 7 < length) && str[idx + 1] == 'n' && str[idx + 2] == 'f' && str[idx + 3] == 'i' && str[idx + 4] == 'n' && str[idx + 5] == 'i' && str[idx + 6] == 't' && str[idx + 7] == 'y') { + return _parse_constant(s, "Infinity", idx, next_idx_ptr); + } + break; + case '-': + /* -Infinity */ + if ((idx + 8 < length) && str[idx + 1] == 'I' && str[idx + 2] == 'n' && str[idx + 3] == 'f' && str[idx + 4] == 'i' && str[idx + 5] == 'n' && str[idx + 6] == 'i' && str[idx + 7] == 't' && str[idx + 8] == 'y') { + return _parse_constant(s, "-Infinity", idx, next_idx_ptr); + } + break; + } + /* Didn't find a string, object, array, or named constant. Look for a number. */ + return _match_number_unicode(s, pystr, idx, next_idx_ptr); +} + +static PyObject * +scanner_call(PyObject *self, PyObject *args, PyObject *kwds) +{ + PyObject *pystr; + PyObject *rval; + Py_ssize_t idx; + Py_ssize_t next_idx = -1; + static char *kwlist[] = {"string", "idx", NULL}; + PyScannerObject *s; + assert(PyScanner_Check(self)); + s = (PyScannerObject *)self; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:scan_once", kwlist, &pystr, _convertPyInt_AsSsize_t, &idx)) + return NULL; + + if (PyString_Check(pystr)) { + rval = scan_once_str(s, pystr, idx, &next_idx); + } + else if (PyUnicode_Check(pystr)) { + rval = scan_once_unicode(s, pystr, idx, &next_idx); + } + else { + PyErr_Format(PyExc_TypeError, + "first argument must be a string, not %.80s", + Py_TYPE(pystr)->tp_name); + return NULL; + } + return _build_rval_index_tuple(rval, next_idx); +} + +static int +scanner_init(PyObject *self, PyObject *args, PyObject *kwds) +{ + PyObject *ctx; + static char *kwlist[] = {"context", NULL}; + PyScannerObject *s; + + assert(PyScanner_Check(self)); + s = (PyScannerObject *)self; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx)) + return -1; + + s->encoding = NULL; + s->strict = NULL; + s->object_hook = NULL; + s->parse_float = NULL; + s->parse_int = NULL; + s->parse_constant = NULL; + + /* PyString_AS_STRING is used on encoding */ + s->encoding = PyObject_GetAttrString(ctx, "encoding"); + if (s->encoding == Py_None) { + Py_DECREF(Py_None); + s->encoding = PyString_InternFromString(DEFAULT_ENCODING); + } + else if (PyUnicode_Check(s->encoding)) { + PyObject *tmp = PyUnicode_AsEncodedString(s->encoding, NULL, NULL); + Py_DECREF(s->encoding); + s->encoding = tmp; + } + if (s->encoding == NULL || !PyString_Check(s->encoding)) + goto bail; + + /* All of these will fail "gracefully" so we don't need to verify them */ + s->strict = PyObject_GetAttrString(ctx, "strict"); + if (s->strict == NULL) + goto bail; + s->object_hook = PyObject_GetAttrString(ctx, "object_hook"); + if (s->object_hook == NULL) + goto bail; + s->parse_float = PyObject_GetAttrString(ctx, "parse_float"); + if (s->parse_float == NULL) + goto bail; + s->parse_int = PyObject_GetAttrString(ctx, "parse_int"); + if (s->parse_int == NULL) + goto bail; + s->parse_constant = PyObject_GetAttrString(ctx, "parse_constant"); + if (s->parse_constant == NULL) + goto bail; + + return 0; + +bail: + Py_CLEAR(s->encoding); + Py_CLEAR(s->strict); + Py_CLEAR(s->object_hook); + Py_CLEAR(s->parse_float); + Py_CLEAR(s->parse_int); + Py_CLEAR(s->parse_constant); + return -1; +} + +PyDoc_STRVAR(scanner_doc, "JSON scanner object"); + +static +PyTypeObject PyScannerType = { + PyObject_HEAD_INIT(0) + 0, /* tp_internal */ + "make_scanner", /* tp_name */ + sizeof(PyScannerObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + scanner_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + scanner_call, /* tp_call */ + 0, /* tp_str */ + 0,/* PyObject_GenericGetAttr, */ /* tp_getattro */ + 0,/* PyObject_GenericSetAttr, */ /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + scanner_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + scanner_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + scanner_init, /* tp_init */ + 0,/* PyType_GenericAlloc, */ /* tp_alloc */ + 0,/* PyType_GenericNew, */ /* tp_new */ + 0,/* _PyObject_Del, */ /* tp_free */ +}; + +static int +encoder_init(PyObject *self, PyObject *args, PyObject *kwds) +{ + static char *kwlist[] = {"markers", "default", "encoder", "indent", "key_separator", "item_separator", "sort_keys", "skipkeys", "allow_nan", NULL}; + + PyEncoderObject *s; + PyObject *allow_nan; + + assert(PyEncoder_Check(self)); + s = (PyEncoderObject *)self; + + s->markers = NULL; + s->defaultfn = NULL; + s->encoder = NULL; + s->indent = NULL; + s->key_separator = NULL; + s->item_separator = NULL; + s->sort_keys = NULL; + s->skipkeys = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OOOOOOOOO:make_encoder", kwlist, + &s->markers, &s->defaultfn, &s->encoder, &s->indent, &s->key_separator, &s->item_separator, &s->sort_keys, &s->skipkeys, &allow_nan)) + return -1; + + Py_INCREF(s->markers); + Py_INCREF(s->defaultfn); + Py_INCREF(s->encoder); + Py_INCREF(s->indent); + Py_INCREF(s->key_separator); + Py_INCREF(s->item_separator); + Py_INCREF(s->sort_keys); + Py_INCREF(s->skipkeys); + s->fast_encode = (PyCFunction_Check(s->encoder) && PyCFunction_GetFunction(s->encoder) == (PyCFunction)py_encode_basestring_ascii); + s->allow_nan = PyObject_IsTrue(allow_nan); + return 0; +} + +static PyObject * +encoder_call(PyObject *self, PyObject *args, PyObject *kwds) +{ + static char *kwlist[] = {"obj", "_current_indent_level", NULL}; + PyObject *obj; + PyObject *rval; + Py_ssize_t indent_level; + PyEncoderObject *s; + assert(PyEncoder_Check(self)); + s = (PyEncoderObject *)self; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:_iterencode", kwlist, + &obj, _convertPyInt_AsSsize_t, &indent_level)) + return NULL; + rval = PyList_New(0); + if (rval == NULL) + return NULL; + if (encoder_listencode_obj(s, rval, obj, indent_level)) { + Py_DECREF(rval); + return NULL; + } + return rval; +} + +static PyObject * +_encoded_const(PyObject *obj) +{ + if (obj == Py_None) { + static PyObject *s_null = NULL; + if (s_null == NULL) { + s_null = PyString_InternFromString("null"); + } + Py_INCREF(s_null); + return s_null; + } + else if (obj == Py_True) { + static PyObject *s_true = NULL; + if (s_true == NULL) { + s_true = PyString_InternFromString("true"); + } + Py_INCREF(s_true); + return s_true; + } + else if (obj == Py_False) { + static PyObject *s_false = NULL; + if (s_false == NULL) { + s_false = PyString_InternFromString("false"); + } + Py_INCREF(s_false); + return s_false; + } + else { + PyErr_SetString(PyExc_ValueError, "not a const"); + return NULL; + } +} + +static PyObject * +encoder_encode_float(PyEncoderObject *s, PyObject *obj) +{ + double i = PyFloat_AS_DOUBLE(obj); + if (!Py_IS_FINITE(i)) { + if (!s->allow_nan) { + PyErr_SetString(PyExc_ValueError, "Out of range float values are not JSON compliant"); + return NULL; + } + if (i > 0) { + return PyString_FromString("Infinity"); + } + else if (i < 0) { + return PyString_FromString("-Infinity"); + } + else { + return PyString_FromString("NaN"); + } + } + /* Use a better float format here? */ + return PyObject_Repr(obj); +} + +static PyObject * +encoder_encode_string(PyEncoderObject *s, PyObject *obj) +{ + if (s->fast_encode) + return py_encode_basestring_ascii(NULL, obj); + else + return PyObject_CallFunctionObjArgs(s->encoder, obj, NULL); +} + +static int +_steal_list_append(PyObject *lst, PyObject *stolen) +{ + int rval = PyList_Append(lst, stolen); + Py_DECREF(stolen); + return rval; +} + +static int +encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level) +{ + PyObject *newobj; + int rv; + + if (obj == Py_None || obj == Py_True || obj == Py_False) { + PyObject *cstr = _encoded_const(obj); + if (cstr == NULL) + return -1; + return _steal_list_append(rval, cstr); + } + else if (PyString_Check(obj) || PyUnicode_Check(obj)) + { + PyObject *encoded = encoder_encode_string(s, obj); + if (encoded == NULL) + return -1; + return _steal_list_append(rval, encoded); + } + else if (PyInt_Check(obj) || PyLong_Check(obj)) { + PyObject *encoded = PyObject_Str(obj); + if (encoded == NULL) + return -1; + return _steal_list_append(rval, encoded); + } + else if (PyFloat_Check(obj)) { + PyObject *encoded = encoder_encode_float(s, obj); + if (encoded == NULL) + return -1; + return _steal_list_append(rval, encoded); + } + else if (PyList_Check(obj) || PyTuple_Check(obj)) { + return encoder_listencode_list(s, rval, obj, indent_level); + } + else if (PyDict_Check(obj)) { + return encoder_listencode_dict(s, rval, obj, indent_level); + } + else { + PyObject *ident = NULL; + if (s->markers != Py_None) { + int has_key; + ident = PyLong_FromVoidPtr(obj); + if (ident == NULL) + return -1; + has_key = PyDict_Contains(s->markers, ident); + if (has_key) { + if (has_key != -1) + PyErr_SetString(PyExc_ValueError, "Circular reference detected"); + Py_DECREF(ident); + return -1; + } + if (PyDict_SetItem(s->markers, ident, obj)) { + Py_DECREF(ident); + return -1; + } + } + newobj = PyObject_CallFunctionObjArgs(s->defaultfn, obj, NULL); + if (newobj == NULL) { + Py_DECREF(ident); + return -1; + } + rv = encoder_listencode_obj(s, rval, newobj, indent_level); + Py_DECREF(newobj); + if (rv) { + Py_DECREF(ident); + return -1; + } + if (ident != NULL) { + if (PyDict_DelItem(s->markers, ident)) { + Py_DECREF(ident); + return -1; + } + Py_DECREF(ident); + } + return rv; + } +} + +static int +encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level) +{ + static PyObject *open_dict = NULL; + static PyObject *close_dict = NULL; + static PyObject *empty_dict = NULL; + PyObject *kstr = NULL; + PyObject *ident = NULL; + PyObject *key, *value; + Py_ssize_t pos; + int skipkeys; + Py_ssize_t idx; + + if (open_dict == NULL || close_dict == NULL || empty_dict == NULL) { + open_dict = PyString_InternFromString("{"); + close_dict = PyString_InternFromString("}"); + empty_dict = PyString_InternFromString("{}"); + if (open_dict == NULL || close_dict == NULL || empty_dict == NULL) + return -1; + } + if (PyDict_Size(dct) == 0) + return PyList_Append(rval, empty_dict); + + if (s->markers != Py_None) { + int has_key; + ident = PyLong_FromVoidPtr(dct); + if (ident == NULL) + goto bail; + has_key = PyDict_Contains(s->markers, ident); + if (has_key) { + if (has_key != -1) + PyErr_SetString(PyExc_ValueError, "Circular reference detected"); + goto bail; + } + if (PyDict_SetItem(s->markers, ident, dct)) { + goto bail; + } + } + + if (PyList_Append(rval, open_dict)) + goto bail; + + if (s->indent != Py_None) { + /* TODO: DOES NOT RUN */ + indent_level += 1; + /* + newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) + separator = _item_separator + newline_indent + buf += newline_indent + */ + } + + /* TODO: C speedup not implemented for sort_keys */ + + pos = 0; + skipkeys = PyObject_IsTrue(s->skipkeys); + idx = 0; + while (PyDict_Next(dct, &pos, &key, &value)) { + PyObject *encoded; + + if (PyString_Check(key) || PyUnicode_Check(key)) { + Py_INCREF(key); + kstr = key; + } + else if (PyFloat_Check(key)) { + kstr = encoder_encode_float(s, key); + if (kstr == NULL) + goto bail; + } + else if (PyInt_Check(key) || PyLong_Check(key)) { + kstr = PyObject_Str(key); + if (kstr == NULL) + goto bail; + } + else if (key == Py_True || key == Py_False || key == Py_None) { + kstr = _encoded_const(key); + if (kstr == NULL) + goto bail; + } + else if (skipkeys) { + continue; + } + else { + /* TODO: include repr of key */ + PyErr_SetString(PyExc_ValueError, "keys must be a string"); + goto bail; + } + + if (idx) { + if (PyList_Append(rval, s->item_separator)) + goto bail; + } + + encoded = encoder_encode_string(s, kstr); + Py_CLEAR(kstr); + if (encoded == NULL) + goto bail; + if (PyList_Append(rval, encoded)) { + Py_DECREF(encoded); + goto bail; + } + Py_DECREF(encoded); + if (PyList_Append(rval, s->key_separator)) + goto bail; + if (encoder_listencode_obj(s, rval, value, indent_level)) + goto bail; + idx += 1; + } + if (ident != NULL) { + if (PyDict_DelItem(s->markers, ident)) + goto bail; + Py_CLEAR(ident); + } + if (s->indent != Py_None) { + /* TODO: DOES NOT RUN */ + indent_level -= 1; + /* + yield '\n' + (' ' * (_indent * _current_indent_level)) + */ + } + if (PyList_Append(rval, close_dict)) + goto bail; + return 0; + +bail: + Py_XDECREF(kstr); + Py_XDECREF(ident); + return -1; +} + + +static int +encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level) +{ + static PyObject *open_array = NULL; + static PyObject *close_array = NULL; + static PyObject *empty_array = NULL; + PyObject *ident = NULL; + PyObject *s_fast = NULL; + Py_ssize_t num_items; + PyObject **seq_items; + Py_ssize_t i; + + if (open_array == NULL || close_array == NULL || empty_array == NULL) { + open_array = PyString_InternFromString("["); + close_array = PyString_InternFromString("]"); + empty_array = PyString_InternFromString("[]"); + if (open_array == NULL || close_array == NULL || empty_array == NULL) + return -1; + } + ident = NULL; + s_fast = PySequence_Fast(seq, "_iterencode_list needs a sequence"); + if (s_fast == NULL) + return -1; + num_items = PySequence_Fast_GET_SIZE(s_fast); + if (num_items == 0) { + Py_DECREF(s_fast); + return PyList_Append(rval, empty_array); + } + + if (s->markers != Py_None) { + int has_key; + ident = PyLong_FromVoidPtr(seq); + if (ident == NULL) + goto bail; + has_key = PyDict_Contains(s->markers, ident); + if (has_key) { + if (has_key != -1) + PyErr_SetString(PyExc_ValueError, "Circular reference detected"); + goto bail; + } + if (PyDict_SetItem(s->markers, ident, seq)) { + goto bail; + } + } + + seq_items = PySequence_Fast_ITEMS(s_fast); + if (PyList_Append(rval, open_array)) + goto bail; + if (s->indent != Py_None) { + /* TODO: DOES NOT RUN */ + indent_level += 1; + /* + newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) + separator = _item_separator + newline_indent + buf += newline_indent + */ + } + for (i = 0; i < num_items; i++) { + PyObject *obj = seq_items[i]; + if (i) { + if (PyList_Append(rval, s->item_separator)) + goto bail; + } + if (encoder_listencode_obj(s, rval, obj, indent_level)) + goto bail; + } + if (ident != NULL) { + if (PyDict_DelItem(s->markers, ident)) + goto bail; + Py_CLEAR(ident); + } + if (s->indent != Py_None) { + /* TODO: DOES NOT RUN */ + indent_level -= 1; + /* + yield '\n' + (' ' * (_indent * _current_indent_level)) + */ + } + if (PyList_Append(rval, close_array)) + goto bail; + Py_DECREF(s_fast); + return 0; + +bail: + Py_XDECREF(ident); + Py_DECREF(s_fast); + return -1; +} + +static void +encoder_dealloc(PyObject *self) +{ + PyEncoderObject *s; + assert(PyEncoder_Check(self)); + s = (PyEncoderObject *)self; + Py_CLEAR(s->markers); + Py_CLEAR(s->defaultfn); + Py_CLEAR(s->encoder); + Py_CLEAR(s->indent); + Py_CLEAR(s->key_separator); + Py_CLEAR(s->item_separator); + Py_CLEAR(s->sort_keys); + Py_CLEAR(s->skipkeys); + self->ob_type->tp_free(self); +} + +PyDoc_STRVAR(encoder_doc, "_iterencode(obj, _current_indent_level) -> iterable"); + +static +PyTypeObject PyEncoderType = { + PyObject_HEAD_INIT(0) + 0, /* tp_internal */ + "make_encoder", /* tp_name */ + sizeof(PyEncoderObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + encoder_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + encoder_call, /* tp_call */ + 0, /* tp_str */ + 0,/* PyObject_GenericGetAttr, */ /* tp_getattro */ + 0,/* PyObject_GenericSetAttr, */ /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + encoder_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + encoder_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + encoder_init, /* tp_init */ + 0,/* PyType_GenericAlloc, */ /* tp_alloc */ + 0,/* PyType_GenericNew, */ /* tp_new */ + 0,/* _PyObject_Del, */ /* tp_free */ +}; + static PyMethodDef json_methods[] = { - {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii, - METH_O, pydoc_encode_basestring_ascii}, - {"scanstring", (PyCFunction)py_scanstring, METH_VARARGS, - pydoc_scanstring}, + {"encode_basestring_ascii", + (PyCFunction)py_encode_basestring_ascii, + METH_O, + pydoc_encode_basestring_ascii}, + {"scanstring", + (PyCFunction)py_scanstring, + METH_VARARGS, + pydoc_scanstring}, {NULL, NULL, 0, NULL} }; @@ -617,5 +2081,23 @@ init_json(void) { PyObject *m; + PyScannerType.tp_getattro = PyObject_GenericGetAttr; + PyScannerType.tp_setattro = PyObject_GenericSetAttr; + PyScannerType.tp_alloc = PyType_GenericAlloc; + PyScannerType.tp_new = PyType_GenericNew; + PyScannerType.tp_free = _PyObject_Del; + if (PyType_Ready(&PyScannerType) < 0) + return; + PyEncoderType.tp_getattro = PyObject_GenericGetAttr; + PyEncoderType.tp_setattro = PyObject_GenericSetAttr; + PyEncoderType.tp_alloc = PyType_GenericAlloc; + PyEncoderType.tp_new = PyType_GenericNew; + PyEncoderType.tp_free = _PyObject_Del; + if (PyType_Ready(&PyEncoderType) < 0) + return; m = Py_InitModule3("_json", json_methods, module_doc); + Py_INCREF((PyObject*)&PyScannerType); + PyModule_AddObject(m, "make_scanner", (PyObject*)&PyScannerType); + Py_INCREF((PyObject*)&PyEncoderType); + PyModule_AddObject(m, "make_encoder", (PyObject*)&PyEncoderType); }