"""JSON token scanner """ import re try: from _json import make_scanner as c_make_scanner except ImportError: c_make_scanner = None try: from _json import scanstring as c_scanstring except ImportError: c_scanstring = None __all__ = ['make_scanner', 'JSONDecodeError'] class JSONDecodeError(ValueError): """Subclass of ValueError with the following additional properties: msg: The unformatted error message doc: The JSON document being parsed pos: The start index of doc where parsing failed lineno: The line corresponding to pos colno: The column corresponding to pos """ # Note that this exception is used from _json def __init__(self, msg, doc, pos): lineno = doc.count('\n', 0, pos) + 1 colno = pos - doc.rfind('\n', 0, pos) errmsg = '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos) ValueError.__init__(self, errmsg) self.msg = msg self.doc = doc self.pos = pos self.lineno = lineno self.colno = colno def __reduce__(self): return self.__class__, (self.msg, self.doc, self.pos) FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) BACKSLASH = { '"': '"', '\\': '\\', '/': '/', 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', } def _decode_uXXXX(s, pos): esc = s[pos + 1:pos + 5] if len(esc) == 4 and esc[1] not in 'xX': try: return int(esc, 16) except ValueError: pass msg = "Invalid \\uXXXX escape" raise JSONDecodeError(msg, s, pos) def py_scanstring(s, end, parse_string, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match): """Scan the string s for a JSON string. End is the index of the character in s after the quote that started the JSON string. Unescapes all valid JSON string escape sequences and raises ValueError on attempt to decode an invalid string. If strict is False then literal control characters are allowed in the string. Returns a tuple of the decoded string and the index of the character in s after the end quote.""" chunks = [] _append = chunks.append begin = end - 1 while 1: chunk = _m(s, end) if chunk is None: raise JSONDecodeError("Unterminated string starting at", s, begin) end = chunk.end() content, terminator = chunk.groups() # Content is contains zero or more unescaped string characters if content: _append(content) # Terminator is the end of string, a literal control character, # or a backslash denoting that an escape sequence follows if terminator == '"': break elif terminator != '\\': if strict: # msg = "Invalid control character %r at" % (terminator,) msg = "Invalid control character {0!r} at".format(terminator) raise JSONDecodeError(msg, s, end) else: _append(terminator) continue try: esc = s[end] except IndexError: raise JSONDecodeError("Unterminated string starting at", s, begin) # If not a unicode escape sequence, must be in the lookup table if esc != 'u': try: char = _b[esc] except KeyError: msg = "Invalid \\escape: {0!r}".format(esc) raise JSONDecodeError(msg, s, end) end += 1 else: uni = _decode_uXXXX(s, end) end += 5 if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u': uni2 = _decode_uXXXX(s, end + 1) if 0xdc00 <= uni2 <= 0xdfff: uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) end += 6 char = chr(uni) _append(char) string = ''.join(chunks) if parse_string is not None: string = parse_string(string) return string, end # Use speedup if available scanstring = py_scanstring # SWAP: scanstring = c_scanstring or py_scanstring WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) WHITESPACE_STR = ' \t\n\r' def scanobject(s, end, parse_key, parse_object, parse_ordered_object, strict, scan_once, memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR): pairs = [] pairs_append = pairs.append # Backwards compatibility if memo is None: memo = {} memo_get = memo.setdefault # Use a slice to prevent IndexError from being raised, the following # check will raise a more specific ValueError if the string is empty nextchar = s[end:end + 1] # Normally we expect nextchar == '"' if nextchar != '"': if nextchar in _ws: end = _w(s, end).end() nextchar = s[end:end + 1] # Trivial empty object if nextchar == '}': if parse_ordered_object is not None: result = parse_ordered_object(pairs) return result, end + 1 pairs = {} if parse_object is not None: pairs = parse_object(pairs) return pairs, end + 1 elif nextchar != '"': raise JSONDecodeError( "Expecting property name enclosed in double quotes", s, end) end += 1 while True: key, end = scanstring(s, end, parse_key, strict) key = memo_get(key, key) # To skip some function call overhead we optimize the fast paths where # the JSON key separator is ": " or just ":". if s[end:end + 1] != ':': end = _w(s, end).end() if s[end:end + 1] != ':': raise JSONDecodeError("Expecting ':' delimiter", s, end) end += 1 try: if s[end] in _ws: end += 1 if s[end] in _ws: end = _w(s, end + 1).end() except IndexError: pass try: value, end = scan_once(s, end) except StopIteration as err: raise JSONDecodeError("Expecting value", s, err.value) from None pairs_append((key, value)) try: nextchar = s[end] if nextchar in _ws: end = _w(s, end + 1).end() nextchar = s[end] except IndexError: nextchar = '' end += 1 if nextchar == '}': break elif nextchar != ',': raise JSONDecodeError("Expecting ',' delimiter", s, end - 1) end = _w(s, end).end() nextchar = s[end:end + 1] end += 1 if nextchar != '"': raise JSONDecodeError( "Expecting property name enclosed in double quotes", s, end - 1) if parse_ordered_object is not None: result = parse_ordered_object(pairs) return result, end pairs = dict(pairs) if parse_object is not None: pairs = parse_object(pairs) return pairs, end def scanarray(s, end, parse_array, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): values = [] nextchar = s[end:end + 1] if nextchar in _ws: end = _w(s, end + 1).end() nextchar = s[end:end + 1] # Look-ahead for trivial empty array if nextchar == ']': if parse_array is not None: values = parse_array(values) return values, end + 1 _append = values.append while True: try: value, end = scan_once(s, end) except StopIteration as err: raise JSONDecodeError("Expecting value", s, err.value) from None _append(value) nextchar = s[end:end + 1] if nextchar in _ws: end = _w(s, end + 1).end() nextchar = s[end:end + 1] end += 1 if nextchar == ']': break elif nextchar != ',': raise JSONDecodeError("Expecting ',' delimiter", s, end - 1) try: if s[end] in _ws: end += 1 if s[end] in _ws: end = _w(s, end + 1).end() except IndexError: pass if parse_array is not None: values = parse_array(values) return values, end NUMBER_RE = re.compile(r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?', FLAGS) def py_make_scanner(context): strict = context.strict parse_ordered_object = context.parse_ordered_object parse_object = context.parse_object parse_key = context.parse_key parse_array = context.parse_array parse_string = context.parse_string parse_float = context.parse_float parse_int = context.parse_int parse_constant = context.parse_constant memo = context.memo def _scan_once(string, idx): try: nextchar = string[idx] except IndexError: raise StopIteration(idx) if nextchar == '"': return scanstring(string, idx + 1, parse_string, strict) elif nextchar == '{': return scanobject(string, idx + 1, parse_key, parse_object, parse_ordered_object, strict, _scan_once, memo) elif nextchar == '[': return scanarray(string, idx + 1, parse_array, _scan_once) elif nextchar == 'n' and string[idx:idx + 4] == 'null': return None, idx + 4 elif nextchar == 't' and string[idx:idx + 4] == 'true': return True, idx + 4 elif nextchar == 'f' and string[idx:idx + 5] == 'false': return False, idx + 5 m = NUMBER_RE.match(string, idx) if m is not None: integer, frac, exp = m.groups() if frac or exp: res = parse_float(integer + (frac or '') + (exp or '')) else: res = parse_int(integer) return res, m.end() elif nextchar == 'N' and string[idx:idx + 3] == 'NaN': return parse_constant('NaN'), idx + 3 elif nextchar == 'I' and string[idx:idx + 8] == 'Infinity': return parse_constant('Infinity'), idx + 8 elif nextchar == '-' and string[idx:idx + 9] == '-Infinity': return parse_constant('-Infinity'), idx + 9 else: raise StopIteration(idx) def scan_once(string, idx): try: return _scan_once(string, idx) finally: memo.clear() return _scan_once make_scanner = py_make_scanner # SWAP: make_scanner = c_make_scanner or py_make_scanner