diff -r f21f0de30544 Doc/library/re.rst --- a/Doc/library/re.rst Wed Oct 08 13:15:36 2014 +0300 +++ b/Doc/library/re.rst Wed Oct 08 17:19:56 2014 +0300 @@ -726,13 +726,36 @@ form. Clear the regular expression cache. -.. exception:: error +.. exception:: error(msg, pattern=None, pos=None) Exception raised when a string passed to one of the functions here is not a valid regular expression (for example, it might contain unmatched parentheses) or when some other error occurs during compilation or matching. It is never an - error if a string contains no match for a pattern. + error if a string contains no match for a pattern. The error instance has + the following additional attributes: + .. attribute:: msg + + The unformatted error message + + .. attribute:: pattern + + The regular expression pattern. + + .. attribute:: pos + + The index of *pattern* where compilation failed + + .. attribute:: lineno + + The line corresponding to *pos* + + .. attribute:: colno + + The column corresponding to *pos* + + .. versionchanged:: 3.5 + Added additional attributes. .. _re-objects: diff -r f21f0de30544 Lib/sre_constants.py --- a/Lib/sre_constants.py Wed Oct 08 13:15:36 2014 +0300 +++ b/Lib/sre_constants.py Wed Oct 08 17:19:56 2014 +0300 @@ -21,8 +21,37 @@ from _sre import MAXREPEAT, MAXGROUPS # should this really be here? class error(Exception): - pass + def __init__(self, msg, pattern=None, pos=None): + self.msg = msg + self.pattern = pattern + self.pos = pos + if pattern is not None and pos is not None: + msg = '%s at position %d' % (msg, pos) + if isinstance(pattern, str): + newline = '\n' + else: + newline = b'\n' + self.lineno = pattern.count(newline, 0, pos) + 1 + if self.lineno == 1: + self.colno = pos + 1 + else: + self.colno = pos - pattern.rindex(newline, 0, pos) + msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno) + else: + self.lineno = self.colno = None + super().__init__(msg) +def linecol(doc, pos): + if isinstance(pattern, str): + newline = '\n' + else: + newline = b'\n' + lineno = pattern.count(newline, 0, pos) + 1 + if lineno == 1: + colno = pos + 1 + else: + colno = pos - doc.rindex(newline, 0, pos) + return lineno, colno # operators FAILURE = "failure" diff -r f21f0de30544 Lib/sre_parse.py --- a/Lib/sre_parse.py Wed Oct 08 13:15:36 2014 +0300 +++ b/Lib/sre_parse.py Wed Oct 08 17:19:56 2014 +0300 @@ -207,7 +207,8 @@ class Tokenizer: try: c = self.string[self.index + 1] except IndexError: - raise error("bogus escape (end of line)") + self.next = None + raise self.error("bogus escape (end of line)", 0) if not self.istext: c = chr(c) char = char + c @@ -233,9 +234,13 @@ class Tokenizer: self.__next() return result def tell(self): - return self.index, self.next + return self.index - len(self.next or '') def seek(self, index): - self.index, self.next = index + self.index = index + self.__next() + + def error(self, msg, offset): + return error(msg, self.string, self.tell() - offset) # The following three functions are not used in this module anymore, but we keep # them here (with DeprecationWarnings) for backwards compatibility. @@ -299,8 +304,8 @@ def _class_escape(source, escape): escape += source.getwhile(2, OCTDIGITS) c = int(escape[1:], 8) if c > 0o377: - raise error('octal escape value %r outside of ' - 'range 0-0o377' % escape) + raise source.error('octal escape value %r outside of ' + 'range 0-0o377' % escape, len(escape)) return LITERAL, c elif c in DIGITS: raise ValueError @@ -308,7 +313,7 @@ def _class_escape(source, escape): return LITERAL, ord(escape[1]) except ValueError: pass - raise error("bogus escape: %s" % repr(escape)) + raise source.error("bogus escape: %s" % repr(escape), len(escape)) def _escape(source, escape, state): # handle escape code in expression @@ -354,21 +359,23 @@ def _escape(source, escape, state): escape = escape + source.get() c = int(escape[1:], 8) if c > 0o377: - raise error('octal escape value %r outside of ' - 'range 0-0o377' % escape) + raise source.error('octal escape value %r outside of ' + 'range 0-0o377' % escape, + len(escape)) return LITERAL, c # not an octal escape, so this is a group reference group = int(escape[1:]) if group < state.groups: if not state.checkgroup(group): - raise error("cannot refer to open group") + raise source.error("cannot refer to open group", + len(escape)) return GROUPREF, group raise ValueError if len(escape) == 2: return LITERAL, ord(escape[1]) except ValueError: pass - raise error("bogus escape: %s" % repr(escape)) + raise source.error("bogus escape: %s" % repr(escape), len(escape)) def _parse_sub(source, state, nested=1): # parse an alternation: a|b|c @@ -385,7 +392,7 @@ def _parse_sub(source, state, nested=1): if not source.next or sourcematch(")", 0): break else: - raise error("pattern not properly closed") + raise source.error("pattern not properly closed", 0) if len(items) == 1: return items[0] @@ -434,11 +441,12 @@ def _parse_sub_cond(source, state, condg if source.match("|"): item_no = _parse(source, state) if source.match("|"): - raise error("conditional backref with more than two branches") + raise source.error("conditional backref with more than two branches", + 1) else: item_no = None if source.next and not source.match(")", 0): - raise error("pattern not properly closed") + raise source.error("pattern not properly closed", 0) subpattern = SubPattern(state) subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) return subpattern @@ -503,7 +511,7 @@ def _parse(source, state): elif this: code1 = LITERAL, ord(this) else: - raise error("unexpected end of regular expression") + raise source.error("unexpected end of regular expression", 0) if sourcematch("-"): # potential range this = sourceget() @@ -519,14 +527,14 @@ def _parse(source, state): else: code2 = LITERAL, ord(this) if code1[0] != LITERAL or code2[0] != LITERAL: - raise error("bad character range") + raise source.error("bad character range", len(this)) lo = code1[1] hi = code2[1] if hi < lo: - raise error("bad character range") + raise source.error("bad character range", len(this)) setappend((RANGE, (lo, hi))) else: - raise error("unexpected end of regular expression") + raise source.error("unexpected end of regular expression", 0) else: if code1[0] is IN: code1 = code1[1][0] @@ -543,6 +551,7 @@ def _parse(source, state): elif this and this[0] in REPEAT_CHARS: # repeat previous item + here = source.tell() if this == "?": min, max = 0, 1 elif this == "*": @@ -554,7 +563,6 @@ def _parse(source, state): if source.next == "}": subpatternappend((LITERAL, ord(this))) continue - here = source.tell() min, max = 0, MAXREPEAT lo = hi = "" while source.next in DIGITS: @@ -577,18 +585,21 @@ def _parse(source, state): if max >= MAXREPEAT: raise OverflowError("the repetition number is too large") if max < min: - raise error("bad repeat interval") + raise source.error("bad repeat interval", + source.tell() - here) else: - raise error("not supported") + raise source.error("not supported", len(this)) # figure out which item to repeat if subpattern: item = subpattern[-1:] else: item = None if not item or (_len(item) == 1 and item[0][0] == AT): - raise error("nothing to repeat") + raise source.error("nothing to repeat", + source.tell() - here + len(this)) if item[0][0] in REPEATCODES: - raise error("multiple repeat") + raise source.error("multiple repeat", + source.tell() - here + len(this)) if sourcematch("?"): subpattern[-1] = (MIN_REPEAT, (min, max, item)) else: @@ -612,41 +623,45 @@ def _parse(source, state): while 1: char = sourceget() if char is None: - raise error("unterminated name") + raise source.error("unterminated name", 0) if char == ">": break name = name + char group = 1 if not name: - raise error("missing group name") + raise source.error("missing group name", 1) if not name.isidentifier(): - raise error("bad character in group name %r" % name) + raise source.error("bad character in group name " + "%r" % name, + len(name) + 1) elif sourcematch("="): # named backreference name = "" while 1: char = sourceget() if char is None: - raise error("unterminated name") + raise source.error("unterminated name", 0) if char == ")": break name = name + char if not name: - raise error("missing group name") + raise source.error("missing group name", 1) if not name.isidentifier(): - raise error("bad character in backref group name " - "%r" % name) + raise source.error("bad character in backref " + "group name %r" % name, + len(name) + 1) gid = state.groupdict.get(name) if gid is None: msg = "unknown group name: {0!r}".format(name) - raise error(msg) + raise source.error(msg, len(name) + 1) subpatternappend((GROUPREF, gid)) continue else: char = sourceget() if char is None: - raise error("unexpected end of pattern") - raise error("unknown specifier: ?P%s" % char) + raise source.error("unexpected end of pattern", 0) + raise source.error("unknown specifier: ?P%s" % char, + len(char)) elif sourcematch(":"): # non-capturing group group = 2 @@ -657,7 +672,7 @@ def _parse(source, state): break sourceget() if not sourcematch(")"): - raise error("unbalanced parenthesis") + raise source.error("unbalanced parenthesis", 0) continue elif source.next in ASSERTCHARS: # lookahead assertions @@ -665,12 +680,12 @@ def _parse(source, state): dir = 1 if char == "<": if source.next not in LOOKBEHINDASSERTCHARS: - raise error("syntax error") + raise source.error("syntax error", 0) dir = -1 # lookbehind char = sourceget() p = _parse_sub(source, state) if not sourcematch(")"): - raise error("unbalanced parenthesis") + raise source.error("unbalanced parenthesis", 0) if char == "=": subpatternappend((ASSERT, (dir, p))) else: @@ -682,33 +697,36 @@ def _parse(source, state): while 1: char = sourceget() if char is None: - raise error("unterminated name") + raise source.error("unterminated name", 0) if char == ")": break condname = condname + char group = 2 if not condname: - raise error("missing group name") + raise source.error("missing group name", 1) if condname.isidentifier(): condgroup = state.groupdict.get(condname) if condgroup is None: msg = "unknown group name: {0!r}".format(condname) - raise error(msg) + raise source.error(msg, len(condname) + 1) else: try: condgroup = int(condname) if condgroup < 0: raise ValueError except ValueError: - raise error("bad character in group name") + raise source.error("bad character in group name", + len(condname) + 1) if not condgroup: - raise error("bad group number") + raise source.error("bad group number", + len(condname) + 1) if condgroup >= MAXGROUPS: - raise error("the group number is too large") + raise source.error("the group number is too large", + len(condname) + 1) else: # flags if not source.next in FLAGS: - raise error("unexpected end of pattern") + raise source.error("unexpected end of pattern", 0) while source.next in FLAGS: state.flags = state.flags | FLAGS[sourceget()] if group: @@ -717,13 +735,16 @@ def _parse(source, state): # anonymous group group = None else: - group = state.opengroup(name) + try: + group = state.opengroup(name) + except error as err: + raise source.error(err.msg, len(name) + 1) if condgroup: p = _parse_sub_cond(source, state, condgroup) else: p = _parse_sub(source, state) if not sourcematch(")"): - raise error("unbalanced parenthesis") + raise source.error("unbalanced parenthesis", 0) if group is not None: state.closegroup(group) subpatternappend((SUBPATTERN, (group, p))) @@ -731,10 +752,10 @@ def _parse(source, state): while 1: char = sourceget() if char is None: - raise error("unexpected end of pattern") + raise source.error("unexpected end of pattern", 0) if char == ")": break - raise error("unknown extension") + raise source.error("unknown extension", len(char)) elif this == "^": subpatternappend((AT, AT_BEGINNING)) @@ -747,7 +768,7 @@ def _parse(source, state): subpatternappend(code) else: - raise error("parser error") + raise source.error("parser error", len(this)) return subpattern @@ -778,9 +799,10 @@ def parse(str, flags=0, pattern=None): tail = source.get() if tail == ")": - raise error("unbalanced parenthesis") + raise source.error("unbalanced parenthesis", 1) elif tail: - raise error("bogus characters at end of regular expression") + raise source.error("bogus characters at end of regular expression", + len(tail)) if flags & SRE_FLAG_DEBUG: p.dump() @@ -820,21 +842,23 @@ def parse_template(source, pattern): while True: char = sget() if char is None: - raise error("unterminated group name") + raise s.error("unterminated group name", 0) if char == ">": break name += char if not name: - raise error("missing group name") + raise s.error("missing group name", 1) try: index = int(name) if index < 0: - raise error("negative group number") + raise s.error("negative group number", len(name) + 1) if index >= MAXGROUPS: - raise error("the group number is too large") + raise s.error("the group number is too large", + len(name) + 1) except ValueError: if not name.isidentifier(): - raise error("bad character in group name") + raise s.error("bad character in group name", + len(name) + 1) try: index = pattern.groupindex[name] except KeyError: @@ -857,8 +881,8 @@ def parse_template(source, pattern): isoctal = True c = int(this[1:], 8) if c > 0o377: - raise error('octal escape value %r outside of ' - 'range 0-0o377' % this) + raise s.error('octal escape value %r outside of ' + 'range 0-0o377' % this, len(this)) lappend(chr(c)) if not isoctal: addgroup(int(this[1:])) diff -r f21f0de30544 Lib/test/test_re.py --- a/Lib/test/test_re.py Wed Oct 08 13:15:36 2014 +0300 +++ b/Lib/test/test_re.py Wed Oct 08 17:19:56 2014 +0300 @@ -1270,6 +1270,40 @@ subpattern None # with ignore case. self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3)) + def test_error(self): + with self.assertRaises(re.error) as cm: + re.compile('(\u20ac))') + err = cm.exception + self.assertIsInstance(err.pattern, str) + self.assertEqual(err.pattern, '(\u20ac))') + self.assertEqual(err.pos, 3) + self.assertEqual(err.lineno, 1) + self.assertEqual(err.colno, 4) + self.assertIn(err.msg, str(err)) + self.assertIn(' at position 3', str(err)) + self.assertNotIn(' at position 3', err.msg) + with self.assertRaises(re.error) as cm: + re.compile(b'(\xa4))') + err = cm.exception + self.assertIsInstance(err.pattern, bytes) + self.assertEqual(err.pattern, b'(\xa4))') + self.assertEqual(err.pos, 3) + with self.assertRaises(re.error) as cm: + re.compile(""" + ( + abc + ) + ) + ( + """, re.VERBOSE) + err = cm.exception + self.assertEqual(err.pos, 77) + self.assertEqual(err.lineno, 5) + self.assertEqual(err.colno, 17) + self.assertIn(err.msg, str(err)) + self.assertIn(' at position 77', str(err)) + self.assertIn('(line 5, column 17)', str(err)) + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected):