diff -r 7ed9c601accd Lib/re.py --- a/Lib/re.py Fri Feb 06 09:59:05 2015 -0800 +++ b/Lib/re.py Sun Feb 08 01:18:39 2015 +0200 @@ -286,7 +286,7 @@ def _compile(pattern, flags): if isinstance(pattern, _pattern_type): if flags: raise ValueError( - "Cannot process flags argument with a compiled pattern") + "cannot process flags argument with a compiled pattern") return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled pattern") diff -r 7ed9c601accd Lib/sre_compile.py --- a/Lib/sre_compile.py Fri Feb 06 09:59:05 2015 -0800 +++ b/Lib/sre_compile.py Sun Feb 08 01:18:39 2015 +0200 @@ -113,7 +113,7 @@ def _compile(code, pattern, flags): emit(ANY) elif op in REPEATING_CODES: if flags & SRE_FLAG_TEMPLATE: - raise error("internal: unsupported template operator") + raise error("internal: unsupported template operator %r" % (op,)) elif _simple(av) and op is not REPEAT: if op is MAX_REPEAT: emit(REPEAT_ONE) @@ -216,7 +216,7 @@ def _compile(code, pattern, flags): else: code[skipyes] = _len(code) - skipyes + 1 else: - raise ValueError("unsupported operand type", op) + raise error("internal: unsupported operand type %r" % (op,)) def _compile_charset(charset, flags, code, fixup=None, fixes=None): # compile charset subprogram @@ -242,7 +242,7 @@ def _compile_charset(charset, flags, cod else: emit(av) else: - raise error("internal: unsupported set operator") + raise error("internal: unsupported set operator %r" % (op,)) emit(FAILURE) def _optimize_charset(charset, fixup, fixes): diff -r 7ed9c601accd Lib/sre_parse.py --- a/Lib/sre_parse.py Fri Feb 06 09:59:05 2015 -0800 +++ b/Lib/sre_parse.py Sun Feb 08 01:18:39 2015 +0200 @@ -75,7 +75,7 @@ class Pattern: gid = self.groups self.groups = gid + 1 if self.groups > MAXGROUPS: - raise error("groups number is too large") + raise error("too many groups") if name is not None: ogid = self.groupdict.get(name, None) if ogid is not None: @@ -210,7 +210,7 @@ class Tokenizer: try: char += self.decoded_string[index] except IndexError: - raise error("bogus escape (end of line)", + raise error("bad escape (end of pattern)", self.string, len(self.string) - 1) from None self.index = index + 1 self.next = char @@ -238,8 +238,13 @@ class Tokenizer: c = self.next self.__next() if c is None: - raise self.error("unterminated name") + if not result: + raise self.error("missing group name") + raise self.error("missing %s, unterminated name" % terminator, + len(result)) if c == terminator: + if not result: + raise self.error("missing group name", 1) break result += c return result @@ -314,7 +319,7 @@ def _class_escape(source, escape): escape += source.getwhile(2, OCTDIGITS) c = int(escape[1:], 8) if c > 0o377: - raise source.error('octal escape value %r outside of ' + raise source.error('octal escape value %s outside of ' 'range 0-0o377' % escape, len(escape)) return LITERAL, c elif c in DIGITS: @@ -323,7 +328,7 @@ def _class_escape(source, escape): return LITERAL, ord(escape[1]) except ValueError: pass - raise source.error("bogus escape: %r" % escape, len(escape)) + raise source.error("bad escape %s" % escape, len(escape)) def _escape(source, escape, state): # handle escape code in expression @@ -369,7 +374,7 @@ def _escape(source, escape, state): escape += source.get() c = int(escape[1:], 8) if c > 0o377: - raise source.error('octal escape value %r outside of ' + raise source.error('octal escape value %s outside of ' 'range 0-0o377' % escape, len(escape)) return LITERAL, c @@ -377,15 +382,15 @@ def _escape(source, escape, state): group = int(escape[1:]) if group < state.groups: if not state.checkgroup(group): - raise source.error("cannot refer to open group", + raise source.error("cannot refer to an open group", len(escape)) return GROUPREF, group - raise ValueError + raise source.error("invalid group reference", len(escape)) if len(escape) == 2: return LITERAL, ord(escape[1]) except ValueError: pass - raise source.error("bogus escape: %r" % escape, len(escape)) + raise source.error("bad escape %s" % escape, len(escape)) def _parse_sub(source, state, nested=True): # parse an alternation: a|b|c @@ -393,12 +398,11 @@ def _parse_sub(source, state, nested=Tru items = [] itemsappend = items.append sourcematch = source.match + start = source.tell() while True: itemsappend(_parse(source, state)) if not sourcematch("|"): break - if nested and source.next is not None and source.next != ")": - raise source.error("pattern not properly closed") if len(items) == 1: return items[0] @@ -446,8 +450,6 @@ def _parse_sub_cond(source, state, condg raise source.error("conditional backref with more than two branches") else: item_no = None - if source.next is not None and source.next != ")": - raise source.error("pattern not properly closed") subpattern = SubPattern(state) subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) return subpattern @@ -492,6 +494,7 @@ def _parse(source, state): subpatternappend((LITERAL, _ord(this))) elif this == "[": + here = source.tell() - 1 # character set set = [] setappend = set.append @@ -504,7 +507,8 @@ def _parse(source, state): while True: this = sourceget() if this is None: - raise source.error("unexpected end of regular expression") + raise source.error("unterminated character set", + source.tell() - here) if this == "]" and set != start: break elif this[0] == "\\": @@ -513,25 +517,28 @@ def _parse(source, state): code1 = LITERAL, _ord(this) if sourcematch("-"): # potential range - this = sourceget() - if this is None: - raise source.error("unexpected end of regular expression") - if this == "]": + that = sourceget() + if that is None: + raise source.error("bad character range %s-" % this, + len(this) + 1) + if that == "]": if code1[0] is IN: code1 = code1[1][0] setappend(code1) setappend((LITERAL, _ord("-"))) break - if this[0] == "\\": - code2 = _class_escape(source, this) + if that[0] == "\\": + code2 = _class_escape(source, that) else: - code2 = LITERAL, _ord(this) + code2 = LITERAL, _ord(that) if code1[0] != LITERAL or code2[0] != LITERAL: - raise source.error("bad character range", len(this)) + raise source.error("bad character range %s-%s" % (this, that), + len(this) + 1 + len(that)) lo = code1[1] hi = code2[1] if hi < lo: - raise source.error("bad character range", len(this)) + raise source.error("bad character range %s-%s" % (this, that), + len(this) + 1 + len(that)) setappend((RANGE, (lo, hi))) else: if code1[0] is IN: @@ -586,7 +593,7 @@ def _parse(source, state): raise source.error("bad repeat interval", source.tell() - here) else: - raise source.error("not supported", len(this)) + raise AssertionError("unsupported quantifier %r" % (char,)) # figure out which item to repeat if subpattern: item = subpattern[-1:] @@ -607,23 +614,20 @@ def _parse(source, state): subpatternappend((ANY, None)) elif this == "(": - group = 1 + start = source.tell() - 1 + group = True name = None condgroup = None if sourcematch("?"): - group = 0 # options char = sourceget() if char is None: - raise self.error("unexpected end of pattern") + raise source.error("unexpected end of pattern") if char == "P": # python extensions if sourcematch("<"): # named group: skip forward to end of name name = source.getuntil(">") - group = 1 - if not name: - raise source.error("missing group name", 1) if not name.isidentifier(): raise source.error("bad character in group name " "%r" % name, @@ -631,15 +635,13 @@ def _parse(source, state): elif sourcematch("="): # named backreference name = source.getuntil(")") - if not name: - raise source.error("missing group name", 1) if not name.isidentifier(): - raise source.error("bad character in backref " - "group name %r" % name, + raise source.error("bad character in group name " + "%r" % name, len(name) + 1) gid = state.groupdict.get(name) if gid is None: - msg = "unknown group name: {0!r}".format(name) + msg = "unknown group name {0!r}".format(name) raise source.error(msg, len(name) + 1) subpatternappend((GROUPREF, gid)) continue @@ -647,16 +649,17 @@ def _parse(source, state): char = sourceget() if char is None: raise source.error("unexpected end of pattern") - raise source.error("unknown specifier: ?P%s" % char, - len(char)) + raise source.error("unknown extension ?P" + char, + len(char) + 2) elif char == ":": # non-capturing group - group = 2 + group = None elif char == "#": # comment while True: if source.next is None: - raise source.error("unbalanced parenthesis") + raise source.error("missing ), unterminated comment", + source.tell() - start) if sourceget() == ")": break continue @@ -665,12 +668,16 @@ def _parse(source, state): dir = 1 if char == "<": char = sourceget() - if char is None or char not in "=!": - raise source.error("syntax error") + if char is None: + raise source.error("unexpected end of pattern") + if char not in "=!": + raise source.error("unknown extension ?<" + char, + len(char) + 2) dir = -1 # lookbehind p = _parse_sub(source, state) - if not sourcematch(")"): - raise source.error("unbalanced parenthesis") + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) if char == "=": subpatternappend((ASSERT, (dir, p))) else: @@ -679,13 +686,11 @@ def _parse(source, state): elif char == "(": # conditional backreference group condname = source.getuntil(")") - group = 2 - if not condname: - raise source.error("missing group name", 1) + group = None if condname.isidentifier(): condgroup = state.groupdict.get(condname) if condgroup is None: - msg = "unknown group name: {0!r}".format(condname) + msg = "unknown group name {0!r}".format(condname) raise source.error(msg, len(condname) + 1) else: try: @@ -693,49 +698,48 @@ def _parse(source, state): if condgroup < 0: raise ValueError except ValueError: - raise source.error("bad character in group name", - len(condname) + 1) + raise source.error("bad character in group name " + "%r" % condname, + len(condname) + 1) from None if not condgroup: raise source.error("bad group number", len(condname) + 1) if condgroup >= MAXGROUPS: - raise source.error("the group number is too large", + raise source.error("invalid group reference", len(condname) + 1) elif char in FLAGS: # flags - state.flags |= FLAGS[char] - while source.next in FLAGS: - state.flags |= FLAGS[sourceget()] + while True: + state.flags |= FLAGS[char] + char = sourceget() + if char is None: + raise source.error("missing )") + if char == ")": + break + if char not in FLAGS: + raise source.error("unknown flag", len(char)) verbose = state.flags & SRE_FLAG_VERBOSE + continue else: - raise source.error("unexpected end of pattern") - if group: - # parse group contents - if group == 2: - # anonymous group - group = None - else: - try: - group = state.opengroup(name) - except error as err: - raise source.error(err.msg, len(name) + 1) - if condgroup: - p = _parse_sub_cond(source, state, condgroup) - else: - p = _parse_sub(source, state) - if not sourcematch(")"): - raise source.error("unbalanced parenthesis") - if group is not None: - state.closegroup(group) - subpatternappend((SUBPATTERN, (group, p))) + raise source.error("unknown extension ?" + char, + len(char) + 1) + + # parse group contents + if group is not None: + try: + group = state.opengroup(name) + except error as err: + raise source.error(err.msg, len(name) + 1) from None + if condgroup: + p = _parse_sub_cond(source, state, condgroup) else: - while True: - char = sourceget() - if char is None: - raise source.error("unexpected end of pattern") - if char == ")": - break - raise source.error("unknown extension", len(char)) + p = _parse_sub(source, state) + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + if group is not None: + state.closegroup(group) + subpatternappend((SUBPATTERN, (group, p))) elif this == "^": subpatternappend((AT, AT_BEGINNING)) @@ -744,7 +748,7 @@ def _parse(source, state): subpattern.append((AT, AT_END)) else: - raise source.error("parser error", len(this)) + raise AssertionError("unsupported special character %r" % (char,)) return subpattern @@ -762,7 +766,7 @@ def fix_flags(src, flags): raise ValueError("ASCII and UNICODE flags are incompatible") else: if flags & SRE_FLAG_UNICODE: - raise ValueError("can't use UNICODE flag with a bytes pattern") + raise ValueError("cannot use UNICODE flag with a bytes pattern") if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: import warnings warnings.warn("ASCII and LOCALE flags are incompatible. " @@ -784,11 +788,8 @@ def parse(str, flags=0, pattern=None): p.pattern.flags = fix_flags(str, p.pattern.flags) if source.next is not None: - if source.next == ")": - raise source.error("unbalanced parenthesis") - else: - raise source.error("bogus characters at end of regular expression", - len(tail)) + assert source.next == ")" + raise source.error("unbalanced parenthesis") if flags & SRE_FLAG_DEBUG: p.dump() @@ -824,26 +825,26 @@ def parse_template(source, pattern): c = this[1] if c == "g": name = "" - if s.match("<"): - name = s.getuntil(">") - if not name: - raise s.error("missing group name", 1) - try: - index = int(name) - if index < 0: - raise s.error("negative group number", len(name) + 1) - if index >= MAXGROUPS: - raise s.error("the group number is too large", - len(name) + 1) - except ValueError: - if not name.isidentifier(): - raise s.error("bad character in group name", - len(name) + 1) + if not s.match("<"): + raise s.error("missing <") + name = s.getuntil(">") + if name.isidentifier(): try: index = pattern.groupindex[name] except KeyError: - msg = "unknown group name: {0!r}".format(name) + msg = "unknown group name {0!r}".format(name) raise IndexError(msg) + else: + try: + index = int(name) + if index < 0: + raise ValueError + except ValueError: + raise s.error("bad character in group name %r" % name, + len(name) + 1) from None + if index >= MAXGROUPS: + raise s.error("invalid group reference", + len(name) + 1) addgroup(index) elif c == "0": if s.next in OCTDIGITS: @@ -861,7 +862,7 @@ def parse_template(source, pattern): isoctal = True c = int(this[1:], 8) if c > 0o377: - raise s.error('octal escape value %r outside of ' + raise s.error('octal escape value %s outside of ' 'range 0-0o377' % this, len(this)) lappend(chr(c)) if not isoctal: diff -r 7ed9c601accd Lib/test/test_re.py --- a/Lib/test/test_re.py Fri Feb 06 09:59:05 2015 -0800 +++ b/Lib/test/test_re.py Sun Feb 08 01:18:39 2015 +0200 @@ -38,6 +38,24 @@ class ReTests(unittest.TestCase): self.assertIs(type(actual), type(expect), msg) recurse(actual, expect) + def checkPatternError(self, pattern, errmsg, pos=None): + with self.assertRaises(re.error) as cm: + re.compile(pattern) + with self.subTest(pattern=pattern): + err = cm.exception + self.assertEqual(err.msg, errmsg) + if pos is not None: + self.assertEqual(err.pos, pos) + + def checkTemplateError(self, pattern, repl, string, errmsg, pos=None): + with self.assertRaises(re.error) as cm: + re.sub(pattern, repl, string) + with self.subTest(pattern=pattern, repl=repl): + err = cm.exception + self.assertEqual(err.msg, errmsg) + if pos is not None: + self.assertEqual(err.pos, pos) + def test_keep_buffer(self): # See bug 14212 b = bytearray(b'x') @@ -155,21 +173,25 @@ class ReTests(unittest.TestCase): self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') - self.assertRaises(re.error, re.sub, 'x', r'\400', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\777', 'x') + self.checkTemplateError('x', r'\400', 'x', + r'octal escape value \400 outside of ' + r'range 0-0o377', 0) + self.checkTemplateError('x', r'\777', 'x', + r'octal escape value \777 outside of ' + r'range 0-0o377', 0) - self.assertRaises(re.error, re.sub, 'x', r'\1', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\8', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\9', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\11', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\18', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\90', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\99', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8' - self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1' - self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0' + self.checkTemplateError('x', r'\1', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\8', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\9', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\11', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\18', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\90', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\99', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\118', 'x', 'invalid group reference') # r'\11' + '8' + self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\181', 'x', 'invalid group reference') # r'\18' + '1' + self.checkTemplateError('x', r'\800', 'x', 'invalid group reference') # r'\80' + '0' # in python2.3 (etc), these loop endlessly in sre_parser.py self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') @@ -195,47 +217,65 @@ class ReTests(unittest.TestCase): re.compile('(?Px)(?P=a)(?(a)y)') re.compile('(?Px)(?P=a1)(?(a1)y)') re.compile('(?Px)\1(?(1)y)') - self.assertRaises(re.error, re.compile, '(?P)(?P)') - self.assertRaises(re.error, re.compile, '(?Px)') - self.assertRaises(re.error, re.compile, '(?P=)') - self.assertRaises(re.error, re.compile, '(?P=1)') - self.assertRaises(re.error, re.compile, '(?P=a)') - self.assertRaises(re.error, re.compile, '(?P=a1)') - self.assertRaises(re.error, re.compile, '(?P=a.)') - self.assertRaises(re.error, re.compile, '(?P<)') - self.assertRaises(re.error, re.compile, '(?P<>)') - self.assertRaises(re.error, re.compile, '(?P<1>)') - self.assertRaises(re.error, re.compile, '(?P)') - self.assertRaises(re.error, re.compile, '(?())') - self.assertRaises(re.error, re.compile, '(?(a))') - self.assertRaises(re.error, re.compile, '(?(1a))') - self.assertRaises(re.error, re.compile, '(?(a.))') + self.checkPatternError('(?P)(?P)', + "redefinition of group name 'a' as group 2; " + "was group 1") + self.checkPatternError('(?Pxy)', 'unknown extension ?Px') + self.checkPatternError('(?P)(?P=a', 'missing ), unterminated name', 11) + self.checkPatternError('(?P=', 'missing group name', 4) + self.checkPatternError('(?P=)', 'missing group name', 4) + self.checkPatternError('(?P=1)', "bad character in group name '1'", 4) + self.checkPatternError('(?P=a)', "unknown group name 'a'") + self.checkPatternError('(?P=a1)', "unknown group name 'a1'") + self.checkPatternError('(?P=a.)', "bad character in group name 'a.'", 4) + self.checkPatternError('(?P<)', 'missing >, unterminated name', 4) + self.checkPatternError('(?P, unterminated name', 4) + self.checkPatternError('(?P<', 'missing group name', 4) + self.checkPatternError('(?P<>)', 'missing group name', 4) + self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4) + self.checkPatternError(r'(?P)', "bad character in group name 'a.'", 4) + self.checkPatternError(r'(?(', 'missing group name', 3) + self.checkPatternError(r'(?())', 'missing group name', 3) + self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3) + self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3) + self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3) + self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3) # New valid/invalid identifiers in Python 3 re.compile('(?P<ยต>x)(?P=ยต)(?(ยต)y)') re.compile('(?P<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>x)(?P=๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข)(?(๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข)y)') - self.assertRaises(re.error, re.compile, '(?P<ยฉ>x)') + self.checkPatternError('(?P<ยฉ>x)', "bad character in group name 'ยฉ'", 4) # Support > 100 groups. pat = '|'.join('x(?P%x)y' % (i, i) for i in range(1, 200 + 1)) pat = '(?:%s)(?(200)z|t)' % pat self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) def test_symbolic_refs(self): - self.assertRaises(re.error, re.sub, '(?Px)', '\gx)', '\g<', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)', '\g', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)', '\g', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)', '\g<>', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)', '\g<1a1>', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)', r'\g<2>', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)', r'\2', 'xx') - self.assertRaises(IndexError, re.sub, '(?Px)', '\g', 'xx') + self.checkTemplateError('(?Px)', '\g, unterminated name', 3) + self.checkTemplateError('(?Px)', '\g<', 'xx', + 'missing group name', 3) + self.checkTemplateError('(?Px)', '\g', 'xx', 'missing <', 2) + self.checkTemplateError('(?Px)', '\g', 'xx', + "bad character in group name 'a a'", 3) + self.checkTemplateError('(?Px)', '\g<>', 'xx', + 'missing group name', 3) + self.checkTemplateError('(?Px)', '\g<1a1>', 'xx', + "bad character in group name '1a1'", 3) + self.checkTemplateError('(?Px)', r'\g<2>', 'xx', + 'invalid group reference') + self.checkTemplateError('(?Px)', r'\2', 'xx', + 'invalid group reference') + with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"): + re.sub('(?Px)', '\g', 'xx') self.assertEqual(re.sub('(?Px)|(?Py)', r'\g', 'xx'), '') self.assertEqual(re.sub('(?Px)|(?Py)', r'\2', 'xx'), '') - self.assertRaises(re.error, re.sub, '(?Px)', '\g<-1>', 'xx') + self.checkTemplateError('(?Px)', '\g<-1>', 'xx', + "bad character in group name '-1'", 3) # New valid/invalid identifiers in Python 3 self.assertEqual(re.sub('(?P<ยต>x)', r'\g<ยต>', 'xx'), 'xx') self.assertEqual(re.sub('(?P<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>x)', r'\g<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>', 'xx'), 'xx') - self.assertRaises(re.error, re.sub, '(?Px)', r'\g<ยฉ>', 'xx') + self.checkTemplateError('(?Px)', '\g<ยฉ>', 'xx', + "bad character in group name 'ยฉ'", 3) # Support > 100 groups. pat = '|'.join('x(?P%x)y' % (i, i) for i in range(1, 200 + 1)) self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8') @@ -441,6 +481,19 @@ class ReTests(unittest.TestCase): pat = '(?:%s)(?(200)z)' % pat self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) + self.checkPatternError(r'(?P)(?(0))', 'bad group number', 10) + self.checkPatternError(r'()(?(1)a|b', + 'missing ), unterminated subpattern', 2) + self.checkPatternError(r'()(?(1)a|b|c)', + 'conditional backref with more than ' + 'two branches', 10) + + def test_re_groupref_overflow(self): + self.checkTemplateError('()', '\g<%s>' % sre_constants.MAXGROUPS, 'xx', + 'invalid group reference', 3) + self.checkPatternError(r'(?P)(?(%d))' % sre_constants.MAXGROUPS, + 'invalid group reference', 10) + def test_re_groupref(self): self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), ('|', 'a')) @@ -453,6 +506,8 @@ class ReTests(unittest.TestCase): self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), (None, None)) + self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4) + def test_groupdict(self): self.assertEqual(re.match('(?Pfirst) (?Psecond)', 'first second').groupdict(), @@ -500,6 +555,8 @@ class ReTests(unittest.TestCase): self.assertIsNone(re.match("^x{}$", "xxx")) self.assertTrue(re.match("^x{}$", "x{}")) + self.checkPatternError(r'x{2,1}', 'bad repeat interval', 2) + def test_getattr(self): self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)") self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) @@ -547,7 +604,7 @@ class ReTests(unittest.TestCase): b"1aa! a", re.LOCALE).group(0), b"1aa! a") def test_other_escapes(self): - self.assertRaises(re.error, re.compile, "\\") + self.checkPatternError("\\", 'bad escape (end of pattern)', 0) self.assertEqual(re.match(r"\(", '(').group(), '(') self.assertIsNone(re.match(r"\(", ')')) self.assertEqual(re.match(r"\\", '\\').group(), '\\') @@ -823,15 +880,17 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match(r"\08", "\0008")) self.assertTrue(re.match(r"\01", "\001")) self.assertTrue(re.match(r"\018", "\0018")) - self.assertRaises(re.error, re.match, r"\567", "") - self.assertRaises(re.error, re.match, r"\911", "") - self.assertRaises(re.error, re.match, r"\x1", "") - self.assertRaises(re.error, re.match, r"\x1z", "") - self.assertRaises(re.error, re.match, r"\u123", "") - self.assertRaises(re.error, re.match, r"\u123z", "") - self.assertRaises(re.error, re.match, r"\U0001234", "") - self.assertRaises(re.error, re.match, r"\U0001234z", "") - self.assertRaises(re.error, re.match, r"\U00110000", "") + self.checkPatternError(r"\567", + r'octal escape value \567 outside of ' + r'range 0-0o377', 0) + self.checkPatternError(r"\911", 'invalid group reference', 0) + self.checkPatternError(r"\x1", r'bad escape \x1', 0) + self.checkPatternError(r"\x1z", r'bad escape \x1', 0) + self.checkPatternError(r"\u123", r'bad escape \u123', 0) + self.checkPatternError(r"\u123z", r'bad escape \u123', 0) + self.checkPatternError(r"\U0001234", r'bad escape \U0001234', 0) + self.checkPatternError(r"\U0001234z", r'bad escape \U0001234', 0) + self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0) def test_sre_character_class_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: @@ -851,12 +910,14 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match(r"[\U%08x]" % i, chr(i))) self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0")) self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z")) - self.assertRaises(re.error, re.match, r"[\567]", "") - self.assertRaises(re.error, re.match, r"[\911]", "") - self.assertRaises(re.error, re.match, r"[\x1z]", "") - self.assertRaises(re.error, re.match, r"[\u123z]", "") - self.assertRaises(re.error, re.match, r"[\U0001234z]", "") - self.assertRaises(re.error, re.match, r"[\U00110000]", "") + self.checkPatternError(r"[\567]", + r'octal escape value \567 outside of ' + r'range 0-0o377', 1) + self.checkPatternError(r"[\911]", r'bad escape \9', 1) + self.checkPatternError(r"[\x1z]", r'bad escape \x1', 1) + self.checkPatternError(r"[\u123z]", r'bad escape \u123', 1) + self.checkPatternError(r"[\U0001234z]", r'bad escape \U0001234', 1) + self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1) self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e")) def test_sre_byte_literals(self): @@ -873,10 +934,12 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match(br"\08", b"\0008")) self.assertTrue(re.match(br"\01", b"\001")) self.assertTrue(re.match(br"\018", b"\0018")) - self.assertRaises(re.error, re.match, br"\567", b"") - self.assertRaises(re.error, re.match, br"\911", b"") - self.assertRaises(re.error, re.match, br"\x1", b"") - self.assertRaises(re.error, re.match, br"\x1z", b"") + self.checkPatternError(br"\567", + r'octal escape value \567 outside of ' + r'range 0-0o377', 0) + self.checkPatternError(br"\911", 'invalid group reference', 0) + self.checkPatternError(br"\x1", r'bad escape \x1', 0) + self.checkPatternError(br"\x1z", r'bad escape \x1', 0) def test_sre_byte_class_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255]: @@ -890,9 +953,22 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i]))) self.assertTrue(re.match(br"[\u]", b'u')) self.assertTrue(re.match(br"[\U]", b'U')) - self.assertRaises(re.error, re.match, br"[\567]", b"") - self.assertRaises(re.error, re.match, br"[\911]", b"") - self.assertRaises(re.error, re.match, br"[\x1z]", b"") + self.checkPatternError(br"[\567]", + r'octal escape value \567 outside of ' + r'range 0-0o377', 1) + self.checkPatternError(br"[\911]", r'bad escape \9', 1) + self.checkPatternError(br"[\x1z]", r'bad escape \x1', 1) + + def test_character_set_errors(self): + self.checkPatternError(r'[', 'unterminated character set', 0) + self.checkPatternError(r'[^', 'unterminated character set', 0) + self.checkPatternError(r'[a', 'unterminated character set', 0) + # bug 545855 -- This pattern failed to cause a compile error as it + # should, instead provoking a TypeError. + self.checkPatternError(r"[a-", 'bad character range a-', 1) + self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1) + self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1) + self.checkPatternError(r"[b-a]", 'bad character range b-a', 1) def test_bug_113254(self): self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) @@ -910,7 +986,7 @@ class ReTests(unittest.TestCase): def test_bug_545855(self): # bug 545855 -- This pattern failed to cause a compile error as it # should, instead provoking a TypeError. - self.assertRaises(re.error, re.compile, 'foo[a-') + self.checkPatternError('foo[a-', 'bad character range a-', 4) def test_bug_418626(self): # bugs 418626 at al. -- Testing Greg Chapman's addition of op code @@ -935,6 +1011,17 @@ class ReTests(unittest.TestCase): self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') + def test_nothing_to_repeat(self): + for op in ('*', '+', '?', '{1,2}'): + self.checkPatternError(op, 'nothing to repeat', 0) + self.checkPatternError('(?:%s)' % op, 'nothing to repeat', 3) + + def test_multiple_repeat(self): + for outer_op in ('*', '+', '{1,2}'): + for inner_op in ('*', '+', '?', '{1,2}'): + self.checkPatternError(r'x%s%s' % (inner_op, outer_op), + 'multiple repeat', 1 + len(inner_op)) + def test_unlimited_zero_width_repeat(self): # Issue #9669 self.assertIsNone(re.match(r'(?:a?)*y', 'z')) @@ -1324,13 +1411,13 @@ class ReTests(unittest.TestCase): def test_backref_group_name_in_exception(self): # Issue 17341: Poor error message when compiling invalid regex - with self.assertRaisesRegex(sre_constants.error, ''): - re.compile('(?P=)') + self.checkPatternError('(?P=)', + "bad character in group name ''", 4) def test_group_name_in_exception(self): # Issue 17341: Poor error message when compiling invalid regex - with self.assertRaisesRegex(sre_constants.error, '\?foo'): - re.compile('(?P)') + self.checkPatternError('(?P)', + "bad character in group name '?foo'", 4) def test_issue17998(self): for reps in '*', '+', '?', '{1}': @@ -1499,6 +1586,19 @@ SUBPATTERN None self.assertIn(' at position 77', str(err)) self.assertIn('(line 5, column 17)', str(err)) + def test_misc_errors(self): + self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0) + self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0) + self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5) + self.checkPatternError(r'(?P', 'unexpected end of pattern', 3) + self.checkPatternError(r'(?z)', 'unknown extension ?z', 1) + self.checkPatternError(r'(?iz)', 'unknown flag', 3) + self.checkPatternError(r'(?i', 'missing )', 3) + self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0) + self.checkPatternError(r'(?<', 'unexpected end of pattern', 3) + self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1) + self.checkPatternError(r'(?', 'unexpected end of pattern', 2) + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff -r 7ed9c601accd Modules/_sre.c --- a/Modules/_sre.c Fri Feb 06 09:59:05 2015 -0800 +++ b/Modules/_sre.c Sun Feb 08 01:18:39 2015 +0200 @@ -315,7 +315,7 @@ getstring(PyObject* string, Py_ssize_t* /* get pointer to byte string buffer */ if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) { - PyErr_SetString(PyExc_TypeError, "expected string or buffer"); + PyErr_SetString(PyExc_TypeError, "expected string or bytes-like object"); return NULL; } @@ -359,12 +359,12 @@ state_init(SRE_STATE* state, PatternObje if (isbytes && pattern->isbytes == 0) { PyErr_SetString(PyExc_TypeError, - "can't use a string pattern on a bytes-like object"); + "cannot use a string pattern on a bytes-like object"); goto err; } if (!isbytes && pattern->isbytes > 0) { PyErr_SetString(PyExc_TypeError, - "can't use a bytes pattern on a string-like object"); + "cannot use a bytes pattern on a string-like object"); goto err; }