diff -r b84acd655cb2 regex_3/Python/_regex_core.py --- a/regex_3/Python/_regex_core.py Wed Dec 24 20:17:08 2014 +0000 +++ b/regex_3/Python/_regex_core.py Wed Feb 18 20:14:53 2015 +0200 @@ -402,7 +402,9 @@ def apply_quantifier(source, info, count element = Character(characters[-1], case_flags=case_flags) else: # The quantifier applies to the last item in the sequence. - if applied or not sequence: + if applied: + raise error("multiple repeat", source.string, saved_pos) + if not sequence: raise error("nothing to repeat", source.string, saved_pos) element = sequence.pop() @@ -492,7 +494,7 @@ def parse_limited_quantifier(source): max_count = int(max_count) if max_count else None if max_count is not None and min_count > max_count: - raise error("min repeat greater than max repeat", source.string, + raise error("bad repeat interval", source.string, saved_pos) else: if not min_count: @@ -1047,15 +1049,15 @@ def parse_name(source, allow_numeric=Fal name = source.get_while(set(")>"), include=False) if not name: - raise error("bad group name", source.string, source.pos) + raise error("missing group name", source.string, source.pos) if name.isdigit(): min_group = 0 if allow_group_0 else 1 if not allow_numeric or int(name) < min_group: - raise error("bad group name", source.string, source.pos) + raise error("bad character in group name", source.string, source.pos) else: if not name.isidentifier(): - raise error("bad group name", source.string, source.pos) + raise error("bad character in group name", source.string, source.pos) return name @@ -1079,10 +1081,10 @@ def parse_escape(source, info, in_set): source.ignore_space = saved_ignore if not ch: # A backslash at the end of the pattern. - raise error("bad escape", source.string, source.pos) + raise error("bad escape (end of pattern)", source.string, source.pos) if ch in HEX_ESCAPES: # A hexadecimal escape sequence. - return parse_hex_escape(source, info, HEX_ESCAPES[ch], in_set) + return parse_hex_escape(source, info, HEX_ESCAPES[ch], in_set, ch) elif ch == "g" and not in_set: # A group reference. saved_pos = source.pos @@ -1183,15 +1185,18 @@ def parse_octal_escape(source, info, dig value = int("".join(digits), 8) return make_character(info, value, in_set) except ValueError: - raise error("bad octal escape", source.string, source.pos) - -def parse_hex_escape(source, info, expected_len, in_set): + if digits[0] in OCT_DIGITS: + raise error("incomplete escape \\%s" % ''.join(digits), source.string, source.pos) + else: + raise error("bad escape \\%s" % digits[0], source.string, source.pos) + +def parse_hex_escape(source, info, expected_len, in_set, type): "Parses a hex escape sequence." digits = [] for i in range(expected_len): ch = source.get() if ch not in HEX_DIGITS: - raise error("bad hex escape", source.string, source.pos) + raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), source.string, source.pos) digits.append(ch) value = int("".join(digits), 16) @@ -1441,7 +1446,7 @@ def parse_set_item(source, info): ch = source.get() if not ch: - raise error("bad set", source.string, source.pos) + raise error("unterminated character set", source.string, source.pos) return Character(ord(ch)) @@ -1573,7 +1578,7 @@ def _compile_replacement(source, pattern if ch in HEX_ESCAPES and (ch == "x" or is_unicode): # A hexadecimal escape sequence. - return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch])] + return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)] if ch == "g": # A group preference. @@ -1629,18 +1634,18 @@ def _compile_replacement(source, pattern if not ch: # A trailing backslash. - raise error("bad escape", source.string, source.pos) + raise error("bad escape (end of pattern)", source.string, source.pos) # An escaped non-backslash is a backslash followed by the literal. return False, [ord("\\"), ord(ch)] -def parse_repl_hex_escape(source, expected_len): +def parse_repl_hex_escape(source, expected_len, type): "Parses a hex escape sequence in a replacement string." digits = [] for i in range(expected_len): ch = source.get() if ch not in HEX_DIGITS: - raise error("bad hex escape", source.string, source.pos) + raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), source.string, source.pos) digits.append(ch) return int("".join(digits), 16) @@ -1671,7 +1676,7 @@ def compile_repl_group(source, pattern): if name.isdigit(): index = int(name) if not 0 <= index <= pattern.groups: - raise error("invalid group", source.string, source.pos) + raise error("invalid group reference", source.string, source.pos) return index @@ -2280,7 +2285,7 @@ class CallGroup(RegexBase): raise error("unknown group", pattern, self.position) if not 0 <= self.group <= self.info.group_count: - raise error("unknown group", pattern, self.position) + raise error("invalid group reference", pattern, self.position) if self.group > 0 and self.info.open_group_count[self.group] > 1: raise error("ambiguous group reference", pattern, self.position) @@ -2397,7 +2402,7 @@ class Conditional(RegexBase): raise error("unknown group", pattern, self.position) if not 1 <= self.group <= self.info.group_count: - raise error("unknown group", pattern, self.position) + raise error("invalid group reference", pattern, self.position) self.yes_item.fix_groups(pattern, reverse, fuzzy) self.no_item.fix_groups(pattern, reverse, fuzzy) @@ -3028,7 +3033,7 @@ class RefGroup(RegexBase): raise error("unknown group", pattern, self.position) if not 1 <= self.group <= self.info.group_count: - raise error("unknown group", pattern, self.position) + raise error("invalid group reference", pattern, self.position) self._key = self.__class__, self.group, self.case_flags @@ -3989,7 +3994,7 @@ class Scanner: source.ignore_space = bool(info.flags & VERBOSE) parsed = _parse_pattern(source, info) if not source.at_end(): - raise error("trailing characters", source.string, source.pos) + raise error("unbalanced parenthesis", source.string, source.pos) # We want to forbid capture groups within each phrase. patterns.append(parsed.remove_captures()) diff -r b84acd655cb2 regex_3/Python/regex.py --- a/regex_3/Python/regex.py Wed Dec 24 20:17:08 2014 +0000 +++ b/regex_3/Python/regex.py Wed Feb 18 20:14:53 2015 +0200 @@ -499,7 +499,7 @@ def _compile(pattern, flags=0, kwargs={} caught_exception.pos) if not source.at_end(): - raise error("trailing characters in pattern", pattern, source.pos) + raise error("unbalanced parenthesis", pattern, source.pos) # Check the global flags for conflicts. version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION diff -r b84acd655cb2 regex_3/Python/test_regex.py --- a/regex_3/Python/test_regex.py Wed Dec 24 20:17:08 2014 +0000 +++ b/regex_3/Python/test_regex.py Wed Feb 18 20:14:53 2015 +0200 @@ -20,23 +20,25 @@ class RegexTests(unittest.TestCase): FLAGS_WITH_COMPILED_PAT = "cannot process flags argument with a compiled pattern" INVALID_GROUP_REF = "invalid group reference" MISSING_GT = "missing >" - BAD_GROUP_NAME = "bad group name" + BAD_GROUP_NAME = "bad character in group name" + MISSING_GROUP_NAME = "missing group name" MISSING_LT = "missing <" UNKNOWN_GROUP_I = "unknown group" UNKNOWN_GROUP = "unknown group" - BAD_ESCAPE = "bad escape" - BAD_OCTAL_ESCAPE = "bad octal escape" - BAD_SET = "bad set" - STR_PAT_ON_BYTES = "can't use a string pattern on a bytes-like object" - BYTES_PAT_ON_STR = "can't use a bytes pattern on a string-like object" - STR_PAT_BYTES_TEMPL = "expected str instance, not bytes" - BYTES_PAT_STR_TEMPL = "expected bytes-like object, not str" + BAD_ESCAPE = r"bad escape \(end of pattern\)" + BAD_OCTAL_ESCAPE = r"bad escape \\" + BAD_SET = "unterminated character set" + STR_PAT_ON_BYTES = "cannot use a string pattern on a bytes-like object" + BYTES_PAT_ON_STR = "cannot use a bytes pattern on a string-like object" + STR_PAT_BYTES_TEMPL = "expected str instance, bytes found" + BYTES_PAT_STR_TEMPL = "expected a bytes-like object, str found" BYTES_PAT_UNI_FLAG = "cannot use UNICODE flag with a bytes pattern" MIXED_FLAGS = "ASCII, LOCALE and UNICODE flags are mutually incompatible" - MISSING_RPAREN = "missing \\)" # Need to escape parenthesis for unittest. - TRAILING_CHARS = "trailing characters in pattern" + MISSING_RPAREN = "missing \\)" + TRAILING_CHARS = "unbalanced parenthesis" BAD_CHAR_RANGE = "bad character range" NOTHING_TO_REPEAT = "nothing to repeat" + MULTIPLE_REPEAT = "multiple repeat" OPEN_GROUP = "cannot refer to an open group" DUPLICATE_GROUP = "duplicate group" CANT_TURN_OFF = "bad inline flags: cannot turn flags off" @@ -227,7 +229,7 @@ class RegexTests(unittest.TestCase): def test_symbolic_refs(self): self.assertRaisesRegex(regex.error, self.MISSING_GT, lambda: regex.sub('(?Px)', r'\gx)', r'\g<', 'xx')) self.assertRaisesRegex(regex.error, self.MISSING_LT, lambda: regex.sub('(?Px)', r'\g', 'xx')) @@ -712,7 +714,7 @@ class RegexTests(unittest.TestCase): self.assertEqual(bool(regex.match(r"\x%02xz" % i, chr(i) + "z")), True) - self.assertRaisesRegex(regex.error, self.UNKNOWN_GROUP, lambda: + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.match(r"\911", "")) def test_sre_character_class_literals(self): @@ -1709,7 +1711,7 @@ class RegexTests(unittest.TestCase): (r'(?a)\g', 'aa', '1', ascii('a')), # Test octal escapes. - ('\\1', 'a', '', regex.error, self.UNKNOWN_GROUP), # Backreference. + ('\\1', 'a', '', regex.error, self.INVALID_GROUP_REF), # Backreference. ('[\\1]', '\1', '0', "'\\x01'"), # Character. ('\\09', chr(0) + '9', '0', ascii(chr(0) + '9')), ('\\141', 'a', '0', ascii('a')), @@ -1943,7 +1945,7 @@ class RegexTests(unittest.TestCase): # Character properties. (r"\g", "g", '0', ascii('g')), - (r"\g<1>", "g", '', regex.error, self.UNKNOWN_GROUP), + (r"\g<1>", "g", '', regex.error, self.INVALID_GROUP_REF), (r"(.)\g<1>", "gg", '0', ascii('gg')), (r"(.)\g<1>", "gg", '', ascii(('gg', 'g'))), (r"\N", "N", '0', ascii('N')), @@ -2033,7 +2035,7 @@ class RegexTests(unittest.TestCase): ('(a)b(c)', 'abc', '0,1,2', ascii(('abc', 'a', 'c'))), ('a+b+c', 'aabbabc', '0', ascii('abc')), ('a{1,}b{1,}c', 'aabbabc', '0', ascii('abc')), - ('a**', '-', '', regex.error, self.NOTHING_TO_REPEAT), + ('a**', '-', '', regex.error, self.MULTIPLE_REPEAT), ('a.+?c', 'abcabc', '0', ascii('abc')), ('(a+|b)*', 'ab', '0,1', ascii(('ab', 'b'))), ('(a+|b){0,}', 'ab', '0,1', ascii(('ab', 'b'))), @@ -2087,9 +2089,9 @@ class RegexTests(unittest.TestCase): # ('((((((((((a))))))))))\\41', 'aa', '', ascii(None)), # ('((((((((((a))))))))))\\41', 'a!', '0', ascii('a!')), ('((((((((((a))))))))))\\41', '', '', regex.error, - self.UNKNOWN_GROUP), + self.INVALID_GROUP_REF), ('(?i)((((((((((a))))))))))\\41', '', '', regex.error, - self.UNKNOWN_GROUP), + self.INVALID_GROUP_REF), ('(((((((((a)))))))))', 'a', '0', ascii('a')), ('multiple words of text', 'uh-uh', '', ascii(None)), @@ -2181,7 +2183,7 @@ class RegexTests(unittest.TestCase): ('(?i)a+b+c', 'AABBABC', '0', ascii('ABC')), ('(?i)a{1,}b{1,}c', 'AABBABC', '0', ascii('ABC')), - ('(?i)a**', '-', '', regex.error, self.NOTHING_TO_REPEAT), + ('(?i)a**', '-', '', regex.error, self.MULTIPLE_REPEAT), ('(?i)a.+?c', 'ABCABC', '0', ascii('ABC')), ('(?i)a.*?c', 'ABCABC', '0', ascii('ABC')), ('(?i)a.{0,5}?c', 'ABCABC', '0', ascii('ABC')), @@ -2315,7 +2317,7 @@ 123""", '0', ascii('abc')), ('[\\D]+', '1234abc5678', '0', ascii('abc')), ('[\\da-fA-F]+', '123abc', '0', ascii('123abc')), # Not an error under PCRE/PRE: - # ('[\\d-x]', '-', '', regex.error, self.SYNTAX_ERROR), + # ('[\\d-x]', '-', '', regex.error, self.BAD_CHAR_RANGE), (r'([\s]*)([\S]*)([\s]*)', ' testing!1972', '3,2,1', ascii(('', 'testing!1972', ' '))), (r'(\s*)(\S*)(\s*)', ' testing!1972', '3,2,1', ascii(('', @@ -2405,6 +2407,7 @@ 123""", '0', ascii('abc')), group_list.append(group) if excval is not None: + with self.subTest(pattern=pattern, string=string): self.assertRaisesRegex(expected, excval, regex.search, pattern, string) else: @@ -3223,7 +3226,7 @@ 123""", '0', ascii('abc')), # Hg issue 95. self.assertRaisesRegex(regex.error, - '^nothing to repeat at position 3$', lambda: regex.compile(r'.???')) + self.MULTIPLE_REPEAT, lambda: regex.compile(r'.???')) # Hg issue 97. self.assertEquals(regex.escape('foo!?'), 'foo\\!\\?') diff -r b84acd655cb2 regex_3/regex/_regex.c --- a/regex_3/regex/_regex.c Wed Dec 24 20:17:08 2014 +0000 +++ b/regex_3/regex/_regex.c Wed Feb 18 20:14:53 2015 +0200 @@ -2020,15 +2020,15 @@ Py_LOCAL_INLINE(void) set_error(int stat PyErr_NoMemory(); break; case RE_ERROR_NOT_BYTES: - PyErr_Format(PyExc_TypeError, "expected bytes-like object, not %.200s", + PyErr_Format(PyExc_TypeError, "expected a bytes-like object, %.200s found", object->ob_type->tp_name); break; case RE_ERROR_NOT_STRING: - PyErr_Format(PyExc_TypeError, "expected string instance, not %.200s", + PyErr_Format(PyExc_TypeError, "expected string instance, %.200s found", object->ob_type->tp_name); break; case RE_ERROR_NOT_UNICODE: - PyErr_Format(PyExc_TypeError, "expected str instance, not %.200s", + PyErr_Format(PyExc_TypeError, "expected str instance, %.200s found", object->ob_type->tp_name); break; case RE_ERROR_NO_SUCH_GROUP: @@ -16106,13 +16106,13 @@ Py_LOCAL_INLINE(BOOL) check_compatible(P if (PyBytes_Check(pattern->pattern)) { if (unicode) { PyErr_SetString(PyExc_TypeError, - "can't use a bytes pattern on a string-like object"); + "cannot use a bytes pattern on a string-like object"); return FALSE; } } else { if (!unicode) { PyErr_SetString(PyExc_TypeError, - "can't use a string pattern on a bytes-like object"); + "cannot use a string pattern on a bytes-like object"); return FALSE; } }