=== modified file Lib/re.py --- Lib/re.py 2009-01-01 15:46:10 +0000 +++ Lib/re.py 2009-02-03 21:49:47 +0000 @@ -27,52 +27,81 @@ concatenate ordinary characters, so last matches the string 'last'. The special characters are: - "." Matches any character except a newline. - "^" Matches the start of the string. - "$" Matches the end of the string or just before the newline at - the end of the string. - "*" Matches 0 or more (greedy) repetitions of the preceding RE. - Greedy means that it will match as many repetitions as possible. - "+" Matches 1 or more (greedy) repetitions of the preceding RE. - "?" Matches 0 or 1 (greedy) of the preceding RE. - *?,+?,?? Non-greedy versions of the previous three special characters. - {m,n} Matches from m to n repetitions of the preceding RE. - {m,n}? Non-greedy version of the above. - "\\" Either escapes special characters or signals a special sequence. - [] Indicates a set of characters. - A "^" as the first character indicates a complementing set. - "|" A|B, creates an RE that will match either A or B. - (...) Matches the RE inside the parentheses. - The contents can be retrieved or matched later in the string. - (?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below). - (?:...) Non-grouping version of regular parentheses. - (?P...) The substring matched by the group is accessible by name. - (?P=name) Matches the text matched earlier by the group named name. - (?#...) A comment; ignored. - (?=...) Matches if ... matches next, but doesn't consume the string. - (?!...) Matches if ... doesn't match next. - (?<=...) Matches if preceded by ... (must be fixed length). - (?...) The substring matched by the group isaccessible by + name. + (?...) The substring matched by the group is accessible by + name. + (?#...) A comment; ignored. + (?>...) Atomic group. Like (?:...) but won't retry the RE + within the parentheses. + (?=...) Matches if ... matches next, but doesn't consume + the string. + (?!...) Matches if ... doesn't match next. + (?<=...) Matches if preceded by ... (must be fixed length). + (? Matches the text matched by the group named name. + \g Matches the contents of the group of the same number. + \g<+number> Matches the contents of the group of the relative number. + \g<-number> Matches the contents of the group of the relative number. + \k Matches the text matched earlier by the group named name. + \N{name} Matches named Unicode character. + \p{name} Matches any character having the named property. + \P{name} Matches any character not having the named property. + \s Matches any whitespace character; equivalent to + [ \t\n\r\f\v]. + \S Matches any non-whitespace character; equiv. to + [^ \t\n\r\f\v]. + \w Matches any alphanumeric character; equivalent to + [a-zA-Z0-9_]. With LOCALE, it will match the set + [0-9_] plus characters defined as letters for the current + locale. + \W Matches the complement of \w. + \Z Matches only at the end of the string. + \\ Matches a literal backslash. This module exports the following functions: match Match a regular expression pattern to the beginning of a string. @@ -87,15 +116,17 @@ escape Backslash all non-alphanumerics in a string. Some of the functions in this module takes flags as optional parameters: - I IGNORECASE Perform case-insensitive matching. - L LOCALE Make \w, \W, \b, \B, dependent on the current locale. - M MULTILINE "^" matches the beginning of lines (after a newline) - as well as the string. - "$" matches the end of lines (before a newline) as well - as the end of the string. - S DOTALL "." matches any character at all, including the newline. - X VERBOSE Ignore whitespace and comments for nicer looking RE's. - U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale. + I IGNORECASE Perform case-insensitive matching. + L LOCALE Make \w, \W, \b, \B, dependent on the current locale. + M MULTILINE "^" matches the beginning of lines (after a newline) as + well as the string. + "$" matches the end of lines (before a newline) as well + as the end of the string. + R REVERSE Search backwards, from the end to the start. + S DOTALL "." matches any character at all, including the newline. + X VERBOSE Ignore whitespace and comments for nicer looking RE's. + U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale. + Z ZEROWIDTH Permit splitting on zero-width separators. This module also defines an exception 'error'. @@ -109,18 +140,19 @@ __all__ = [ "match", "search", "sub", "subn", "split", "findall", "compile", "purge", "template", "escape", "I", "L", "M", "S", "X", "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", - "UNICODE", "error" ] + "UNICODE", "REVERSE", "error" ] -__version__ = "2.2.1" +__version__ = "2.2.2" # flags I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale -U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline +R = REVERSE = sre_compile.SRE_FLAG_REVERSE # search backwards S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline +U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments - +Z = ZEROWIDTH = sre_compile.SRE_FLAG_ZEROWIDTH # permit splitting on zero-width separators. # sre extensions (experimental, don't rely on these) T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation @@ -237,12 +269,12 @@ if flags: raise ValueError('Cannot process flags argument with a compiled pattern') return pattern - if not sre_compile.isstring(pattern): - raise TypeError, "first argument must be string or compiled pattern" + if not isinstance(pattern, (str, unicode)): + raise TypeError("First argument must be string or compiled pattern") try: p = sre_compile.compile(pattern, flags) except error, v: - raise error, v # invalid expression + raise error(v) # invalid expression if len(_cache) >= _MAXCACHE: _cache.clear() _cache[cachekey] = p @@ -257,7 +289,7 @@ try: p = sre_parse.parse_template(repl, pattern) except error, v: - raise error, v # invalid expression + raise error(v) # invalid expression if len(_cache_repl) >= _MAXCACHE: _cache_repl.clear() _cache_repl[key] = p @@ -266,7 +298,7 @@ def _expand(pattern, match, template): # internal: match.expand implementation hook template = sre_parse.parse_template(template, pattern) - return sre_parse.expand_template(template, match) + return sre_parse.expand_template(template, match, True) def _subx(pattern, template): # internal: pattern.sub/subn implementation helper @@ -275,7 +307,7 @@ # literal replacement return template[1][0] def filter(match, template=template): - return sre_parse.expand_template(template, match) + return sre_parse.expand_template(template, match, True) return filter # register myself for pickling @@ -292,36 +324,31 @@ class Scanner: def __init__(self, lexicon, flags=0): - from sre_constants import BRANCH, SUBPATTERN self.lexicon = lexicon # combine phrases into a compound pattern p = [] s = sre_parse.Pattern() s.flags = flags - for phrase, action in lexicon: - p.append(sre_parse.SubPattern(s, [ - (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), - ])) - s.groups = len(p)+1 - p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) - self.scanner = sre_compile.compile(p) + sep, template = map(type(lexicon[0][0]), ("|", "(%s)")) + regex = sep.join(template % phrase for phrase, action in lexicon) + self.scanner = sre_compile.compile(regex) def scan(self, string): result = [] append = result.append match = self.scanner.scanner(string).match i = 0 - while 1: + while True: m = match() if not m: break j = m.end() if i == j: break - action = self.lexicon[m.lastindex-1][1] + action = self.lexicon[m.lastindex - 1][1] if hasattr(action, '__call__'): self.match = m action = action(self, m.group()) if action is not None: append(action) i = j - return result, string[i:] + return result, string[i : ] === modified file Lib/test/re_tests.py --- Lib/test/re_tests.py 2003-04-20 07:35:44 +0000 +++ Lib/test/re_tests.py 2009-02-03 18:18:47 +0000 @@ -1,674 +1,674 @@ -#!/usr/bin/env python -# -*- mode: python -*- - -# Re test suite and benchmark suite v1.5 - -# The 3 possible outcomes for each pattern -[SUCCEED, FAIL, SYNTAX_ERROR] = range(3) - -# Benchmark suite (needs expansion) -# -# The benchmark suite does not test correctness, just speed. The -# first element of each tuple is the regex pattern; the second is a -# string to match it against. The benchmarking code will embed the -# second string inside several sizes of padding, to test how regex -# matching performs on large strings. - -benchmarks = [ - - # test common prefix - ('Python|Perl', 'Perl'), # Alternation - ('(Python|Perl)', 'Perl'), # Grouped alternation - - ('Python|Perl|Tcl', 'Perl'), # Alternation - ('(Python|Perl|Tcl)', 'Perl'), # Grouped alternation - - ('(Python)\\1', 'PythonPython'), # Backreference - ('([0a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # Disable the fastmap optimization - ('([a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # A few sets - - ('Python', 'Python'), # Simple text literal - ('.*Python', 'Python'), # Bad text literal - ('.*Python.*', 'Python'), # Worse text literal - ('.*(Python)', 'Python'), # Bad text literal with grouping - -] - -# Test suite (for verifying correctness) -# -# The test suite is a list of 5- or 3-tuples. The 5 parts of a -# complete tuple are: -# element 0: a string containing the pattern -# 1: the string to match against the pattern -# 2: the expected result (SUCCEED, FAIL, SYNTAX_ERROR) -# 3: a string that will be eval()'ed to produce a test string. -# This is an arbitrary Python expression; the available -# variables are "found" (the whole match), and "g1", "g2", ... -# up to "g99" contain the contents of each group, or the -# string 'None' if the group wasn't given a value, or the -# string 'Error' if the group index was out of range; -# also "groups", the return value of m.group() (a tuple). -# 4: The expected result of evaluating the expression. -# If the two don't match, an error is reported. -# -# If the regex isn't expected to work, the latter two elements can be omitted. - -tests = [ - # Test ?P< and ?P= extensions - ('(?Pa)', '', SYNTAX_ERROR), # Begins with a digit - ('(?Pa)', '', SYNTAX_ERROR), # Begins with an illegal char - ('(?Pa)', '', SYNTAX_ERROR), # Begins with an illegal char - - # Same tests, for the ?P= form - ('(?Pa)(?P=foo_123', 'aa', SYNTAX_ERROR), - ('(?Pa)(?P=1)', 'aa', SYNTAX_ERROR), - ('(?Pa)(?P=!)', 'aa', SYNTAX_ERROR), - ('(?Pa)(?P=foo_124', 'aa', SYNTAX_ERROR), # Backref to undefined group - - ('(?Pa)', 'a', SUCCEED, 'g1', 'a'), - ('(?Pa)(?P=foo_123)', 'aa', SUCCEED, 'g1', 'a'), - - # Test octal escapes - ('\\1', 'a', SYNTAX_ERROR), # Backreference - ('[\\1]', '\1', SUCCEED, 'found', '\1'), # Character - ('\\09', chr(0) + '9', SUCCEED, 'found', chr(0) + '9'), - ('\\141', 'a', SUCCEED, 'found', 'a'), - ('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', SUCCEED, 'found+"-"+g11', 'abcdefghijklk9-k'), - - # Test \0 is handled everywhere - (r'\0', '\0', SUCCEED, 'found', '\0'), - (r'[\0a]', '\0', SUCCEED, 'found', '\0'), - (r'[a\0]', '\0', SUCCEED, 'found', '\0'), - (r'[^a\0]', '\0', FAIL), - - # Test various letter escapes - (r'\a[\b]\f\n\r\t\v', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'), - (r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'), - # NOTE: not an error under PCRE/PRE: - # (r'\u', '', SYNTAX_ERROR), # A Perl escape - (r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'), - (r'\xff', '\377', SUCCEED, 'found', chr(255)), - # new \x semantics - (r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)), - (r'\x00f', '\017', FAIL, 'found', chr(15)), - (r'\x00fe', '\376', FAIL, 'found', chr(254)), - # (r'\x00ffffffffffffff', '\377', SUCCEED, 'found', chr(255)), - # (r'\x00f', '\017', SUCCEED, 'found', chr(15)), - # (r'\x00fe', '\376', SUCCEED, 'found', chr(254)), - - (r"^\w+=(\\[\000-\277]|[^\n\\])*", "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", - SUCCEED, 'found', "SRC=eval.c g.c blah blah blah \\\\"), - - # Test that . only matches \n in DOTALL mode - ('a.b', 'acb', SUCCEED, 'found', 'acb'), - ('a.b', 'a\nb', FAIL), - ('a.*b', 'acc\nccb', FAIL), - ('a.{4,5}b', 'acc\nccb', FAIL), - ('a.b', 'a\rb', SUCCEED, 'found', 'a\rb'), - ('a.b(?s)', 'a\nb', SUCCEED, 'found', 'a\nb'), - ('a.*(?s)b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'), - ('(?s)a.{4,5}b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'), - ('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'), - - (')', '', SYNTAX_ERROR), # Unmatched right bracket - ('', '', SUCCEED, 'found', ''), # Empty pattern - ('abc', 'abc', SUCCEED, 'found', 'abc'), - ('abc', 'xbc', FAIL), - ('abc', 'axc', FAIL), - ('abc', 'abx', FAIL), - ('abc', 'xabcy', SUCCEED, 'found', 'abc'), - ('abc', 'ababc', SUCCEED, 'found', 'abc'), - ('ab*c', 'abc', SUCCEED, 'found', 'abc'), - ('ab*bc', 'abc', SUCCEED, 'found', 'abc'), - ('ab*bc', 'abbc', SUCCEED, 'found', 'abbc'), - ('ab*bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), - ('ab+bc', 'abbc', SUCCEED, 'found', 'abbc'), - ('ab+bc', 'abc', FAIL), - ('ab+bc', 'abq', FAIL), - ('ab+bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), - ('ab?bc', 'abbc', SUCCEED, 'found', 'abbc'), - ('ab?bc', 'abc', SUCCEED, 'found', 'abc'), - ('ab?bc', 'abbbbc', FAIL), - ('ab?c', 'abc', SUCCEED, 'found', 'abc'), - ('^abc$', 'abc', SUCCEED, 'found', 'abc'), - ('^abc$', 'abcc', FAIL), - ('^abc', 'abcc', SUCCEED, 'found', 'abc'), - ('^abc$', 'aabc', FAIL), - ('abc$', 'aabc', SUCCEED, 'found', 'abc'), - ('^', 'abc', SUCCEED, 'found+"-"', '-'), - ('$', 'abc', SUCCEED, 'found+"-"', '-'), - ('a.c', 'abc', SUCCEED, 'found', 'abc'), - ('a.c', 'axc', SUCCEED, 'found', 'axc'), - ('a.*c', 'axyzc', SUCCEED, 'found', 'axyzc'), - ('a.*c', 'axyzd', FAIL), - ('a[bc]d', 'abc', FAIL), - ('a[bc]d', 'abd', SUCCEED, 'found', 'abd'), - ('a[b-d]e', 'abd', FAIL), - ('a[b-d]e', 'ace', SUCCEED, 'found', 'ace'), - ('a[b-d]', 'aac', SUCCEED, 'found', 'ac'), - ('a[-b]', 'a-', SUCCEED, 'found', 'a-'), - ('a[\\-b]', 'a-', SUCCEED, 'found', 'a-'), - # NOTE: not an error under PCRE/PRE: - # ('a[b-]', 'a-', SYNTAX_ERROR), - ('a[]b', '-', SYNTAX_ERROR), - ('a[', '-', SYNTAX_ERROR), - ('a\\', '-', SYNTAX_ERROR), - ('abc)', '-', SYNTAX_ERROR), - ('(abc', '-', SYNTAX_ERROR), - ('a]', 'a]', SUCCEED, 'found', 'a]'), - ('a[]]b', 'a]b', SUCCEED, 'found', 'a]b'), - ('a[\]]b', 'a]b', SUCCEED, 'found', 'a]b'), - ('a[^bc]d', 'aed', SUCCEED, 'found', 'aed'), - ('a[^bc]d', 'abd', FAIL), - ('a[^-b]c', 'adc', SUCCEED, 'found', 'adc'), - ('a[^-b]c', 'a-c', FAIL), - ('a[^]b]c', 'a]c', FAIL), - ('a[^]b]c', 'adc', SUCCEED, 'found', 'adc'), - ('\\ba\\b', 'a-', SUCCEED, '"-"', '-'), - ('\\ba\\b', '-a', SUCCEED, '"-"', '-'), - ('\\ba\\b', '-a-', SUCCEED, '"-"', '-'), - ('\\by\\b', 'xy', FAIL), - ('\\by\\b', 'yz', FAIL), - ('\\by\\b', 'xyz', FAIL), - ('x\\b', 'xyz', FAIL), - ('x\\B', 'xyz', SUCCEED, '"-"', '-'), - ('\\Bz', 'xyz', SUCCEED, '"-"', '-'), - ('z\\B', 'xyz', FAIL), - ('\\Bx', 'xyz', FAIL), - ('\\Ba\\B', 'a-', FAIL, '"-"', '-'), - ('\\Ba\\B', '-a', FAIL, '"-"', '-'), - ('\\Ba\\B', '-a-', FAIL, '"-"', '-'), - ('\\By\\B', 'xy', FAIL), - ('\\By\\B', 'yz', FAIL), - ('\\By\\b', 'xy', SUCCEED, '"-"', '-'), - ('\\by\\B', 'yz', SUCCEED, '"-"', '-'), - ('\\By\\B', 'xyz', SUCCEED, '"-"', '-'), - ('ab|cd', 'abc', SUCCEED, 'found', 'ab'), - ('ab|cd', 'abcd', SUCCEED, 'found', 'ab'), - ('()ef', 'def', SUCCEED, 'found+"-"+g1', 'ef-'), - ('$b', 'b', FAIL), - ('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-Error'), - ('a\\(*b', 'ab', SUCCEED, 'found', 'ab'), - ('a\\(*b', 'a((b', SUCCEED, 'found', 'a((b'), - ('a\\\\b', 'a\\b', SUCCEED, 'found', 'a\\b'), - ('((a))', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'a-a-a'), - ('(a)b(c)', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'abc-a-c'), - ('a+b+c', 'aabbabc', SUCCEED, 'found', 'abc'), - ('(a+|b)*', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), - ('(a+|b)+', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), - ('(a+|b)?', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'), - (')(', '-', SYNTAX_ERROR), - ('[^ab]*', 'cde', SUCCEED, 'found', 'cde'), - ('abc', '', FAIL), - ('a*', '', SUCCEED, 'found', ''), - ('a|b|c|d|e', 'e', SUCCEED, 'found', 'e'), - ('(a|b|c|d|e)f', 'ef', SUCCEED, 'found+"-"+g1', 'ef-e'), - ('abcd*efg', 'abcdefg', SUCCEED, 'found', 'abcdefg'), - ('ab*', 'xabyabbbz', SUCCEED, 'found', 'ab'), - ('ab*', 'xayabbbz', SUCCEED, 'found', 'a'), - ('(ab|cd)e', 'abcde', SUCCEED, 'found+"-"+g1', 'cde-cd'), - ('[abhgefdc]ij', 'hij', SUCCEED, 'found', 'hij'), - ('^(ab|cd)e', 'abcde', FAIL, 'xg1y', 'xy'), - ('(abc|)ef', 'abcdef', SUCCEED, 'found+"-"+g1', 'ef-'), - ('(a|b)c*d', 'abcd', SUCCEED, 'found+"-"+g1', 'bcd-b'), - ('(ab|ab*)bc', 'abc', SUCCEED, 'found+"-"+g1', 'abc-a'), - ('a([bc]*)c*', 'abc', SUCCEED, 'found+"-"+g1', 'abc-bc'), - ('a([bc]*)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), - ('a([bc]+)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), - ('a([bc]*)(c+d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'), - ('a[bcd]*dcdcde', 'adcdcde', SUCCEED, 'found', 'adcdcde'), - ('a[bcd]+dcdcde', 'adcdcde', FAIL), - ('(ab|a)b*c', 'abc', SUCCEED, 'found+"-"+g1', 'abc-ab'), - ('((a)(b)c)(d)', 'abcd', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'), - ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, 'found', 'alpha'), - ('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-None'), - ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), - ('(bc+d$|ef*g.|h?i(j|k))', 'ij', SUCCEED, 'found+"-"+g1+"-"+g2', 'ij-ij-j'), - ('(bc+d$|ef*g.|h?i(j|k))', 'effg', FAIL), - ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', FAIL), - ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), - ('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'), - ('multiple words of text', 'uh-uh', FAIL), - ('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'), - ('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'), - ('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'), - ('[k]', 'ab', FAIL), - ('a[-]?c', 'ac', SUCCEED, 'found', 'ac'), - ('(abc)\\1', 'abcabc', SUCCEED, 'g1', 'abc'), - ('([a-c]*)\\1', 'abcabc', SUCCEED, 'g1', 'abc'), - ('^(.+)?B', 'AB', SUCCEED, 'g1', 'A'), - ('(a+).\\1$', 'aaaaa', SUCCEED, 'found+"-"+g1', 'aaaaa-aa'), - ('^(a+).\\1$', 'aaaa', FAIL), - ('(abc)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'), - ('([a-c]+)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'), - ('(a)\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'), - ('(a+)\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'), - ('(a+)+\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'), - ('(a).+\\1', 'aba', SUCCEED, 'found+"-"+g1', 'aba-a'), - ('(a)ba*\\1', 'aba', SUCCEED, 'found+"-"+g1', 'aba-a'), - ('(aa|a)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'), - ('(a|aa)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'), - ('(a+)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'), - ('([abc]*)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'), - ('(a)(b)c|ab', 'ab', SUCCEED, 'found+"-"+g1+"-"+g2', 'ab-None-None'), - ('(a)+x', 'aaax', SUCCEED, 'found+"-"+g1', 'aaax-a'), - ('([ac])+x', 'aacx', SUCCEED, 'found+"-"+g1', 'aacx-c'), - ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', SUCCEED, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/'), - ('([^.]*)\\.([^:]*):[T ]+(.*)', 'track1.title:TBlah blah blah', SUCCEED, 'found+"-"+g1+"-"+g2+"-"+g3', 'track1.title:TBlah blah blah-track1-title-Blah blah blah'), - ('([^N]*N)+', 'abNNxyzN', SUCCEED, 'found+"-"+g1', 'abNNxyzN-xyzN'), - ('([^N]*N)+', 'abNNxyz', SUCCEED, 'found+"-"+g1', 'abNN-N'), - ('([abc]*)x', 'abcx', SUCCEED, 'found+"-"+g1', 'abcx-abc'), - ('([abc]*)x', 'abc', FAIL), - ('([xyz]*)x', 'abcx', SUCCEED, 'found+"-"+g1', 'x-'), - ('(a)+b|aac', 'aac', SUCCEED, 'found+"-"+g1', 'aac-None'), - - # Test symbolic groups - - ('(?Paaa)a', 'aaaa', SYNTAX_ERROR), - ('(?Paaa)a', 'aaaa', SUCCEED, 'found+"-"+id', 'aaaa-aaa'), - ('(?Paa)(?P=id)', 'aaaa', SUCCEED, 'found+"-"+id', 'aaaa-aa'), - ('(?Paa)(?P=xd)', 'aaaa', SYNTAX_ERROR), - - # Test octal escapes/memory references - - ('\\1', 'a', SYNTAX_ERROR), - ('\\09', chr(0) + '9', SUCCEED, 'found', chr(0) + '9'), - ('\\141', 'a', SUCCEED, 'found', 'a'), - ('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', SUCCEED, 'found+"-"+g11', 'abcdefghijklk9-k'), - - # All tests from Perl - - ('abc', 'abc', SUCCEED, 'found', 'abc'), - ('abc', 'xbc', FAIL), - ('abc', 'axc', FAIL), - ('abc', 'abx', FAIL), - ('abc', 'xabcy', SUCCEED, 'found', 'abc'), - ('abc', 'ababc', SUCCEED, 'found', 'abc'), - ('ab*c', 'abc', SUCCEED, 'found', 'abc'), - ('ab*bc', 'abc', SUCCEED, 'found', 'abc'), - ('ab*bc', 'abbc', SUCCEED, 'found', 'abbc'), - ('ab*bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), - ('ab{0,}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), - ('ab+bc', 'abbc', SUCCEED, 'found', 'abbc'), - ('ab+bc', 'abc', FAIL), - ('ab+bc', 'abq', FAIL), - ('ab{1,}bc', 'abq', FAIL), - ('ab+bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), - ('ab{1,}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), - ('ab{1,3}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), - ('ab{3,4}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), - ('ab{4,5}bc', 'abbbbc', FAIL), - ('ab?bc', 'abbc', SUCCEED, 'found', 'abbc'), - ('ab?bc', 'abc', SUCCEED, 'found', 'abc'), - ('ab{0,1}bc', 'abc', SUCCEED, 'found', 'abc'), - ('ab?bc', 'abbbbc', FAIL), - ('ab?c', 'abc', SUCCEED, 'found', 'abc'), - ('ab{0,1}c', 'abc', SUCCEED, 'found', 'abc'), - ('^abc$', 'abc', SUCCEED, 'found', 'abc'), - ('^abc$', 'abcc', FAIL), - ('^abc', 'abcc', SUCCEED, 'found', 'abc'), - ('^abc$', 'aabc', FAIL), - ('abc$', 'aabc', SUCCEED, 'found', 'abc'), - ('^', 'abc', SUCCEED, 'found', ''), - ('$', 'abc', SUCCEED, 'found', ''), - ('a.c', 'abc', SUCCEED, 'found', 'abc'), - ('a.c', 'axc', SUCCEED, 'found', 'axc'), - ('a.*c', 'axyzc', SUCCEED, 'found', 'axyzc'), - ('a.*c', 'axyzd', FAIL), - ('a[bc]d', 'abc', FAIL), - ('a[bc]d', 'abd', SUCCEED, 'found', 'abd'), - ('a[b-d]e', 'abd', FAIL), - ('a[b-d]e', 'ace', SUCCEED, 'found', 'ace'), - ('a[b-d]', 'aac', SUCCEED, 'found', 'ac'), - ('a[-b]', 'a-', SUCCEED, 'found', 'a-'), - ('a[b-]', 'a-', SUCCEED, 'found', 'a-'), - ('a[b-a]', '-', SYNTAX_ERROR), - ('a[]b', '-', SYNTAX_ERROR), - ('a[', '-', SYNTAX_ERROR), - ('a]', 'a]', SUCCEED, 'found', 'a]'), - ('a[]]b', 'a]b', SUCCEED, 'found', 'a]b'), - ('a[^bc]d', 'aed', SUCCEED, 'found', 'aed'), - ('a[^bc]d', 'abd', FAIL), - ('a[^-b]c', 'adc', SUCCEED, 'found', 'adc'), - ('a[^-b]c', 'a-c', FAIL), - ('a[^]b]c', 'a]c', FAIL), - ('a[^]b]c', 'adc', SUCCEED, 'found', 'adc'), - ('ab|cd', 'abc', SUCCEED, 'found', 'ab'), - ('ab|cd', 'abcd', SUCCEED, 'found', 'ab'), - ('()ef', 'def', SUCCEED, 'found+"-"+g1', 'ef-'), - ('*a', '-', SYNTAX_ERROR), - ('(*)b', '-', SYNTAX_ERROR), - ('$b', 'b', FAIL), - ('a\\', '-', SYNTAX_ERROR), - ('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-Error'), - ('a\\(*b', 'ab', SUCCEED, 'found', 'ab'), - ('a\\(*b', 'a((b', SUCCEED, 'found', 'a((b'), - ('a\\\\b', 'a\\b', SUCCEED, 'found', 'a\\b'), - ('abc)', '-', SYNTAX_ERROR), - ('(abc', '-', SYNTAX_ERROR), - ('((a))', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'a-a-a'), - ('(a)b(c)', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'abc-a-c'), - ('a+b+c', 'aabbabc', SUCCEED, 'found', 'abc'), - ('a{1,}b{1,}c', 'aabbabc', SUCCEED, 'found', 'abc'), - ('a**', '-', SYNTAX_ERROR), - ('a.+?c', 'abcabc', SUCCEED, 'found', 'abc'), - ('(a+|b)*', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), - ('(a+|b){0,}', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), - ('(a+|b)+', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), - ('(a+|b){1,}', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), - ('(a+|b)?', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'), - ('(a+|b){0,1}', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'), - (')(', '-', SYNTAX_ERROR), - ('[^ab]*', 'cde', SUCCEED, 'found', 'cde'), - ('abc', '', FAIL), - ('a*', '', SUCCEED, 'found', ''), - ('([abc])*d', 'abbbcd', SUCCEED, 'found+"-"+g1', 'abbbcd-c'), - ('([abc])*bcd', 'abcd', SUCCEED, 'found+"-"+g1', 'abcd-a'), - ('a|b|c|d|e', 'e', SUCCEED, 'found', 'e'), - ('(a|b|c|d|e)f', 'ef', SUCCEED, 'found+"-"+g1', 'ef-e'), - ('abcd*efg', 'abcdefg', SUCCEED, 'found', 'abcdefg'), - ('ab*', 'xabyabbbz', SUCCEED, 'found', 'ab'), - ('ab*', 'xayabbbz', SUCCEED, 'found', 'a'), - ('(ab|cd)e', 'abcde', SUCCEED, 'found+"-"+g1', 'cde-cd'), - ('[abhgefdc]ij', 'hij', SUCCEED, 'found', 'hij'), - ('^(ab|cd)e', 'abcde', FAIL), - ('(abc|)ef', 'abcdef', SUCCEED, 'found+"-"+g1', 'ef-'), - ('(a|b)c*d', 'abcd', SUCCEED, 'found+"-"+g1', 'bcd-b'), - ('(ab|ab*)bc', 'abc', SUCCEED, 'found+"-"+g1', 'abc-a'), - ('a([bc]*)c*', 'abc', SUCCEED, 'found+"-"+g1', 'abc-bc'), - ('a([bc]*)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), - ('a([bc]+)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), - ('a([bc]*)(c+d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'), - ('a[bcd]*dcdcde', 'adcdcde', SUCCEED, 'found', 'adcdcde'), - ('a[bcd]+dcdcde', 'adcdcde', FAIL), - ('(ab|a)b*c', 'abc', SUCCEED, 'found+"-"+g1', 'abc-ab'), - ('((a)(b)c)(d)', 'abcd', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'), - ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, 'found', 'alpha'), - ('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-None'), - ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), - ('(bc+d$|ef*g.|h?i(j|k))', 'ij', SUCCEED, 'found+"-"+g1+"-"+g2', 'ij-ij-j'), - ('(bc+d$|ef*g.|h?i(j|k))', 'effg', FAIL), - ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', FAIL), - ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), - ('((((((((((a))))))))))', 'a', SUCCEED, 'g10', 'a'), - ('((((((((((a))))))))))\\10', 'aa', SUCCEED, 'found', 'aa'), -# Python does not have the same rules for \\41 so this is a syntax error -# ('((((((((((a))))))))))\\41', 'aa', FAIL), -# ('((((((((((a))))))))))\\41', 'a!', SUCCEED, 'found', 'a!'), - ('((((((((((a))))))))))\\41', '', SYNTAX_ERROR), - ('(?i)((((((((((a))))))))))\\41', '', SYNTAX_ERROR), - ('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'), - ('multiple words of text', 'uh-uh', FAIL), - ('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'), - ('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'), - ('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'), - ('[k]', 'ab', FAIL), - ('a[-]?c', 'ac', SUCCEED, 'found', 'ac'), - ('(abc)\\1', 'abcabc', SUCCEED, 'g1', 'abc'), - ('([a-c]*)\\1', 'abcabc', SUCCEED, 'g1', 'abc'), - ('(?i)abc', 'ABC', SUCCEED, 'found', 'ABC'), - ('(?i)abc', 'XBC', FAIL), - ('(?i)abc', 'AXC', FAIL), - ('(?i)abc', 'ABX', FAIL), - ('(?i)abc', 'XABCY', SUCCEED, 'found', 'ABC'), - ('(?i)abc', 'ABABC', SUCCEED, 'found', 'ABC'), - ('(?i)ab*c', 'ABC', SUCCEED, 'found', 'ABC'), - ('(?i)ab*bc', 'ABC', SUCCEED, 'found', 'ABC'), - ('(?i)ab*bc', 'ABBC', SUCCEED, 'found', 'ABBC'), - ('(?i)ab*?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), - ('(?i)ab{0,}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), - ('(?i)ab+?bc', 'ABBC', SUCCEED, 'found', 'ABBC'), - ('(?i)ab+bc', 'ABC', FAIL), - ('(?i)ab+bc', 'ABQ', FAIL), - ('(?i)ab{1,}bc', 'ABQ', FAIL), - ('(?i)ab+bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), - ('(?i)ab{1,}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), - ('(?i)ab{1,3}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), - ('(?i)ab{3,4}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), - ('(?i)ab{4,5}?bc', 'ABBBBC', FAIL), - ('(?i)ab??bc', 'ABBC', SUCCEED, 'found', 'ABBC'), - ('(?i)ab??bc', 'ABC', SUCCEED, 'found', 'ABC'), - ('(?i)ab{0,1}?bc', 'ABC', SUCCEED, 'found', 'ABC'), - ('(?i)ab??bc', 'ABBBBC', FAIL), - ('(?i)ab??c', 'ABC', SUCCEED, 'found', 'ABC'), - ('(?i)ab{0,1}?c', 'ABC', SUCCEED, 'found', 'ABC'), - ('(?i)^abc$', 'ABC', SUCCEED, 'found', 'ABC'), - ('(?i)^abc$', 'ABCC', FAIL), - ('(?i)^abc', 'ABCC', SUCCEED, 'found', 'ABC'), - ('(?i)^abc$', 'AABC', FAIL), - ('(?i)abc$', 'AABC', SUCCEED, 'found', 'ABC'), - ('(?i)^', 'ABC', SUCCEED, 'found', ''), - ('(?i)$', 'ABC', SUCCEED, 'found', ''), - ('(?i)a.c', 'ABC', SUCCEED, 'found', 'ABC'), - ('(?i)a.c', 'AXC', SUCCEED, 'found', 'AXC'), - ('(?i)a.*?c', 'AXYZC', SUCCEED, 'found', 'AXYZC'), - ('(?i)a.*c', 'AXYZD', FAIL), - ('(?i)a[bc]d', 'ABC', FAIL), - ('(?i)a[bc]d', 'ABD', SUCCEED, 'found', 'ABD'), - ('(?i)a[b-d]e', 'ABD', FAIL), - ('(?i)a[b-d]e', 'ACE', SUCCEED, 'found', 'ACE'), - ('(?i)a[b-d]', 'AAC', SUCCEED, 'found', 'AC'), - ('(?i)a[-b]', 'A-', SUCCEED, 'found', 'A-'), - ('(?i)a[b-]', 'A-', SUCCEED, 'found', 'A-'), - ('(?i)a[b-a]', '-', SYNTAX_ERROR), - ('(?i)a[]b', '-', SYNTAX_ERROR), - ('(?i)a[', '-', SYNTAX_ERROR), - ('(?i)a]', 'A]', SUCCEED, 'found', 'A]'), - ('(?i)a[]]b', 'A]B', SUCCEED, 'found', 'A]B'), - ('(?i)a[^bc]d', 'AED', SUCCEED, 'found', 'AED'), - ('(?i)a[^bc]d', 'ABD', FAIL), - ('(?i)a[^-b]c', 'ADC', SUCCEED, 'found', 'ADC'), - ('(?i)a[^-b]c', 'A-C', FAIL), - ('(?i)a[^]b]c', 'A]C', FAIL), - ('(?i)a[^]b]c', 'ADC', SUCCEED, 'found', 'ADC'), - ('(?i)ab|cd', 'ABC', SUCCEED, 'found', 'AB'), - ('(?i)ab|cd', 'ABCD', SUCCEED, 'found', 'AB'), - ('(?i)()ef', 'DEF', SUCCEED, 'found+"-"+g1', 'EF-'), - ('(?i)*a', '-', SYNTAX_ERROR), - ('(?i)(*)b', '-', SYNTAX_ERROR), - ('(?i)$b', 'B', FAIL), - ('(?i)a\\', '-', SYNTAX_ERROR), - ('(?i)a\\(b', 'A(B', SUCCEED, 'found+"-"+g1', 'A(B-Error'), - ('(?i)a\\(*b', 'AB', SUCCEED, 'found', 'AB'), - ('(?i)a\\(*b', 'A((B', SUCCEED, 'found', 'A((B'), - ('(?i)a\\\\b', 'A\\B', SUCCEED, 'found', 'A\\B'), - ('(?i)abc)', '-', SYNTAX_ERROR), - ('(?i)(abc', '-', SYNTAX_ERROR), - ('(?i)((a))', 'ABC', SUCCEED, 'found+"-"+g1+"-"+g2', 'A-A-A'), - ('(?i)(a)b(c)', 'ABC', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABC-A-C'), - ('(?i)a+b+c', 'AABBABC', SUCCEED, 'found', 'ABC'), - ('(?i)a{1,}b{1,}c', 'AABBABC', SUCCEED, 'found', 'ABC'), - ('(?i)a**', '-', SYNTAX_ERROR), - ('(?i)a.+?c', 'ABCABC', SUCCEED, 'found', 'ABC'), - ('(?i)a.*?c', 'ABCABC', SUCCEED, 'found', 'ABC'), - ('(?i)a.{0,5}?c', 'ABCABC', SUCCEED, 'found', 'ABC'), - ('(?i)(a+|b)*', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'), - ('(?i)(a+|b){0,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'), - ('(?i)(a+|b)+', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'), - ('(?i)(a+|b){1,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'), - ('(?i)(a+|b)?', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'), - ('(?i)(a+|b){0,1}', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'), - ('(?i)(a+|b){0,1}?', 'AB', SUCCEED, 'found+"-"+g1', '-None'), - ('(?i))(', '-', SYNTAX_ERROR), - ('(?i)[^ab]*', 'CDE', SUCCEED, 'found', 'CDE'), - ('(?i)abc', '', FAIL), - ('(?i)a*', '', SUCCEED, 'found', ''), - ('(?i)([abc])*d', 'ABBBCD', SUCCEED, 'found+"-"+g1', 'ABBBCD-C'), - ('(?i)([abc])*bcd', 'ABCD', SUCCEED, 'found+"-"+g1', 'ABCD-A'), - ('(?i)a|b|c|d|e', 'E', SUCCEED, 'found', 'E'), - ('(?i)(a|b|c|d|e)f', 'EF', SUCCEED, 'found+"-"+g1', 'EF-E'), - ('(?i)abcd*efg', 'ABCDEFG', SUCCEED, 'found', 'ABCDEFG'), - ('(?i)ab*', 'XABYABBBZ', SUCCEED, 'found', 'AB'), - ('(?i)ab*', 'XAYABBBZ', SUCCEED, 'found', 'A'), - ('(?i)(ab|cd)e', 'ABCDE', SUCCEED, 'found+"-"+g1', 'CDE-CD'), - ('(?i)[abhgefdc]ij', 'HIJ', SUCCEED, 'found', 'HIJ'), - ('(?i)^(ab|cd)e', 'ABCDE', FAIL), - ('(?i)(abc|)ef', 'ABCDEF', SUCCEED, 'found+"-"+g1', 'EF-'), - ('(?i)(a|b)c*d', 'ABCD', SUCCEED, 'found+"-"+g1', 'BCD-B'), - ('(?i)(ab|ab*)bc', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-A'), - ('(?i)a([bc]*)c*', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-BC'), - ('(?i)a([bc]*)(c*d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-BC-D'), - ('(?i)a([bc]+)(c*d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-BC-D'), - ('(?i)a([bc]*)(c+d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-B-CD'), - ('(?i)a[bcd]*dcdcde', 'ADCDCDE', SUCCEED, 'found', 'ADCDCDE'), - ('(?i)a[bcd]+dcdcde', 'ADCDCDE', FAIL), - ('(?i)(ab|a)b*c', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-AB'), - ('(?i)((a)(b)c)(d)', 'ABCD', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'ABC-A-B-D'), - ('(?i)[a-zA-Z_][a-zA-Z0-9_]*', 'ALPHA', SUCCEED, 'found', 'ALPHA'), - ('(?i)^a(bc+|b[eh])g|.h$', 'ABH', SUCCEED, 'found+"-"+g1', 'BH-None'), - ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'), - ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'IJ', SUCCEED, 'found+"-"+g1+"-"+g2', 'IJ-IJ-J'), - ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFG', FAIL), - ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'BCDD', FAIL), - ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'REFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'), - ('(?i)((((((((((a))))))))))', 'A', SUCCEED, 'g10', 'A'), - ('(?i)((((((((((a))))))))))\\10', 'AA', SUCCEED, 'found', 'AA'), - #('(?i)((((((((((a))))))))))\\41', 'AA', FAIL), - #('(?i)((((((((((a))))))))))\\41', 'A!', SUCCEED, 'found', 'A!'), - ('(?i)(((((((((a)))))))))', 'A', SUCCEED, 'found', 'A'), - ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))', 'A', SUCCEED, 'g1', 'A'), - ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))', 'C', SUCCEED, 'g1', 'C'), - ('(?i)multiple words of text', 'UH-UH', FAIL), - ('(?i)multiple words', 'MULTIPLE WORDS, YEAH', SUCCEED, 'found', 'MULTIPLE WORDS'), - ('(?i)(.*)c(.*)', 'ABCDE', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCDE-AB-DE'), - ('(?i)\\((.*), (.*)\\)', '(A, B)', SUCCEED, 'g2+"-"+g1', 'B-A'), - ('(?i)[k]', 'AB', FAIL), -# ('(?i)abcd', 'ABCD', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', 'ABCD-$&-\\ABCD'), -# ('(?i)a(bc)d', 'ABCD', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', 'BC-$1-\\BC'), - ('(?i)a[-]?c', 'AC', SUCCEED, 'found', 'AC'), - ('(?i)(abc)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'), - ('(?i)([a-c]*)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'), - ('a(?!b).', 'abad', SUCCEED, 'found', 'ad'), - ('a(?=d).', 'abad', SUCCEED, 'found', 'ad'), - ('a(?=c|d).', 'abad', SUCCEED, 'found', 'ad'), - ('a(?:b|c|d)(.)', 'ace', SUCCEED, 'g1', 'e'), - ('a(?:b|c|d)*(.)', 'ace', SUCCEED, 'g1', 'e'), - ('a(?:b|c|d)+?(.)', 'ace', SUCCEED, 'g1', 'e'), - ('a(?:b|(c|e){1,2}?|d)+?(.)', 'ace', SUCCEED, 'g1 + g2', 'ce'), - ('^(.+)?B', 'AB', SUCCEED, 'g1', 'A'), - - # lookbehind: split by : but not if it is escaped by -. - ('(?]*?b', 'a>b', FAIL), - # bug 490573: minimizing repeat problem - (r'^a*?$', 'foo', FAIL), - # bug 470582: nested groups problem - (r'^((a)c)?(ab)$', 'ab', SUCCEED, 'g1+"-"+g2+"-"+g3', 'None-None-ab'), - # another minimizing repeat problem (capturing groups in assertions) - ('^([ab]*?)(?=(b)?)c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'), - ('^([ab]*?)(?!(b))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'), - ('^([ab]*?)(?a)', '', SYNTAX_ERROR), # Begins with a digit + ('(?Pa)', '', SYNTAX_ERROR), # Begins with an illegal char + ('(?Pa)', '', SYNTAX_ERROR), # Begins with an illegal char + + # Same tests, for the ?P= form + ('(?Pa)(?P=foo_123', 'aa', SYNTAX_ERROR), + ('(?Pa)(?P=1)', 'aa', SYNTAX_ERROR), + ('(?Pa)(?P=!)', 'aa', SYNTAX_ERROR), + ('(?Pa)(?P=foo_124', 'aa', SYNTAX_ERROR), # Backref to undefined group + + ('(?Pa)', 'a', SUCCEED, 'g1', 'a'), + ('(?Pa)(?P=foo_123)', 'aa', SUCCEED, 'g1', 'a'), + + # Test octal escapes + ('\\1', 'a', SYNTAX_ERROR), # Backreference + ('[\\1]', '\1', SUCCEED, 'found', '\1'), # Character + ('\\09', chr(0) + '9', SUCCEED, 'found', chr(0) + '9'), + ('\\141', 'a', SUCCEED, 'found', 'a'), + ('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', SUCCEED, 'found+"-"+g11', 'abcdefghijklk9-k'), + + # Test \0 is handled everywhere + (r'\0', '\0', SUCCEED, 'found', '\0'), + (r'[\0a]', '\0', SUCCEED, 'found', '\0'), + (r'[a\0]', '\0', SUCCEED, 'found', '\0'), + (r'[^a\0]', '\0', FAIL), + + # Test various letter escapes + (r'\a[\b]\f\n\r\t\v', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'), + (r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'), + # NOTE: not an error under PCRE/PRE: + # (r'\u', '', SYNTAX_ERROR), # A Perl escape + (r'\c\e\h\i\j\m\q\y\z', 'cehijmqyz', SUCCEED, 'found', 'cehijmqyz'), + (r'\xff', '\377', SUCCEED, 'found', chr(255)), + # new \x semantics + (r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)), + (r'\x00f', '\017', FAIL, 'found', chr(15)), + (r'\x00fe', '\376', FAIL, 'found', chr(254)), + # (r'\x00ffffffffffffff', '\377', SUCCEED, 'found', chr(255)), + # (r'\x00f', '\017', SUCCEED, 'found', chr(15)), + # (r'\x00fe', '\376', SUCCEED, 'found', chr(254)), + + (r"^\w+=(\\[\000-\277]|[^\n\\])*", "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", + SUCCEED, 'found', "SRC=eval.c g.c blah blah blah \\\\"), + + # Test that . only matches \n in DOTALL mode + ('a.b', 'acb', SUCCEED, 'found', 'acb'), + ('a.b', 'a\nb', FAIL), + ('a.*b', 'acc\nccb', FAIL), + ('a.{4,5}b', 'acc\nccb', FAIL), + ('a.b', 'a\rb', SUCCEED, 'found', 'a\rb'), + ('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'), + ('(?s)a.*b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'), + ('(?s)a.{4,5}b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'), + ('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'), + + (')', '', SYNTAX_ERROR), # Unmatched right bracket + ('', '', SUCCEED, 'found', ''), # Empty pattern + ('abc', 'abc', SUCCEED, 'found', 'abc'), + ('abc', 'xbc', FAIL), + ('abc', 'axc', FAIL), + ('abc', 'abx', FAIL), + ('abc', 'xabcy', SUCCEED, 'found', 'abc'), + ('abc', 'ababc', SUCCEED, 'found', 'abc'), + ('ab*c', 'abc', SUCCEED, 'found', 'abc'), + ('ab*bc', 'abc', SUCCEED, 'found', 'abc'), + ('ab*bc', 'abbc', SUCCEED, 'found', 'abbc'), + ('ab*bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), + ('ab+bc', 'abbc', SUCCEED, 'found', 'abbc'), + ('ab+bc', 'abc', FAIL), + ('ab+bc', 'abq', FAIL), + ('ab+bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), + ('ab?bc', 'abbc', SUCCEED, 'found', 'abbc'), + ('ab?bc', 'abc', SUCCEED, 'found', 'abc'), + ('ab?bc', 'abbbbc', FAIL), + ('ab?c', 'abc', SUCCEED, 'found', 'abc'), + ('^abc$', 'abc', SUCCEED, 'found', 'abc'), + ('^abc$', 'abcc', FAIL), + ('^abc', 'abcc', SUCCEED, 'found', 'abc'), + ('^abc$', 'aabc', FAIL), + ('abc$', 'aabc', SUCCEED, 'found', 'abc'), + ('^', 'abc', SUCCEED, 'found+"-"', '-'), + ('$', 'abc', SUCCEED, 'found+"-"', '-'), + ('a.c', 'abc', SUCCEED, 'found', 'abc'), + ('a.c', 'axc', SUCCEED, 'found', 'axc'), + ('a.*c', 'axyzc', SUCCEED, 'found', 'axyzc'), + ('a.*c', 'axyzd', FAIL), + ('a[bc]d', 'abc', FAIL), + ('a[bc]d', 'abd', SUCCEED, 'found', 'abd'), + ('a[b-d]e', 'abd', FAIL), + ('a[b-d]e', 'ace', SUCCEED, 'found', 'ace'), + ('a[b-d]', 'aac', SUCCEED, 'found', 'ac'), + ('a[-b]', 'a-', SUCCEED, 'found', 'a-'), + ('a[\\-b]', 'a-', SUCCEED, 'found', 'a-'), + # NOTE: not an error under PCRE/PRE: + # ('a[b-]', 'a-', SYNTAX_ERROR), + ('a[]b', '-', SYNTAX_ERROR), + ('a[', '-', SYNTAX_ERROR), + ('a\\', '-', SYNTAX_ERROR), + ('abc)', '-', SYNTAX_ERROR), + ('(abc', '-', SYNTAX_ERROR), + ('a]', 'a]', SUCCEED, 'found', 'a]'), + ('a[]]b', 'a]b', SUCCEED, 'found', 'a]b'), + ('a[\]]b', 'a]b', SUCCEED, 'found', 'a]b'), + ('a[^bc]d', 'aed', SUCCEED, 'found', 'aed'), + ('a[^bc]d', 'abd', FAIL), + ('a[^-b]c', 'adc', SUCCEED, 'found', 'adc'), + ('a[^-b]c', 'a-c', FAIL), + ('a[^]b]c', 'a]c', FAIL), + ('a[^]b]c', 'adc', SUCCEED, 'found', 'adc'), + ('\\ba\\b', 'a-', SUCCEED, '"-"', '-'), + ('\\ba\\b', '-a', SUCCEED, '"-"', '-'), + ('\\ba\\b', '-a-', SUCCEED, '"-"', '-'), + ('\\by\\b', 'xy', FAIL), + ('\\by\\b', 'yz', FAIL), + ('\\by\\b', 'xyz', FAIL), + ('x\\b', 'xyz', FAIL), + ('x\\B', 'xyz', SUCCEED, '"-"', '-'), + ('\\Bz', 'xyz', SUCCEED, '"-"', '-'), + ('z\\B', 'xyz', FAIL), + ('\\Bx', 'xyz', FAIL), + ('\\Ba\\B', 'a-', FAIL, '"-"', '-'), + ('\\Ba\\B', '-a', FAIL, '"-"', '-'), + ('\\Ba\\B', '-a-', FAIL, '"-"', '-'), + ('\\By\\B', 'xy', FAIL), + ('\\By\\B', 'yz', FAIL), + ('\\By\\b', 'xy', SUCCEED, '"-"', '-'), + ('\\by\\B', 'yz', SUCCEED, '"-"', '-'), + ('\\By\\B', 'xyz', SUCCEED, '"-"', '-'), + ('ab|cd', 'abc', SUCCEED, 'found', 'ab'), + ('ab|cd', 'abcd', SUCCEED, 'found', 'ab'), + ('()ef', 'def', SUCCEED, 'found+"-"+g1', 'ef-'), + ('$b', 'b', FAIL), + ('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-Error'), + ('a\\(*b', 'ab', SUCCEED, 'found', 'ab'), + ('a\\(*b', 'a((b', SUCCEED, 'found', 'a((b'), + ('a\\\\b', 'a\\b', SUCCEED, 'found', 'a\\b'), + ('((a))', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'a-a-a'), + ('(a)b(c)', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'abc-a-c'), + ('a+b+c', 'aabbabc', SUCCEED, 'found', 'abc'), + ('(a+|b)*', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), + ('(a+|b)+', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), + ('(a+|b)?', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'), + (')(', '-', SYNTAX_ERROR), + ('[^ab]*', 'cde', SUCCEED, 'found', 'cde'), + ('abc', '', FAIL), + ('a*', '', SUCCEED, 'found', ''), + ('a|b|c|d|e', 'e', SUCCEED, 'found', 'e'), + ('(a|b|c|d|e)f', 'ef', SUCCEED, 'found+"-"+g1', 'ef-e'), + ('abcd*efg', 'abcdefg', SUCCEED, 'found', 'abcdefg'), + ('ab*', 'xabyabbbz', SUCCEED, 'found', 'ab'), + ('ab*', 'xayabbbz', SUCCEED, 'found', 'a'), + ('(ab|cd)e', 'abcde', SUCCEED, 'found+"-"+g1', 'cde-cd'), + ('[abhgefdc]ij', 'hij', SUCCEED, 'found', 'hij'), + ('^(ab|cd)e', 'abcde', FAIL, 'xg1y', 'xy'), + ('(abc|)ef', 'abcdef', SUCCEED, 'found+"-"+g1', 'ef-'), + ('(a|b)c*d', 'abcd', SUCCEED, 'found+"-"+g1', 'bcd-b'), + ('(ab|ab*)bc', 'abc', SUCCEED, 'found+"-"+g1', 'abc-a'), + ('a([bc]*)c*', 'abc', SUCCEED, 'found+"-"+g1', 'abc-bc'), + ('a([bc]*)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), + ('a([bc]+)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), + ('a([bc]*)(c+d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'), + ('a[bcd]*dcdcde', 'adcdcde', SUCCEED, 'found', 'adcdcde'), + ('a[bcd]+dcdcde', 'adcdcde', FAIL), + ('(ab|a)b*c', 'abc', SUCCEED, 'found+"-"+g1', 'abc-ab'), + ('((a)(b)c)(d)', 'abcd', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'), + ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, 'found', 'alpha'), + ('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-None'), + ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), + ('(bc+d$|ef*g.|h?i(j|k))', 'ij', SUCCEED, 'found+"-"+g1+"-"+g2', 'ij-ij-j'), + ('(bc+d$|ef*g.|h?i(j|k))', 'effg', FAIL), + ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', FAIL), + ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), + ('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'), + ('multiple words of text', 'uh-uh', FAIL), + ('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'), + ('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'), + ('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'), + ('[k]', 'ab', FAIL), + ('a[-]?c', 'ac', SUCCEED, 'found', 'ac'), + ('(abc)\\1', 'abcabc', SUCCEED, 'g1', 'abc'), + ('([a-c]*)\\1', 'abcabc', SUCCEED, 'g1', 'abc'), + ('^(.+)?B', 'AB', SUCCEED, 'g1', 'A'), + ('(a+).\\1$', 'aaaaa', SUCCEED, 'found+"-"+g1', 'aaaaa-aa'), + ('^(a+).\\1$', 'aaaa', FAIL), + ('(abc)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'), + ('([a-c]+)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'), + ('(a)\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'), + ('(a+)\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'), + ('(a+)+\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'), + ('(a).+\\1', 'aba', SUCCEED, 'found+"-"+g1', 'aba-a'), + ('(a)ba*\\1', 'aba', SUCCEED, 'found+"-"+g1', 'aba-a'), + ('(aa|a)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'), + ('(a|aa)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'), + ('(a+)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'), + ('([abc]*)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'), + ('(a)(b)c|ab', 'ab', SUCCEED, 'found+"-"+g1+"-"+g2', 'ab-None-None'), + ('(a)+x', 'aaax', SUCCEED, 'found+"-"+g1', 'aaax-a'), + ('([ac])+x', 'aacx', SUCCEED, 'found+"-"+g1', 'aacx-c'), + ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', SUCCEED, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/'), + ('([^.]*)\\.([^:]*):[T ]+(.*)', 'track1.title:TBlah blah blah', SUCCEED, 'found+"-"+g1+"-"+g2+"-"+g3', 'track1.title:TBlah blah blah-track1-title-Blah blah blah'), + ('([^N]*N)+', 'abNNxyzN', SUCCEED, 'found+"-"+g1', 'abNNxyzN-xyzN'), + ('([^N]*N)+', 'abNNxyz', SUCCEED, 'found+"-"+g1', 'abNN-N'), + ('([abc]*)x', 'abcx', SUCCEED, 'found+"-"+g1', 'abcx-abc'), + ('([abc]*)x', 'abc', FAIL), + ('([xyz]*)x', 'abcx', SUCCEED, 'found+"-"+g1', 'x-'), + ('(a)+b|aac', 'aac', SUCCEED, 'found+"-"+g1', 'aac-None'), + + # Test symbolic groups + + ('(?Paaa)a', 'aaaa', SYNTAX_ERROR), + ('(?Paaa)a', 'aaaa', SUCCEED, 'found+"-"+id', 'aaaa-aaa'), + ('(?Paa)(?P=id)', 'aaaa', SUCCEED, 'found+"-"+id', 'aaaa-aa'), + ('(?Paa)(?P=xd)', 'aaaa', SYNTAX_ERROR), + + # Test octal escapes/memory references + + ('\\1', 'a', SYNTAX_ERROR), + ('\\09', chr(0) + '9', SUCCEED, 'found', chr(0) + '9'), + ('\\141', 'a', SUCCEED, 'found', 'a'), + ('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', SUCCEED, 'found+"-"+g11', 'abcdefghijklk9-k'), + + # All tests from Perl + + ('abc', 'abc', SUCCEED, 'found', 'abc'), + ('abc', 'xbc', FAIL), + ('abc', 'axc', FAIL), + ('abc', 'abx', FAIL), + ('abc', 'xabcy', SUCCEED, 'found', 'abc'), + ('abc', 'ababc', SUCCEED, 'found', 'abc'), + ('ab*c', 'abc', SUCCEED, 'found', 'abc'), + ('ab*bc', 'abc', SUCCEED, 'found', 'abc'), + ('ab*bc', 'abbc', SUCCEED, 'found', 'abbc'), + ('ab*bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), + ('ab{0,}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), + ('ab+bc', 'abbc', SUCCEED, 'found', 'abbc'), + ('ab+bc', 'abc', FAIL), + ('ab+bc', 'abq', FAIL), + ('ab{1,}bc', 'abq', FAIL), + ('ab+bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), + ('ab{1,}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), + ('ab{1,3}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), + ('ab{3,4}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), + ('ab{4,5}bc', 'abbbbc', FAIL), + ('ab?bc', 'abbc', SUCCEED, 'found', 'abbc'), + ('ab?bc', 'abc', SUCCEED, 'found', 'abc'), + ('ab{0,1}bc', 'abc', SUCCEED, 'found', 'abc'), + ('ab?bc', 'abbbbc', FAIL), + ('ab?c', 'abc', SUCCEED, 'found', 'abc'), + ('ab{0,1}c', 'abc', SUCCEED, 'found', 'abc'), + ('^abc$', 'abc', SUCCEED, 'found', 'abc'), + ('^abc$', 'abcc', FAIL), + ('^abc', 'abcc', SUCCEED, 'found', 'abc'), + ('^abc$', 'aabc', FAIL), + ('abc$', 'aabc', SUCCEED, 'found', 'abc'), + ('^', 'abc', SUCCEED, 'found', ''), + ('$', 'abc', SUCCEED, 'found', ''), + ('a.c', 'abc', SUCCEED, 'found', 'abc'), + ('a.c', 'axc', SUCCEED, 'found', 'axc'), + ('a.*c', 'axyzc', SUCCEED, 'found', 'axyzc'), + ('a.*c', 'axyzd', FAIL), + ('a[bc]d', 'abc', FAIL), + ('a[bc]d', 'abd', SUCCEED, 'found', 'abd'), + ('a[b-d]e', 'abd', FAIL), + ('a[b-d]e', 'ace', SUCCEED, 'found', 'ace'), + ('a[b-d]', 'aac', SUCCEED, 'found', 'ac'), + ('a[-b]', 'a-', SUCCEED, 'found', 'a-'), + ('a[b-]', 'a-', SUCCEED, 'found', 'a-'), + ('a[b-a]', '-', SYNTAX_ERROR), + ('a[]b', '-', SYNTAX_ERROR), + ('a[', '-', SYNTAX_ERROR), + ('a]', 'a]', SUCCEED, 'found', 'a]'), + ('a[]]b', 'a]b', SUCCEED, 'found', 'a]b'), + ('a[^bc]d', 'aed', SUCCEED, 'found', 'aed'), + ('a[^bc]d', 'abd', FAIL), + ('a[^-b]c', 'adc', SUCCEED, 'found', 'adc'), + ('a[^-b]c', 'a-c', FAIL), + ('a[^]b]c', 'a]c', FAIL), + ('a[^]b]c', 'adc', SUCCEED, 'found', 'adc'), + ('ab|cd', 'abc', SUCCEED, 'found', 'ab'), + ('ab|cd', 'abcd', SUCCEED, 'found', 'ab'), + ('()ef', 'def', SUCCEED, 'found+"-"+g1', 'ef-'), + ('*a', '-', SYNTAX_ERROR), + ('(*)b', '-', SYNTAX_ERROR), + ('$b', 'b', FAIL), + ('a\\', '-', SYNTAX_ERROR), + ('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-Error'), + ('a\\(*b', 'ab', SUCCEED, 'found', 'ab'), + ('a\\(*b', 'a((b', SUCCEED, 'found', 'a((b'), + ('a\\\\b', 'a\\b', SUCCEED, 'found', 'a\\b'), + ('abc)', '-', SYNTAX_ERROR), + ('(abc', '-', SYNTAX_ERROR), + ('((a))', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'a-a-a'), + ('(a)b(c)', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'abc-a-c'), + ('a+b+c', 'aabbabc', SUCCEED, 'found', 'abc'), + ('a{1,}b{1,}c', 'aabbabc', SUCCEED, 'found', 'abc'), + ('a**', '-', SYNTAX_ERROR), + ('a.+?c', 'abcabc', SUCCEED, 'found', 'abc'), + ('(a+|b)*', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), + ('(a+|b){0,}', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), + ('(a+|b)+', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), + ('(a+|b){1,}', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), + ('(a+|b)?', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'), + ('(a+|b){0,1}', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'), + (')(', '-', SYNTAX_ERROR), + ('[^ab]*', 'cde', SUCCEED, 'found', 'cde'), + ('abc', '', FAIL), + ('a*', '', SUCCEED, 'found', ''), + ('([abc])*d', 'abbbcd', SUCCEED, 'found+"-"+g1', 'abbbcd-c'), + ('([abc])*bcd', 'abcd', SUCCEED, 'found+"-"+g1', 'abcd-a'), + ('a|b|c|d|e', 'e', SUCCEED, 'found', 'e'), + ('(a|b|c|d|e)f', 'ef', SUCCEED, 'found+"-"+g1', 'ef-e'), + ('abcd*efg', 'abcdefg', SUCCEED, 'found', 'abcdefg'), + ('ab*', 'xabyabbbz', SUCCEED, 'found', 'ab'), + ('ab*', 'xayabbbz', SUCCEED, 'found', 'a'), + ('(ab|cd)e', 'abcde', SUCCEED, 'found+"-"+g1', 'cde-cd'), + ('[abhgefdc]ij', 'hij', SUCCEED, 'found', 'hij'), + ('^(ab|cd)e', 'abcde', FAIL), + ('(abc|)ef', 'abcdef', SUCCEED, 'found+"-"+g1', 'ef-'), + ('(a|b)c*d', 'abcd', SUCCEED, 'found+"-"+g1', 'bcd-b'), + ('(ab|ab*)bc', 'abc', SUCCEED, 'found+"-"+g1', 'abc-a'), + ('a([bc]*)c*', 'abc', SUCCEED, 'found+"-"+g1', 'abc-bc'), + ('a([bc]*)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), + ('a([bc]+)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), + ('a([bc]*)(c+d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'), + ('a[bcd]*dcdcde', 'adcdcde', SUCCEED, 'found', 'adcdcde'), + ('a[bcd]+dcdcde', 'adcdcde', FAIL), + ('(ab|a)b*c', 'abc', SUCCEED, 'found+"-"+g1', 'abc-ab'), + ('((a)(b)c)(d)', 'abcd', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'), + ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, 'found', 'alpha'), + ('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-None'), + ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), + ('(bc+d$|ef*g.|h?i(j|k))', 'ij', SUCCEED, 'found+"-"+g1+"-"+g2', 'ij-ij-j'), + ('(bc+d$|ef*g.|h?i(j|k))', 'effg', FAIL), + ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', FAIL), + ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), + ('((((((((((a))))))))))', 'a', SUCCEED, 'g10', 'a'), + ('((((((((((a))))))))))\\10', 'aa', SUCCEED, 'found', 'aa'), +# Python does not have the same rules for \\41 so this is a syntax error +# ('((((((((((a))))))))))\\41', 'aa', FAIL), +# ('((((((((((a))))))))))\\41', 'a!', SUCCEED, 'found', 'a!'), + ('((((((((((a))))))))))\\41', '', SYNTAX_ERROR), + ('(?i)((((((((((a))))))))))\\41', '', SYNTAX_ERROR), + ('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'), + ('multiple words of text', 'uh-uh', FAIL), + ('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'), + ('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'), + ('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'), + ('[k]', 'ab', FAIL), + ('a[-]?c', 'ac', SUCCEED, 'found', 'ac'), + ('(abc)\\1', 'abcabc', SUCCEED, 'g1', 'abc'), + ('([a-c]*)\\1', 'abcabc', SUCCEED, 'g1', 'abc'), + ('(?i)abc', 'ABC', SUCCEED, 'found', 'ABC'), + ('(?i)abc', 'XBC', FAIL), + ('(?i)abc', 'AXC', FAIL), + ('(?i)abc', 'ABX', FAIL), + ('(?i)abc', 'XABCY', SUCCEED, 'found', 'ABC'), + ('(?i)abc', 'ABABC', SUCCEED, 'found', 'ABC'), + ('(?i)ab*c', 'ABC', SUCCEED, 'found', 'ABC'), + ('(?i)ab*bc', 'ABC', SUCCEED, 'found', 'ABC'), + ('(?i)ab*bc', 'ABBC', SUCCEED, 'found', 'ABBC'), + ('(?i)ab*?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), + ('(?i)ab{0,}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), + ('(?i)ab+?bc', 'ABBC', SUCCEED, 'found', 'ABBC'), + ('(?i)ab+bc', 'ABC', FAIL), + ('(?i)ab+bc', 'ABQ', FAIL), + ('(?i)ab{1,}bc', 'ABQ', FAIL), + ('(?i)ab+bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), + ('(?i)ab{1,}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), + ('(?i)ab{1,3}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), + ('(?i)ab{3,4}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), + ('(?i)ab{4,5}?bc', 'ABBBBC', FAIL), + ('(?i)ab??bc', 'ABBC', SUCCEED, 'found', 'ABBC'), + ('(?i)ab??bc', 'ABC', SUCCEED, 'found', 'ABC'), + ('(?i)ab{0,1}?bc', 'ABC', SUCCEED, 'found', 'ABC'), + ('(?i)ab??bc', 'ABBBBC', FAIL), + ('(?i)ab??c', 'ABC', SUCCEED, 'found', 'ABC'), + ('(?i)ab{0,1}?c', 'ABC', SUCCEED, 'found', 'ABC'), + ('(?i)^abc$', 'ABC', SUCCEED, 'found', 'ABC'), + ('(?i)^abc$', 'ABCC', FAIL), + ('(?i)^abc', 'ABCC', SUCCEED, 'found', 'ABC'), + ('(?i)^abc$', 'AABC', FAIL), + ('(?i)abc$', 'AABC', SUCCEED, 'found', 'ABC'), + ('(?i)^', 'ABC', SUCCEED, 'found', ''), + ('(?i)$', 'ABC', SUCCEED, 'found', ''), + ('(?i)a.c', 'ABC', SUCCEED, 'found', 'ABC'), + ('(?i)a.c', 'AXC', SUCCEED, 'found', 'AXC'), + ('(?i)a.*?c', 'AXYZC', SUCCEED, 'found', 'AXYZC'), + ('(?i)a.*c', 'AXYZD', FAIL), + ('(?i)a[bc]d', 'ABC', FAIL), + ('(?i)a[bc]d', 'ABD', SUCCEED, 'found', 'ABD'), + ('(?i)a[b-d]e', 'ABD', FAIL), + ('(?i)a[b-d]e', 'ACE', SUCCEED, 'found', 'ACE'), + ('(?i)a[b-d]', 'AAC', SUCCEED, 'found', 'AC'), + ('(?i)a[-b]', 'A-', SUCCEED, 'found', 'A-'), + ('(?i)a[b-]', 'A-', SUCCEED, 'found', 'A-'), + ('(?i)a[b-a]', '-', SYNTAX_ERROR), + ('(?i)a[]b', '-', SYNTAX_ERROR), + ('(?i)a[', '-', SYNTAX_ERROR), + ('(?i)a]', 'A]', SUCCEED, 'found', 'A]'), + ('(?i)a[]]b', 'A]B', SUCCEED, 'found', 'A]B'), + ('(?i)a[^bc]d', 'AED', SUCCEED, 'found', 'AED'), + ('(?i)a[^bc]d', 'ABD', FAIL), + ('(?i)a[^-b]c', 'ADC', SUCCEED, 'found', 'ADC'), + ('(?i)a[^-b]c', 'A-C', FAIL), + ('(?i)a[^]b]c', 'A]C', FAIL), + ('(?i)a[^]b]c', 'ADC', SUCCEED, 'found', 'ADC'), + ('(?i)ab|cd', 'ABC', SUCCEED, 'found', 'AB'), + ('(?i)ab|cd', 'ABCD', SUCCEED, 'found', 'AB'), + ('(?i)()ef', 'DEF', SUCCEED, 'found+"-"+g1', 'EF-'), + ('(?i)*a', '-', SYNTAX_ERROR), + ('(?i)(*)b', '-', SYNTAX_ERROR), + ('(?i)$b', 'B', FAIL), + ('(?i)a\\', '-', SYNTAX_ERROR), + ('(?i)a\\(b', 'A(B', SUCCEED, 'found+"-"+g1', 'A(B-Error'), + ('(?i)a\\(*b', 'AB', SUCCEED, 'found', 'AB'), + ('(?i)a\\(*b', 'A((B', SUCCEED, 'found', 'A((B'), + ('(?i)a\\\\b', 'A\\B', SUCCEED, 'found', 'A\\B'), + ('(?i)abc)', '-', SYNTAX_ERROR), + ('(?i)(abc', '-', SYNTAX_ERROR), + ('(?i)((a))', 'ABC', SUCCEED, 'found+"-"+g1+"-"+g2', 'A-A-A'), + ('(?i)(a)b(c)', 'ABC', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABC-A-C'), + ('(?i)a+b+c', 'AABBABC', SUCCEED, 'found', 'ABC'), + ('(?i)a{1,}b{1,}c', 'AABBABC', SUCCEED, 'found', 'ABC'), + ('(?i)a**', '-', SYNTAX_ERROR), + ('(?i)a.+?c', 'ABCABC', SUCCEED, 'found', 'ABC'), + ('(?i)a.*?c', 'ABCABC', SUCCEED, 'found', 'ABC'), + ('(?i)a.{0,5}?c', 'ABCABC', SUCCEED, 'found', 'ABC'), + ('(?i)(a+|b)*', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'), + ('(?i)(a+|b){0,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'), + ('(?i)(a+|b)+', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'), + ('(?i)(a+|b){1,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'), + ('(?i)(a+|b)?', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'), + ('(?i)(a+|b){0,1}', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'), + ('(?i)(a+|b){0,1}?', 'AB', SUCCEED, 'found+"-"+g1', '-None'), + ('(?i))(', '-', SYNTAX_ERROR), + ('(?i)[^ab]*', 'CDE', SUCCEED, 'found', 'CDE'), + ('(?i)abc', '', FAIL), + ('(?i)a*', '', SUCCEED, 'found', ''), + ('(?i)([abc])*d', 'ABBBCD', SUCCEED, 'found+"-"+g1', 'ABBBCD-C'), + ('(?i)([abc])*bcd', 'ABCD', SUCCEED, 'found+"-"+g1', 'ABCD-A'), + ('(?i)a|b|c|d|e', 'E', SUCCEED, 'found', 'E'), + ('(?i)(a|b|c|d|e)f', 'EF', SUCCEED, 'found+"-"+g1', 'EF-E'), + ('(?i)abcd*efg', 'ABCDEFG', SUCCEED, 'found', 'ABCDEFG'), + ('(?i)ab*', 'XABYABBBZ', SUCCEED, 'found', 'AB'), + ('(?i)ab*', 'XAYABBBZ', SUCCEED, 'found', 'A'), + ('(?i)(ab|cd)e', 'ABCDE', SUCCEED, 'found+"-"+g1', 'CDE-CD'), + ('(?i)[abhgefdc]ij', 'HIJ', SUCCEED, 'found', 'HIJ'), + ('(?i)^(ab|cd)e', 'ABCDE', FAIL), + ('(?i)(abc|)ef', 'ABCDEF', SUCCEED, 'found+"-"+g1', 'EF-'), + ('(?i)(a|b)c*d', 'ABCD', SUCCEED, 'found+"-"+g1', 'BCD-B'), + ('(?i)(ab|ab*)bc', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-A'), + ('(?i)a([bc]*)c*', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-BC'), + ('(?i)a([bc]*)(c*d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-BC-D'), + ('(?i)a([bc]+)(c*d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-BC-D'), + ('(?i)a([bc]*)(c+d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-B-CD'), + ('(?i)a[bcd]*dcdcde', 'ADCDCDE', SUCCEED, 'found', 'ADCDCDE'), + ('(?i)a[bcd]+dcdcde', 'ADCDCDE', FAIL), + ('(?i)(ab|a)b*c', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-AB'), + ('(?i)((a)(b)c)(d)', 'ABCD', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'ABC-A-B-D'), + ('(?i)[a-zA-Z_][a-zA-Z0-9_]*', 'ALPHA', SUCCEED, 'found', 'ALPHA'), + ('(?i)^a(bc+|b[eh])g|.h$', 'ABH', SUCCEED, 'found+"-"+g1', 'BH-None'), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'IJ', SUCCEED, 'found+"-"+g1+"-"+g2', 'IJ-IJ-J'), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFG', FAIL), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'BCDD', FAIL), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'REFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'), + ('(?i)((((((((((a))))))))))', 'A', SUCCEED, 'g10', 'A'), + ('(?i)((((((((((a))))))))))\\10', 'AA', SUCCEED, 'found', 'AA'), + #('(?i)((((((((((a))))))))))\\41', 'AA', FAIL), + #('(?i)((((((((((a))))))))))\\41', 'A!', SUCCEED, 'found', 'A!'), + ('(?i)(((((((((a)))))))))', 'A', SUCCEED, 'found', 'A'), + ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))', 'A', SUCCEED, 'g1', 'A'), + ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))', 'C', SUCCEED, 'g1', 'C'), + ('(?i)multiple words of text', 'UH-UH', FAIL), + ('(?i)multiple words', 'MULTIPLE WORDS, YEAH', SUCCEED, 'found', 'MULTIPLE WORDS'), + ('(?i)(.*)c(.*)', 'ABCDE', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCDE-AB-DE'), + ('(?i)\\((.*), (.*)\\)', '(A, B)', SUCCEED, 'g2+"-"+g1', 'B-A'), + ('(?i)[k]', 'AB', FAIL), +# ('(?i)abcd', 'ABCD', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', 'ABCD-$&-\\ABCD'), +# ('(?i)a(bc)d', 'ABCD', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', 'BC-$1-\\BC'), + ('(?i)a[-]?c', 'AC', SUCCEED, 'found', 'AC'), + ('(?i)(abc)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'), + ('(?i)([a-c]*)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'), + ('a(?!b).', 'abad', SUCCEED, 'found', 'ad'), + ('a(?=d).', 'abad', SUCCEED, 'found', 'ad'), + ('a(?=c|d).', 'abad', SUCCEED, 'found', 'ad'), + ('a(?:b|c|d)(.)', 'ace', SUCCEED, 'g1', 'e'), + ('a(?:b|c|d)*(.)', 'ace', SUCCEED, 'g1', 'e'), + ('a(?:b|c|d)+?(.)', 'ace', SUCCEED, 'g1', 'e'), + ('a(?:b|(c|e){1,2}?|d)+?(.)', 'ace', SUCCEED, 'g1 + g2', 'ce'), + ('^(.+)?B', 'AB', SUCCEED, 'g1', 'A'), + + # lookbehind: split by : but not if it is escaped by -. + ('(?]*?b', 'a>b', FAIL), + # bug 490573: minimizing repeat problem + (r'^a*?$', 'foo', FAIL), + # bug 470582: nested groups problem + (r'^((a)c)?(ab)$', 'ab', SUCCEED, 'g1+"-"+g2+"-"+g3', 'None-None-ab'), + # another minimizing repeat problem (capturing groups in assertions) + ('^([ab]*?)(?=(b)?)c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'), + ('^([ab]*?)(?!(b))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'), + ('^([ab]*?)(?x)', '\g\g', 'xx'), 'xxxx') - self.assertEqual(re.sub('(?Px)', '\g\g<1>', 'xx'), 'xxxx') - self.assertEqual(re.sub('(?Px)', '\g\g', 'xx'), 'xxxx') - self.assertEqual(re.sub('(?Px)', '\g<1>\g<1>', 'xx'), 'xxxx') - - self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'), - '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') - self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a') - self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), - (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7))) - - self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest') - - def test_bug_449964(self): - # fails for group followed by other escape - self.assertEqual(re.sub(r'(?Px)', '\g<1>\g<1>\\b', 'xx'), - 'xx\bxx\b') - - def test_bug_449000(self): - # Test for sub() on escaped characters - self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), - 'abc\ndef\n') - self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), - 'abc\ndef\n') - self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), - 'abc\ndef\n') - self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), - 'abc\ndef\n') - - def test_bug_1140(self): - # re.sub(x, y, u'') should return u'', not '', and - # re.sub(x, y, '') should return '', not u''. - # Also: - # re.sub(x, y, unicode(x)) should return unicode(y), and - # re.sub(x, y, str(x)) should return - # str(y) if isinstance(y, str) else unicode(y). - for x in 'x', u'x': - for y in 'y', u'y': - z = re.sub(x, y, u'') - self.assertEqual(z, u'') - self.assertEqual(type(z), unicode) - # - z = re.sub(x, y, '') - self.assertEqual(z, '') - self.assertEqual(type(z), str) - # - z = re.sub(x, y, unicode(x)) - self.assertEqual(z, y) - self.assertEqual(type(z), unicode) - # - z = re.sub(x, y, str(x)) - self.assertEqual(z, y) - self.assertEqual(type(z), type(y)) - - def test_bug_1661(self): - # Verify that flags do not get silently ignored with compiled patterns - pattern = re.compile('.') - self.assertRaises(ValueError, re.match, pattern, 'A', re.I) - self.assertRaises(ValueError, re.search, pattern, 'A', re.I) - self.assertRaises(ValueError, re.findall, pattern, 'A', re.I) - self.assertRaises(ValueError, re.compile, pattern, re.I) - - def test_bug_3629(self): - # A regex that triggered a bug in the sre-code validator - re.compile("(?P)(?(quote))") - - def test_sub_template_numeric_escape(self): - # bug 776311 and friends - self.assertEqual(re.sub('x', r'\0', 'x'), '\0') - self.assertEqual(re.sub('x', r'\000', 'x'), '\000') - self.assertEqual(re.sub('x', r'\001', 'x'), '\001') - self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8') - self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') - self.assertEqual(re.sub('x', r'\111', 'x'), '\111') - self.assertEqual(re.sub('x', r'\117', 'x'), '\117') - - self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') - self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') - - self.assertEqual(re.sub('x', r'\00', 'x'), '\x00') - self.assertEqual(re.sub('x', r'\07', 'x'), '\x07') - self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8') - self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') - self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') - - self.assertEqual(re.sub('x', r'\400', 'x'), '\0') - self.assertEqual(re.sub('x', r'\777', 'x'), '\377') - - self.assertRaises(re.error, re.sub, 'x', r'\1', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\8', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\9', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\11', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\18', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\90', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\99', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8' - self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1' - self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0' - - # in python2.3 (etc), these loop endlessly in sre_parser.py - self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') - self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), - 'xz8') - self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), - 'xza') - - def test_qualified_re_sub(self): - self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb') - self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') - - def test_bug_114660(self): - self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), - 'hello there') - - def test_bug_462270(self): - # Test for empty sub() behaviour, see SF bug #462270 - self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-') - self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d') - - def test_symbolic_refs(self): - self.assertRaises(re.error, re.sub, '(?Px)', '\gx)', '\g<', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)', '\g', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)', '\g', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)', '\g<1a1>', 'xx') - self.assertRaises(IndexError, re.sub, '(?Px)', '\g', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)|(?Py)', '\g', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)|(?Py)', '\\2', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)', '\g<-1>', 'xx') - - def test_re_subn(self): - self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) - self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) - self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) - self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) - self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) - - def test_re_split(self): - self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) - self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c']) - self.assertEqual(re.split("(:*)", ":a:b::c"), - ['', ':', 'a', ':', 'b', '::', 'c']) - self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) - self.assertEqual(re.split("(:)*", ":a:b::c"), - ['', ':', 'a', ':', 'b', ':', 'c']) - self.assertEqual(re.split("([b:]+)", ":a:b::c"), - ['', ':', 'a', ':b::', 'c']) - self.assertEqual(re.split("(b)|(:+)", ":a:b::c"), - ['', None, ':', 'a', None, ':', '', 'b', None, '', - None, '::', 'c']) - self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), - ['', 'a', '', '', 'c']) - - def test_qualified_re_split(self): - self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) - self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) - self.assertEqual(re.split("(:)", ":a:b::c", 2), - ['', ':', 'a', ':', 'b::c']) - self.assertEqual(re.split("(:*)", ":a:b::c", 2), - ['', ':', 'a', ':', 'b::c']) - - def test_re_findall(self): - self.assertEqual(re.findall(":+", "abc"), []) - self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"]) - self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"]) - self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""), - (":", ":"), - (":", "::")]) - - def test_bug_117612(self): - self.assertEqual(re.findall(r"(a|(b))", "aba"), - [("a", ""),("b", "b"),("a", "")]) - - def test_re_match(self): - self.assertEqual(re.match('a', 'a').groups(), ()) - self.assertEqual(re.match('(a)', 'a').groups(), ('a',)) - self.assertEqual(re.match(r'(a)', 'a').group(0), 'a') - self.assertEqual(re.match(r'(a)', 'a').group(1), 'a') - self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a')) - - pat = re.compile('((a)|(b))(c)?') - self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None)) - self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None)) - self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c')) - self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c')) - self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c')) - - # A single group - m = re.match('(a)', 'a') - self.assertEqual(m.group(0), 'a') - self.assertEqual(m.group(0), 'a') - self.assertEqual(m.group(1), 'a') - self.assertEqual(m.group(1, 1), ('a', 'a')) - - pat = re.compile('(?:(?Pa)|(?Pb))(?Pc)?') - self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) - self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), - (None, 'b', None)) - self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) - - def test_re_groupref_exists(self): - self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(), - ('(', 'a')) - self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(), - (None, 'a')) - self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None) - self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None) - self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(), - ('a', 'b')) - self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(), - (None, 'd')) - self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(), - (None, 'd')) - self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(), - ('a', '')) - - # Tests for bug #1177831: exercise groups other than the first group - p = re.compile('(?Pa)(?Pb)?((?(g2)c|d))') - self.assertEqual(p.match('abc').groups(), - ('a', 'b', 'c')) - self.assertEqual(p.match('ad').groups(), - ('a', None, 'd')) - self.assertEqual(p.match('abd'), None) - self.assertEqual(p.match('ac'), None) - - - def test_re_groupref(self): - self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), - ('|', 'a')) - self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(), - (None, 'a')) - self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None) - self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None) - self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(), - ('a', 'a')) - self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), - (None, None)) - - def test_groupdict(self): - self.assertEqual(re.match('(?Pfirst) (?Psecond)', - 'first second').groupdict(), - {'first':'first', 'second':'second'}) - - def test_expand(self): - self.assertEqual(re.match("(?Pfirst) (?Psecond)", - "first second") - .expand(r"\2 \1 \g \g"), - "second first second first") - - def test_repeat_minmax(self): - self.assertEqual(re.match("^(\w){1}$", "abc"), None) - self.assertEqual(re.match("^(\w){1}?$", "abc"), None) - self.assertEqual(re.match("^(\w){1,2}$", "abc"), None) - self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None) - - self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c") - self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c") - self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c") - self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c") - self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c") - self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c") - self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c") - self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c") - - self.assertEqual(re.match("^x{1}$", "xxx"), None) - self.assertEqual(re.match("^x{1}?$", "xxx"), None) - self.assertEqual(re.match("^x{1,2}$", "xxx"), None) - self.assertEqual(re.match("^x{1,2}?$", "xxx"), None) - - self.assertNotEqual(re.match("^x{3}$", "xxx"), None) - self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None) - self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None) - self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None) - self.assertNotEqual(re.match("^x{3}?$", "xxx"), None) - self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None) - self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None) - self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None) - - self.assertEqual(re.match("^x{}$", "xxx"), None) - self.assertNotEqual(re.match("^x{}$", "x{}"), None) - - def test_getattr(self): - self.assertEqual(re.match("(a)", "a").pos, 0) - self.assertEqual(re.match("(a)", "a").endpos, 1) - self.assertEqual(re.match("(a)", "a").string, "a") - self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1))) - self.assertNotEqual(re.match("(a)", "a").re, None) - - def test_special_escapes(self): - self.assertEqual(re.search(r"\b(b.)\b", - "abcd abc bcd bx").group(1), "bx") - self.assertEqual(re.search(r"\B(b.)\B", - "abc bcd bc abxd").group(1), "bx") - self.assertEqual(re.search(r"\b(b.)\b", - "abcd abc bcd bx", re.LOCALE).group(1), "bx") - self.assertEqual(re.search(r"\B(b.)\B", - "abc bcd bc abxd", re.LOCALE).group(1), "bx") - self.assertEqual(re.search(r"\b(b.)\b", - "abcd abc bcd bx", re.UNICODE).group(1), "bx") - self.assertEqual(re.search(r"\B(b.)\B", - "abc bcd bc abxd", re.UNICODE).group(1), "bx") - self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") - self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") - self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None) - self.assertEqual(re.search(r"\b(b.)\b", - u"abcd abc bcd bx").group(1), "bx") - self.assertEqual(re.search(r"\B(b.)\B", - u"abc bcd bc abxd").group(1), "bx") - self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc") - self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc") - self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None) - self.assertEqual(re.search(r"\d\D\w\W\s\S", - "1aa! a").group(0), "1aa! a") - self.assertEqual(re.search(r"\d\D\w\W\s\S", - "1aa! a", re.LOCALE).group(0), "1aa! a") - self.assertEqual(re.search(r"\d\D\w\W\s\S", - "1aa! a", re.UNICODE).group(0), "1aa! a") - - def test_bigcharset(self): - self.assertEqual(re.match(u"([\u2222\u2223])", - u"\u2222").group(1), u"\u2222") - self.assertEqual(re.match(u"([\u2222\u2223])", - u"\u2222", re.UNICODE).group(1), u"\u2222") - - def test_anyall(self): - self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0), - "a\nb") - self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), - "a\n\nb") - - def test_non_consuming(self): - self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a") - self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a") - self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a") - self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a") - self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a") - self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a") - self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a") - - self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a") - self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a") - self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") - self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") - - def test_ignore_case(self): - self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") - self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") - self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") - self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") - self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") - self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb") - self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a") - self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa") - self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") - self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") - - def test_category(self): - self.assertEqual(re.match(r"(\s)", " ").group(1), " ") - - def test_getlower(self): - import _sre - self.assertEqual(_sre.getlower(ord('A'), 0), ord('a')) - self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a')) - self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a')) - - self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") - self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") - - def test_not_literal(self): - self.assertEqual(re.search("\s([^a])", " b").group(1), "b") - self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb") - - def test_search_coverage(self): - self.assertEqual(re.search("\s(b)", " b").group(1), "b") - self.assertEqual(re.search("a\s", "a ").group(0), "a ") - - def test_re_escape(self): - p="" - for i in range(0, 256): - p = p + chr(i) - self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None, - True) - self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1)) - - pat=re.compile(re.escape(p)) - self.assertEqual(pat.match(p) is not None, True) - self.assertEqual(pat.match(p).span(), (0,256)) - - def test_pickling(self): - import pickle - self.pickle_test(pickle) - import cPickle - self.pickle_test(cPickle) - # old pickles expect the _compile() reconstructor in sre module - import warnings - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "The sre module is deprecated", - DeprecationWarning) - from sre import _compile - - def pickle_test(self, pickle): - oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)') - s = pickle.dumps(oldpat) - newpat = pickle.loads(s) - self.assertEqual(oldpat, newpat) - - def test_constants(self): - self.assertEqual(re.I, re.IGNORECASE) - self.assertEqual(re.L, re.LOCALE) - self.assertEqual(re.M, re.MULTILINE) - self.assertEqual(re.S, re.DOTALL) - self.assertEqual(re.X, re.VERBOSE) - - def test_flags(self): - for flag in [re.I, re.M, re.X, re.S, re.L]: - self.assertNotEqual(re.compile('^pattern$', flag), None) - - def test_sre_character_literals(self): - for i in [0, 8, 16, 32, 64, 127, 128, 255]: - self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None) - self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None) - self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None) - self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None) - self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None) - self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None) - self.assertRaises(re.error, re.match, "\911", "") - - def test_sre_character_class_literals(self): - for i in [0, 8, 16, 32, 64, 127, 128, 255]: - self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None) - self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None) - self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None) - self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None) - self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None) - self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None) - self.assertRaises(re.error, re.match, "[\911]", "") - - def test_bug_113254(self): - self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) - self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1) - self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1)) - - def test_bug_527371(self): - # bug described in patches 527371/672491 - self.assertEqual(re.match(r'(a)?a','a').lastindex, None) - self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1) - self.assertEqual(re.match(r'(?Pa)(?Pb)?b','ab').lastgroup, 'a') - self.assertEqual(re.match("(?Pa(b))", "ab").lastgroup, 'a') - self.assertEqual(re.match("((a))", "a").lastindex, 1) - - def test_bug_545855(self): - # bug 545855 -- This pattern failed to cause a compile error as it - # should, instead provoking a TypeError. - self.assertRaises(re.error, re.compile, 'foo[a-') - - def test_bug_418626(self): - # bugs 418626 at al. -- Testing Greg Chapman's addition of op code - # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of - # pattern '*?' on a long string. - self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) - self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), - 20003) - self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) - # non-simple '*?' still used to hit the recursion limit, before the - # non-recursive scheme was implemented. - self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) - - def test_bug_612074(self): - pat=u"["+re.escape(u"\u2039")+u"]" - self.assertEqual(re.compile(pat) and 1, 1) - - def test_stack_overflow(self): - # nasty cases that used to overflow the straightforward recursive - # implementation of repeated groups. - self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x') - self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') - self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') - - def test_scanner(self): - def s_ident(scanner, token): return token - def s_operator(scanner, token): return "op%s" % token - def s_float(scanner, token): return float(token) - def s_int(scanner, token): return int(token) - - scanner = Scanner([ - (r"[a-zA-Z_]\w*", s_ident), - (r"\d+\.\d*", s_float), - (r"\d+", s_int), - (r"=|\+|-|\*|/", s_operator), - (r"\s+", None), - ]) - - self.assertNotEqual(scanner.scanner.scanner("").pattern, None) - - self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), - (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, - 'op+', 'bar'], '')) - - def test_bug_448951(self): - # bug 448951 (similar to 429357, but with single char match) - # (Also test greedy matches.) - for op in '','?','*': - self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(), - (None, None)) - self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(), - ('a:', 'a')) - - def test_bug_725106(self): - # capturing groups in alternatives in repeats - self.assertEqual(re.match('^((a)|b)*', 'abc').groups(), - ('b', 'a')) - self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(), - ('c', 'b')) - self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(), - ('b', None)) - self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(), - ('b', None)) - self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(), - ('b', 'a')) - self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(), - ('c', 'b')) - self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(), - ('b', None)) - self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(), - ('b', None)) - - def test_bug_725149(self): - # mark_stack_base restoring before restoring marks - self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(), - ('a', None)) - self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(), - ('a', None, None)) - - def test_bug_764548(self): - # bug 764548, re.compile() barfs on str/unicode subclasses - try: - unicode - except NameError: - return # no problem if we have no unicode - class my_unicode(unicode): pass - pat = re.compile(my_unicode("abc")) - self.assertEqual(pat.match("xyz"), None) - - def test_finditer(self): - iter = re.finditer(r":+", "a:b::c:::d") - self.assertEqual([item.group(0) for item in iter], - [":", "::", ":::"]) - - def test_bug_926075(self): - try: - unicode - except NameError: - return # no problem if we have no unicode - self.assert_(re.compile('bug_926075') is not - re.compile(eval("u'bug_926075'"))) - - def test_bug_931848(self): - try: - unicode - except NameError: - pass - pattern = eval('u"[\u002E\u3002\uFF0E\uFF61]"') - self.assertEqual(re.compile(pattern).split("a.b.c"), - ['a','b','c']) - - def test_bug_581080(self): - iter = re.finditer(r"\s", "a b") - self.assertEqual(iter.next().span(), (1,2)) - self.assertRaises(StopIteration, iter.next) - - scanner = re.compile(r"\s").scanner("a b") - self.assertEqual(scanner.search().span(), (1, 2)) - self.assertEqual(scanner.search(), None) - - def test_bug_817234(self): - iter = re.finditer(r".*", "asdf") - self.assertEqual(iter.next().span(), (0, 4)) - self.assertEqual(iter.next().span(), (4, 4)) - self.assertRaises(StopIteration, iter.next) - - def test_empty_array(self): - # SF buf 1647541 - import array - for typecode in 'cbBuhHiIlLfd': - a = array.array(typecode) - self.assertEqual(re.compile("bla").match(a), None) - self.assertEqual(re.compile("").match(a).groups(), ()) - - def test_inline_flags(self): - # Bug #1700 - upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow - lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow - - p = re.compile(upper_char, re.I | re.U) - q = p.match(lower_char) - self.assertNotEqual(q, None) - - p = re.compile(lower_char, re.I | re.U) - q = p.match(upper_char) - self.assertNotEqual(q, None) - - p = re.compile('(?i)' + upper_char, re.U) - q = p.match(lower_char) - self.assertNotEqual(q, None) - - p = re.compile('(?i)' + lower_char, re.U) - q = p.match(upper_char) - self.assertNotEqual(q, None) - - p = re.compile('(?iu)' + upper_char) - q = p.match(lower_char) - self.assertNotEqual(q, None) - - p = re.compile('(?iu)' + lower_char) - q = p.match(upper_char) - self.assertNotEqual(q, None) - - def test_dollar_matches_twice(self): - "$ matches the end of string, and just before the terminating \n" - pattern = re.compile('$') - self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') - self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') - self.assertEqual(pattern.sub('#', '\n'), '#\n#') - - pattern = re.compile('$', re.MULTILINE) - self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' ) - self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') - self.assertEqual(pattern.sub('#', '\n'), '#\n#') - - -def run_re_tests(): - from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR - if verbose: - print 'Running re_tests test suite' - else: - # To save time, only run the first and last 10 tests - #tests = tests[:10] + tests[-10:] - pass - - for t in tests: - sys.stdout.flush() - pattern = s = outcome = repl = expected = None - if len(t) == 5: - pattern, s, outcome, repl, expected = t - elif len(t) == 3: - pattern, s, outcome = t - else: - raise ValueError, ('Test tuples should have 3 or 5 fields', t) - - try: - obj = re.compile(pattern) - except re.error: - if outcome == SYNTAX_ERROR: pass # Expected a syntax error - else: - print '=== Syntax error:', t - except KeyboardInterrupt: raise KeyboardInterrupt - except: - print '*** Unexpected error ***', t - if verbose: - traceback.print_exc(file=sys.stdout) - else: - try: - result = obj.search(s) - except re.error, msg: - print '=== Unexpected exception', t, repr(msg) - if outcome == SYNTAX_ERROR: - # This should have been a syntax error; forget it. - pass - elif outcome == FAIL: - if result is None: pass # No match, as expected - else: print '=== Succeeded incorrectly', t - elif outcome == SUCCEED: - if result is not None: - # Matched, as expected, so now we compute the - # result string and compare it to our expected result. - start, end = result.span(0) - vardict={'found': result.group(0), - 'groups': result.group(), - 'flags': result.re.flags} - for i in range(1, 100): - try: - gi = result.group(i) - # Special hack because else the string concat fails: - if gi is None: - gi = "None" - except IndexError: - gi = "Error" - vardict['g%d' % i] = gi - for i in result.re.groupindex.keys(): - try: - gi = result.group(i) - if gi is None: - gi = "None" - except IndexError: - gi = "Error" - vardict[i] = gi - repl = eval(repl, vardict) - if repl != expected: - print '=== grouping error', t, - print repr(repl) + ' should be ' + repr(expected) - else: - print '=== Failed incorrectly', t - - # Try the match on a unicode string, and check that it - # still succeeds. - try: - result = obj.search(unicode(s, "latin-1")) - if result is None: - print '=== Fails on unicode match', t - except NameError: - continue # 1.5.2 - except TypeError: - continue # unicode test case - - # Try the match on a unicode pattern, and check that it - # still succeeds. - obj=re.compile(unicode(pattern, "latin-1")) - result = obj.search(s) - if result is None: - print '=== Fails on unicode pattern match', t - - # Try the match with the search area limited to the extent - # of the match and see if it still succeeds. \B will - # break (because it won't match at the end or start of a - # string), so we'll ignore patterns that feature it. - - if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \ - and result is not None: - obj = re.compile(pattern) - result = obj.search(s, result.start(0), result.end(0) + 1) - if result is None: - print '=== Failed on range-limited match', t - - # Try the match with IGNORECASE enabled, and check that it - # still succeeds. - obj = re.compile(pattern, re.IGNORECASE) - result = obj.search(s) - if result is None: - print '=== Fails on case-insensitive match', t - - # Try the match with LOCALE enabled, and check that it - # still succeeds. - obj = re.compile(pattern, re.LOCALE) - result = obj.search(s) - if result is None: - print '=== Fails on locale-sensitive match', t - - # Try the match with UNICODE locale enabled, and check - # that it still succeeds. - obj = re.compile(pattern, re.UNICODE) - result = obj.search(s) - if result is None: - print '=== Fails on unicode-sensitive match', t - -def test_main(): - run_unittest(ReTests) - run_re_tests() - -if __name__ == "__main__": - test_main() +import sys +sys.path = ['.'] + sys.path + +from test.test_support import verbose, run_unittest +import re +from re import Scanner +import sys, os, traceback +from weakref import proxy +import unicodedata + +# Misc tests from Tim Peters' re.doc + +# WARNING: Don't change details in these tests if you don't know +# what you're doing. Some of these tests were carefuly modeled to +# cover most of the code. + +import unittest + +class ReTests(unittest.TestCase): + + def test_weakref(self): + s = 'QabbbcR' + x = re.compile('ab+c') + y = proxy(x) + self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR')) + + def test_search_star_plus(self): + self.assertEqual(re.search('x*', 'axx').span(0), (0, 0)) + self.assertEqual(re.search('x*', 'axx').span(), (0, 0)) + self.assertEqual(re.search('x+', 'axx').span(0), (1, 3)) + self.assertEqual(re.search('x+', 'axx').span(), (1, 3)) + self.assertEqual(re.search('x', 'aaa'), None) + self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0)) + self.assertEqual(re.match('a*', 'xxx').span(), (0, 0)) + self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3)) + self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) + self.assertEqual(re.match('a+', 'xxx'), None) + + def bump_num(self, matchobj): + int_value = int(matchobj.group(0)) + return str(int_value + 1) + + def test_basic_re_sub(self): + self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') + self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), + '9.3 -3 24x100y') + self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), + '9.3 -3 23x99y') + + self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') + self.assertEqual(re.sub('.', r"\n", 'x'), '\n') + + s = r"\1\1" + self.assertEqual(re.sub('(.)', s, 'x'), 'xx') + self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s) + self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s) + + self.assertEqual(re.sub('(?Px)', '\g\g', 'xx'), 'xxxx') + self.assertEqual(re.sub('(?Px)', '\g\g<1>', 'xx'), 'xxxx') + self.assertEqual(re.sub('(?Px)', '\g\g', 'xx'), 'xxxx') + self.assertEqual(re.sub('(?Px)', '\g<1>\g<1>', 'xx'), 'xxxx') + + self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'), + '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') + self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a') + self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), + (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7))) + + self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest') + + def test_bug_449964(self): + # fails for group followed by other escape + self.assertEqual(re.sub(r'(?Px)', '\g<1>\g<1>\\b', 'xx'), + 'xx\bxx\b') + + def test_bug_449000(self): + # Test for sub() on escaped characters + self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), + 'abc\ndef\n') + self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), + 'abc\ndef\n') + self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), + 'abc\ndef\n') + self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), + 'abc\ndef\n') + + def test_bug_1140(self): + # re.sub(x, y, u'') should return u'', not '', and + # re.sub(x, y, '') should return '', not u''. + # Also: + # re.sub(x, y, unicode(x)) should return unicode(y), and + # re.sub(x, y, str(x)) should return + # str(y) if isinstance(y, str) else unicode(y). + for x in 'x', u'x': + for y in 'y', u'y': + z = re.sub(x, y, u'') + self.assertEqual(z, u'') + self.assertEqual(type(z), unicode) + # + z = re.sub(x, y, '') + self.assertEqual(z, '') + self.assertEqual(type(z), str) + # + z = re.sub(x, y, unicode(x)) + self.assertEqual(z, y) + self.assertEqual(type(z), unicode) + # + z = re.sub(x, y, str(x)) + self.assertEqual(z, y) + self.assertEqual(type(z), type(y)) + + def test_bug_1661(self): + # Verify that flags do not get silently ignored with compiled patterns + pattern = re.compile('.') + self.assertRaises(ValueError, re.match, pattern, 'A', re.I) + self.assertRaises(ValueError, re.search, pattern, 'A', re.I) + self.assertRaises(ValueError, re.findall, pattern, 'A', re.I) + self.assertRaises(ValueError, re.compile, pattern, re.I) + + def test_bug_3629(self): + # A regex that triggered a bug in the sre-code validator + re.compile("(?P)(?(quote))") + + def test_sub_template_numeric_escape(self): + # bug 776311 and friends + self.assertEqual(re.sub('x', r'\0', 'x'), '\0') + self.assertEqual(re.sub('x', r'\000', 'x'), '\000') + self.assertEqual(re.sub('x', r'\001', 'x'), '\001') + self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8') + self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') + self.assertEqual(re.sub('x', r'\111', 'x'), '\111') + self.assertEqual(re.sub('x', r'\117', 'x'), '\117') + + self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') + self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') + + self.assertEqual(re.sub('x', r'\00', 'x'), '\x00') + self.assertEqual(re.sub('x', r'\07', 'x'), '\x07') + self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8') + self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') + self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') + + self.assertEqual(re.sub('x', r'\400', 'x'), '\0') + self.assertEqual(re.sub('x', r'\777', 'x'), '\377') + + self.assertRaises(re.error, re.sub, 'x', r'\1', 'x') + self.assertRaises(re.error, re.sub, 'x', r'\8', 'x') + self.assertRaises(re.error, re.sub, 'x', r'\9', 'x') + self.assertRaises(re.error, re.sub, 'x', r'\11', 'x') + self.assertRaises(re.error, re.sub, 'x', r'\18', 'x') + self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x') + self.assertRaises(re.error, re.sub, 'x', r'\90', 'x') + self.assertRaises(re.error, re.sub, 'x', r'\99', 'x') + self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8' + self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x') + self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1' + self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0' + + # in python2.3 (etc), these loop endlessly in sre_parser.py + self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') + self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), + 'xz8') + self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), + 'xza') + + def test_qualified_re_sub(self): + self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb') + self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') + + def test_bug_114660(self): + self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), + 'hello there') + + def test_bug_462270(self): + # Test for empty sub() behaviour, see SF bug #462270 + self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-') + self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d') + + def test_symbolic_refs(self): + self.assertRaises(re.error, re.sub, '(?Px)', '\gx)', '\g<', 'xx') + self.assertRaises(re.error, re.sub, '(?Px)', '\g', 'xx') + self.assertRaises(re.error, re.sub, '(?Px)', '\g', 'xx') + self.assertRaises(re.error, re.sub, '(?Px)', '\g<1a1>', 'xx') + self.assertRaises(re.error, re.sub, '(?Px)', '\g', 'xx') + self.assertEqual(re.sub('(?Px)|(?Py)', '\g', 'xx'), '') + self.assertEqual(re.sub('(?Px)|(?Py)', '\\2', 'xx'), '') + self.assertRaises(re.error, re.sub, '(?Px)', '\g<-1>', 'xx') + + def test_re_subn(self): + self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) + self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) + self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) + self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) + self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) + + def test_re_split(self): + self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) + self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c']) + self.assertEqual(re.split("(:*)", ":a:b::c"), + ['', ':', 'a', ':', 'b', '::', 'c']) + self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) + self.assertEqual(re.split("(:)*", ":a:b::c"), + ['', ':', 'a', ':', 'b', ':', 'c']) + self.assertEqual(re.split("([b:]+)", ":a:b::c"), + ['', ':', 'a', ':b::', 'c']) + self.assertEqual(re.split("(b)|(:+)", ":a:b::c"), + ['', None, ':', 'a', None, ':', '', 'b', None, '', + None, '::', 'c']) + self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), + ['', 'a', '', '', 'c']) + self.assertEqual(re.split("(?z):*", ":a:b::c"), ['', 'a', 'b', 'c', '']) + + def test_qualified_re_split(self): + self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) + self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) + self.assertEqual(re.split("(:)", ":a:b::c", 2), + ['', ':', 'a', ':', 'b::c']) + self.assertEqual(re.split("(:*)", ":a:b::c", 2), + ['', ':', 'a', ':', 'b::c']) + + def test_re_findall(self): + self.assertEqual(re.findall(":+", "abc"), []) + self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"]) + self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"]) + self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""), + (":", ":"), + (":", "::")]) + + def test_bug_117612(self): + self.assertEqual(re.findall(r"(a|(b))", "aba"), + [("a", ""),("b", "b"),("a", "")]) + + def test_re_match(self): + self.assertEqual(re.match('a', 'a').groups(), ()) + self.assertEqual(re.match('(a)', 'a').groups(), ('a',)) + self.assertEqual(re.match(r'(a)', 'a').group(0), 'a') + self.assertEqual(re.match(r'(a)', 'a').group(1), 'a') + self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a')) + + pat = re.compile('((a)|(b))(c)?') + self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None)) + self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None)) + self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c')) + self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c')) + self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c')) + + # A single group + m = re.match('(a)', 'a') + self.assertEqual(m.group(0), 'a') + self.assertEqual(m.group(0), 'a') + self.assertEqual(m.group(1), 'a') + self.assertEqual(m.group(1, 1), ('a', 'a')) + + pat = re.compile('(?:(?Pa)|(?Pb))(?Pc)?') + self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) + self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), + (None, 'b', None)) + self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) + + def test_re_groupref_exists(self): + self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(), + ('(', 'a')) + self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(), + (None, 'a')) + self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None) + self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None) + self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(), + ('a', 'b')) + self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(), + (None, 'd')) + self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(), + (None, 'd')) + self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(), + ('a', '')) + + # Tests for bug #1177831: exercise groups other than the first group + p = re.compile('(?Pa)(?Pb)?((?(g2)c|d))') + self.assertEqual(p.match('abc').groups(), + ('a', 'b', 'c')) + self.assertEqual(p.match('ad').groups(), + ('a', None, 'd')) + self.assertEqual(p.match('abd'), None) + self.assertEqual(p.match('ac'), None) + + + def test_re_groupref(self): + self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), + ('|', 'a')) + self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(), + (None, 'a')) + self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None) + self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None) + self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(), + ('a', 'a')) + self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), + (None, None)) + + def test_groupdict(self): + self.assertEqual(re.match('(?Pfirst) (?Psecond)', + 'first second').groupdict(), + {'first':'first', 'second':'second'}) + + def test_expand(self): + self.assertEqual(re.match("(?Pfirst) (?Psecond)", + "first second") + .expand(r"\2 \1 \g \g"), + "second first second first") + + def test_repeat_minmax(self): + self.assertEqual(re.match("^(\w){1}$", "abc"), None) + self.assertEqual(re.match("^(\w){1}?$", "abc"), None) + self.assertEqual(re.match("^(\w){1,2}$", "abc"), None) + self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None) + + self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c") + self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c") + self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c") + self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c") + self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c") + self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c") + self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c") + self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c") + + self.assertEqual(re.match("^x{1}$", "xxx"), None) + self.assertEqual(re.match("^x{1}?$", "xxx"), None) + self.assertEqual(re.match("^x{1,2}$", "xxx"), None) + self.assertEqual(re.match("^x{1,2}?$", "xxx"), None) + + self.assertNotEqual(re.match("^x{3}$", "xxx"), None) + self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None) + self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None) + self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None) + self.assertNotEqual(re.match("^x{3}?$", "xxx"), None) + self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None) + self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None) + self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None) + + self.assertEqual(re.match("^x{}$", "xxx"), None) + self.assertNotEqual(re.match("^x{}$", "x{}"), None) + + def test_getattr(self): + self.assertEqual(re.match("(a)", "a").pos, 0) + self.assertEqual(re.match("(a)", "a").endpos, 1) + self.assertEqual(re.match("(a)", "a").string, "a") + self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1))) + self.assertNotEqual(re.match("(a)", "a").re, None) + + def test_special_escapes(self): + self.assertEqual(re.search(r"\b(b.)\b", + "abcd abc bcd bx").group(1), "bx") + self.assertEqual(re.search(r"\B(b.)\B", + "abc bcd bc abxd").group(1), "bx") + self.assertEqual(re.search(r"\b(b.)\b", + "abcd abc bcd bx", re.LOCALE).group(1), "bx") + self.assertEqual(re.search(r"\B(b.)\B", + "abc bcd bc abxd", re.LOCALE).group(1), "bx") + self.assertEqual(re.search(r"\b(b.)\b", + "abcd abc bcd bx", re.UNICODE).group(1), "bx") + self.assertEqual(re.search(r"\B(b.)\B", + "abc bcd bc abxd", re.UNICODE).group(1), "bx") + self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") + self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") + self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None) + self.assertEqual(re.search(r"\b(b.)\b", + u"abcd abc bcd bx").group(1), "bx") + self.assertEqual(re.search(r"\B(b.)\B", + u"abc bcd bc abxd").group(1), "bx") + self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc") + self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc") + self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None) + self.assertEqual(re.search(r"\d\D\w\W\s\S", + "1aa! a").group(0), "1aa! a") + self.assertEqual(re.search(r"\d\D\w\W\s\S", + "1aa! a", re.LOCALE).group(0), "1aa! a") + self.assertEqual(re.search(r"\d\D\w\W\s\S", + "1aa! a", re.UNICODE).group(0), "1aa! a") + + def test_bigcharset(self): + self.assertEqual(re.match(u"([\u2222\u2223])", + u"\u2222").group(1), u"\u2222") + self.assertEqual(re.match(u"([\u2222\u2223])", + u"\u2222", re.UNICODE).group(1), u"\u2222") + + def test_anyall(self): + self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0), + "a\nb") + self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), + "a\n\nb") + + def test_non_consuming(self): + self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a") + self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a") + self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a") + self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a") + self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a") + self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a") + self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a") + + self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a") + self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a") + self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") + self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") + + def test_ignore_case(self): + self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") + self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") + self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") + self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") + self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") + self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb") + self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a") + self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa") + self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") + self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") + + def test_category(self): + self.assertEqual(re.match(r"(\s)", " ").group(1), " ") + + def test_getlower(self): + import _sre + self.assertEqual(_sre.getlower(ord('A'), 0), ord('a')) + self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a')) + self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a')) + + self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") + self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") + + def test_not_literal(self): + self.assertEqual(re.search("\s([^a])", " b").group(1), "b") + self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb") + + def test_search_coverage(self): + self.assertEqual(re.search("\s(b)", " b").group(1), "b") + self.assertEqual(re.search("a\s", "a ").group(0), "a ") + + def test_re_escape(self): + p="" + for i in range(0, 256): + p = p + chr(i) + self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None, + True) + self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1)) + + pat=re.compile(re.escape(p)) + self.assertEqual(pat.match(p) is not None, True) + self.assertEqual(pat.match(p).span(), (0,256)) + + def test_pickling(self): + import pickle + self.pickle_test(pickle) + import cPickle + self.pickle_test(cPickle) + # old pickles expect the _compile() reconstructor in sre module + import warnings + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "The sre module is deprecated", + DeprecationWarning) + from sre import _compile + + def pickle_test(self, pickle): + oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)') + s = pickle.dumps(oldpat) + newpat = pickle.loads(s) + self.assertEqual(oldpat, newpat) + + def test_constants(self): + self.assertEqual(re.I, re.IGNORECASE) + self.assertEqual(re.L, re.LOCALE) + self.assertEqual(re.M, re.MULTILINE) + self.assertEqual(re.S, re.DOTALL) + self.assertEqual(re.X, re.VERBOSE) + + def test_flags(self): + for flag in [re.I, re.M, re.X, re.S, re.L]: + self.assertNotEqual(re.compile('^pattern$', flag), None) + + def test_sre_character_literals(self): + for i in [0, 8, 16, 32, 64, 127, 128, 255]: + self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None) + self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None) + self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None) + self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None) + self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None) + self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None) + self.assertRaises(re.error, re.match, "\911", "") + + def test_sre_character_class_literals(self): + for i in [0, 8, 16, 32, 64, 127, 128, 255]: + self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None) + self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None) + self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None) + self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None) + self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None) + self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None) + self.assertRaises(re.error, re.match, "[\911]", "") + + def test_bug_113254(self): + self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) + self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1) + self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1)) + + def test_bug_527371(self): + # bug described in patches 527371/672491 + self.assertEqual(re.match(r'(a)?a','a').lastindex, None) + self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1) + self.assertEqual(re.match(r'(?Pa)(?Pb)?b','ab').lastgroup, 'a') + self.assertEqual(re.match("(?Pa(b))", "ab").lastgroup, 'a') + self.assertEqual(re.match("((a))", "a").lastindex, 1) + + def test_bug_545855(self): + # bug 545855 -- This pattern failed to cause a compile error as it + # should, instead provoking a TypeError. + self.assertRaises(re.error, re.compile, 'foo[a-') + + def test_bug_418626(self): + # bugs 418626 at al. -- Testing Greg Chapman's addition of op code + # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of + # pattern '*?' on a long string. + self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) + self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), + 20003) + self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) + # non-simple '*?' still used to hit the recursion limit, before the + # non-recursive scheme was implemented. + self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) + + def test_bug_612074(self): + pat=u"["+re.escape(u"\u2039")+u"]" + self.assertEqual(re.compile(pat) and 1, 1) + + def test_stack_overflow(self): + # nasty cases that used to overflow the straightforward recursive + # implementation of repeated groups. + self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x') + self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') + self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') + + def test_scanner(self): + def s_ident(scanner, token): return token + def s_operator(scanner, token): return "op%s" % token + def s_float(scanner, token): return float(token) + def s_int(scanner, token): return int(token) + + scanner = Scanner([ + (r"[a-zA-Z_]\w*", s_ident), + (r"\d+\.\d*", s_float), + (r"\d+", s_int), + (r"=|\+|-|\*|/", s_operator), + (r"\s+", None), + ]) + + self.assertNotEqual(scanner.scanner.scanner("").pattern, None) + + self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), + (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, + 'op+', 'bar'], '')) + + def test_bug_448951(self): + # bug 448951 (similar to 429357, but with single char match) + # (Also test greedy matches.) + for op in '','?','*': + self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(), + (None, None)) + self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(), + ('a:', 'a')) + + def test_bug_725106(self): + # capturing groups in alternatives in repeats + self.assertEqual(re.match('^((a)|b)*', 'abc').groups(), + ('b', 'a')) + self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(), + ('c', 'b')) + self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(), + ('b', None)) + self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(), + ('b', None)) + self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(), + ('b', 'a')) + self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(), + ('c', 'b')) + self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(), + ('b', None)) + self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(), + ('b', None)) + + def test_bug_725149(self): + # mark_stack_base restoring before restoring marks + self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(), + ('a', None)) + self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(), + ('a', None, None)) + + def test_bug_764548(self): + # bug 764548, re.compile() barfs on str/unicode subclasses + try: + unicode + except NameError: + return # no problem if we have no unicode + class my_unicode(unicode): pass + pat = re.compile(my_unicode("abc")) + self.assertEqual(pat.match("xyz"), None) + + def test_finditer(self): + iter = re.finditer(r":+", "a:b::c:::d") + self.assertEqual([item.group(0) for item in iter], + [":", "::", ":::"]) + + def test_bug_926075(self): + try: + unicode + except NameError: + return # no problem if we have no unicode + self.assert_(re.compile('bug_926075') is not + re.compile(eval("u'bug_926075'"))) + + def test_bug_931848(self): + try: + unicode + except NameError: + pass + pattern = eval('u"[\u002E\u3002\uFF0E\uFF61]"') + self.assertEqual(re.compile(pattern).split("a.b.c"), + ['a','b','c']) + + def test_bug_581080(self): + iter = re.finditer(r"\s", "a b") + self.assertEqual(iter.next().span(), (1,2)) + self.assertRaises(StopIteration, iter.next) + + scanner = re.compile(r"\s").scanner("a b") + self.assertEqual(scanner.search().span(), (1, 2)) + self.assertEqual(scanner.search(), None) + + def test_bug_817234(self): + iter = re.finditer(r".*", "asdf") + self.assertEqual(iter.next().span(), (0, 4)) + self.assertEqual(iter.next().span(), (4, 4)) + self.assertRaises(StopIteration, iter.next) + + def test_empty_array(self): + # SF buf 1647541 + import array + for typecode in 'cbBuhHiIlLfd': + a = array.array(typecode) + self.assertEqual(re.compile("bla").match(a), None) + self.assertEqual(re.compile("").match(a).groups(), ()) + + def test_inline_flags(self): + # Bug #1700 + upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow + lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow + + p = re.compile(upper_char, re.I | re.U) + q = p.match(lower_char) + self.assertNotEqual(q, None) + + p = re.compile(lower_char, re.I | re.U) + q = p.match(upper_char) + self.assertNotEqual(q, None) + + p = re.compile('(?i)' + upper_char, re.U) + q = p.match(lower_char) + self.assertNotEqual(q, None) + + p = re.compile('(?i)' + lower_char, re.U) + q = p.match(upper_char) + self.assertNotEqual(q, None) + + p = re.compile('(?iu)' + upper_char) + q = p.match(lower_char) + self.assertNotEqual(q, None) + + p = re.compile('(?iu)' + lower_char) + q = p.match(upper_char) + self.assertNotEqual(q, None) + + def test_dollar_matches_twice(self): + "$ matches the end of string, and just before the terminating \n" + pattern = re.compile('$') + self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') + self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') + self.assertEqual(pattern.sub('#', '\n'), '#\n#') + + pattern = re.compile('$', re.MULTILINE) + self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' ) + self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') + self.assertEqual(pattern.sub('#', '\n'), '#\n#') + + def test_atomic(self): + pattern = re.compile(r'a(?>bc|b)c') + self.assertEqual(pattern.match('abc'), None) + self.assertNotEqual(pattern.match('abcc'), None) + self.assertEqual(re.match(r'(?>.*).', 'abc'), None) + self.assertNotEqual(re.match(r'(?>x)++', 'xxx'), None) + self.assertNotEqual(re.match(r'(?>x++)', 'xxx'), None) + self.assertEqual(re.match(r'(?>x)++x', 'xxx'), None) + self.assertEqual(re.match(r'(?>x++)x', 'xxx'), None) + + def test_bug_2537(self): + "nested repeat" + self.assertEqual(re.sub('((x|y)*)*', '(\\1, \\2)', 'xyyzy', 1), '(, y)zy') + self.assertEqual(re.sub('((x|y+)*)*', '(\\1, \\2)', 'xyyzy', 1), '(, yy)zy') + + def test_word_chars(self): + word_chars, all_chars = [], [] + accept_set = set(['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Nd', 'No', 'Mc', 'Me', 'Mn', 'Pc']) + for i in range(sys.maxunicode): + c = unichr(i) + if c == '_' or unicodedata.category(c) in accept_set: + word_chars.append(c) + all_chars.append(c) + word_chars = u''.join(word_chars) + found_chars = u''.join(re.findall(r'(?u)(\w)', u''.join(all_chars))) + self.assertEqual(found_chars, word_chars) + + def test_digit_chars(self): + digit_chars, all_chars = [], [] + accept_set = set(['Nd']) + for i in range(sys.maxunicode): + c = unichr(i) + if unicodedata.category(c) in accept_set: + digit_chars.append(c) + all_chars.append(c) + digit_chars = u''.join(digit_chars) + found_chars = u''.join(re.findall(r'(?u)(\d)', u''.join(all_chars))) + self.assertEqual(found_chars, digit_chars) + + def test_named_chars(self): + self.assertNotEqual(re.match(r"\N{LATIN CAPITAL LETTER A}", u"A"), None) + self.assertNotEqual(re.match(r"[\N{LATIN CAPITAL LETTER A}]", u"A"), None) + self.assertEqual(re.match(r"\N{LATIN CAPITAL LETTER A}", u"B"), None) + self.assertEqual(re.match(r"[\N{LATIN CAPITAL LETTER A}]", u"a"), None) + + def test_unicode_properties(self): + self.assertNotEqual(re.match(r"\p{Lu}", u"A"), None) + self.assertEqual(re.match(r"\p{Lu}", u"a"), None) + self.assertNotEqual(re.match(r"\p{L&}", u"A"), None) + + ascii_chars = "".join(chr(c) for c in range(0x0, 0x80)) + charsets = r""" +\p{Alnum} [\p{L&}\p{Nd}] [a-zA-Z0-9] +\p{Alpha} \p{L&} [a-zA-Z] +\p{ASCII} [\x00-\x7F] +\p{Blank} [\p{Zs}\t] [ \t] +\p{Cntrl} \p{Cc} [\x00-\x1F\x7F] +\p{Digit} \p{Nd} \d [0-9] +\p{Graph} [^\p{Z}\p{C}] [\x21-\x7E] +\p{Lower} \p{Ll} [a-z] +\p{Print} \P{C} [\x20-\x7E] +\p{Punct} [\p{P}\p{S}] [!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~] +\p{Space} [\p{Z}\t\r\n\v\f] \s [ \t\r\n\v\f] +\p{Upper} \p{Lu} [A-Z] + [\p{L}\p{N}\p{Pc}] \w [A-Za-z0-9_] +\p{XDigit} [A-Fa-f0-9] +""" + for line in charsets.splitlines(): + parts = [p.strip() for p in line.split(" ")] + parts = [p for p in parts if p] + if parts: + matched = [re.findall(p, ascii_chars, re.U) for p in parts] + self.assertEqual(self.all_same(matched), True) + + def all_same(self, items): + first = items[0] + return all(i == first for i in items[1 : ]) + +def run_re_tests(): + from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR + if verbose: + print 'Running re_tests test suite' + else: + # To save time, only run the first and last 10 tests + #tests = tests[:10] + tests[-10:] + pass + + for t in tests: + sys.stdout.flush() + pattern = s = outcome = repl = expected = None + if len(t) == 5: + pattern, s, outcome, repl, expected = t + elif len(t) == 3: + pattern, s, outcome = t + else: + raise ValueError, ('Test tuples should have 3 or 5 fields', t) + + try: + obj = re.compile(pattern) + except re.error: + if outcome == SYNTAX_ERROR: pass # Expected a syntax error + else: + print '=== Syntax error:', t + except KeyboardInterrupt: raise KeyboardInterrupt + except: + print '*** Unexpected error ***', t + if verbose: + traceback.print_exc(file=sys.stdout) + else: + try: + result = obj.search(s) + except re.error, msg: + print '=== Unexpected exception', t, repr(msg) + if outcome == SYNTAX_ERROR: + # This should have been a syntax error; forget it. + pass + elif outcome == FAIL: + if result is None: pass # No match, as expected + else: print '=== Succeeded incorrectly', t + elif outcome == SUCCEED: + if result is not None: + # Matched, as expected, so now we compute the + # result string and compare it to our expected result. + start, end = result.span(0) + vardict={'found': result.group(0), + 'groups': result.group(), + 'flags': result.re.flags} + for i in range(1, 100): + try: + gi = result.group(i) + # Special hack because else the string concat fails: + if gi is None: + gi = "None" + except IndexError: + gi = "Error" + vardict['g%d' % i] = gi + for i in result.re.groupindex.keys(): + try: + gi = result.group(i) + if gi is None: + gi = "None" + except IndexError: + gi = "Error" + vardict[i] = gi + repl = eval(repl, vardict) + if repl != expected: + print '=== grouping error', t, + print repr(repl) + ' should be ' + repr(expected) + else: + print '=== Failed incorrectly', t + + # Try the match on a unicode string, and check that it + # still succeeds. + try: + result = obj.search(unicode(s, "latin-1")) + if result is None: + print '=== Fails on unicode match', t + except NameError: + continue # 1.5.2 + except TypeError: + continue # unicode test case + + # Try the match on a unicode pattern, and check that it + # still succeeds. + obj=re.compile(unicode(pattern, "latin-1")) + result = obj.search(s) + if result is None: + print '=== Fails on unicode pattern match', t + + # Try the match with the search area limited to the extent + # of the match and see if it still succeeds. \B will + # break (because it won't match at the end or start of a + # string), so we'll ignore patterns that feature it. + + if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \ + and result is not None: + obj = re.compile(pattern) + result = obj.search(s, result.start(0), result.end(0) + 1) + if result is None: + print '=== Failed on range-limited match', t + + # Try the match with IGNORECASE enabled, and check that it + # still succeeds. + obj = re.compile(pattern, re.IGNORECASE) + result = obj.search(s) + if result is None: + print '=== Fails on case-insensitive match', t + + # Try the match with LOCALE enabled, and check that it + # still succeeds. + obj = re.compile(pattern, re.LOCALE) + result = obj.search(s) + if result is None: + print '=== Fails on locale-sensitive match', t + + # Try the match with UNICODE locale enabled, and check + # that it still succeeds. + obj = re.compile(pattern, re.UNICODE) + result = obj.search(s) + if result is None: + print '=== Fails on unicode-sensitive match', t + +def test_main(): + run_unittest(ReTests) + run_re_tests() + +if __name__ == "__main__": + test_main() === modified file Lib/sre_constants.py --- Lib/sre_constants.py 2004-08-25 02:22:30 +0000 +++ Lib/sre_constants.py 2009-02-03 19:09:29 +0000 @@ -13,11 +13,22 @@ # update when constants are added or removed -MAGIC = 20031017 +MAGIC = 20081218 -# max code word in this release - -MAXREPEAT = 65535 +import operator +import unicodedata +from collections import defaultdict + +# size of code word in this release +BYTES_PER_CODE = 4 +BITS_PER_CODE = 8 * BYTES_PER_CODE +MAXCODE = (1 << BITS_PER_CODE) - 1 + +MAXREPEAT = MAXCODE + +DIGITS = set("0123456789") +OCTDIGITS = set("01234567") +HEXDIGITS = set("0123456789abcdefABCDEF") # SRE standard exception (access as sre.error) # should this really be here? @@ -25,181 +36,126 @@ class error(Exception): pass -# operators +# list of all the operators +# the fields are: name, op_type, negative, directional, end_marker +# those with a negative form start with NOT_ +# those with a reverse directional form end with _REV +OPERATOR_LIST = """ +FAILURE INVALID N N - +SUCCESS INVALID N N - +ANY SIMPLE_CATEGORY N Y - +ANY_ALL SIMPLE_CATEGORY N Y - +ASSERT ASSERT N N END_ASSERT +ASSERT_NOT ASSERT N N END_ASSERT_NOT +ATOMIC ATOMIC N N END_ATOMIC +BOUNDARY POSITION Y N - +BRANCH BRANCH N N - +CATEGORY CATEGORY Y Y - +CHARSET CHARSET Y Y - +CHARSET_IGNORE CHARSET Y Y - +END_OF_LINE POSITION N N - +END_OF_STRING POSITION N N - +END_OF_STRING_LN POSITION N N - +GROUPREF GROUPREF N Y - +GROUPREF_EXISTS GROUPREF_EXISTS N N - +GROUPREF_IGNORE GROUPREF N Y - +JUMP INVALID N N - +LITERAL LITERAL Y Y - +LITERAL_IGNORE LITERAL Y Y - +LITERAL_STRING LITERAL_STRING N Y - +LITERAL_STRING_IGNORE LITERAL_STRING N Y - +MARK MARK N N - +RANGE RANGE Y Y - +RANGE_IGNORE RANGE Y Y - +REPEAT_MAX REPEAT N Y END_REPEAT_MAX +REPEAT_MIN REPEAT N Y END_REPEAT_MIN +REPEAT_ONE_MAX REPEAT_ONE N Y - +REPEAT_ONE_MIN REPEAT_ONE N Y - +REPEAT_ONE_POSS REPEAT_ONE N Y - +REPEAT_POSS REPEAT N Y END_REPEAT_POSS +SET SET Y Y - +SET_IGNORE SET Y Y - +START_OF_LINE POSITION N N - +START_OF_STRING POSITION N N - +SUBPATTERN INVALID N N - +""" + +# enumerate the operators +neg_prefix = {"N": [""], "Y": ["", "NOT_"]} +dir_suffix = {"N": [(0, "")], "Y": [(1, ""), (-1, "_REV")]} + +operator_list = [] +for line in OPERATOR_LIST.splitlines(): + fields = line.split() + if not fields: + continue + name, op_type, negative, directional, end_marker = fields + # some opcodes have a negative "NOT_x" form + for p in neg_prefix[negative]: + # some opcodes are directional; they have a reverse "x_REV" form + for d, s in dir_suffix[directional]: + operator_list.append((p + name + s, op_type, d, end_marker)) + if end_marker != "-": + operator_list.append((p + end_marker + s, "INVALID", d, "-")) + +# build a dict of positive<->negative opcodes +not_opcodes = [name for name, op_type, direction, end_marker in operator_list if name.startswith("NOT_")] +not_opcodes = dict([(name, name[4 : ]) for name in not_opcodes] + [(name[4 : ], name) for name in not_opcodes]) + +def not_op(op): + return not_opcodes[op[0]], op[1] + +# build a dict of normal<->ignore opcodes +ignore_opcodes = [name for name, op_type, direction, end_marker in operator_list if name.endswith("_IGNORE")] +ignore_opcodes = dict([(name, name[ : -7]) for name in ignore_opcodes] + [(name[ : -7], name) for name in ignore_opcodes]) +for op in ["CATEGORY", "NOT_CATEGORY"]: + ignore_opcodes[op] = op + +def ignore_op(op): + return ignore_opcodes[op[0]], op[1] + +# sort the operators (except FAILURE and SUCCESS) and assign opcode numbers +operator_list = operator_list[ : 2] + sorted(operator_list[2 : ]) +operator_list = [(name, number, op_type, direction, end_marker) for number, (name, op_type, direction, end_marker) in enumerate(operator_list)] + +# build the OPCODES dict +OPCODES = dict((name, number) for name, number, op_type, direction, end_marker in operator_list) -FAILURE = "failure" -SUCCESS = "success" +# collect the op_types +op_types = set(op_type for name, number, op_type, direction, end_marker in operator_list) + +# create an attribute in OP for each operator +class Record(object): + pass -ANY = "any" -ANY_ALL = "any_all" -ASSERT = "assert" -ASSERT_NOT = "assert_not" -AT = "at" -BIGCHARSET = "bigcharset" -BRANCH = "branch" -CALL = "call" -CATEGORY = "category" -CHARSET = "charset" -GROUPREF = "groupref" -GROUPREF_IGNORE = "groupref_ignore" -GROUPREF_EXISTS = "groupref_exists" -IN = "in" -IN_IGNORE = "in_ignore" -INFO = "info" -JUMP = "jump" -LITERAL = "literal" -LITERAL_IGNORE = "literal_ignore" -MARK = "mark" -MAX_REPEAT = "max_repeat" -MAX_UNTIL = "max_until" -MIN_REPEAT = "min_repeat" -MIN_UNTIL = "min_until" -NEGATE = "negate" -NOT_LITERAL = "not_literal" -NOT_LITERAL_IGNORE = "not_literal_ignore" -RANGE = "range" -REPEAT = "repeat" -REPEAT_ONE = "repeat_one" -SUBPATTERN = "subpattern" -MIN_REPEAT_ONE = "min_repeat_one" - -# positions -AT_BEGINNING = "at_beginning" -AT_BEGINNING_LINE = "at_beginning_line" -AT_BEGINNING_STRING = "at_beginning_string" -AT_BOUNDARY = "at_boundary" -AT_NON_BOUNDARY = "at_non_boundary" -AT_END = "at_end" -AT_END_LINE = "at_end_line" -AT_END_STRING = "at_end_string" -AT_LOC_BOUNDARY = "at_loc_boundary" -AT_LOC_NON_BOUNDARY = "at_loc_non_boundary" -AT_UNI_BOUNDARY = "at_uni_boundary" -AT_UNI_NON_BOUNDARY = "at_uni_non_boundary" - -# categories -CATEGORY_DIGIT = "category_digit" -CATEGORY_NOT_DIGIT = "category_not_digit" -CATEGORY_SPACE = "category_space" -CATEGORY_NOT_SPACE = "category_not_space" -CATEGORY_WORD = "category_word" -CATEGORY_NOT_WORD = "category_not_word" -CATEGORY_LINEBREAK = "category_linebreak" -CATEGORY_NOT_LINEBREAK = "category_not_linebreak" -CATEGORY_LOC_WORD = "category_loc_word" -CATEGORY_LOC_NOT_WORD = "category_loc_not_word" -CATEGORY_UNI_DIGIT = "category_uni_digit" -CATEGORY_UNI_NOT_DIGIT = "category_uni_not_digit" -CATEGORY_UNI_SPACE = "category_uni_space" -CATEGORY_UNI_NOT_SPACE = "category_uni_not_space" -CATEGORY_UNI_WORD = "category_uni_word" -CATEGORY_UNI_NOT_WORD = "category_uni_not_word" -CATEGORY_UNI_LINEBREAK = "category_uni_linebreak" -CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak" - -OPCODES = [ - - # failure=0 success=1 (just because it looks better that way :-) - FAILURE, SUCCESS, - - ANY, ANY_ALL, - ASSERT, ASSERT_NOT, - AT, - BRANCH, - CALL, - CATEGORY, - CHARSET, BIGCHARSET, - GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE, - IN, IN_IGNORE, - INFO, - JUMP, - LITERAL, LITERAL_IGNORE, - MARK, - MAX_UNTIL, - MIN_UNTIL, - NOT_LITERAL, NOT_LITERAL_IGNORE, - NEGATE, - RANGE, - REPEAT, - REPEAT_ONE, - SUBPATTERN, - MIN_REPEAT_ONE - -] - -ATCODES = [ - AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY, - AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING, - AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY, - AT_UNI_NON_BOUNDARY -] - -CHCODES = [ - CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE, - CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD, - CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_WORD, - CATEGORY_LOC_NOT_WORD, CATEGORY_UNI_DIGIT, CATEGORY_UNI_NOT_DIGIT, - CATEGORY_UNI_SPACE, CATEGORY_UNI_NOT_SPACE, CATEGORY_UNI_WORD, - CATEGORY_UNI_NOT_WORD, CATEGORY_UNI_LINEBREAK, - CATEGORY_UNI_NOT_LINEBREAK -] - -def makedict(list): - d = {} - i = 0 - for item in list: - d[item] = i - i = i + 1 - return d - -OPCODES = makedict(OPCODES) -ATCODES = makedict(ATCODES) -CHCODES = makedict(CHCODES) - -# replacement operations for "ignore case" mode -OP_IGNORE = { - GROUPREF: GROUPREF_IGNORE, - IN: IN_IGNORE, - LITERAL: LITERAL_IGNORE, - NOT_LITERAL: NOT_LITERAL_IGNORE -} - -AT_MULTILINE = { - AT_BEGINNING: AT_BEGINNING_LINE, - AT_END: AT_END_LINE -} - -AT_LOCALE = { - AT_BOUNDARY: AT_LOC_BOUNDARY, - AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY -} - -AT_UNICODE = { - AT_BOUNDARY: AT_UNI_BOUNDARY, - AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY -} - -CH_LOCALE = { - CATEGORY_DIGIT: CATEGORY_DIGIT, - CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, - CATEGORY_SPACE: CATEGORY_SPACE, - CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE, - CATEGORY_WORD: CATEGORY_LOC_WORD, - CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, - CATEGORY_LINEBREAK: CATEGORY_LINEBREAK, - CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK -} - -CH_UNICODE = { - CATEGORY_DIGIT: CATEGORY_UNI_DIGIT, - CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT, - CATEGORY_SPACE: CATEGORY_UNI_SPACE, - CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE, - CATEGORY_WORD: CATEGORY_UNI_WORD, - CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD, - CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK, - CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK -} +OP = Record() +for name in OPCODES: + setattr(OP, name, name) + +# unicode codepoint categories (property "\p{Lu}", etc) +# (these entries must have certain fixed values) +UNI_CATEGORY_LIST = "- Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co - Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So -" + +# additional unicode categories (property "\p{Alpha}", etc) +COMMON_CATEGORY_LIST = "Alpha Alnum ASCII Blank Cntrl Digit Graph LineBreak Lower Print Punct Space Upper Word XDigit" + +# build the unicode categories dict +CATEGORIES = dict((name, value) for value, name in enumerate(UNI_CATEGORY_LIST.split()) if name != "-") +assert len(CATEGORIES) <= 0x20 + +# add the unicode supercategories (property "\p{L&}", etc) +category_number = 0x20 +for name in UNI_CATEGORY_LIST.split(): + if name == "-" or name[0] in CATEGORIES: + continue + CATEGORIES[name[0]] = category_number + CATEGORIES[name[0] + "&"] = category_number + category_number += 1 + +COMMON_CATEGORY_START = category_number +for name in COMMON_CATEGORY_LIST.split(): + CATEGORIES[name] = category_number + category_number += 1 # flags SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking) @@ -210,6 +166,8 @@ SRE_FLAG_UNICODE = 32 # use unicode locale SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments SRE_FLAG_DEBUG = 128 # debugging +SRE_FLAG_REVERSE = 256 # search backwards +SRE_FLAG_ZEROWIDTH = 512 # permit split on zero-width # flags for INFO primitive SRE_INFO_PREFIX = 1 # has prefix @@ -217,12 +175,8 @@ SRE_INFO_CHARSET = 4 # pattern starts with character from given set if __name__ == "__main__": - def dump(f, d, prefix): - items = d.items() - items.sort(key=lambda a: a[1]) - for k, v in items: - f.write("#define %s_%s %s\n" % (prefix, k.upper(), v)) - f = open("sre_constants.h", "w") + f = open("sre_constants.h", "wb") + f.write("""\ /* * Secret Labs' Regular Expression Engine @@ -240,22 +194,84 @@ """) f.write("#define SRE_MAGIC %d\n" % MAGIC) + f.write("\n") + f.write("/* size of a code word (must be unsigned short or larger, and\n") + f.write(" large enough to hold a Py_UNICODE character) */\n") + if BYTES_PER_CODE == 4: + f.write("typedef unsigned int SRE_CODE;\n") + else: + f.write("typedef unsigned short SRE_CODE;\n") + + f.write("\n") + f.write("#define SRE_BYTES_PER_CODE %d\n" % BYTES_PER_CODE) + f.write("#define SRE_BITS_PER_CODE %d\n" % BITS_PER_CODE) + f.write("#define SRE_UNLIMITED_REPEATS 0x%X\n" % MAXREPEAT) + + f.write("\n") + for name, number, op_type, direction, end_marker in operator_list: + f.write("#define SRE_OP_%s %d\n" % (name, number)) + f.write("#define SRE_MAX_OP %d\n" % (len(operator_list) - 1)) + + f.write("\n") + f.write("#define SRE_FLAG_TEMPLATE 0x%X\n" % SRE_FLAG_TEMPLATE) + f.write("#define SRE_FLAG_IGNORECASE 0x%X\n" % SRE_FLAG_IGNORECASE) + f.write("#define SRE_FLAG_LOCALE 0x%X\n" % SRE_FLAG_LOCALE) + f.write("#define SRE_FLAG_MULTILINE 0x%X\n" % SRE_FLAG_MULTILINE) + f.write("#define SRE_FLAG_DOTALL 0x%X\n" % SRE_FLAG_DOTALL) + f.write("#define SRE_FLAG_UNICODE 0x%X\n" % SRE_FLAG_UNICODE) + f.write("#define SRE_FLAG_VERBOSE 0x%X\n" % SRE_FLAG_VERBOSE) + f.write("#define SRE_FLAG_REVERSE 0x%X\n" % SRE_FLAG_REVERSE) + f.write("#define SRE_FLAG_ZEROWIDTH 0x%X\n" % SRE_FLAG_ZEROWIDTH) + + f.write("\n") + f.write("#define SRE_INFO_PREFIX 0x%X\n" % SRE_INFO_PREFIX) + f.write("#define SRE_INFO_LITERAL 0x%X\n" % SRE_INFO_LITERAL) + f.write("#define SRE_INFO_CHARSET 0x%X\n" % SRE_INFO_CHARSET) + + f.write("\n") + _categories = sorted(CATEGORIES.items(), key=operator.itemgetter(1)) + for name, value in ((name, value) for name, value in _categories if value < COMMON_CATEGORY_START): + if name.isalnum(): + f.write("#define SRE_UNI_CAT_%s 0x%X\n" % (name, value)) + + f.write("\n") + for name, value in ((name, value) for name, value in _categories if value >= COMMON_CATEGORY_START): + f.write("#define SRE_CAT_%s 0x%X\n" % (name, value)) + + f.write("\n") + groups = defaultdict(int) + for name, value in ((name, value) for name, value in _categories if value < COMMON_CATEGORY_START): + if len(name) == 2 and name.isalpha(): + groups[name[ : 1]] |= 1 << value + + for name, value in sorted(groups.items()): + f.write("#define SRE_CAT_GROUP_%s 0x%08X\n" % (name, value)) + + f.write(""" +// info for operator validation +typedef struct SRE_OpInfo { + int type; + int direction; + int end_marker; +} SRE_OpInfo; - dump(f, OPCODES, "SRE_OP") - dump(f, ATCODES, "SRE") - dump(f, CHCODES, "SRE") - - f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE) - f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE) - f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE) - f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE) - f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL) - f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE) - f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE) - - f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX) - f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL) - f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET) +""") + # sort the op_types (putting "INVALID" first) and assign numbers + op_types = [(name, number) for number, name in enumerate(sorted(op_types, key=lambda name: ("" if name == "INVALID" else name)))] + for name, number in op_types: + f.write("#define SRE_TYPE_%s %d\n" % (name, number)) + + op_types = dict(op_types) + f.write(""" +static SRE_OpInfo op_info[] = { +""") + for name, number, op_type, direction, end_marker in operator_list: + if end_marker == "-": + end_marker = "0" + else: + end_marker = "SRE_OP_%s" % end_marker + f.write(" {%s, %s, %s}, // SRE_OP_%s\n" % (op_types[op_type], direction, end_marker, name)) + f.write("};\n") f.close() print "done" === modified file Lib/sre_compile.py --- Lib/sre_compile.py 2008-04-08 21:27:42 +0000 +++ Lib/sre_compile.py 2009-02-03 19:10:17 +0000 @@ -3,7 +3,7 @@ # # convert template to internal format # -# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. +# Copyright (c) 1997-2001 by Secret Labs MAGICAB. All rights reserved. # # See the sre.py file for information on usage and redistribution. # @@ -11,281 +11,263 @@ """Internal support module for sre""" import _sre, sys -import sre_parse + from sre_constants import * -assert _sre.MAGIC == MAGIC, "SRE module mismatch" +assert _sre.MAGIC == MAGIC, "MAGICSRE module mismatch" + +ASSERT_OP_CODES = { + OP.ASSERT: OP.END_ASSERT, + OP.ASSERT_NOT: OP.END_ASSERT_NOT, +} + +REPEAT_OP_CODES = { + OP.REPEAT_MAX: OP.END_REPEAT_MAX, + OP.REPEAT_MIN: OP.END_REPEAT_MIN, + OP.REPEAT_POSS: OP.END_REPEAT_POSS, +} + +SINGLE_CHAR_OP_CODES = set([ + OP.ANY, OP.ANY_ALL, + OP.CATEGORY, OP.NOT_CATEGORY, + OP.CHARSET, OP.CHARSET_IGNORE, OP.NOT_CHARSET, OP.NOT_CHARSET_IGNORE, + OP.LITERAL, OP.LITERAL_IGNORE, OP.NOT_LITERAL, OP.NOT_LITERAL_IGNORE, + OP.RANGE, OP.RANGE_IGNORE, + OP.SET, OP.NOT_SET, +]) + +NORMAL_OP_CODES, REVERSE_OP_CODES = {}, {} +for op in dir(OP): + if not op.startswith("_"): + NORMAL_OP_CODES[op] = op + if op.endswith("_REV"): + REVERSE_OP_CODES[op[ : -4]] = op + else: + REVERSE_OP_CODES.setdefault(op, op) -if _sre.CODESIZE == 2: - MAXCODE = 65535 -else: - MAXCODE = 0xFFFFFFFFL - -def _identityfunction(x): - return x - -def set(seq): - s = {} - for elem in seq: - s[elem] = 1 - return s - -_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) -_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) -_SUCCESS_CODES = set([SUCCESS, FAILURE]) -_ASSERT_CODES = set([ASSERT, ASSERT_NOT]) +REPEAT_ONE_OP_CODES = { + OP.REPEAT_MAX: OP.REPEAT_ONE_MAX, + OP.REPEAT_MIN: OP.REPEAT_ONE_MIN, + OP.REPEAT_POSS: OP.REPEAT_ONE_POSS, +} + +CATEGORY_OP_SET = set([OP.CATEGORY, OP.NOT_CATEGORY]) +CHARSET_OP_SET = set([OP.CHARSET, OP.CHARSET_IGNORE, OP.NOT_CHARSET, OP.NOT_CHARSET_IGNORE]) +GROUPREF_OP_SET = set([OP.GROUPREF, OP.GROUPREF_IGNORE]) +LITERAL_OP_SET = set([OP.LITERAL, OP.LITERAL_IGNORE, OP.NOT_LITERAL, OP.NOT_LITERAL_IGNORE]) +POSITION_OP_SET = set([OP.BOUNDARY, OP.END_OF_LINE, OP.END_OF_STRING, OP.END_OF_STRING_LN, OP.NOT_BOUNDARY, OP.START_OF_LINE, OP.START_OF_STRING]) +RANGE_OP_SET = set([OP.NOT_RANGE, OP.NOT_RANGE_IGNORE, OP.RANGE, OP.RANGE_IGNORE]) +REPEAT_OP_SET = set([OP.REPEAT_MAX, OP.REPEAT_MIN, OP.REPEAT_POSS]) +SET_OP_SET = set([OP.SET, OP.SET_IGNORE, OP.NOT_SET, OP.NOT_SET_IGNORE]) +SIMPLE_CATEGORY_OP_SET = set([OP.ANY, OP.ANY_ALL]) -def _compile(code, pattern, flags): +def _compile(code, pattern, flags, info, dir=1): # internal: compile a (sub)pattern emit = code.append - _len = len - LITERAL_CODES = _LITERAL_CODES - REPEATING_CODES = _REPEATING_CODES - SUCCESS_CODES = _SUCCESS_CODES - ASSERT_CODES = _ASSERT_CODES + literal_op, literal_string = None, [] + if dir < 0: + fix_direction = REVERSE_OP_CODES + else: + fix_direction = NORMAL_OP_CODES + if dir < 0: + # Within lookbehind, so reverse the order of the matching + pattern = reversed(pattern) + def flush_literal(): + if literal_string: + emit_literal_string(code, literal_op, literal_string[ : : dir], fix_direction) for op, av in pattern: - if op in LITERAL_CODES: - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - emit(_sre.getlower(av, flags)) - else: - emit(OPCODES[op]) - emit(av) - elif op is IN: - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - def fixup(literal, flags=flags): - return _sre.getlower(literal, flags) - else: - emit(OPCODES[op]) - fixup = _identityfunction - skip = _len(code); emit(0) - _compile_charset(av, flags, code, fixup) - code[skip] = _len(code) - skip - elif op is ANY: - if flags & SRE_FLAG_DOTALL: - emit(OPCODES[ANY_ALL]) - else: - emit(OPCODES[ANY]) - elif op in REPEATING_CODES: - if flags & SRE_FLAG_TEMPLATE: - raise error, "internal: unsupported template operator" - emit(OPCODES[REPEAT]) - skip = _len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - emit(OPCODES[SUCCESS]) - code[skip] = _len(code) - skip - elif _simple(av) and op is not REPEAT: - if op is MAX_REPEAT: - emit(OPCODES[REPEAT_ONE]) - else: - emit(OPCODES[MIN_REPEAT_ONE]) - skip = _len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - emit(OPCODES[SUCCESS]) - code[skip] = _len(code) - skip + if op in SET_OP_SET: + op, av = _optimize_set(op, av, flags) + if op == literal_op: + literal_string.append(av) + else: + flush_literal() + if op in (OP.LITERAL, OP.LITERAL_IGNORE): + literal_op, literal_string = op, [av] else: - emit(OPCODES[REPEAT]) - skip = _len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - code[skip] = _len(code) - skip - if op is MAX_REPEAT: - emit(OPCODES[MAX_UNTIL]) + literal_op, literal_string = None, [] + if op in ASSERT_OP_CODES: + # ... + emit(OPCODES[op]) + skip = len(code); emit(0) + _compile(code, av[1], flags, info, av[0]) + emit(OPCODES[ASSERT_OP_CODES[op]]) + code[skip] = len(code) - skip + elif op == OP.ATOMIC: + # ... + emit(OPCODES[OP.ATOMIC]) + _compile(code, av[1], flags, info, dir) + emit(OPCODES[OP.END_ATOMIC]) + elif op == OP.BRANCH: + # ... ... 0 + emit(OPCODES[op]) + tail = [] + tailappend = tail.append + for av in av[1]: + skip = len(code); emit(0) + _compile(code, av, flags, info, dir) + emit(OPCODES[OP.JUMP]) + tailappend(len(code)); emit(0) + code[skip] = len(code) - skip + emit(0) # end of branchs + for tail in tail: + code[tail] = len(code) - tail + elif op in CATEGORY_OP_SET: + # category + emit(OPCODES[fix_direction[op]]) + emit(av) + elif op in CHARSET_OP_SET: + # skip charset + emit(OPCODES[fix_direction[op]]) + skip = len(code); emit(0) + _compile_charset(code, av) + code[skip] = len(code) - skip + elif op in GROUPREF_OP_SET: + # group_id + emit(OPCODES[fix_direction[op]]) + emit(av - 1) + elif op == OP.GROUPREF_EXISTS: + # group_id code_yes code_no + emit(OPCODES[op]) + emit(av[0] - 1) + skipyes = len(code); emit(0) + _compile(code, av[1], flags, info, dir) + if av[2]: + emit(OPCODES[OP.JUMP]) + skipno = len(code); emit(0) + code[skipyes] = len(code) - skipyes + 1 + _compile(code, av[2], flags, info, dir) + code[skipno] = len(code) - skipno + else: + code[skipyes] = len(code) - skipyes + 1 + elif op in LITERAL_OP_SET: + # code + emit(OPCODES[fix_direction[op]]) + emit(av) + elif op in POSITION_OP_SET: + # + emit(OPCODES[fix_direction[op]]) + elif op in RANGE_OP_SET: + # min max + emit(OPCODES[fix_direction[op]]) + emit(av[0]) + emit(av[1]) + elif op in REPEAT_OP_SET: + if flags & SRE_FLAG_TEMPLATE: + raise error("internal: unsupported template operator") + else: + single = get_single_character(av[2]) + if single: + # ... + emit(OPCODES[fix_direction[REPEAT_ONE_OP_CODES[op]]]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, single, flags, info, dir) + code[skip] = len(code) - skip + else: + # ... + emit(OPCODES[fix_direction[op]]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags, info, dir) + emit(OPCODES[fix_direction[REPEAT_OP_CODES[op]]]) + offset = len(code) - skip + code[skip] = offset + emit(offset) + elif op in SET_OP_SET: + # set + emit(OPCODES[fix_direction[op]]) + _compile_set(code, av) + elif op in SIMPLE_CATEGORY_OP_SET: + # + emit(OPCODES[op]) + elif op == OP.SUBPATTERN: + if av[0]: + number_id, name_id = av[0] + info.group_count += 1 + number_start_mark, number_end_mark = number_id * 2 - 2, number_id * 2 - 1 + name_start_mark, name_end_mark = name_id * 2 - 2, name_id * 2 - 1 + if dir < 0: + number_start_mark, number_end_mark = number_end_mark, number_start_mark + name_start_mark, name_end_mark = name_end_mark, name_start_mark + # + emit(OPCODES[OP.MARK]) + emit(number_start_mark) + emit(name_start_mark) + _compile(code, av[1], flags, info, dir) + if av[0]: + # + emit(OPCODES[OP.MARK]) + emit(number_end_mark) + emit(name_end_mark) else: - emit(OPCODES[MIN_UNTIL]) - elif op is SUBPATTERN: - if av[0]: - emit(OPCODES[MARK]) - emit((av[0]-1)*2) - # _compile_info(code, av[1], flags) - _compile(code, av[1], flags) - if av[0]: - emit(OPCODES[MARK]) - emit((av[0]-1)*2+1) - elif op in SUCCESS_CODES: - emit(OPCODES[op]) - elif op in ASSERT_CODES: - emit(OPCODES[op]) - skip = _len(code); emit(0) - if av[0] >= 0: - emit(0) # look ahead - else: - lo, hi = av[1].getwidth() - if lo != hi: - raise error, "look-behind requires fixed-width pattern" - emit(lo) # look behind - _compile(code, av[1], flags) - emit(OPCODES[SUCCESS]) - code[skip] = _len(code) - skip - elif op is CALL: + raise ValueError("unsupported operand type: %s" % op) + flush_literal() + +def emit_literal_string(code, literal_op, literal_string, fix_direction): + emit = code.append + if len(literal_string) > 1: + # a string + if literal_op == OP.LITERAL_IGNORE: + # length ... + emit(OPCODES[fix_direction[OP.LITERAL_STRING_IGNORE]]) + else: + # length ... + emit(OPCODES[fix_direction[OP.LITERAL_STRING]]) + emit(len(literal_string)) + code.extend(literal_string) + else: + # code + # a single character + emit(OPCODES[fix_direction[literal_op]]) + emit(literal_string[0]) + +def get_single_character(pattern): + if len(pattern) == 1 and pattern[0][0] in SINGLE_CHAR_OP_CODES: + return pattern + return None + +def _compile_set(code, charset): + emit = code.append + skip_set = len(code); emit(0) + for op, av in charset: + if op in CHARSET_OP_SET: + # skip charset emit(OPCODES[op]) - skip = _len(code); emit(0) - _compile(code, av, flags) - emit(OPCODES[SUCCESS]) - code[skip] = _len(code) - skip - elif op is AT: + skip = len(code); emit(0) + _compile_charset(code, av) + code[skip] = len(code) - skip + elif op in CATEGORY_OP_SET: + # category emit(OPCODES[op]) - if flags & SRE_FLAG_MULTILINE: - av = AT_MULTILINE.get(av, av) - if flags & SRE_FLAG_LOCALE: - av = AT_LOCALE.get(av, av) - elif flags & SRE_FLAG_UNICODE: - av = AT_UNICODE.get(av, av) - emit(ATCODES[av]) - elif op is BRANCH: + emit(av) + elif op == OP.LITERAL: + # code emit(OPCODES[op]) - tail = [] - tailappend = tail.append - for av in av[1]: - skip = _len(code); emit(0) - # _compile_info(code, av, flags) - _compile(code, av, flags) - emit(OPCODES[JUMP]) - tailappend(_len(code)); emit(0) - code[skip] = _len(code) - skip - emit(0) # end of branch - for tail in tail: - code[tail] = _len(code) - tail - elif op is CATEGORY: + emit(av) + elif op == OP.RANGE: + # min max emit(OPCODES[op]) - if flags & SRE_FLAG_LOCALE: - av = CH_LOCALE[av] - elif flags & SRE_FLAG_UNICODE: - av = CH_UNICODE[av] - emit(CHCODES[av]) - elif op is GROUPREF: - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - else: - emit(OPCODES[op]) - emit(av-1) - elif op is GROUPREF_EXISTS: + emit(av[0]) + emit(av[1]) + elif op in SIMPLE_CATEGORY_OP_SET: + # emit(OPCODES[op]) - emit(av[0]-1) - skipyes = _len(code); emit(0) - _compile(code, av[1], flags) - if av[2]: - emit(OPCODES[JUMP]) - skipno = _len(code); emit(0) - code[skipyes] = _len(code) - skipyes + 1 - _compile(code, av[2], flags) - code[skipno] = _len(code) - skipno - else: - code[skipyes] = _len(code) - skipyes + 1 else: - raise ValueError, ("unsupported operand type", op) + raise error("internal: unsupported set member: %s" % op) + code[skip_set] = len(code) - skip_set -def _compile_charset(charset, flags, code, fixup=None): - # compile charset subprogram - emit = code.append - if fixup is None: - fixup = _identityfunction - for op, av in _optimize_charset(charset, fixup): - emit(OPCODES[op]) - if op is NEGATE: - pass - elif op is LITERAL: - emit(fixup(av)) - elif op is RANGE: - emit(fixup(av[0])) - emit(fixup(av[1])) - elif op is CHARSET: - code.extend(av) - elif op is BIGCHARSET: - code.extend(av) - elif op is CATEGORY: - if flags & SRE_FLAG_LOCALE: - emit(CHCODES[CH_LOCALE[av]]) - elif flags & SRE_FLAG_UNICODE: - emit(CHCODES[CH_UNICODE[av]]) - else: - emit(CHCODES[av]) - else: - raise error, "internal: unsupported set operator" - emit(OPCODES[FAILURE]) - -def _optimize_charset(charset, fixup): - # internal: optimize character set - out = [] - outappend = out.append - charmap = [0]*256 - try: - for op, av in charset: - if op is NEGATE: - outappend((op, av)) - elif op is LITERAL: - charmap[fixup(av)] = 1 - elif op is RANGE: - for i in range(fixup(av[0]), fixup(av[1])+1): - charmap[i] = 1 - elif op is CATEGORY: - # XXX: could append to charmap tail - return charset # cannot compress - except IndexError: - # character set contains unicode characters - return _optimize_unicode(charset, fixup) - # compress character map - i = p = n = 0 - runs = [] - runsappend = runs.append - for c in charmap: - if c: - if n == 0: - p = i - n = n + 1 - elif n: - runsappend((p, n)) - n = 0 - i = i + 1 - if n: - runsappend((p, n)) - if len(runs) <= 2: - # use literal/range - for p, n in runs: - if n == 1: - outappend((LITERAL, p)) - else: - outappend((RANGE, (p, p+n-1))) - if len(out) < len(charset): - return out - else: - # use bitmap - data = _mk_bitmap(charmap) - outappend((CHARSET, data)) - return out - return charset - -def _mk_bitmap(bits): - data = [] - dataappend = data.append - if _sre.CODESIZE == 2: - start = (1, 0) - else: - start = (1L, 0L) - m, v = start - for c in bits: - if c: - v = v + m - m = m + m - if m > MAXCODE: - dataappend(v) - m, v = start - return data +# The characters may be mapped to a bitmap. -# To represent a big charset, first a bitmap of all characters in the +# To represent a charset, first a bitmap of all characters in the # set is constructed. Then, this bitmap is sliced into chunks of 256 # characters, duplicate chunks are eliminated, and each chunk is # given a number. In the compiled expression, the charset is -# represented by a 16-bit word sequence, consisting of one word for -# the number of different chunks, a sequence of 256 bytes (128 words) -# of chunk numbers indexed by their original chunk position, and a -# sequence of chunks (16 words each). +# represented by a codeword sequence, consisting of one codeword for +# the maximum character code, a sequence of chunk numbers +# (2 per codeword), and a sequence of chunks (8 codewords each). # Compression is normally good: in a typical charset, large ranges of # Unicode will be either completely excluded (e.g. if only cyrillic @@ -293,215 +275,145 @@ # subranges of Kanji match). These ranges will be represented by # chunks of all one-bits or all zero-bits. -# Matching can be also done efficiently: the more significant byte of +# Matching can be also done efficiently: the most significant bits of # the Unicode character is an index into the chunk number, and the -# less significant byte is a bit index in the chunk (just like the -# CHARSET matching). +# least significant byte is a bit index into the chunk. -# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets -# of the basic multilingual plane; an efficient representation -# for all of UTF-16 has not yet been developed. This means, -# in particular, that negated charsets cannot be represented as -# bigcharsets. - -def _optimize_unicode(charset, fixup): - try: - import array - except ImportError: - return charset - charmap = [0]*65536 - negate = 0 - try: - for op, av in charset: - if op is NEGATE: - negate = 1 - elif op is LITERAL: - charmap[fixup(av)] = 1 - elif op is RANGE: - for i in xrange(fixup(av[0]), fixup(av[1])+1): - charmap[i] = 1 - elif op is CATEGORY: - # XXX: could expand category - return charset # cannot compress - except IndexError: - # non-BMP characters - return charset - if negate: - if sys.maxunicode != 65535: - # XXX: negation does not work with big charsets - return charset - for i in xrange(65536): - charmap[i] = not charmap[i] - comps = {} - mapping = [0]*256 - block = 0 - data = [] - for i in xrange(256): - chunk = tuple(charmap[i*256:(i+1)*256]) - new = comps.setdefault(chunk, block) - mapping[i] = new - if new == block: - block = block + 1 - data = data + _mk_bitmap(chunk) - header = [block] - if _sre.CODESIZE == 2: - code = 'H' - else: - code = 'I' - # Convert block indices to byte array of 256 bytes - mapping = array.array('b', mapping).tostring() - # Convert byte array to word array - mapping = array.array(code, mapping) - assert mapping.itemsize == _sre.CODESIZE - header = header + mapping.tolist() - data[0:0] = header - return [(BIGCHARSET, data)] - -def _simple(av): - # check if av is a "simple" operator - lo, hi = av[2].getwidth() - if lo == 0 and hi == MAXREPEAT: - raise error, "nothing to repeat" - return lo == hi == 1 and av[2][0][0] != SUBPATTERN - -def _compile_info(code, pattern, flags): - # internal: compile an info block. in the current version, - # this contains min/max pattern width, and an optional literal - # prefix or a character map - lo, hi = pattern.getwidth() - if lo == 0: - return # not worth it - # look for a literal prefix - prefix = [] - prefixappend = prefix.append - prefix_skip = 0 - charset = [] # not used - charsetappend = charset.append - if not (flags & SRE_FLAG_IGNORECASE): - # look for literal prefix - for op, av in pattern.data: - if op is LITERAL: - if len(prefix) == prefix_skip: - prefix_skip = prefix_skip + 1 - prefixappend(av) - elif op is SUBPATTERN and len(av[1]) == 1: - op, av = av[1][0] - if op is LITERAL: - prefixappend(av) - else: - break +# a charset is a 3-tuple, consisting of the maximum character code, +# a list of indexes and a list of 256-bit bitsets +def _compile_charset(code, charset): + # the maximum character code + code.append(charset[0]) + # pack the 16-bit indexes into 32-bit codewords + # (adding an extra index ensures that zip() doesn't drop + # the last one if there are an odd number of them) + for lo, hi in zip(charset[1][0 : : 2], charset[1][1 : : 2] + [0]): + code.append(lo | (hi << 16)) + # pack the 256-bit bitsets to 32-bit codewords + for chunk in charset[2]: + for i in range(256 // BITS_PER_CODE): + code.append(chunk & MAXCODE) + chunk >>= BITS_PER_CODE + +def _ones(n): + return (1 << n) - 1 + +def _optimize_set(set_op, set_members, flags): + # consolidate the ranges (the bounds are inclusive) + charset = set() + categories = [] + for o, a in set_members: + if o == OP.LITERAL: + charset.add(a) + elif o == OP.RANGE: + for c in xrange(a[0], a[1] + 1): + charset.add(c) + else: + categories.append((o, a)) + categories = sorted(set(categories)) + # convert charset to list of ranges + ranges = [] + start, end = None, None + for c in sorted(charset): + try: + if c == end + 1: + end = c else: - break - # if no prefix, look for charset prefix - if not prefix and pattern.data: - op, av = pattern.data[0] - if op is SUBPATTERN and av[1]: - op, av = av[1][0] - if op is LITERAL: - charsetappend((op, av)) - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) - else: - break - else: - charset = c - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) - else: - break - else: - charset = c - elif op is IN: - charset = av -## if prefix: -## print "*** PREFIX", prefix, prefix_skip -## if charset: -## print "*** CHARSET", charset - # add an info block - emit = code.append - emit(OPCODES[INFO]) - skip = len(code); emit(0) - # literal flag - mask = 0 - if prefix: - mask = SRE_INFO_PREFIX - if len(prefix) == prefix_skip == len(pattern.data): - mask = mask + SRE_INFO_LITERAL - elif charset: - mask = mask + SRE_INFO_CHARSET - emit(mask) - # pattern length - if lo < MAXCODE: - emit(lo) - else: - emit(MAXCODE) - prefix = prefix[:MAXCODE] - if hi < MAXCODE: - emit(hi) + ranges.append((start, end)) + start, end = c, c + except TypeError: + start, end = c, c + if start is not None: + ranges.append((start, end)) + # try to optimise the set + if len(ranges) <= 1: + # only a few ranges + for r in ranges: + if r[0] == r[1]: + # a range of 1 character! + categories.append((OP.LITERAL, r[0])) + else: + categories.append((OP.RANGE, r)) else: - emit(0) - # add literal prefix - if prefix: - emit(len(prefix)) # length - emit(prefix_skip) # skip - code.extend(prefix) - # generate overlap table - table = [-1] + ([0]*len(prefix)) - for i in xrange(len(prefix)): - table[i+1] = table[i]+1 - while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: - table[i+1] = table[table[i+1]-1]+1 - code.extend(table[1:]) # don't store first entry - elif charset: - _compile_charset(charset, flags, code) - code[skip] = len(code) - skip - -try: - unicode -except NameError: - STRING_TYPES = (type(""),) -else: - STRING_TYPES = (type(""), type(unicode(""))) - -def isstring(obj): - for tp in STRING_TYPES: - if isinstance(obj, tp): - return 1 - return 0 + # many ranges, so use a charset instead + max_char = ranges[-1][1] + subset_list = [0] * (max_char // 256 + 1) + for lo, hi in ranges: + base = lo - lo % 256 + while lo <= hi: + subset_list[base // 256] |= _ones(min(hi - base + 1, 256)) ^ _ones(lo % 256) + base += 256 + lo = base + # build the index and chunks, consolidating duplicate subsets/chunks + index_list, chunk_list = [], [] + for subset in subset_list: + try: + index_list.append(chunk_list.index(subset)) + except ValueError: + index_list.append(len(chunk_list)) + chunk_list.append(subset) + categories.append((OP.CHARSET, (max_char, index_list, chunk_list))) + if len(categories) == 1: + # only 1 test in the set, so don't use a set + cat = categories[0] + if set_op.startswith("NOT_"): + cat = not_op(cat) + if set_op.endswith("_IGNORE"): + cat = ignore_op(cat) + return cat + return set_op, categories + +def create_charset(iterable): + # (UNUSED) + # enumerate the characters and create the subsets + subset_list = [] + max_code = 0 + for ch in iterable: + ch = ord(ch) + max_code = max(max_code, ch) + hi, lo = divmod(ch, 256) + mask = 1 << lo + try: + subset_list[hi] |= mask + except IndexError: + subset_list.extend([0] * (hi - len(subset_list))) + subset_list.append(mask) + # optimise the subsets + index_list, chunk_list = [], [] + for subset in subset_list: + try: + index_list.append(chunk_list.index(subset)) + except ValueError: + index_list.append(len(chunk_list)) + chunk_list.append(subset) + return max_code, index_list, chunk_list def _code(p, flags): - flags = p.pattern.flags | flags code = [] # compile info block - _compile_info(code, p, flags) + #_compile_info(code, p, flags) # compile the pattern - _compile(code, p.data, flags) - - code.append(OPCODES[SUCCESS]) + class Record(object): + pass + info = Record() + info.group_count = 0 + if flags & SRE_FLAG_REVERSE: + dir = -1 + else: + dir = 1 + _compile(code, p.data, flags, info, dir) + code.append(OPCODES[OP.SUCCESS]) return code def compile(p, flags=0): # internal: convert pattern list to internal format - if isstring(p): + if isinstance(p, basestring): + import sre_parse pattern = p p = sre_parse.parse(p, flags) else: @@ -511,20 +423,11 @@ # print code - # XXX: get rid of this limitation! - if p.pattern.groups > 100: - raise AssertionError( - "sorry, but this version only supports 100 named groups" - ) - # map in either direction - groupindex = p.pattern.groupdict - indexgroup = [None] * p.pattern.groups - for k, i in groupindex.items(): - indexgroup[i] = k - - return _sre.compile( - pattern, flags | p.pattern.flags, code, - p.pattern.groups-1, - groupindex, indexgroup - ) + groupindex = p.pattern.named_groups + indexgroup = [None] * (max(groupindex.values() + [-1]) + 1) + + for name, index in groupindex.items(): + indexgroup[index] = name + + return _sre.compile(pattern, flags | p.pattern.flags, code, p.pattern.groups, groupindex, indexgroup) === modified file Lib/sre_parse.py --- Lib/sre_parse.py 2008-05-27 01:18:39 +0000 +++ Lib/sre_parse.py 2009-02-03 21:38:45 +0000 @@ -15,81 +15,66 @@ import sys from sre_constants import * +import unicodedata -def set(seq): - s = {} - for elem in seq: - s[elem] = 1 - return s - -SPECIAL_CHARS = ".\\[{()*+?^$|" -REPEAT_CHARS = "*+?{" - -DIGITS = set("0123456789") - -OCTDIGITS = set("01234567") -HEXDIGITS = set("0123456789abcdefABCDEF") - -WHITESPACE = set(" \t\n\r\v\f") +SPECIAL_CHARS = set(".\\[{()*+?^$|") +REPEAT_CHARS = set("*+?{") +WHITESPACE_CHARS = set(" \t\n\r\v\f") ESCAPES = { - r"\a": (LITERAL, ord("\a")), - r"\b": (LITERAL, ord("\b")), - r"\f": (LITERAL, ord("\f")), - r"\n": (LITERAL, ord("\n")), - r"\r": (LITERAL, ord("\r")), - r"\t": (LITERAL, ord("\t")), - r"\v": (LITERAL, ord("\v")), - r"\\": (LITERAL, ord("\\")) + r"\a": (OP.LITERAL, ord("\a")), + r"\b": (OP.LITERAL, ord("\b")), + r"\f": (OP.LITERAL, ord("\f")), + r"\n": (OP.LITERAL, ord("\n")), + r"\r": (OP.LITERAL, ord("\r")), + r"\t": (OP.LITERAL, ord("\t")), + r"\v": (OP.LITERAL, ord("\v")), + r"\\": (OP.LITERAL, ord("\\")), } -CATEGORIES = { - r"\A": (AT, AT_BEGINNING_STRING), # start of string - r"\b": (AT, AT_BOUNDARY), - r"\B": (AT, AT_NON_BOUNDARY), - r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), - r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), - r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), - r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), - r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), - r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), - r"\Z": (AT, AT_END_STRING), # end of string +POSITIONS = { + r"\A": (OP.START_OF_STRING, None), + r"\b": (OP.BOUNDARY, None), + r"\B": (OP.NOT_BOUNDARY, None), + r"\Z": (OP.END_OF_STRING, None), +} + +STD_CATEGORIES = { + r"\d": (OP.CATEGORY, CATEGORIES["Digit"]), + r"\D": (OP.NOT_CATEGORY, CATEGORIES["Digit"]), + r"\s": (OP.CATEGORY, CATEGORIES["Space"]), + r"\S": (OP.NOT_CATEGORY, CATEGORIES["Space"]), + r"\w": (OP.CATEGORY, CATEGORIES["Word"]), + r"\W": (OP.NOT_CATEGORY, CATEGORIES["Word"]), } FLAGS = { - # standard flags "i": SRE_FLAG_IGNORECASE, "L": SRE_FLAG_LOCALE, "m": SRE_FLAG_MULTILINE, + "r": SRE_FLAG_REVERSE, "s": SRE_FLAG_DOTALL, "x": SRE_FLAG_VERBOSE, - # extensions "t": SRE_FLAG_TEMPLATE, "u": SRE_FLAG_UNICODE, + "z": SRE_FLAG_ZEROWIDTH, } +SCOPED_FLAGS_MASK = SRE_FLAG_IGNORECASE | SRE_FLAG_MULTILINE | SRE_FLAG_DOTALL | SRE_FLAG_VERBOSE + class Pattern: - # master pattern object. keeps track of global attributes + # master pattern object. keeps track of global attributes def __init__(self): self.flags = 0 - self.open = [] - self.groups = 1 - self.groupdict = {} - def opengroup(self, name=None): - gid = self.groups - self.groups = gid + 1 + self.groups = 0 + self.named_groups = {} + self.fix_list = [] + def new_group(self, name=None): + self.groups += 1 + group_number = self.groups if name is not None: - ogid = self.groupdict.get(name, None) - if ogid is not None: - raise error, ("redefinition of group name %s as group %d; " - "was group %d" % (repr(name), gid, ogid)) - self.groupdict[name] = gid - self.open.append(gid) - return gid - def closegroup(self, gid): - self.open.remove(gid) - def checkgroup(self, gid): - return gid < self.groups and gid not in self.open + self.named_groups.setdefault(name, len(self.named_groups)) + return group_number, name class SubPattern: # a subpattern, in intermediate form @@ -101,23 +86,28 @@ self.width = None def dump(self, level=0): nl = 1 - seqtypes = type(()), type([]) + seqtypes = tuple, list for op, av in self.data: print level*" " + op,; nl = 0 - if op == "in": + if op == OP.SET: # member sublanguage print; nl = 1 for op, a in av: print (level+1)*" " + op, a - elif op == "branch": + elif op == OP.NOT_SET: + # member sublanguage + print; nl = 1 + for op, a in av: + print (level+1)*" " + op, a + elif op == OP.BRANCH: print; nl = 1 i = 0 for a in av[1]: if i > 0: print level*" " + "or" a.dump(level+1); nl = 1 - i = i + 1 - elif type(av) in seqtypes: + i += 1 + elif isinstance(av, seqtypes): for a in av: if isinstance(a, SubPattern): if not nl: print @@ -143,363 +133,434 @@ self.data.insert(index, code) def append(self, code): self.data.append(code) - def getwidth(self): - # determine the width (min, max) for this subpattern - if self.width: - return self.width - lo = hi = 0L - UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) - REPEATCODES = (MIN_REPEAT, MAX_REPEAT) - for op, av in self.data: - if op is BRANCH: - i = sys.maxint - j = 0 - for av in av[1]: - l, h = av.getwidth() - i = min(i, l) - j = max(j, h) - lo = lo + i - hi = hi + j - elif op is CALL: - i, j = av.getwidth() - lo = lo + i - hi = hi + j - elif op is SUBPATTERN: - i, j = av[1].getwidth() - lo = lo + i - hi = hi + j - elif op in REPEATCODES: - i, j = av[2].getwidth() - lo = lo + long(i) * av[0] - hi = hi + long(j) * av[1] - elif op in UNITCODES: - lo = lo + 1 - hi = hi + 1 - elif op == SUCCESS: - break - self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) - return self.width class Tokenizer: def __init__(self, string): self.string = string self.index = 0 - self.__next() - def __next(self): - if self.index >= len(self.string): + self._next() + def _next(self): + try: + char = self.string[self.index] + if char == "\\": + try: + char += self.string[self.index + 1] + except IndexError: + raise error("bad escape (end of line)") + self.index += len(char) + self.next = char + except IndexError: self.next = None - return - char = self.string[self.index] - if char[0] == "\\": - try: - c = self.string[self.index + 1] - except IndexError: - raise error, "bogus escape (end of line)" - char = char + c - self.index = self.index + len(char) - self.next = char - def match(self, char, skip=1): - if char == self.next: - if skip: - self.__next() - return 1 - return 0 + def match(self, char, skip=True): + if char != self.next: + return False + if skip: + self._next() + return True def get(self): this = self.next - self.__next() + self._next() return this def tell(self): return self.index, self.next def seek(self, index): self.index, self.next = index -def isident(char): - return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" +def is_name(name): + # check that group name is a valid string + return (name[0] == "_" or name[0].isalpha()) and all(char == "_" or char.isalnum() for char in name[1 : ]) -def isdigit(char): - return "0" <= char <= "9" +# names can be delimited in a number of ways +NAME_DELIMITERS = {"<": ">", "{": "}"} -def isname(name): - # check that group name is a valid string - if not isident(name[0]): - return False - for char in name[1:]: - if not isident(char) and not isdigit(char): - return False - return True +def hex_escape(source, escape, max_digits): + # hexadecimal escape + if escape == r"\x" and source.next in NAME_DELIMITERS: + # hex escape \x{n} + start_delimiter = source.get() + end_delimiter = NAME_DELIMITERS[start_delimiter] + while source.next in HEXDIGITS: + digits += source.get() + if not 1 <= len(digits) <= 8 or not source.match(end_delimiter): + raise error("bad escape: %s" % (escape + start_delimiter + digits)) + else: + digits = "" + while source.next in HEXDIGITS and len(digits) < max_digits: + digits += source.get() + if len(digits) != max_digits: + raise error("bad escape: %s" % (escape + digits)) + return int(digits, 16) + +def oct_escape(source, escape, digits): + # octal escape + while source.next in OCTDIGITS and len(digits) < 3: + digits += source.get() + try: + return int(digits, 8) & 0xFF + except ValueError: + raise error("bad escape: %s" % (escape + digits)) -def _class_escape(source, escape): +def parse_name(source, terminator, name_type, prefix): + name = "" + while True: + char = source.get() + if char is None: + raise error("unterminated %s name: %s" % (name_type, prefix)) + if char == terminator: + break + name += char + return name + +HEX_ESCAPE_LENGTH = {"x": 2, "u": 4, "U": 8} + +def class_escape(source, escape): # handle escape code inside character class - code = ESCAPES.get(escape) - if code: - return code - code = CATEGORIES.get(escape) + code = STD_CATEGORIES.get(escape) or ESCAPES.get(escape) if code: return code try: - c = escape[1:2] - if c == "x": - # hexadecimal escape (exactly two digits) - while source.next in HEXDIGITS and len(escape) < 4: - escape = escape + source.get() - escape = escape[2:] - if len(escape) != 2: - raise error, "bogus escape: %s" % repr("\\" + escape) - return LITERAL, int(escape, 16) & 0xff + c = escape[1 : 2] + if c in HEX_ESCAPE_LENGTH: + # hex escape + return OP.LITERAL, hex_escape(source, escape, HEX_ESCAPE_LENGTH[c]) + elif c == "o": + # octal escape + return OP.LITERAL, oct_escape(source, escape, "") elif c in OCTDIGITS: - # octal escape (up to three digits) - while source.next in OCTDIGITS and len(escape) < 4: - escape = escape + source.get() - escape = escape[1:] - return LITERAL, int(escape, 8) & 0xff + # octal escape + return OP.LITERAL, oct_escape(source, escape[ : 1], c) elif c in DIGITS: - raise error, "bogus escape: %s" % repr(escape) - if len(escape) == 2: - return LITERAL, ord(escape[1]) + raise error("bad escape: %s" % escape) + elif c == "N": + # named character + if source.next not in NAME_DELIMITERS: + raise error("missing character name: %s" % escape) + delimiter = source.get() + name = parse_name(source, NAME_DELIMITERS[delimiter], "character", escape + delimiter) + try: + return OP.LITERAL, ord(unicodedata.lookup(name)) + except KeyError: + raise error("bad character name: %s" % name) + elif c == "p": + # character property + if source.next not in NAME_DELIMITERS: + raise error("missing property name: %s" % escape) + delimiter = source.get() + name = parse_name(source, NAME_DELIMITERS[delimiter], "property", escape + delimiter) + try: + return OP.CATEGORY, CATEGORIES[name] + except KeyError: + raise error("bad property name: %s" % name) + else: + return OP.LITERAL, ord(c) except ValueError: pass - raise error, "bogus escape: %s" % repr(escape) + raise error("bad escape: %s" % escape) + +# group references can be delimited in a number of ways +GROUP_DELIMITERS = {"<": ">", "{": "}", "'": "'", '"': '"'} + +# group references can be relative +GROUP_DIRECTION = {"+": 1, "-": -1} -def _escape(source, escape, state): +def escape(source, escape, state): # handle escape code in expression - code = CATEGORIES.get(escape) - if code: - return code - code = ESCAPES.get(escape) + # group references returned as list instead of tuple so that they can be fixed later + code = POSITIONS.get(escape) or STD_CATEGORIES.get(escape) or ESCAPES.get(escape) if code: return code + if state.flags & SRE_FLAG_IGNORECASE: + literal_op, groupref_op = OP.LITERAL_IGNORE, OP.GROUPREF_IGNORE + else: + literal_op, groupref_op = OP.LITERAL, OP.GROUPREF try: - c = escape[1:2] - if c == "x": - # hexadecimal escape - while source.next in HEXDIGITS and len(escape) < 4: - escape = escape + source.get() - if len(escape) != 4: - raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff + c = escape[1 : 2] + if c in HEX_ESCAPE_LENGTH: + # hex escape + return literal_op, hex_escape(source, escape, HEX_ESCAPE_LENGTH[c]) + elif c == "o": + # octal escape + return literal_op, oct_escape(source, escape, "") elif c == "0": # octal escape - while source.next in OCTDIGITS and len(escape) < 4: - escape = escape + source.get() - return LITERAL, int(escape[1:], 8) & 0xff + return literal_op, oct_escape(source, escape[ : 1], c) elif c in DIGITS: # octal escape *or* decimal group reference (sigh) if source.next in DIGITS: - escape = escape + source.get() - if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and - source.next in OCTDIGITS): + escape += source.get() + if set(escape[1 : ]) <= OCTDIGITS and source.next in OCTDIGITS: # got three octal digits; this is an octal escape - escape = escape + source.get() - return LITERAL, int(escape[1:], 8) & 0xff + escape += source.get() + return literal_op, int(escape[1 : ], 8) & 0xFF # not an octal escape, so this is a group reference - group = int(escape[1:]) - if group < state.groups: - if not state.checkgroup(group): - raise error, "cannot refer to open group" - return GROUPREF, group - raise ValueError - if len(escape) == 2: - return LITERAL, ord(escape[1]) + ref = [groupref_op, escape[1 : ]] + state.fix_list.append(ref) + return ref + elif c == "g": + # group reference + if source.next in GROUP_DELIMITERS: + # delimited group reference + delimiter = source.get() + name = parse_name(source, GROUP_DELIMITERS[delimiter], "group", escape + delimiter) + if name[0] in GROUP_DIRECTION and name[1 : ].isdigit(): + # relative group reference + name = str(state.groups + GROUP_DIRECTION[name[0]] * int(name[1 : ])) + if not name.isdigit() and not is_name(name): + raise error("bad group name: %s" % name) + # return the group reference + ref = [groupref_op, name] + state.fix_list.append(ref) + return ref + elif source.next in DIGITS: + # non-delimited group reference (single digit) + ref = [groupref_op, source.get()] + state.fix_list.append(ref) + return ref + else: + raise error("missing group name: %s" % escape) + elif c == "k": + # named group reference + if source.next in GROUP_DELIMITERS: + # delimited group reference + delimiter = source.get() + name = parse_name(source, GROUP_DELIMITERS[delimiter], "group", escape + delimiter) + if not is_name(name): + raise error("bad group name: %s" % name) + ref = [groupref_op, name] + state.fix_list.append(ref) + return ref + else: + # non-delimited group reference; invalid for \k + raise error("missing group name: %s" % escape) + elif c == "N": + # named character + if source.next not in NAME_DELIMITERS: + raise error("missing character name: %s" % escape) + delimiter = source.get() + name = parse_name(source, NAME_DELIMITERS[delimiter], "character", escape + delimiter) + try: + return literal_op, ord(unicodedata.lookup(name)) + except KeyError: + raise error("bad character name: %s" % name) + elif c in "pP": + # character property + if source.next not in NAME_DELIMITERS: + raise error("missing property name: %s" % escape) + delimiter = source.get() + name = parse_name(source, NAME_DELIMITERS[delimiter], "property", escape + delimiter) + try: + op = OP.CATEGORY, CATEGORIES[name] + if c == "P": + op = not_op(op) + return op + except KeyError: + raise error("bad property name: %s" % name) + else: + return literal_op, ord(c) except ValueError: pass - raise error, "bogus escape: %s" % repr(escape) + raise error("bad escape: %s" % escape) -def _parse_sub(source, state, nested=1): +def _parse_sub(source, state, nested=True): # parse an alternation: a|b|c - items = [] - itemsappend = items.append - sourcematch = source.match - while 1: - itemsappend(_parse(source, state)) - if sourcematch("|"): + while True: + items.append(_parse(source, state)) + if source.match("|"): continue if not nested: break - if not source.next or sourcematch(")", 0): + if not source.next or source.match(")", False): break else: - raise error, "pattern not properly closed" + raise error("pattern not properly closed") if len(items) == 1: return items[0] subpattern = SubPattern(state) - subpatternappend = subpattern.append - # check if all items share a common prefix - while 1: - prefix = None - for item in items: - if not item: - break - if prefix is None: - prefix = item[0] - elif item[0] != prefix: - break - else: - # all subitems start with a common "prefix". - # move it out of the branch - for item in items: - del item[0] - subpatternappend(prefix) - continue # check next one - break - - # check if the branch can be replaced by a character set - for item in items: - if len(item) != 1 or item[0][0] != LITERAL: + # check whether all branches share a common prefix + # (the prefix shouldn't contain a capture group) + prefix_len = 0 + while prefix_len < len(items[0]) and not is_capture(items[0][prefix_len]): + prefix_len += 1 + + prefix = items[0][ : prefix_len] + for item in items[1 : ]: + prefix = prefix[ : len(item)] + while prefix and item[ : len(prefix)] != prefix: + prefix = prefix[ : -1] + if not prefix: + # no common prefix, so skip any further branches break - else: - # we can store this as a character set instead of a + + if prefix: + subpattern.append(prefix) + items = [item[len(prefix) : ] for item in items] + + # check whether the alternation can be replaced by a character set + if all(len(item) == 1 and item[0][0] == OP.LITERAL for item in items): + # we can store this as a set instead of a # branch (the compiler may optimize this even more) - set = [] - setappend = set.append - for item in items: - setappend(item[0]) - subpatternappend((IN, set)) - return subpattern + subpattern.append((OP.SET, [item[0] for item in items])) + else: + subpattern.append((OP.BRANCH, (None, items))) - subpattern.append((BRANCH, (None, items))) return subpattern +def is_capture(pattern): + if not pattern: + return False + o, a = pattern + if o in [OP.ASSERT, OP.ASSERT_NOT, OP.ATOMIC]: + return has_capture(a[1]) + elif o == OP.BRANCH: + return any(has_capture(i) for i in a[1]) + elif o == OP.GROUPREF_EXISTS: + return any(has_capture(i) for i in a[1 : 3]) + elif o in [OP.REPEAT_MAX, OP.REPEAT_MIN, OP.REPEAT_POSS]: + return has_capture(a[2]) + elif o in [OP.REPEAT_ONE_MAX, OP.REPEAT_ONE_MIN, OP.REPEAT_ONE_POSS]: + return is_capture(a[2]) + elif o == OP.SUBPATTERN: + return a[0] is not None or has_capture(a[1]) + else: + return False + +def has_capture(pattern): + if not pattern: + return False + return any(i for i in pattern) + def _parse_sub_cond(source, state, condgroup): item_yes = _parse(source, state) if source.match("|"): item_no = _parse(source, state) if source.match("|"): - raise error, "conditional backref with more than two branches" + raise error("conditional reference with more than two branches") else: item_no = None - if source.next and not source.match(")", 0): - raise error, "pattern not properly closed" + if source.next and not source.match(")", False): + raise error("pattern not properly closed") subpattern = SubPattern(state) - subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) + ref = (OP.GROUPREF_EXISTS, [condgroup, item_yes, item_no]) + state.fix_list.append(ref) + subpattern.append(ref) return subpattern -_PATTERNENDERS = set("|)") -_ASSERTCHARS = set("=!<") -_LOOKBEHINDASSERTCHARS = set("=!") -_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) +PATTERN_ENDERS = set("|)") +ASSERT_CHARS = set("=!<") +LOOKBEHIND_ASSERT_CHARS = set("=!") +POSITION_CODES = set([OP.BOUNDARY, OP.END_OF_LINE, OP.END_OF_STRING, OP.END_OF_STRING_LN, OP.NOT_BOUNDARY, OP.START_OF_LINE, OP.START_OF_STRING]) +QUERY_GROUP = 0 +CAPTURE_GROUP = 1 +NONCAPTURE_GROUP = 2 +ATOMIC_GROUP = 3 def _parse(source, state): # parse a simple pattern subpattern = SubPattern(state) - # precompute constants into local variables - subpatternappend = subpattern.append - sourceget = source.get - sourcematch = source.match - _len = len - PATTERNENDERS = _PATTERNENDERS - ASSERTCHARS = _ASSERTCHARS - LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS - REPEATCODES = _REPEATCODES - - while 1: - - if source.next in PATTERNENDERS: + while True: + if source.next in PATTERN_ENDERS: break # end of subpattern - this = sourceget() - if this is None: - break # end of pattern + + this = source.get() if state.flags & SRE_FLAG_VERBOSE: # skip whitespace and comments - if this in WHITESPACE: - continue + while this in WHITESPACE_CHARS: + this = source.get() if this == "#": - while 1: - this = sourceget() + while True: + this = source.get() if this in (None, "\n"): break continue - if this and this[0] not in SPECIAL_CHARS: - subpatternappend((LITERAL, ord(this))) + if this is None: + break # end of pattern + if this[0] not in SPECIAL_CHARS: + if state.flags & SRE_FLAG_IGNORECASE: + subpattern.append((OP.LITERAL_IGNORE, ord(this))) + else: + subpattern.append((OP.LITERAL, ord(this))) elif this == "[": # character set - set = [] - setappend = set.append -## if sourcematch(":"): -## pass # handle character classes - if sourcematch("^"): - setappend((NEGATE, None)) + char_set = [] + negate = source.match("^") # check remaining characters - start = set[:] - while 1: - this = sourceget() - if this == "]" and set != start: + while True: + this = source.get() + if this == "]" and char_set: + # terminating ] break - elif this and this[0] == "\\": - code1 = _class_escape(source, this) + if this and this[0] == "\\": + code1 = class_escape(source, this) elif this: - code1 = LITERAL, ord(this) + code1 = OP.LITERAL, ord(this) else: - raise error, "unexpected end of regular expression" - if sourcematch("-"): + raise error("unexpected end of pattern") + if source.match("-"): # potential range - this = sourceget() + this = source.get() if this == "]": - if code1[0] is IN: - code1 = code1[1][0] - setappend(code1) - setappend((LITERAL, ord("-"))) + # at end of pattern, so literal char and "-" + char_set.append(code1) + char_set.append((OP.LITERAL, ord("-"))) break elif this: if this[0] == "\\": - code2 = _class_escape(source, this) + code2 = class_escape(source, this) else: - code2 = LITERAL, ord(this) - if code1[0] != LITERAL or code2[0] != LITERAL: - raise error, "bad character range" + code2 = OP.LITERAL, ord(this) + if code1[0] != OP.LITERAL or code2[0] != OP.LITERAL: + raise error("bad character range") lo = code1[1] hi = code2[1] if hi < lo: - raise error, "bad character range" - setappend((RANGE, (lo, hi))) + raise error("bad character range") + char_set.append((OP.RANGE, (lo, hi))) else: - raise error, "unexpected end of regular expression" + raise error("unexpected end of pattern") else: - if code1[0] is IN: - code1 = code1[1][0] - setappend(code1) - - # XXX: should move set optimization to compiler! - if _len(set)==1 and set[0][0] is LITERAL: - subpatternappend(set[0]) # optimization - elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: - subpatternappend((NOT_LITERAL, set[1][1])) # optimization + char_set.append(code1) + if negate: + if state.flags & SRE_FLAG_IGNORECASE: + subpattern.append((OP.NOT_SET_IGNORE, char_set)) + else: + subpattern.append((OP.NOT_SET, char_set)) else: - # XXX: should add charmap optimization here - subpatternappend((IN, set)) - - elif this and this[0] in REPEAT_CHARS: + if state.flags & SRE_FLAG_IGNORECASE: + subpattern.append((OP.SET_IGNORE, char_set)) + else: + subpattern.append((OP.SET, char_set)) + elif this[0] in REPEAT_CHARS: # repeat previous item if this == "?": min, max = 0, 1 elif this == "*": min, max = 0, MAXREPEAT - elif this == "+": min, max = 1, MAXREPEAT elif this == "{": if source.next == "}": - subpatternappend((LITERAL, ord(this))) + subpattern.append((OP.LITERAL, ord(this))) continue here = source.tell() min, max = 0, MAXREPEAT lo = hi = "" while source.next in DIGITS: - lo = lo + source.get() - if sourcematch(","): + lo += source.get() + if source.match(","): while source.next in DIGITS: - hi = hi + sourceget() + hi += source.get() else: hi = lo - if not sourcematch("}"): - subpatternappend((LITERAL, ord(this))) + if not source.match("}"): + subpattern.append((OP.LITERAL, ord(this))) source.seek(here) continue if lo: @@ -507,165 +568,227 @@ if hi: max = int(hi) if max < min: - raise error, "bad repeat interval" + raise error("bad repeat interval") else: - raise error, "not supported" + raise error("not supported") # figure out which item to repeat - if subpattern: - item = subpattern[-1:] + item = subpattern[-1 : ] + if not item or len(item) == 1 and item[0][0] in POSITION_CODES: + raise error("nothing to repeat") + if source.match("?"): + subpattern[-1] = (OP.REPEAT_MIN, (min, max, item)) + elif source.match("+"): + subpattern[-1] = (OP.REPEAT_POSS, (min, max, item)) else: - item = None - if not item or (_len(item) == 1 and item[0][0] == AT): - raise error, "nothing to repeat" - if item[0][0] in REPEATCODES: - raise error, "multiple repeat" - if sourcematch("?"): - subpattern[-1] = (MIN_REPEAT, (min, max, item)) - else: - subpattern[-1] = (MAX_REPEAT, (min, max, item)) - + subpattern[-1] = (OP.REPEAT_MAX, (min, max, item)) elif this == ".": - subpatternappend((ANY, None)) - + if state.flags & SRE_FLAG_DOTALL: + subpattern.append((OP.ANY_ALL, None)) + else: + subpattern.append((OP.ANY, None)) elif this == "(": - group = 1 + group = CAPTURE_GROUP name = None condgroup = None - if sourcematch("?"): - group = 0 + scoped_flags = None + if source.match("?"): + group = QUERY_GROUP # options - if sourcematch("P"): + if source.match("P"): # python extensions - if sourcematch("<"): - # named group: skip forward to end of name - name = "" - while 1: - char = sourceget() - if char is None: - raise error, "unterminated name" - if char == ">": - break - name = name + char - group = 1 - if not isname(name): - raise error, "bad character in group name" - elif sourcematch("="): - # named backreference - name = "" - while 1: - char = sourceget() - if char is None: - raise error, "unterminated name" - if char == ")": - break - name = name + char - if not isname(name): - raise error, "bad character in group name" - gid = state.groupdict.get(name) - if gid is None: - raise error, "unknown group name" - subpatternappend((GROUPREF, gid)) + if source.match("<"): + # named group + name = parse_name(source, ">", "group", "(?P<") + group = CAPTURE_GROUP + if not is_name(name): + raise error("bad group name: %s" % name) + elif source.match("="): + # named group reference + # group reference stored as list instead of tuple so that it can be fixed later + name = parse_name(source, ")", "group", "(?P=") + if not is_name(name): + raise error("bad group name: %s" % name) + if state.flags & SRE_FLAG_IGNORECASE: + ref = [OP.GROUPREF_IGNORE, name] + else: + ref = [OP.GROUPREF, name] + state.fix_list.append(ref) + subpattern.append(ref) continue else: - char = sourceget() + char = source.get() if char is None: - raise error, "unexpected end of pattern" - raise error, "unknown specifier: ?P%s" % char - elif sourcematch(":"): - # non-capturing group - group = 2 - elif sourcematch("#"): + raise error("unexpected end of pattern") + raise error("unknown specifier: (?P%s" % char) + elif source.match("<"): + # named group or look-behind + if source.next in LOOKBEHIND_ASSERT_CHARS: + # lookbehind assertion + dir = -1 # lookbehind + char = source.get() + saved_flags = state.flags + p = _parse_sub(source, state) + state.flags = (state.flags & ~SCOPED_FLAGS_MASK) | (saved_flags & SCOPED_FLAGS_MASK) + if not source.match(")"): + raise error("unbalanced parenthesis") + if char == "=": + subpattern.append((OP.ASSERT, (dir, p))) + else: + subpattern.append((OP.ASSERT_NOT, (dir, p))) + continue + # named group + name = parse_name(source, ">", "group", "(?<") + group = CAPTURE_GROUP + if not is_name(name): + raise error("bad group name: %s" % name) + elif source.match(">"): + # atomic group + group = ATOMIC_GROUP + elif source.match("#"): # comment - while 1: - if source.next is None or source.next == ")": + while True: + if source.next in (None, ")"): break - sourceget() - if not sourcematch(")"): - raise error, "unbalanced parenthesis" + source.get() + if not source.match(")"): + raise error("unbalanced parenthesis") continue - elif source.next in ASSERTCHARS: + elif source.next in ASSERT_CHARS: # lookahead assertions - char = sourceget() + char = source.get() dir = 1 if char == "<": - if source.next not in LOOKBEHINDASSERTCHARS: - raise error, "syntax error" + if source.next not in LOOKBEHIND_ASSERT_CHARS: + raise error("syntax error: (?%s" % char) dir = -1 # lookbehind - char = sourceget() + char = source.get() + saved_flags = state.flags p = _parse_sub(source, state) - if not sourcematch(")"): - raise error, "unbalanced parenthesis" + state.flags = (state.flags & ~SCOPED_FLAGS_MASK) | (saved_flags & SCOPED_FLAGS_MASK) + if not source.match(")"): + raise error("unbalanced parenthesis") if char == "=": - subpatternappend((ASSERT, (dir, p))) + subpattern.append((OP.ASSERT, (dir, p))) else: - subpatternappend((ASSERT_NOT, (dir, p))) + subpattern.append((OP.ASSERT_NOT, (dir, p))) continue - elif sourcematch("("): + elif source.match("("): # conditional backreference group - condname = "" - while 1: - char = sourceget() - if char is None: - raise error, "unterminated name" - if char == ")": - break - condname = condname + char - group = 2 - if isname(condname): - condgroup = state.groupdict.get(condname) - if condgroup is None: - raise error, "unknown group name" - else: - try: - condgroup = int(condname) - except ValueError: - raise error, "bad character in group name" - else: - # flags - if not source.next in FLAGS: - raise error, "unexpected end of pattern" + condgroup = parse_name(source, ")", "group", "(?(") + group = NONCAPTURE_GROUP + if not is_name(condgroup) and not condgroup.isdigit(): + raise error("bad group name: %s" % condgroup) + else: + # probably non-capturing group or flags + # might be scoped (set at start of group and local to group) + scoped_flags = state.flags + seen_on, seen_off = False, False while source.next in FLAGS: - state.flags = state.flags | FLAGS[sourceget()] + scoped_flags |= FLAGS[source.get()] + seen_on = True + if source.match("-"): + while source.next in FLAGS: + if (FLAGS[source.next] & SCOPED_FLAGS_MASK) == 0: + raise error("bad pattern flag: %s" % source.next) + scoped_flags &= ~FLAGS[source.get()] + seen_off = True + if not seen_off: + raise error("bad pattern flag") + # update just global flags + state.flags |= scoped_flags & ~SCOPED_FLAGS_MASK + if source.match(":"): + # non-capturing group with scoped flags + group = NONCAPTURE_GROUP + elif seen_on or seen_off: + # not start of group, just setting flags + state.flags = scoped_flags + scoped_flags = None + else: + raise error("unexpected end of pattern") if group: + atomic = group == ATOMIC_GROUP # parse group contents - if group == 2: + if group in [NONCAPTURE_GROUP, ATOMIC_GROUP]: # anonymous group group = None else: - group = state.opengroup(name) + group = state.new_group(name) + saved_flags = state.flags + if scoped_flags is not None: + state.flags = scoped_flags if condgroup: p = _parse_sub_cond(source, state, condgroup) else: p = _parse_sub(source, state) - if not sourcematch(")"): - raise error, "unbalanced parenthesis" - if group is not None: - state.closegroup(group) - subpatternappend((SUBPATTERN, (group, p))) + state.flags = (state.flags & ~SCOPED_FLAGS_MASK) | (saved_flags & SCOPED_FLAGS_MASK) + if not source.match(")"): + raise error("unbalanced parenthesis") + if atomic: + subpattern.append((OP.ATOMIC, (group, p))) + else: + if group is None: + subpattern.append((OP.SUBPATTERN, (None, p))) + else: + # group reference stored as list instead of tuple so that it can be fixed later + ref = OP.SUBPATTERN, (list(group), p) + state.fix_list.append(ref) + subpattern.append(ref) else: - while 1: - char = sourceget() + while True: + char = source.get() if char is None: - raise error, "unexpected end of pattern" + raise error("unexpected end of pattern") if char == ")": break - raise error, "unknown extension" - + raise error("unknown extension") elif this == "^": - subpatternappend((AT, AT_BEGINNING)) - + if state.flags & SRE_FLAG_MULTILINE: + subpattern.append((OP.START_OF_LINE, None)) + else: + subpattern.append((OP.START_OF_STRING, None)) elif this == "$": - subpattern.append((AT, AT_END)) - - elif this and this[0] == "\\": - code = _escape(source, this, state) - subpatternappend(code) - + if state.flags & SRE_FLAG_MULTILINE: + subpattern.append((OP.END_OF_LINE, None)) + else: + subpattern.append((OP.END_OF_STRING_LN, None)) + elif this[0] == "\\": + code = escape(source, this, state) + subpattern.append(code) else: - raise error, "parser error" + raise error("parser error") return subpattern +def fix_ref(ref, index, state): + if ref[index].isdigit(): + ref[index] = int(ref[index]) + if not (1 <= ref[index] <= state.groups): + raise error("invalid group reference: %s" % ref[index]) + else: + try: + ref[index] = state.named_groups[ref[index]] + except KeyError: + raise error("invalid group reference: %s" % ref[index]) + +def fix_grouprefs(p, state): + for name, value in state.named_groups.items(): + state.named_groups[name] = state.groups + 1 + value + GROUPREF_SET = set([OP.GROUPREF, OP.GROUPREF_IGNORE]) + for ref in state.fix_list: + if ref[0] in GROUPREF_SET: + fix_ref(ref, 1, state) + elif ref[0] == OP.GROUPREF_EXISTS: + fix_ref(ref[1], 0, state) + elif ref[0] == OP.SUBPATTERN: + ref = ref[1][0] + if ref[1] is None: + ref[1] = ref[0] + else: + try: + ref[1] = state.named_groups[ref[1]] + except KeyError: + raise error("invalid group reference: %s" % ref[1]) + def parse(str, flags=0, pattern=None): # parse 're' pattern into list of (opcode, argument) tuples @@ -675,122 +798,144 @@ pattern = Pattern() pattern.flags = flags pattern.str = str + pattern.group_count = 0 p = _parse_sub(source, pattern, 0) tail = source.get() if tail == ")": - raise error, "unbalanced parenthesis" + raise error("unbalanced parenthesis") elif tail: - raise error, "bogus characters at end of regular expression" + raise error("bad characters at end of pattern") + + fix_grouprefs(p, pattern) if flags & SRE_FLAG_DEBUG: p.dump() - if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: - # the VERBOSE flag was switched on inside the pattern. to be - # on the safe side, we'll parse the whole thing again... - return parse(str, p.pattern.flags) - return p def parse_template(source, pattern): - # parse 're' replacement string into list of literals and - # group references + # parse 're' replacement string into list of literals and group references + sep = source[ : 0] + char_type = unichr if isinstance(sep, unicode) else chr s = Tokenizer(source) - sget = s.get - p = [] - a = p.append - def literal(literal, p=p, pappend=a): - if p and p[-1][0] is LITERAL: - p[-1] = LITERAL, p[-1][1] + literal - else: - pappend((LITERAL, literal)) - sep = source[:0] - if type(sep) is type(""): - makechar = chr - else: - makechar = unichr - while 1: - this = sget() + literals, groups = [], [] + current_literal = [] + def add_literal(char_code): + current_literal.append(char_type(char_code)) + def flush_literal(): + if current_literal: + literals.append(sep.join(current_literal)) + current_literal[:] = [] + def add_group(index): + flush_literal() + groups.append((index, len(literals))) + literals.append(None) + while True: + this = s.get() if this is None: break # end of replacement string - if this and this[0] == "\\": - # group - c = this[1:2] - if c == "g": - name = "" - if s.match("<"): - while 1: - char = sget() - if char is None: - raise error, "unterminated group name" - if char == ">": - break - name = name + char - if not name: - raise error, "bad group name" - try: + if this[0] == "\\": + c = this[1 : 2] + if c in HEX_ESCAPE_LENGTH: + # hex escape + add_literal(hex_escape(s, escape, HEX_ESCAPE_LENGTH[c])) + elif c == "o": + # octal escape + add_literal(oct_escape(s, escape, "")) + elif c == "0": + add_literal(oct_escape(s, this[0], this[1 : ])) + elif c in DIGITS: + if s.next in DIGITS: + this += s.get() + if set(this[1 : ]) <= OCTDIGITS and s.next in OCTDIGITS: + this += s.get() + add_literal(int(this[1 : ], 8) & 0xFF) + else: + index = int(this[1 : ]) + if index > pattern.groups: + raise error("invalid group reference: %s" % index) + add_group(index) + else: + index = int(this[1 : ]) + if index > pattern.groups: + raise error("invalid group reference: %s" % index) + add_group(index) + elif c == "g": + # group reference + if s.next in GROUP_DELIMITERS: + # delimited group reference + delimiter = s.get() + name = parse_name(s, GROUP_DELIMITERS[delimiter], "group", this + delimiter) + elif s.next in DIGITS: + # non-delimited group reference (single digit) + name = s.get() + else: + raise error("missing group name: %s" + this) + if name.isdigit(): index = int(name) - if index < 0: - raise error, "negative group number" - except ValueError: - if not isname(name): - raise error, "bad character in group name" + if not (0 <= index <= pattern.groups): + raise error("invalid group reference: %s" % index) + elif is_name(name): try: index = pattern.groupindex[name] except KeyError: - raise IndexError, "unknown group name" - a((MARK, index)) - elif c == "0": - if s.next in OCTDIGITS: - this = this + sget() - if s.next in OCTDIGITS: - this = this + sget() - literal(makechar(int(this[1:], 8) & 0xff)) - elif c in DIGITS: - isoctal = False - if s.next in DIGITS: - this = this + sget() - if (c in OCTDIGITS and this[2] in OCTDIGITS and - s.next in OCTDIGITS): - this = this + sget() - isoctal = True - literal(makechar(int(this[1:], 8) & 0xff)) - if not isoctal: - a((MARK, int(this[1:]))) + raise error("invalid group reference: %s" % name) + else: + raise error("bad group name: %s" % name) + add_group(index) + elif c == "k": + # named group reference + if s.next in GROUP_DELIMITERS: + # delimited group reference + delimiter = s.get() + name = parse_name(s, GROUP_DELIMITERS[delimiter], "group", this + delimiter) + else: + # non-delimited group reference; invalid for \k + raise error("missing group name: %s" + this) + if is_name(name): + try: + index = pattern.groupindex[name] + except KeyError: + raise error("invalid group reference: %s" % name) + else: + raise error("bad group name: %s" % name) + add_group(index) + elif c == "N": + # named character + if not s.match("{"): + raise error("missing character name: %s" + this) + name = parse_name(s, "}", "character", this + "{") + try: + add_literal(ord(unicodedata.lookup(name))) + except KeyError: + raise error("bad character name: %s" % name) else: try: - this = makechar(ESCAPES[this][1]) + add_literal(ESCAPES[this][1]) except KeyError: - pass - literal(this) - else: - literal(this) - # convert template to groups and literals lists - i = 0 - groups = [] - groupsappend = groups.append - literals = [None] * len(p) - for c, s in p: - if c is MARK: - groupsappend((i, s)) - # literal[i] is already None + add_literal(ord(this[0])) + add_literal(ord(this[1])) else: - literals[i] = s - i = i + 1 - return groups, literals - -def expand_template(template, match): - g = match.group - sep = match.string[:0] - groups, literals = template + add_literal(ord(this)) + flush_literal() + return literals, groups + +def expand_template(template, match, unmatched_as_empty=False): + g = match._internal_group + sep = match.string[ : 0] + literals, groups = template literals = literals[:] try: - for index, group in groups: - literals[index] = s = g(group) + for index, pos in groups: + s = g(index) if s is None: - raise error, "unmatched group" + if unmatched_as_empty: + s = sep + else: + raise error("unmatched group") + literals[pos] = s except IndexError: - raise error, "invalid group reference" + raise error("invalid group reference: %s" % a) return sep.join(literals) === modified file Lib/re.py --- Lib/re.py 2009-01-01 15:46:10 +0000 +++ Lib/re.py 2009-02-03 21:49:47 +0000 @@ -27,52 +27,81 @@ concatenate ordinary characters, so last matches the string 'last'. The special characters are: - "." Matches any character except a newline. - "^" Matches the start of the string. - "$" Matches the end of the string or just before the newline at - the end of the string. - "*" Matches 0 or more (greedy) repetitions of the preceding RE. - Greedy means that it will match as many repetitions as possible. - "+" Matches 1 or more (greedy) repetitions of the preceding RE. - "?" Matches 0 or 1 (greedy) of the preceding RE. - *?,+?,?? Non-greedy versions of the previous three special characters. - {m,n} Matches from m to n repetitions of the preceding RE. - {m,n}? Non-greedy version of the above. - "\\" Either escapes special characters or signals a special sequence. - [] Indicates a set of characters. - A "^" as the first character indicates a complementing set. - "|" A|B, creates an RE that will match either A or B. - (...) Matches the RE inside the parentheses. - The contents can be retrieved or matched later in the string. - (?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below). - (?:...) Non-grouping version of regular parentheses. - (?P...) The substring matched by the group is accessible by name. - (?P=name) Matches the text matched earlier by the group named name. - (?#...) A comment; ignored. - (?=...) Matches if ... matches next, but doesn't consume the string. - (?!...) Matches if ... doesn't match next. - (?<=...) Matches if preceded by ... (must be fixed length). - (?...) The substring matched by the group isaccessible by + name. + (?...) The substring matched by the group is accessible by + name. + (?#...) A comment; ignored. + (?>...) Atomic group. Like (?:...) but won't retry the RE + within the parentheses. + (?=...) Matches if ... matches next, but doesn't consume + the string. + (?!...) Matches if ... doesn't match next. + (?<=...) Matches if preceded by ... (must be fixed length). + (? Matches the text matched by the group named name. + \g Matches the contents of the group of the same number. + \g<+number> Matches the contents of the group of the relative number. + \g<-number> Matches the contents of the group of the relative number. + \k Matches the text matched earlier by the group named name. + \N{name} Matches named Unicode character. + \p{name} Matches any character having the named property. + \P{name} Matches any character not having the named property. + \s Matches any whitespace character; equivalent to + [ \t\n\r\f\v]. + \S Matches any non-whitespace character; equiv. to + [^ \t\n\r\f\v]. + \w Matches any alphanumeric character; equivalent to + [a-zA-Z0-9_]. With LOCALE, it will match the set + [0-9_] plus characters defined as letters for the current + locale. + \W Matches the complement of \w. + \Z Matches only at the end of the string. + \\ Matches a literal backslash. This module exports the following functions: match Match a regular expression pattern to the beginning of a string. @@ -87,15 +116,17 @@ escape Backslash all non-alphanumerics in a string. Some of the functions in this module takes flags as optional parameters: - I IGNORECASE Perform case-insensitive matching. - L LOCALE Make \w, \W, \b, \B, dependent on the current locale. - M MULTILINE "^" matches the beginning of lines (after a newline) - as well as the string. - "$" matches the end of lines (before a newline) as well - as the end of the string. - S DOTALL "." matches any character at all, including the newline. - X VERBOSE Ignore whitespace and comments for nicer looking RE's. - U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale. + I IGNORECASE Perform case-insensitive matching. + L LOCALE Make \w, \W, \b, \B, dependent on the current locale. + M MULTILINE "^" matches the beginning of lines (after a newline) as + well as the string. + "$" matches the end of lines (before a newline) as well + as the end of the string. + R REVERSE Search backwards, from the end to the start. + S DOTALL "." matches any character at all, including the newline. + X VERBOSE Ignore whitespace and comments for nicer looking RE's. + U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale. + Z ZEROWIDTH Permit splitting on zero-width separators. This module also defines an exception 'error'. @@ -109,18 +140,19 @@ __all__ = [ "match", "search", "sub", "subn", "split", "findall", "compile", "purge", "template", "escape", "I", "L", "M", "S", "X", "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", - "UNICODE", "error" ] + "UNICODE", "REVERSE", "error" ] -__version__ = "2.2.1" +__version__ = "2.2.2" # flags I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale -U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline +R = REVERSE = sre_compile.SRE_FLAG_REVERSE # search backwards S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline +U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments - +Z = ZEROWIDTH = sre_compile.SRE_FLAG_ZEROWIDTH # permit splitting on zero-width separators. # sre extensions (experimental, don't rely on these) T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation @@ -237,12 +269,12 @@ if flags: raise ValueError('Cannot process flags argument with a compiled pattern') return pattern - if not sre_compile.isstring(pattern): - raise TypeError, "first argument must be string or compiled pattern" + if not isinstance(pattern, (str, unicode)): + raise TypeError("First argument must be string or compiled pattern") try: p = sre_compile.compile(pattern, flags) except error, v: - raise error, v # invalid expression + raise error(v) # invalid expression if len(_cache) >= _MAXCACHE: _cache.clear() _cache[cachekey] = p @@ -257,7 +289,7 @@ try: p = sre_parse.parse_template(repl, pattern) except error, v: - raise error, v # invalid expression + raise error(v) # invalid expression if len(_cache_repl) >= _MAXCACHE: _cache_repl.clear() _cache_repl[key] = p @@ -266,7 +298,7 @@ def _expand(pattern, match, template): # internal: match.expand implementation hook template = sre_parse.parse_template(template, pattern) - return sre_parse.expand_template(template, match) + return sre_parse.expand_template(template, match, True) def _subx(pattern, template): # internal: pattern.sub/subn implementation helper @@ -275,7 +307,7 @@ # literal replacement return template[1][0] def filter(match, template=template): - return sre_parse.expand_template(template, match) + return sre_parse.expand_template(template, match, True) return filter # register myself for pickling @@ -292,36 +324,31 @@ class Scanner: def __init__(self, lexicon, flags=0): - from sre_constants import BRANCH, SUBPATTERN self.lexicon = lexicon # combine phrases into a compound pattern p = [] s = sre_parse.Pattern() s.flags = flags - for phrase, action in lexicon: - p.append(sre_parse.SubPattern(s, [ - (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), - ])) - s.groups = len(p)+1 - p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) - self.scanner = sre_compile.compile(p) + sep, template = map(type(lexicon[0][0]), ("|", "(%s)")) + regex = sep.join(template % phrase for phrase, action in lexicon) + self.scanner = sre_compile.compile(regex) def scan(self, string): result = [] append = result.append match = self.scanner.scanner(string).match i = 0 - while 1: + while True: m = match() if not m: break j = m.end() if i == j: break - action = self.lexicon[m.lastindex-1][1] + action = self.lexicon[m.lastindex - 1][1] if hasattr(action, '__call__'): self.match = m action = action(self, m.group()) if action is not None: append(action) i = j - return result, string[i:] + return result, string[i : ] === modified file Modules/sre.h --- Modules/sre.h 2006-06-12 03:05:40 +0000 +++ Modules/sre.h 2009-01-29 22:36:26 +0000 @@ -11,19 +11,15 @@ #ifndef SRE_INCLUDED #define SRE_INCLUDED -#include "sre_constants.h" +typedef int BOOL; +enum BOOL {FALSE, TRUE}; -/* size of a code word (must be unsigned short or larger, and - large enough to hold a Py_UNICODE character) */ -#ifdef Py_UNICODE_WIDE -#define SRE_CODE Py_UCS4 -#else -#define SRE_CODE unsigned short -#endif +#include "sre_constants.h" typedef struct { PyObject_VAR_HEAD Py_ssize_t groups; /* must be first! */ + Py_ssize_t internal_groups; /* both numbered and named (all named are numbered) */ PyObject* groupindex; PyObject* indexgroup; /* compatibility */ @@ -36,6 +32,7 @@ } PatternObject; #define PatternObject_GetCode(o) (((PatternObject*)(o))->code) +#define PatternObject_GetCodeSize(o) (((PatternObject*)(o))->codesize) typedef struct { PyObject_VAR_HEAD @@ -44,7 +41,9 @@ PatternObject* pattern; /* link to the regex (pattern) object */ Py_ssize_t pos, endpos; /* current target slice */ Py_ssize_t lastindex; /* last index marker seen by the engine (-1 if none) */ + Py_ssize_t last_named_index; /* last named index marker seen by the engine (-1 if none) */ Py_ssize_t groups; /* number of groups (start/end marks) */ + Py_ssize_t internal_groups; /* number of groups, both numbered and named (all named are also numbered) */ Py_ssize_t mark[1]; } MatchObject; @@ -53,12 +52,52 @@ /* FIXME: shouldn't be a constant, really... */ #define SRE_MARK_SIZE 200 -typedef struct SRE_REPEAT_T { - Py_ssize_t count; - SRE_CODE* pattern; /* points to REPEAT operator arguments */ - void* last_ptr; /* helper to check for infinite loops */ - struct SRE_REPEAT_T *prev; /* points to previous repeat context */ -} SRE_REPEAT; +#define SRE_BACKTRACK_CHUNK_SIZE 1024 + +typedef struct SRE_BACKTRACK_ITEM { + int op; + union + { + struct { + void* text_start; + void* text_ptr; + SRE_CODE* pattern_ptr; + } assert; + struct { + void* text_ptr; + SRE_CODE* pattern_ptr; + } branch; + struct { + int numbered_index; + void* numbered_mark_ptr; + int named_index; + void* named_mark_ptr; + } mark; + struct { + void* text_ptr; + int repeat_min; + int repeat_max; + int repeat_counter; + void* repeat_start; + struct SRE_BACKTRACK_ITEM* top_nested; + SRE_CODE* pattern_ptr; + } repeat; + }; + void* marks; // Numbered and named marks. +} SRE_BACKTRACK_ITEM; + +typedef struct SRE_BACKTRACK_CHUNK { + struct SRE_BACKTRACK_CHUNK* previous; + SRE_BACKTRACK_ITEM items[SRE_BACKTRACK_CHUNK_SIZE]; + int count; +} SRE_BACKTRACK_CHUNK; + +typedef struct SRE_ENCODING_TABLE { +BOOL (*in_category)(SRE_CODE category, SRE_CODE ch); +SRE_CODE (*lower)(SRE_CODE ch); +SRE_CODE (*upper)(SRE_CODE ch); +SRE_CODE (*title)(SRE_CODE ch); +} SRE_ENCODING_TABLE; typedef struct { /* string pointers */ @@ -71,18 +110,20 @@ Py_ssize_t pos, endpos; /* character size */ int charsize; + int reverse; + int reject_zero_width; /* registers */ Py_ssize_t lastindex; Py_ssize_t lastmark; + Py_ssize_t last_named_index; void* mark[SRE_MARK_SIZE]; /* dynamically allocated stuff */ - char* data_stack; - size_t data_stack_size; - size_t data_stack_base; - /* current repeat context */ - SRE_REPEAT *repeat; + SRE_BACKTRACK_CHUNK* backtrack_chunk; + int numbered_mark_count; + int named_mark_count; + SRE_CODE* pattern_code; /* hooks */ - SRE_TOLOWER_HOOK lower; + SRE_ENCODING_TABLE* encoding; } SRE_STATE; typedef struct { === modified file Modules/sre_constants.h --- Modules/sre_constants.h 2003-10-17 22:13:16 +0000 +++ Modules/sre_constants.h 2009-02-01 01:43:54 +0000 @@ -11,76 +11,297 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20031017 +#define SRE_MAGIC 20081218 + +/* size of a code word (must be unsigned short or larger, and + large enough to hold a Py_UNICODE character) */ +typedef unsigned int SRE_CODE; + +#define SRE_BYTES_PER_CODE 4 +#define SRE_BITS_PER_CODE 32 +#define SRE_UNLIMITED_REPEATS 0xFFFFFFFF + #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 #define SRE_OP_ANY_ALL 3 -#define SRE_OP_ASSERT 4 -#define SRE_OP_ASSERT_NOT 5 -#define SRE_OP_AT 6 -#define SRE_OP_BRANCH 7 -#define SRE_OP_CALL 8 -#define SRE_OP_CATEGORY 9 -#define SRE_OP_CHARSET 10 -#define SRE_OP_BIGCHARSET 11 -#define SRE_OP_GROUPREF 12 -#define SRE_OP_GROUPREF_EXISTS 13 -#define SRE_OP_GROUPREF_IGNORE 14 -#define SRE_OP_IN 15 -#define SRE_OP_IN_IGNORE 16 -#define SRE_OP_INFO 17 -#define SRE_OP_JUMP 18 -#define SRE_OP_LITERAL 19 -#define SRE_OP_LITERAL_IGNORE 20 -#define SRE_OP_MARK 21 -#define SRE_OP_MAX_UNTIL 22 -#define SRE_OP_MIN_UNTIL 23 -#define SRE_OP_NOT_LITERAL 24 -#define SRE_OP_NOT_LITERAL_IGNORE 25 -#define SRE_OP_NEGATE 26 -#define SRE_OP_RANGE 27 -#define SRE_OP_REPEAT 28 -#define SRE_OP_REPEAT_ONE 29 -#define SRE_OP_SUBPATTERN 30 -#define SRE_OP_MIN_REPEAT_ONE 31 -#define SRE_AT_BEGINNING 0 -#define SRE_AT_BEGINNING_LINE 1 -#define SRE_AT_BEGINNING_STRING 2 -#define SRE_AT_BOUNDARY 3 -#define SRE_AT_NON_BOUNDARY 4 -#define SRE_AT_END 5 -#define SRE_AT_END_LINE 6 -#define SRE_AT_END_STRING 7 -#define SRE_AT_LOC_BOUNDARY 8 -#define SRE_AT_LOC_NON_BOUNDARY 9 -#define SRE_AT_UNI_BOUNDARY 10 -#define SRE_AT_UNI_NON_BOUNDARY 11 -#define SRE_CATEGORY_DIGIT 0 -#define SRE_CATEGORY_NOT_DIGIT 1 -#define SRE_CATEGORY_SPACE 2 -#define SRE_CATEGORY_NOT_SPACE 3 -#define SRE_CATEGORY_WORD 4 -#define SRE_CATEGORY_NOT_WORD 5 -#define SRE_CATEGORY_LINEBREAK 6 -#define SRE_CATEGORY_NOT_LINEBREAK 7 -#define SRE_CATEGORY_LOC_WORD 8 -#define SRE_CATEGORY_LOC_NOT_WORD 9 -#define SRE_CATEGORY_UNI_DIGIT 10 -#define SRE_CATEGORY_UNI_NOT_DIGIT 11 -#define SRE_CATEGORY_UNI_SPACE 12 -#define SRE_CATEGORY_UNI_NOT_SPACE 13 -#define SRE_CATEGORY_UNI_WORD 14 -#define SRE_CATEGORY_UNI_NOT_WORD 15 -#define SRE_CATEGORY_UNI_LINEBREAK 16 -#define SRE_CATEGORY_UNI_NOT_LINEBREAK 17 -#define SRE_FLAG_TEMPLATE 1 -#define SRE_FLAG_IGNORECASE 2 -#define SRE_FLAG_LOCALE 4 -#define SRE_FLAG_MULTILINE 8 -#define SRE_FLAG_DOTALL 16 -#define SRE_FLAG_UNICODE 32 -#define SRE_FLAG_VERBOSE 64 -#define SRE_INFO_PREFIX 1 -#define SRE_INFO_LITERAL 2 -#define SRE_INFO_CHARSET 4 +#define SRE_OP_ANY_ALL_REV 4 +#define SRE_OP_ANY_REV 5 +#define SRE_OP_ASSERT 6 +#define SRE_OP_ASSERT_NOT 7 +#define SRE_OP_ATOMIC 8 +#define SRE_OP_BOUNDARY 9 +#define SRE_OP_BRANCH 10 +#define SRE_OP_CATEGORY 11 +#define SRE_OP_CATEGORY_REV 12 +#define SRE_OP_CHARSET 13 +#define SRE_OP_CHARSET_IGNORE 14 +#define SRE_OP_CHARSET_IGNORE_REV 15 +#define SRE_OP_CHARSET_REV 16 +#define SRE_OP_END_ASSERT 17 +#define SRE_OP_END_ASSERT_NOT 18 +#define SRE_OP_END_ATOMIC 19 +#define SRE_OP_END_OF_LINE 20 +#define SRE_OP_END_OF_STRING 21 +#define SRE_OP_END_OF_STRING_LN 22 +#define SRE_OP_END_REPEAT_MAX 23 +#define SRE_OP_END_REPEAT_MAX_REV 24 +#define SRE_OP_END_REPEAT_MIN 25 +#define SRE_OP_END_REPEAT_MIN_REV 26 +#define SRE_OP_END_REPEAT_POSS 27 +#define SRE_OP_END_REPEAT_POSS_REV 28 +#define SRE_OP_GROUPREF 29 +#define SRE_OP_GROUPREF_EXISTS 30 +#define SRE_OP_GROUPREF_IGNORE 31 +#define SRE_OP_GROUPREF_IGNORE_REV 32 +#define SRE_OP_GROUPREF_REV 33 +#define SRE_OP_JUMP 34 +#define SRE_OP_LITERAL 35 +#define SRE_OP_LITERAL_IGNORE 36 +#define SRE_OP_LITERAL_IGNORE_REV 37 +#define SRE_OP_LITERAL_REV 38 +#define SRE_OP_LITERAL_STRING 39 +#define SRE_OP_LITERAL_STRING_IGNORE 40 +#define SRE_OP_LITERAL_STRING_IGNORE_REV 41 +#define SRE_OP_LITERAL_STRING_REV 42 +#define SRE_OP_MARK 43 +#define SRE_OP_NOT_BOUNDARY 44 +#define SRE_OP_NOT_CATEGORY 45 +#define SRE_OP_NOT_CATEGORY_REV 46 +#define SRE_OP_NOT_CHARSET 47 +#define SRE_OP_NOT_CHARSET_IGNORE 48 +#define SRE_OP_NOT_CHARSET_IGNORE_REV 49 +#define SRE_OP_NOT_CHARSET_REV 50 +#define SRE_OP_NOT_LITERAL 51 +#define SRE_OP_NOT_LITERAL_IGNORE 52 +#define SRE_OP_NOT_LITERAL_IGNORE_REV 53 +#define SRE_OP_NOT_LITERAL_REV 54 +#define SRE_OP_NOT_RANGE 55 +#define SRE_OP_NOT_RANGE_IGNORE 56 +#define SRE_OP_NOT_RANGE_IGNORE_REV 57 +#define SRE_OP_NOT_RANGE_REV 58 +#define SRE_OP_NOT_SET 59 +#define SRE_OP_NOT_SET_IGNORE 60 +#define SRE_OP_NOT_SET_IGNORE_REV 61 +#define SRE_OP_NOT_SET_REV 62 +#define SRE_OP_RANGE 63 +#define SRE_OP_RANGE_IGNORE 64 +#define SRE_OP_RANGE_IGNORE_REV 65 +#define SRE_OP_RANGE_REV 66 +#define SRE_OP_REPEAT_MAX 67 +#define SRE_OP_REPEAT_MAX_REV 68 +#define SRE_OP_REPEAT_MIN 69 +#define SRE_OP_REPEAT_MIN_REV 70 +#define SRE_OP_REPEAT_ONE_MAX 71 +#define SRE_OP_REPEAT_ONE_MAX_REV 72 +#define SRE_OP_REPEAT_ONE_MIN 73 +#define SRE_OP_REPEAT_ONE_MIN_REV 74 +#define SRE_OP_REPEAT_ONE_POSS 75 +#define SRE_OP_REPEAT_ONE_POSS_REV 76 +#define SRE_OP_REPEAT_POSS 77 +#define SRE_OP_REPEAT_POSS_REV 78 +#define SRE_OP_SET 79 +#define SRE_OP_SET_IGNORE 80 +#define SRE_OP_SET_IGNORE_REV 81 +#define SRE_OP_SET_REV 82 +#define SRE_OP_START_OF_LINE 83 +#define SRE_OP_START_OF_STRING 84 +#define SRE_OP_SUBPATTERN 85 +#define SRE_MAX_OP 85 + +#define SRE_FLAG_TEMPLATE 0x1 +#define SRE_FLAG_IGNORECASE 0x2 +#define SRE_FLAG_LOCALE 0x4 +#define SRE_FLAG_MULTILINE 0x8 +#define SRE_FLAG_DOTALL 0x10 +#define SRE_FLAG_UNICODE 0x20 +#define SRE_FLAG_VERBOSE 0x40 +#define SRE_FLAG_REVERSE 0x100 +#define SRE_FLAG_ZEROWIDTH 0x200 + +#define SRE_INFO_PREFIX 0x1 +#define SRE_INFO_LITERAL 0x2 +#define SRE_INFO_CHARSET 0x4 + +#define SRE_UNI_CAT_Lu 0x1 +#define SRE_UNI_CAT_Ll 0x2 +#define SRE_UNI_CAT_Lt 0x3 +#define SRE_UNI_CAT_Mn 0x4 +#define SRE_UNI_CAT_Mc 0x5 +#define SRE_UNI_CAT_Me 0x6 +#define SRE_UNI_CAT_Nd 0x7 +#define SRE_UNI_CAT_Nl 0x8 +#define SRE_UNI_CAT_No 0x9 +#define SRE_UNI_CAT_Zs 0xA +#define SRE_UNI_CAT_Zl 0xB +#define SRE_UNI_CAT_Zp 0xC +#define SRE_UNI_CAT_Cc 0xD +#define SRE_UNI_CAT_Cf 0xE +#define SRE_UNI_CAT_Cs 0xF +#define SRE_UNI_CAT_Co 0x10 +#define SRE_UNI_CAT_Lm 0x12 +#define SRE_UNI_CAT_Lo 0x13 +#define SRE_UNI_CAT_Pc 0x14 +#define SRE_UNI_CAT_Pd 0x15 +#define SRE_UNI_CAT_Ps 0x16 +#define SRE_UNI_CAT_Pe 0x17 +#define SRE_UNI_CAT_Pi 0x18 +#define SRE_UNI_CAT_Pf 0x19 +#define SRE_UNI_CAT_Po 0x1A +#define SRE_UNI_CAT_Sm 0x1B +#define SRE_UNI_CAT_Sc 0x1C +#define SRE_UNI_CAT_Sk 0x1D +#define SRE_UNI_CAT_So 0x1E + +#define SRE_UNI_CAT_L 0x20 +#define SRE_UNI_CAT_M 0x21 +#define SRE_UNI_CAT_N 0x22 +#define SRE_UNI_CAT_Z 0x23 +#define SRE_UNI_CAT_C 0x24 +#define SRE_UNI_CAT_P 0x25 +#define SRE_UNI_CAT_S 0x26 + +#define SRE_CAT_Alpha 0x27 +#define SRE_CAT_Alnum 0x28 +#define SRE_CAT_ASCII 0x29 +#define SRE_CAT_Blank 0x2A +#define SRE_CAT_Cntrl 0x2B +#define SRE_CAT_Digit 0x2C +#define SRE_CAT_Graph 0x2D +#define SRE_CAT_LineBreak 0x2E +#define SRE_CAT_Lower 0x2F +#define SRE_CAT_Print 0x30 +#define SRE_CAT_Punct 0x31 +#define SRE_CAT_Space 0x32 +#define SRE_CAT_Upper 0x33 +#define SRE_CAT_Word 0x34 +#define SRE_CAT_XDigit 0x35 + +#define SRE_UNI_CAT_C_MASK 0x0001E000 +#define SRE_UNI_CAT_L_MASK 0x000C000E +#define SRE_UNI_CAT_M_MASK 0x00000070 +#define SRE_UNI_CAT_N_MASK 0x00000380 +#define SRE_UNI_CAT_P_MASK 0x07F00000 +#define SRE_UNI_CAT_S_MASK 0x78000000 +#define SRE_UNI_CAT_Z_MASK 0x00001C00 + +#define SRE_UNI_CAT_MASK_Alnum 0x000C008E +#define SRE_UNI_CAT_MASK_Alpha 0x000C000E +#define SRE_UNI_CAT_MASK_Graph 0x7FFC03FE +#define SRE_UNI_CAT_MASK_Print 0x7FFC1FFE +#define SRE_UNI_CAT_MASK_Punct 0x7FF00000 +#define SRE_UNI_CAT_MASK_Word 0x001C03FE + +// info for operator validation +typedef struct SRE_OpInfo { + int type; + int direction; + int end_marker; +} SRE_OpInfo; + +#define SRE_TYPE_INVALID 0 +#define SRE_TYPE_ASSERT 1 +#define SRE_TYPE_ATOMIC 2 +#define SRE_TYPE_BRANCH 3 +#define SRE_TYPE_CATEGORY 4 +#define SRE_TYPE_CHARSET 5 +#define SRE_TYPE_GROUPREF 6 +#define SRE_TYPE_GROUPREF_EXISTS 7 +#define SRE_TYPE_LITERAL 8 +#define SRE_TYPE_LITERAL_STRING 9 +#define SRE_TYPE_MARK 10 +#define SRE_TYPE_POSITION 11 +#define SRE_TYPE_RANGE 12 +#define SRE_TYPE_REPEAT 13 +#define SRE_TYPE_REPEAT_ONE 14 +#define SRE_TYPE_SET 15 +#define SRE_TYPE_SIMPLE_CATEGORY 16 + +static SRE_OpInfo op_info[] = { + {0, 0, 0}, // SRE_OP_FAILURE + {0, 0, 0}, // SRE_OP_SUCCESS + {16, 1, 0}, // SRE_OP_ANY + {16, 1, 0}, // SRE_OP_ANY_ALL + {16, -1, 0}, // SRE_OP_ANY_ALL_REV + {16, -1, 0}, // SRE_OP_ANY_REV + {1, 0, SRE_OP_END_ASSERT}, // SRE_OP_ASSERT + {1, 0, SRE_OP_END_ASSERT_NOT}, // SRE_OP_ASSERT_NOT + {2, 0, SRE_OP_END_ATOMIC}, // SRE_OP_ATOMIC + {11, 0, 0}, // SRE_OP_BOUNDARY + {3, 0, 0}, // SRE_OP_BRANCH + {4, 1, 0}, // SRE_OP_CATEGORY + {4, -1, 0}, // SRE_OP_CATEGORY_REV + {5, 1, 0}, // SRE_OP_CHARSET + {5, 1, 0}, // SRE_OP_CHARSET_IGNORE + {5, -1, 0}, // SRE_OP_CHARSET_IGNORE_REV + {5, -1, 0}, // SRE_OP_CHARSET_REV + {0, 0, 0}, // SRE_OP_END_ASSERT + {0, 0, 0}, // SRE_OP_END_ASSERT_NOT + {0, 0, 0}, // SRE_OP_END_ATOMIC + {11, 0, 0}, // SRE_OP_END_OF_LINE + {11, 0, 0}, // SRE_OP_END_OF_STRING + {11, 0, 0}, // SRE_OP_END_OF_STRING_LN + {0, 1, 0}, // SRE_OP_END_REPEAT_MAX + {0, -1, 0}, // SRE_OP_END_REPEAT_MAX_REV + {0, 1, 0}, // SRE_OP_END_REPEAT_MIN + {0, -1, 0}, // SRE_OP_END_REPEAT_MIN_REV + {0, 1, 0}, // SRE_OP_END_REPEAT_POSS + {0, -1, 0}, // SRE_OP_END_REPEAT_POSS_REV + {6, 1, 0}, // SRE_OP_GROUPREF + {7, 0, 0}, // SRE_OP_GROUPREF_EXISTS + {6, 1, 0}, // SRE_OP_GROUPREF_IGNORE + {6, -1, 0}, // SRE_OP_GROUPREF_IGNORE_REV + {6, -1, 0}, // SRE_OP_GROUPREF_REV + {0, 0, 0}, // SRE_OP_JUMP + {8, 1, 0}, // SRE_OP_LITERAL + {8, 1, 0}, // SRE_OP_LITERAL_IGNORE + {8, -1, 0}, // SRE_OP_LITERAL_IGNORE_REV + {8, -1, 0}, // SRE_OP_LITERAL_REV + {9, 1, 0}, // SRE_OP_LITERAL_STRING + {9, 1, 0}, // SRE_OP_LITERAL_STRING_IGNORE + {9, -1, 0}, // SRE_OP_LITERAL_STRING_IGNORE_REV + {9, -1, 0}, // SRE_OP_LITERAL_STRING_REV + {10, 0, 0}, // SRE_OP_MARK + {11, 0, 0}, // SRE_OP_NOT_BOUNDARY + {4, 1, 0}, // SRE_OP_NOT_CATEGORY + {4, -1, 0}, // SRE_OP_NOT_CATEGORY_REV + {5, 1, 0}, // SRE_OP_NOT_CHARSET + {5, 1, 0}, // SRE_OP_NOT_CHARSET_IGNORE + {5, -1, 0}, // SRE_OP_NOT_CHARSET_IGNORE_REV + {5, -1, 0}, // SRE_OP_NOT_CHARSET_REV + {8, 1, 0}, // SRE_OP_NOT_LITERAL + {8, 1, 0}, // SRE_OP_NOT_LITERAL_IGNORE + {8, -1, 0}, // SRE_OP_NOT_LITERAL_IGNORE_REV + {8, -1, 0}, // SRE_OP_NOT_LITERAL_REV + {12, 1, 0}, // SRE_OP_NOT_RANGE + {12, 1, 0}, // SRE_OP_NOT_RANGE_IGNORE + {12, -1, 0}, // SRE_OP_NOT_RANGE_IGNORE_REV + {12, -1, 0}, // SRE_OP_NOT_RANGE_REV + {15, 1, 0}, // SRE_OP_NOT_SET + {15, 1, 0}, // SRE_OP_NOT_SET_IGNORE + {15, -1, 0}, // SRE_OP_NOT_SET_IGNORE_REV + {15, -1, 0}, // SRE_OP_NOT_SET_REV + {12, 1, 0}, // SRE_OP_RANGE + {12, 1, 0}, // SRE_OP_RANGE_IGNORE + {12, -1, 0}, // SRE_OP_RANGE_IGNORE_REV + {12, -1, 0}, // SRE_OP_RANGE_REV + {13, 1, SRE_OP_END_REPEAT_MAX}, // SRE_OP_REPEAT_MAX + {13, -1, SRE_OP_END_REPEAT_MAX}, // SRE_OP_REPEAT_MAX_REV + {13, 1, SRE_OP_END_REPEAT_MIN}, // SRE_OP_REPEAT_MIN + {13, -1, SRE_OP_END_REPEAT_MIN}, // SRE_OP_REPEAT_MIN_REV + {14, 1, 0}, // SRE_OP_REPEAT_ONE_MAX + {14, -1, 0}, // SRE_OP_REPEAT_ONE_MAX_REV + {14, 1, 0}, // SRE_OP_REPEAT_ONE_MIN + {14, -1, 0}, // SRE_OP_REPEAT_ONE_MIN_REV + {14, 1, 0}, // SRE_OP_REPEAT_ONE_POSS + {14, -1, 0}, // SRE_OP_REPEAT_ONE_POSS_REV + {13, 1, SRE_OP_END_REPEAT_POSS}, // SRE_OP_REPEAT_POSS + {13, -1, SRE_OP_END_REPEAT_POSS}, // SRE_OP_REPEAT_POSS_REV + {15, 1, 0}, // SRE_OP_SET + {15, 1, 0}, // SRE_OP_SET_IGNORE + {15, -1, 0}, // SRE_OP_SET_IGNORE_REV + {15, -1, 0}, // SRE_OP_SET_REV + {11, 0, 0}, // SRE_OP_START_OF_LINE + {11, 0, 0}, // SRE_OP_START_OF_STRING + {0, 0, 0}, // SRE_OP_SUBPATTERN +}; === modified file Modules/_sre.c --- Modules/_sre.c 2008-09-10 14:27:00 +0000 +++ Modules/_sre.c 2009-02-03 17:29:46 +0000 @@ -4,24 +4,25 @@ * regular expression matching engine * * partial history: - * 1999-10-24 fl created (based on existing template matcher code) - * 2000-03-06 fl first alpha, sort of - * 2000-08-01 fl fixes for 1.6b1 - * 2000-08-07 fl use PyOS_CheckStack() if available - * 2000-09-20 fl added expand method - * 2001-03-20 fl lots of fixes for 2.1b2 - * 2001-04-15 fl export copyright as Python attribute, not global - * 2001-04-28 fl added __copy__ methods (work in progress) - * 2001-05-14 fl fixes for 1.5.2 compatibility - * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis) - * 2001-10-18 fl fixed group reset issue (from Matthew Mueller) - * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1 - * 2001-10-21 fl added sub/subn primitive - * 2001-10-24 fl added finditer primitive (for 2.2 only) - * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) - * 2002-11-09 fl fixed empty sub/subn return type - * 2003-04-18 mvl fully support 4-byte codes - * 2003-10-17 gn implemented non recursive scheme + * 1999-10-24 fl created (based on existing template matcher code) + * 2000-03-06 fl first alpha, sort of + * 2000-08-01 fl fixes for 1.6b1 + * 2000-08-07 fl use PyOS_CheckStack() if available + * 2000-09-20 fl added expand method + * 2001-03-20 fl lots of fixes for 2.1b2 + * 2001-04-15 fl export copyright as Python attribute, not global + * 2001-04-28 fl added __copy__ methods (work in progress) + * 2001-05-14 fl fixes for 1.5.2 compatibility + * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis) + * 2001-10-18 fl fixed group reset issue (from Matthew Mueller) + * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1 + * 2001-10-21 fl added sub/subn primitive + * 2001-10-24 fl added finditer primitive (for 2.2 only) + * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) + * 2002-11-09 fl fixed empty sub/subn return type + * 2003-04-18 mvl fully support 4-byte codes + * 2003-10-17 gn implemented non recursive scheme + * 2008-09-21 mrab major reworking * * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * @@ -55,11 +56,14 @@ #define SRE_PY_MODULE "re" -/* defining this one enables tracing */ -#undef VERBOSE +/* uncomment this define to enable tracing */ +//#define VERBOSE_SRE_ENGINE + +//#define DEBUG_TRACE(v) printf v +#define DEBUG_TRACE(v) #if PY_VERSION_HEX >= 0x01060000 -#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE) +#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE) /* defining this enables unicode support (default under 1.6a1 and later) */ #define HAVE_UNICODE #endif @@ -68,9 +72,6 @@ /* -------------------------------------------------------------------- */ /* optional features */ -/* enables fast searching */ -#define USE_FAST_SEARCH - /* enables aggressive inlining (always on for Visual C) */ #undef USE_INLINE @@ -95,13 +96,13 @@ #endif /* error codes */ -#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */ -#define SRE_ERROR_STATE -2 /* illegal state */ +#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */ +#define SRE_ERROR_STATE -2 /* illegal state */ #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */ -#define SRE_ERROR_MEMORY -9 /* out of memory */ -#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */ +#define SRE_ERROR_MEMORY -9 /* out of memory */ +#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */ -#if defined(VERBOSE) +#if defined(VERBOSE_SRE_ENGINE) #define TRACE(v) printf v #else #define TRACE(v) @@ -110,219 +111,408 @@ /* -------------------------------------------------------------------- */ /* search engine state */ -/* default character predicates (run sre_chars.py to regenerate tables) */ +typedef struct { + const unsigned char category; /* index into _PyUnicode_CategoryNames */ + const unsigned char combining; /* combining class value 0 - 255 */ + const unsigned char bidirectional; /* index into _PyUnicode_BidirectionalNames */ + const unsigned char mirrored; /* true if mirrored in bidir mode */ + const unsigned char east_asian_width; /* index into _PyUnicode_EastAsianWidth */ +} _PyUnicode_DatabaseRecord; + +typedef struct change_record { + const unsigned char bidir_changed; + const unsigned char category_changed; + const unsigned char decimal_changed; + const unsigned char mirrored_changed; + const int numeric_changed; +} change_record; + +#include "unicodedata_db.h" + +static const unsigned char get_unicode_category(Py_UCS4 code) { + int index; + if (code >= 0x110000) + index = 0; + else { + index = index1[(code >> SHIFT)]; + index = index2[(index << SHIFT) + (code & ((1 << SHIFT) - 1))]; + } + + return _PyUnicode_Database_Records[index].category; +} -#define SRE_DIGIT_MASK 1 -#define SRE_SPACE_MASK 2 -#define SRE_LINEBREAK_MASK 4 -#define SRE_ALNUM_MASK 8 -#define SRE_WORD_MASK 16 - -/* FIXME: this assumes ASCII. create tables in init_sre() instead */ - -static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2, -2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, -25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, -24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, -0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, -24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 }; - -static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, -27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, -61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, -108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, -122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, -106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, -120, 121, 122, 123, 124, 125, 126, 127 }; - -#define SRE_IS_DIGIT(ch)\ - ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0) -#define SRE_IS_SPACE(ch)\ - ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0) -#define SRE_IS_LINEBREAK(ch)\ - ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0) -#define SRE_IS_ALNUM(ch)\ - ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0) -#define SRE_IS_WORD(ch)\ - ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0) +/* ASCII */ -static unsigned int sre_lower(unsigned int ch) -{ - return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch); +#define SRE_ASCII_MAX 0x7F + +#define SRE_BLANK_MASK 0x001 +#define SRE_DIGIT_MASK 0x002 +#define SRE_GRAPH_MASK 0x004 +#define SRE_LOWER_MASK 0x008 +#define SRE_PRINT_MASK 0x010 +#define SRE_PUNCT_MASK 0x020 +#define SRE_UNDERSCORE_MASK 0x040 +#define SRE_UPPER_MASK 0x080 +#define SRE_XDIGIT_MASK 0x100 +#define SRE_WHITESPACE_MASK 0x200 + +static short sre_ascii_info[SRE_ASCII_MAX + 1] = { +0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x201, 0x200, 0x200, 0x200, 0x200, 0x000, 0x000, +0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x200, 0x200, 0x200, 0x200, +0x211, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, +0x116, 0x116, 0x116, 0x116, 0x116, 0x116, 0x116, 0x116, 0x116, 0x116, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, +0x034, 0x194, 0x194, 0x194, 0x194, 0x194, 0x194, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, +0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x034, 0x034, 0x034, 0x034, 0x074, +0x034, 0x11C, 0x11C, 0x11C, 0x11C, 0x11C, 0x11C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, +0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x034, 0x034, 0x034, 0x034, 0x000, +}; + +static BOOL ascii_in_category(SRE_CODE category, SRE_CODE ch) { + if (ch > SRE_ASCII_MAX) + return FALSE; + + switch(category) { + case SRE_CAT_Alnum: + return (sre_ascii_info[ch] & (SRE_DIGIT_MASK | SRE_LOWER_MASK | SRE_UPPER_MASK)) != 0; + case SRE_CAT_Alpha: + return (sre_ascii_info[ch] & (SRE_LOWER_MASK | SRE_UPPER_MASK)) != 0; + case SRE_CAT_ASCII: + return TRUE; + case SRE_CAT_Blank: + return (sre_ascii_info[ch] & SRE_BLANK_MASK) != 0; + case SRE_CAT_Cntrl: + return (sre_ascii_info[ch] & SRE_PRINT_MASK) == 0; + case SRE_CAT_Digit: + return (sre_ascii_info[ch] & SRE_DIGIT_MASK) != 0; + case SRE_CAT_Graph: + return (sre_ascii_info[ch] & SRE_GRAPH_MASK) != 0; + case SRE_CAT_LineBreak: + return ch == '\n'; + case SRE_CAT_Lower: + return (sre_ascii_info[ch] & SRE_LOWER_MASK) != 0; + case SRE_CAT_Print: + return (sre_ascii_info[ch] & SRE_PRINT_MASK) != 0; + case SRE_CAT_Punct: + return (sre_ascii_info[ch] & SRE_PUNCT_MASK) != 0; + case SRE_CAT_Space: + return (sre_ascii_info[ch] & SRE_WHITESPACE_MASK) != 0; + case SRE_CAT_Upper: + return (sre_ascii_info[ch] & SRE_UPPER_MASK) != 0; + case SRE_CAT_Word: + return (sre_ascii_info[ch] & (SRE_DIGIT_MASK | SRE_LOWER_MASK | SRE_UPPER_MASK | SRE_UNDERSCORE_MASK)) != 0; + case SRE_CAT_XDigit: + return (sre_ascii_info[ch] & SRE_XDIGIT_MASK) != 0; + default: + return FALSE; + } } -/* locale-specific character predicates */ -/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids - * warnings when c's type supports only numbers < N+1 */ -#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0) -#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0) -#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n') -#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0) -#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_') +static SRE_CODE ascii_lower(SRE_CODE ch) { + return ch <= SRE_ASCII_MAX && (sre_ascii_info[ch] & SRE_UPPER_MASK) ? ch ^ 0x20 : ch; +} -static unsigned int sre_lower_locale(unsigned int ch) -{ - return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch); +static SRE_CODE ascii_upper(SRE_CODE ch) { + return ch <= SRE_ASCII_MAX && (sre_ascii_info[ch] & SRE_LOWER_MASK) ? ch ^ 0x20 : ch; } -/* unicode-specific character predicates */ +static SRE_ENCODING_TABLE ascii_encoding = { +ascii_in_category, +ascii_lower, +ascii_upper, +ascii_upper, +}; -#if defined(HAVE_UNICODE) +/* locale-specific */ -#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch)) -#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch)) -#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) -#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch)) -#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_') +#define SRE_LOC_MAX 0xFF -static unsigned int sre_lower_unicode(unsigned int ch) -{ - return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch)); +static BOOL loc_in_category(SRE_CODE category, SRE_CODE ch) { + if (ch > SRE_LOC_MAX) + return FALSE; + + switch (category) { + case SRE_CAT_Alnum: + return isalnum(ch); + case SRE_CAT_Alpha: + return isalpha(ch); + case SRE_CAT_ASCII: + return ch <= SRE_ASCII_MAX; + case SRE_CAT_Blank: + return ch == '\t' || ch == ' '; + case SRE_CAT_Cntrl: + return !isprint(ch); + case SRE_CAT_Digit: + return isdigit(ch); + case SRE_CAT_Graph: + return isgraph(ch); + case SRE_CAT_LineBreak: + return ch == '\n'; + case SRE_CAT_Lower: + return islower(ch); + case SRE_CAT_Print: + return isprint(ch); + case SRE_CAT_Punct: + return ispunct(ch); + case SRE_CAT_Space: + return isspace(ch); + case SRE_CAT_Upper: + return isupper(ch); + case SRE_CAT_Word: + return ch == '_' || isalnum(ch); + case SRE_CAT_XDigit: + return isxdigit(ch); + default: + return FALSE; + } } -#endif +static SRE_CODE loc_lower(SRE_CODE ch) { + return ch <= SRE_LOC_MAX ? (SRE_CODE)tolower(ch) : ch; +} -LOCAL(int) -sre_category(SRE_CODE category, unsigned int ch) -{ - switch (category) { +static SRE_CODE loc_upper(SRE_CODE ch) { + return ch <= SRE_LOC_MAX ? (SRE_CODE)toupper(ch) : ch; +} - case SRE_CATEGORY_DIGIT: - return SRE_IS_DIGIT(ch); - case SRE_CATEGORY_NOT_DIGIT: - return !SRE_IS_DIGIT(ch); - case SRE_CATEGORY_SPACE: - return SRE_IS_SPACE(ch); - case SRE_CATEGORY_NOT_SPACE: - return !SRE_IS_SPACE(ch); - case SRE_CATEGORY_WORD: - return SRE_IS_WORD(ch); - case SRE_CATEGORY_NOT_WORD: - return !SRE_IS_WORD(ch); - case SRE_CATEGORY_LINEBREAK: - return SRE_IS_LINEBREAK(ch); - case SRE_CATEGORY_NOT_LINEBREAK: - return !SRE_IS_LINEBREAK(ch); - - case SRE_CATEGORY_LOC_WORD: - return SRE_LOC_IS_WORD(ch); - case SRE_CATEGORY_LOC_NOT_WORD: - return !SRE_LOC_IS_WORD(ch); +static SRE_ENCODING_TABLE locale_encoding = { +loc_in_category, +loc_lower, +loc_upper, +loc_upper, +}; -#if defined(HAVE_UNICODE) - case SRE_CATEGORY_UNI_DIGIT: - return SRE_UNI_IS_DIGIT(ch); - case SRE_CATEGORY_UNI_NOT_DIGIT: - return !SRE_UNI_IS_DIGIT(ch); - case SRE_CATEGORY_UNI_SPACE: - return SRE_UNI_IS_SPACE(ch); - case SRE_CATEGORY_UNI_NOT_SPACE: - return !SRE_UNI_IS_SPACE(ch); - case SRE_CATEGORY_UNI_WORD: - return SRE_UNI_IS_WORD(ch); - case SRE_CATEGORY_UNI_NOT_WORD: - return !SRE_UNI_IS_WORD(ch); - case SRE_CATEGORY_UNI_LINEBREAK: - return SRE_UNI_IS_LINEBREAK(ch); - case SRE_CATEGORY_UNI_NOT_LINEBREAK: - return !SRE_UNI_IS_LINEBREAK(ch); -#else - case SRE_CATEGORY_UNI_DIGIT: - return SRE_IS_DIGIT(ch); - case SRE_CATEGORY_UNI_NOT_DIGIT: - return !SRE_IS_DIGIT(ch); - case SRE_CATEGORY_UNI_SPACE: - return SRE_IS_SPACE(ch); - case SRE_CATEGORY_UNI_NOT_SPACE: - return !SRE_IS_SPACE(ch); - case SRE_CATEGORY_UNI_WORD: - return SRE_LOC_IS_WORD(ch); - case SRE_CATEGORY_UNI_NOT_WORD: - return !SRE_LOC_IS_WORD(ch); - case SRE_CATEGORY_UNI_LINEBREAK: - return SRE_IS_LINEBREAK(ch); - case SRE_CATEGORY_UNI_NOT_LINEBREAK: - return !SRE_IS_LINEBREAK(ch); -#endif +/* unicode */ + +static BOOL uni_in_category(SRE_CODE category, SRE_CODE ch) { + int cat = get_unicode_category(ch); + if (category < 0x20) + return cat == category; + + switch (category) { + case SRE_UNI_CAT_L: + return (SRE_UNI_CAT_L_MASK & (1 << cat)) != 0; + case SRE_UNI_CAT_M: + return (SRE_UNI_CAT_M_MASK & (1 << cat)) != 0; + case SRE_UNI_CAT_N: + return (SRE_UNI_CAT_N_MASK & (1 << cat)) != 0; + case SRE_UNI_CAT_Z: + return (SRE_UNI_CAT_Z_MASK & (1 << cat)) != 0; + case SRE_UNI_CAT_C: + return (SRE_UNI_CAT_C_MASK & (1 << cat)) != 0; + case SRE_UNI_CAT_P: + return (SRE_UNI_CAT_P_MASK & (1 << cat)) != 0; + case SRE_UNI_CAT_S: + return (SRE_UNI_CAT_S_MASK & (1 << cat)) != 0; + case SRE_CAT_Alnum: + return (SRE_UNI_CAT_MASK_Alnum & (1 << cat)) != 0; + case SRE_CAT_Alpha: + return (SRE_UNI_CAT_MASK_Alpha & (1 << cat)) != 0; + case SRE_CAT_ASCII: + return ch <= SRE_ASCII_MAX; + case SRE_CAT_Blank: + return ch == '\t' || cat == SRE_UNI_CAT_Zs; + case SRE_CAT_Cntrl: + return cat == SRE_UNI_CAT_Cc; + case SRE_CAT_Digit: + return cat == SRE_UNI_CAT_Nd; + case SRE_CAT_Graph: + return (SRE_UNI_CAT_MASK_Graph & (1 << cat)) != 0; + case SRE_CAT_LineBreak: + return ch == '\n'; + case SRE_CAT_Lower: + return cat == SRE_UNI_CAT_Ll; + case SRE_CAT_Print: + return (SRE_UNI_CAT_MASK_Print & (1 << cat)) != 0; + case SRE_CAT_Punct: + return (SRE_UNI_CAT_MASK_Punct & (1 << cat)) != 0; + case SRE_CAT_Space: + return ch == '\t' || ch == '\r' || ch == '\n' || ch == '\v' || ch == '\f' || (SRE_UNI_CAT_Z_MASK & (1 << cat)) != 0; + case SRE_CAT_Upper: + return cat == SRE_UNI_CAT_Lu; + case SRE_CAT_Word: + return (SRE_UNI_CAT_MASK_Word & (1 << cat)) != 0; + case SRE_CAT_XDigit: + return ch >= '0' && ch <= '9' || ch >= 'A' && ch <= 'F' || ch >= 'a' && ch <= 'f'; + default: + return FALSE; } - return 0; } -/* helpers */ +static SRE_CODE uni_lower(SRE_CODE ch) { + return (SRE_CODE)Py_UNICODE_TOLOWER((Py_UNICODE)ch); +} -static void -data_stack_dealloc(SRE_STATE* state) -{ - if (state->data_stack) { - PyMem_FREE(state->data_stack); - state->data_stack = NULL; - } - state->data_stack_size = state->data_stack_base = 0; +static SRE_CODE uni_upper(SRE_CODE ch) { + return (SRE_CODE)Py_UNICODE_TOUPPER((Py_UNICODE)ch); } -static int -data_stack_grow(SRE_STATE* state, Py_ssize_t size) -{ - Py_ssize_t minsize, cursize; - minsize = state->data_stack_base+size; - cursize = state->data_stack_size; - if (cursize < minsize) { - void* stack; - cursize = minsize+minsize/4+1024; - TRACE(("allocate/grow stack %d\n", cursize)); - stack = PyMem_REALLOC(state->data_stack, cursize); - if (!stack) { - data_stack_dealloc(state); - return SRE_ERROR_MEMORY; +static SRE_CODE uni_title(SRE_CODE ch) { + return (SRE_CODE)Py_UNICODE_TOTITLE((Py_UNICODE)ch); +} + +static SRE_ENCODING_TABLE sre_unicode_encoding = { +uni_in_category, +uni_lower, +uni_upper, +uni_title, +}; + +LOCAL(unsigned int) sre_min(unsigned int x, unsigned int y) { + return x <= y ? x : y; +} + +LOCAL(unsigned int) sre_max(unsigned int x, unsigned int y) { + return x >= y ? x : y; +} + +LOCAL(BOOL) in_charset(SRE_CODE* charset, SRE_CODE ch) { + // Charset format: max_char indexes... chunks... + int hi_byte = ch / 256; + int lo_byte = ch % 256; + int index; + SRE_CODE* chunk; + // Check against the maximum character code in the charset. + if (ch > charset[0]) + return FALSE; + // Get the chunk index (2 x 16-bit indexes in each codeword). + index = (charset[1 + hi_byte / 2] >> ((hi_byte % 2) * 16)) & 0xFFFF; + // Get the chunk. + chunk = charset + 1 + charset[0] / 256 / 2 + 1 + index * (256 / SRE_BITS_PER_CODE); + return ((chunk[lo_byte / SRE_BITS_PER_CODE] >> (lo_byte % SRE_BITS_PER_CODE)) & 0x1) != 0; +} + +LOCAL(BOOL) in_charset_ignore(SRE_STATE* state, SRE_CODE* charset, SRE_CODE ch) { + return in_charset(charset, state->encoding->lower(ch)) || + in_charset(charset, state->encoding->upper(ch)) || + in_charset(charset, state->encoding->title(ch)); +} + +LOCAL(BOOL) in_range(SRE_CODE ch, SRE_CODE lower, SRE_CODE upper) { + return lower <= ch && ch <= upper; +} + +LOCAL(BOOL) in_range_ignore(SRE_STATE* state, SRE_CODE ch, SRE_CODE lower, SRE_CODE upper) { + return in_range(state->encoding->lower(ch), lower, upper) || + in_range(state->encoding->upper(ch), lower, upper) || + in_range(state->encoding->title(ch), lower, upper); +} + +LOCAL(BOOL) in_set(SRE_STATE* state, SRE_CODE* charset, SRE_CODE ch) { + // Check if character is a member of the given set. + SRE_CODE* charset_end = charset + charset[0]; + + charset++; + + do { + switch (charset[0]) { + case SRE_OP_CATEGORY: + // + if (state->encoding->in_category(charset[1], ch)) + return TRUE; + charset += 2; + break; + case SRE_OP_CHARSET: + // + if (in_charset(charset + 2, ch)) + return TRUE; + charset += 1 + charset[1]; + break; + case SRE_OP_LITERAL: + // + if (ch == charset[1]) + return TRUE; + charset += 2; + break; + case SRE_OP_RANGE: + // + if (in_range(ch, charset[1], charset[2])) + return TRUE; + charset += 3; + break; + default: + /* internal error -- there's not much we can do about it + here, so let's just pretend it didn't match... */ + return FALSE; } - state->data_stack = (char *)stack; - state->data_stack_size = cursize; - } - return 0; + } while (charset < charset_end); + + return FALSE; } -/* generate 8-bit version */ +LOCAL(BOOL) in_set_ignore(SRE_STATE* state, SRE_CODE* charset, SRE_CODE ch) { + return in_set(state, charset, state->encoding->lower(ch)) || + in_set(state, charset, state->encoding->upper(ch)) || + in_set(state, charset, state->encoding->title(ch)); +} + +LOCAL(BOOL) same_char_ignore(SRE_STATE* state, SRE_CODE ch_1, SRE_CODE ch_2) { + return state->encoding->lower(ch_1) == state->encoding->lower(ch_2) || + state->encoding->upper(ch_1) == state->encoding->upper(ch_2) || + state->encoding->title(ch_1) == state->encoding->title(ch_2); +} + +/* generate bytestring version */ #define SRE_CHAR unsigned char -#define SRE_AT sre_at -#define SRE_COUNT sre_count -#define SRE_CHARSET sre_charset -#define SRE_INFO sre_info -#define SRE_MATCH sre_match -#define SRE_MATCH_CONTEXT sre_match_context -#define SRE_SEARCH sre_search -#define SRE_LITERAL_TEMPLATE sre_literal_template +#define SRE_MATCH sre_bmatch +#define SRE_SEARCH sre_bsearch +#define SRE_LITERAL_TEMPLATE sre_bliteral_template +#define SRE_AT_BOUNDARY sre_bat_boundary +#define SRE_CONTEXT sre_bcontext +#define SRE_SAVE_BACKTRACK sre_bsave_backtrack +#define SRE_DISCARD_BACKTRACK sre_bdiscard_backtrack +#define SRE_REFRESH_MARKS sre_brefresh_marks +#define SRE_DISCARD_UNTIL sre_bdiscard_until +#define SRE_CLEANUP sre_bcleanup +#define SRE_POSSIBLE_MATCH_AHEAD sre_bpossible_match_ahead +#define SRE_MATCH_MANY sre_bmatch_many +#define SRE_MATCH_UNTIL_TAIL sre_bmatch_until_tail +#define SRE_UNMATCH_UNTIL_TAIL sre_bunmatch_until_tail +#define SRE_UNMATCH_UNTIL_TAIL_REV sre_bunmatch_until_tail_rev +#define SRE_PRINT_TEXT sre_bprint_text #if defined(HAVE_UNICODE) #define SRE_RECURSIVE #include "_sre.c" +#undef SRE_PRINT_TEXT +#undef SRE_UNMATCH_UNTIL_TAIL_REV +#undef SRE_UNMATCH_UNTIL_TAIL +#undef SRE_MATCH_UNTIL_TAIL +#undef SRE_MATCH_MANY +#undef SRE_POSSIBLE_MATCH_AHEAD +#undef SRE_CLEANUP +#undef SRE_DISCARD_UNTIL +#undef SRE_REFRESH_MARKS +#undef SRE_DISCARD_BACKTRACK +#undef SRE_SAVE_BACKTRACK +#undef SRE_CONTEXT #undef SRE_RECURSIVE - +#undef SRE_AT_BOUNDARY #undef SRE_LITERAL_TEMPLATE #undef SRE_SEARCH #undef SRE_MATCH -#undef SRE_MATCH_CONTEXT -#undef SRE_INFO -#undef SRE_CHARSET -#undef SRE_COUNT -#undef SRE_AT #undef SRE_CHAR -/* generate 16-bit unicode version */ +/* generate unicode version */ #define SRE_CHAR Py_UNICODE -#define SRE_AT sre_uat -#define SRE_COUNT sre_ucount -#define SRE_CHARSET sre_ucharset -#define SRE_INFO sre_uinfo #define SRE_MATCH sre_umatch -#define SRE_MATCH_CONTEXT sre_umatch_context #define SRE_SEARCH sre_usearch #define SRE_LITERAL_TEMPLATE sre_uliteral_template +#define SRE_AT_BOUNDARY sre_uat_boundary +#define SRE_CONTEXT sre_ucontext +#define SRE_SAVE_BACKTRACK sre_usave_backtrack +#define SRE_DISCARD_BACKTRACK sre_udiscard_backtrack +#define SRE_REFRESH_MARKS sre_urefresh_marks +#define SRE_DISCARD_UNTIL sre_udiscard_until +#define SRE_CLEANUP sre_ucleanup +#define SRE_POSSIBLE_MATCH_AHEAD sre_upossible_match_ahead +#define SRE_MATCH_MANY sre_umatch_many +#define SRE_MATCH_UNTIL_TAIL sre_umatch_until_tail +#define SRE_UNMATCH_UNTIL_TAIL sre_uunmatch_until_tail +#define SRE_UNMATCH_UNTIL_TAIL_REV sre_uunmatch_until_tail_rev +#define SRE_PRINT_TEXT sre_uprint_text #endif #endif /* SRE_RECURSIVE */ @@ -333,1295 +523,3030 @@ /* the following section is compiled twice, with different character settings */ -LOCAL(int) -SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) -{ - /* check if pointer is at given position */ - - Py_ssize_t thisp, thatp; +typedef struct SRE_CONTEXT { + SRE_CHAR* text_beginning; // The true start of the text. + SRE_CHAR* text_ptr; + SRE_CHAR* text_start; // The start of the text to search/match. + SRE_CHAR* text_end; // The end of the text to search/match; also the true end, or treated as such. + SRE_CHAR* final_linebreak; // The position of the final linebreak if it's final, otherwise NULL. + SRE_CODE* pattern_ptr; + SRE_CHAR** marks; // All the numbered and named marks (start and end of numbered and named groups). + int marks_size; // The total size of numbered and named text mark pointers. + SRE_BACKTRACK_CHUNK* backtrack_chunk; + SRE_BACKTRACK_ITEM* backtrack_item; +} SRE_CONTEXT; + +LOCAL(int) SRE_CLEANUP(SRE_CONTEXT* context, SRE_STATE* state, int result) { + SRE_BACKTRACK_CHUNK* current; + SRE_BACKTRACK_ITEM* item; + + current = context->backtrack_chunk; + while (current->previous != NULL) { + SRE_BACKTRACK_CHUNK* previous = current->previous; + + for(item = current->items; item < ¤t->items[current->count]; item++) { + if (item->marks != NULL) + PyMem_FREE(item->marks); + } - switch (at) { + PyMem_FREE(current); + current = previous; + } - case SRE_AT_BEGINNING: - case SRE_AT_BEGINNING_STRING: - return ((void*) ptr == state->beginning); + for(item = current->items; item < ¤t->items[current->count]; item++) { + if (item->marks != NULL) + PyMem_FREE(item->marks); + } - case SRE_AT_BEGINNING_LINE: - return ((void*) ptr == state->beginning || - SRE_IS_LINEBREAK((int) ptr[-1])); + current->count = 0; - case SRE_AT_END: - return (((void*) (ptr+1) == state->end && - SRE_IS_LINEBREAK((int) ptr[0])) || - ((void*) ptr == state->end)); + context->backtrack_chunk = current; + state->backtrack_chunk = current; - case SRE_AT_END_LINE: - return ((void*) ptr == state->end || - SRE_IS_LINEBREAK((int) ptr[0])); + return result; +} - case SRE_AT_END_STRING: - return ((void*) ptr == state->end); +LOCAL(int) SRE_SAVE_BACKTRACK(SRE_CONTEXT* context, int op, BOOL save_marks) { + SRE_BACKTRACK_ITEM* backtrack_item; - case SRE_AT_BOUNDARY: - if (state->beginning == state->end) - return 0; - thatp = ((void*) ptr > state->beginning) ? - SRE_IS_WORD((int) ptr[-1]) : 0; - thisp = ((void*) ptr < state->end) ? - SRE_IS_WORD((int) ptr[0]) : 0; - return thisp != thatp; + if (context->backtrack_chunk->count >= SRE_BACKTRACK_CHUNK_SIZE) { + SRE_BACKTRACK_CHUNK* new_backtrack_chunk = (SRE_BACKTRACK_CHUNK*)PyMem_MALLOC(sizeof(SRE_BACKTRACK_CHUNK)); + if (new_backtrack_chunk == NULL) + return SRE_ERROR_MEMORY; - case SRE_AT_NON_BOUNDARY: - if (state->beginning == state->end) - return 0; - thatp = ((void*) ptr > state->beginning) ? - SRE_IS_WORD((int) ptr[-1]) : 0; - thisp = ((void*) ptr < state->end) ? - SRE_IS_WORD((int) ptr[0]) : 0; - return thisp == thatp; + new_backtrack_chunk->previous = context->backtrack_chunk; + new_backtrack_chunk->count = 0; + context->backtrack_chunk = new_backtrack_chunk; + } - case SRE_AT_LOC_BOUNDARY: - if (state->beginning == state->end) - return 0; - thatp = ((void*) ptr > state->beginning) ? - SRE_LOC_IS_WORD((int) ptr[-1]) : 0; - thisp = ((void*) ptr < state->end) ? - SRE_LOC_IS_WORD((int) ptr[0]) : 0; - return thisp != thatp; + backtrack_item = &context->backtrack_chunk->items[context->backtrack_chunk->count++]; + backtrack_item->op = op; + if (save_marks && context->marks_size > 0) { + backtrack_item->marks = PyMem_MALLOC(context->marks_size); + if (backtrack_item->marks == NULL) + return SRE_ERROR_MEMORY; + } else + backtrack_item->marks = NULL; + context->backtrack_item = backtrack_item; - case SRE_AT_LOC_NON_BOUNDARY: - if (state->beginning == state->end) - return 0; - thatp = ((void*) ptr > state->beginning) ? - SRE_LOC_IS_WORD((int) ptr[-1]) : 0; - thisp = ((void*) ptr < state->end) ? - SRE_LOC_IS_WORD((int) ptr[0]) : 0; - return thisp == thatp; + return 0; +} -#if defined(HAVE_UNICODE) - case SRE_AT_UNI_BOUNDARY: - if (state->beginning == state->end) - return 0; - thatp = ((void*) ptr > state->beginning) ? - SRE_UNI_IS_WORD((int) ptr[-1]) : 0; - thisp = ((void*) ptr < state->end) ? - SRE_UNI_IS_WORD((int) ptr[0]) : 0; - return thisp != thatp; +LOCAL(void) SRE_DISCARD_BACKTRACK(SRE_CONTEXT* context) { + SRE_BACKTRACK_CHUNK* chunk = context->backtrack_chunk; + SRE_BACKTRACK_ITEM* item = &chunk->items[--chunk->count]; - case SRE_AT_UNI_NON_BOUNDARY: - if (state->beginning == state->end) - return 0; - thatp = ((void*) ptr > state->beginning) ? - SRE_UNI_IS_WORD((int) ptr[-1]) : 0; - thisp = ((void*) ptr < state->end) ? - SRE_UNI_IS_WORD((int) ptr[0]) : 0; - return thisp == thatp; -#endif + if (item->marks != NULL) + PyMem_FREE(item->marks); + if (chunk->count == 0 && chunk->previous != NULL) { + SRE_BACKTRACK_CHUNK* previous = chunk->previous; + PyMem_FREE(chunk); + context->backtrack_chunk = previous; } - return 0; + context->backtrack_item = item; } -LOCAL(int) -SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) -{ - /* check if character is a member of the given set */ - - int ok = 1; +LOCAL(void) SRE_DISCARD_UNTIL(SRE_CONTEXT* context, int op) { + SRE_BACKTRACK_ITEM* item; for (;;) { - switch (*set++) { - - case SRE_OP_FAILURE: - return !ok; - - case SRE_OP_LITERAL: - /* */ - if (ch == set[0]) - return ok; - set++; + SRE_BACKTRACK_CHUNK* chunk = context->backtrack_chunk; + item = &chunk->items[chunk->count - 1]; + if (item->op == op) break; + SRE_DISCARD_BACKTRACK(context); + } - case SRE_OP_CATEGORY: - /* */ - if (sre_category(set[0], (int) ch)) - return ok; - set += 1; - break; + context->backtrack_item = item; +} - case SRE_OP_CHARSET: - if (sizeof(SRE_CODE) == 2) { - /* (16 bits per code word) */ - if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15)))) - return ok; - set += 16; - } - else { - /* (32 bits per code word) */ - if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31)))) - return ok; - set += 8; - } - break; +LOCAL(BOOL) SRE_AT_BOUNDARY(SRE_CONTEXT* context, SRE_STATE* state) { + int before = context->text_ptr > context->text_beginning && state->encoding->in_category(SRE_CAT_Word, context->text_ptr[-1]); + int after = context->text_ptr < context->text_end && state->encoding->in_category(SRE_CAT_Word, context->text_ptr[0]); + return before != after; +} - case SRE_OP_RANGE: - /* */ - if (set[0] <= ch && ch <= set[1]) - return ok; - set += 2; - break; - - case SRE_OP_NEGATE: - ok = !ok; - break; - - case SRE_OP_BIGCHARSET: - /* <256 blockindices> */ - { - Py_ssize_t count, block; - count = *(set++); - - if (sizeof(SRE_CODE) == 2) { - block = ((unsigned char*)set)[ch >> 8]; - set += 128; - if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15))) - return ok; - set += count*16; - } - else { - /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids - * warnings when c's type supports only numbers < N+1 */ - if (!(ch & ~65535)) - block = ((unsigned char*)set)[ch >> 8]; - else - block = -1; - set += 64; - if (block >=0 && - (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31)))) - return ok; - set += count*8; - } - break; - } +#define SRE_MARK_OP_SIZE 3 - default: - /* internal error -- there's not much we can do about it - here, so let's just pretend it didn't match... */ - return 0; - } +// Look ahead to see whether it could match. Returns 0 if couldn't match. +LOCAL(BOOL) SRE_POSSIBLE_MATCH_AHEAD(SRE_CONTEXT* context, SRE_STATE* state, SRE_CODE* tail) { + while (tail[0] == SRE_OP_MARK) + tail += SRE_MARK_OP_SIZE; + + switch (tail[0]) { + case SRE_OP_ANY: + return !state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0]); + case SRE_OP_ANY_ALL: + return TRUE; + case SRE_OP_ANY_ALL_REV: + return TRUE; + case SRE_OP_ANY_REV: + return !state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1]); + case SRE_OP_BOUNDARY: + return SRE_AT_BOUNDARY(context, state); + case SRE_OP_CATEGORY: + return state->encoding->in_category(tail[1], context->text_ptr[0]); + case SRE_OP_CATEGORY_REV: + return state->encoding->in_category(tail[1], context->text_ptr[-1]); + case SRE_OP_CHARSET: + return in_charset(tail + 2, context->text_ptr[0]); + case SRE_OP_CHARSET_IGNORE: + return in_charset_ignore(state, tail + 2, context->text_ptr[0]); + case SRE_OP_CHARSET_IGNORE_REV: + return in_charset_ignore(state, tail + 2, context->text_ptr[-1]); + case SRE_OP_CHARSET_REV: + return in_charset(tail + 2, context->text_ptr[-1]); + case SRE_OP_END_OF_LINE: + return context->text_ptr >= context->text_end || state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0]); + case SRE_OP_END_OF_STRING: + return context->text_ptr >= context->text_end; + case SRE_OP_END_OF_STRING_LN: + return context->text_ptr >= context->text_end || context->text_ptr == context->final_linebreak; + case SRE_OP_LITERAL: + return context->text_ptr[0] == (SRE_CHAR)tail[1]; + case SRE_OP_LITERAL_IGNORE: + return same_char_ignore(state, context->text_ptr[0], tail[1]); + case SRE_OP_LITERAL_IGNORE_REV: + return same_char_ignore(state, context->text_ptr[-1], tail[1]); + case SRE_OP_LITERAL_REV: + return context->text_ptr[-1] == (SRE_CHAR)tail[1]; + case SRE_OP_LITERAL_STRING: + return context->text_ptr[0] == (SRE_CHAR)tail[2]; + case SRE_OP_LITERAL_STRING_IGNORE: + return same_char_ignore(state, context->text_ptr[0], tail[2]); + case SRE_OP_LITERAL_STRING_IGNORE_REV: + return same_char_ignore(state, context->text_ptr[-(int)tail[1]], tail[2]); + case SRE_OP_LITERAL_STRING_REV: + return context->text_ptr[-(int)tail[1]] == (SRE_CHAR)tail[2]; + case SRE_OP_NOT_BOUNDARY: + return !SRE_AT_BOUNDARY(context, state); + case SRE_OP_NOT_CATEGORY: + return !state->encoding->in_category(tail[1], context->text_ptr[0]); + case SRE_OP_NOT_CATEGORY_REV: + return !state->encoding->in_category(tail[1], context->text_ptr[-1]); + case SRE_OP_NOT_CHARSET: + return !in_charset(tail + 2, context->text_ptr[0]); + case SRE_OP_NOT_CHARSET_IGNORE: + return !in_charset_ignore(state, tail + 2, context->text_ptr[0]); + case SRE_OP_NOT_CHARSET_IGNORE_REV: + return !in_charset_ignore(state, tail + 2, context->text_ptr[-1]); + case SRE_OP_NOT_CHARSET_REV: + return !in_charset(tail + 2, context->text_ptr[-1]); + case SRE_OP_NOT_LITERAL: + return context->text_ptr[0] != (SRE_CHAR)tail[1]; + case SRE_OP_NOT_LITERAL_IGNORE: + return !same_char_ignore(state, context->text_ptr[0], tail[1]); + case SRE_OP_NOT_LITERAL_IGNORE_REV: + return !same_char_ignore(state, context->text_ptr[-1], tail[1]); + case SRE_OP_NOT_LITERAL_REV: + return context->text_ptr[-1] != (SRE_CHAR)tail[1]; + case SRE_OP_NOT_SET: + return !in_set(state, tail + 1, context->text_ptr[0]); + case SRE_OP_NOT_SET_IGNORE: + return !in_set_ignore(state, tail + 1, context->text_ptr[0]); + case SRE_OP_NOT_SET_IGNORE_REV: + return !in_set_ignore(state, tail + 1, context->text_ptr[-1]); + case SRE_OP_NOT_SET_REV: + return !in_set(state, tail + 1, context->text_ptr[-1]); + case SRE_OP_SET: + return in_set(state, tail + 1, context->text_ptr[0]); + case SRE_OP_SET_IGNORE: + return in_set_ignore(state, tail + 1, context->text_ptr[0]); + case SRE_OP_SET_IGNORE_REV: + return in_set_ignore(state, tail + 1, context->text_ptr[-1]); + case SRE_OP_SET_REV: + return in_set(state, tail + 1, context->text_ptr[-1]); + case SRE_OP_START_OF_LINE: + return context->text_ptr == context->text_beginning || state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1]); + case SRE_OP_START_OF_STRING: + return context->text_ptr == context->text_beginning; + default: + return TRUE; } } -LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern); - -LOCAL(Py_ssize_t) -SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) -{ - SRE_CODE chr; - SRE_CHAR* ptr = (SRE_CHAR *)state->ptr; - SRE_CHAR* end = (SRE_CHAR *)state->end; - Py_ssize_t i; - - /* adjust end */ - if (maxcount < end - ptr && maxcount != 65535) - end = ptr + maxcount; - - switch (pattern[0]) { - - case SRE_OP_IN: - /* repeated set */ - TRACE(("|%p|%p|COUNT IN\n", pattern, ptr)); - while (ptr < end && SRE_CHARSET(pattern + 2, *ptr)) - ptr++; +// Match up to the maximum. +LOCAL(void) SRE_MATCH_MANY(SRE_CONTEXT* context, SRE_STATE* state, SRE_CHAR* max_ptr, SRE_CODE* body) { + switch (body[0]) { + case SRE_OP_ANY: + while (context->text_ptr < max_ptr && !state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_ANY_ALL: + while (context->text_ptr < max_ptr) + context->text_ptr++; + break; + case SRE_OP_ANY_ALL_REV: + while (context->text_ptr > max_ptr) + context->text_ptr--; + break; + case SRE_OP_ANY_REV: + while (context->text_ptr > max_ptr && !state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_CATEGORY: + while (context->text_ptr < max_ptr && state->encoding->in_category(body[1], context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_CATEGORY_REV: + while (context->text_ptr > max_ptr && state->encoding->in_category(body[1], context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_CHARSET: + while (context->text_ptr < max_ptr && in_charset(body + 2, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_CHARSET_IGNORE: + while (context->text_ptr < max_ptr && in_charset_ignore(state, body + 2, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_CHARSET_IGNORE_REV: + while (context->text_ptr > max_ptr && in_charset_ignore(state, body + 2, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_CHARSET_REV: + while (context->text_ptr > max_ptr && in_charset(body + 2, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_LITERAL: + while (context->text_ptr < max_ptr && context->text_ptr[0] == (SRE_CHAR)body[1]) + context->text_ptr++; + break; + case SRE_OP_LITERAL_IGNORE: + while (context->text_ptr < max_ptr && same_char_ignore(state, context->text_ptr[0], body[1])) + context->text_ptr++; + break; + case SRE_OP_LITERAL_IGNORE_REV: + while (context->text_ptr > max_ptr && !same_char_ignore(state, context->text_ptr[-1], body[1])) + context->text_ptr--; + break; + case SRE_OP_LITERAL_REV: + while (context->text_ptr > max_ptr && context->text_ptr[-1] == (SRE_CHAR)body[1]) + context->text_ptr--; + break; + case SRE_OP_NOT_CATEGORY: + while (context->text_ptr < max_ptr && !state->encoding->in_category(body[1], context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_NOT_CATEGORY_REV: + while (context->text_ptr > max_ptr && !state->encoding->in_category(body[1], context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_NOT_CHARSET: + while (context->text_ptr < max_ptr && !in_charset(body + 2, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_NOT_CHARSET_IGNORE: + while (context->text_ptr < max_ptr && !in_charset_ignore(state, body + 2, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_NOT_CHARSET_IGNORE_REV: + while (context->text_ptr > max_ptr && !in_charset_ignore(state, body + 2, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_NOT_CHARSET_REV: + while (context->text_ptr > max_ptr && !in_charset(body + 2, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_NOT_LITERAL: + while (context->text_ptr < max_ptr && context->text_ptr[0] != (SRE_CHAR)body[1]) + context->text_ptr++; + break; + case SRE_OP_NOT_LITERAL_IGNORE: + while (context->text_ptr < max_ptr && !same_char_ignore(state, context->text_ptr[0], body[1])) + context->text_ptr++; + break; + case SRE_OP_NOT_LITERAL_IGNORE_REV: + while (context->text_ptr > max_ptr && !same_char_ignore(state, context->text_ptr[-1], body[1])) + context->text_ptr--; + break; + case SRE_OP_NOT_LITERAL_REV: + while (context->text_ptr > max_ptr && context->text_ptr[-1] != (SRE_CHAR)body[1]) + context->text_ptr--; + break; + case SRE_OP_NOT_RANGE: + while (context->text_ptr < max_ptr && !in_range(context->text_ptr[0], body[1], body[2])) + context->text_ptr++; + break; + case SRE_OP_NOT_RANGE_IGNORE: + while (context->text_ptr < max_ptr && !in_range_ignore(state, context->text_ptr[0], body[1], body[2])) + context->text_ptr++; + break; + case SRE_OP_NOT_RANGE_IGNORE_REV: + while (context->text_ptr > max_ptr && !in_range_ignore(state, context->text_ptr[-1], body[1], body[2])) + context->text_ptr--; + break; + case SRE_OP_NOT_RANGE_REV: + while (context->text_ptr > max_ptr && !in_range(context->text_ptr[-1], body[1], body[2])) + context->text_ptr--; + break; + case SRE_OP_NOT_SET: + while (context->text_ptr < max_ptr && !in_set(state, body + 1, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_NOT_SET_IGNORE: + while (context->text_ptr < max_ptr && !in_set_ignore(state, body + 1, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_NOT_SET_IGNORE_REV: + while (context->text_ptr > max_ptr && !in_set_ignore(state, body + 1, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_NOT_SET_REV: + while (context->text_ptr > max_ptr && !in_set(state, body + 1, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_RANGE: + while (context->text_ptr < max_ptr && in_range(context->text_ptr[0], body[1], body[2])) + context->text_ptr++; + break; + case SRE_OP_RANGE_IGNORE: + while (context->text_ptr < max_ptr && in_range_ignore(state, context->text_ptr[0], body[1], body[2])) + context->text_ptr++; + break; + case SRE_OP_RANGE_IGNORE_REV: + while (context->text_ptr > max_ptr && in_range_ignore(state, context->text_ptr[-1], body[1], body[2])) + context->text_ptr--; + break; + case SRE_OP_RANGE_REV: + while (context->text_ptr > max_ptr && in_range(context->text_ptr[-1], body[1], body[2])) + context->text_ptr--; + break; + case SRE_OP_SET: + while (context->text_ptr < max_ptr && in_set(state, body + 1, context->text_ptr[0])) + context->text_ptr++; break; + case SRE_OP_SET_IGNORE: + while (context->text_ptr < max_ptr && in_set_ignore(state, body + 1, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_SET_IGNORE_REV: + while (context->text_ptr > max_ptr && in_set_ignore(state, body + 1, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_SET_REV: + while (context->text_ptr > max_ptr && in_set(state, body + 1, context->text_ptr[-1])) + context->text_ptr--; + break; + } +} + +// Unmatch down to the minimum until the tail could match. Returns 0 if min_ptr is reached but still no match. +LOCAL(int) SRE_UNMATCH_UNTIL_TAIL(SRE_CONTEXT* context, SRE_STATE* state, SRE_CHAR* min_ptr, SRE_CODE* tail) { + while (tail[0] == SRE_OP_MARK) + tail += SRE_MARK_OP_SIZE; + switch (tail[0]) { case SRE_OP_ANY: - /* repeated dot wildcard. */ - TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr)); - while (ptr < end && !SRE_IS_LINEBREAK(*ptr)) - ptr++; + while (context->text_ptr >= min_ptr && state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0])) + context->text_ptr--; break; - case SRE_OP_ANY_ALL: - /* repeated dot wildcard. skip to the end of the target - string, and backtrack from there */ - TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr)); - ptr = end; break; - + case SRE_OP_BOUNDARY: + while (context->text_ptr >= min_ptr && !SRE_AT_BOUNDARY(context, state)) + context->text_ptr--; + break; + case SRE_OP_CATEGORY: + while (context->text_ptr >= min_ptr && !state->encoding->in_category(tail[1], context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_CHARSET: + while (context->text_ptr >= min_ptr && !in_charset(tail + 2, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_CHARSET_IGNORE: + while (context->text_ptr >= min_ptr && !in_charset_ignore(state, tail + 2, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_END_OF_LINE: + while (context->text_ptr >= min_ptr && context->text_ptr < context->text_end && + !state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_END_OF_STRING: + while (context->text_ptr >= min_ptr && context->text_ptr < context->text_end) + context->text_ptr--; + break; + case SRE_OP_END_OF_STRING_LN: + while (context->text_ptr >= min_ptr && context->text_ptr < context->text_end && context->text_ptr != context->final_linebreak) + context->text_ptr--; + break; case SRE_OP_LITERAL: - /* repeated literal */ - chr = pattern[1]; - TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) *ptr == chr) - ptr++; + while (context->text_ptr >= min_ptr && context->text_ptr[0] != (SRE_CHAR)tail[1]) + context->text_ptr--; break; - case SRE_OP_LITERAL_IGNORE: - /* repeated literal */ - chr = pattern[1]; - TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr) - ptr++; + while (context->text_ptr >= min_ptr && !same_char_ignore(state, context->text_ptr[0], tail[1])) + context->text_ptr--; + break; + case SRE_OP_LITERAL_STRING: + while (context->text_ptr >= min_ptr && context->text_ptr[0] != (SRE_CHAR)tail[2]) + context->text_ptr--; + break; + case SRE_OP_LITERAL_STRING_IGNORE: + while (context->text_ptr >= min_ptr && !same_char_ignore(state, context->text_ptr[0], tail[2])) + context->text_ptr--; + break; + case SRE_OP_NOT_BOUNDARY: + while (context->text_ptr >= min_ptr && SRE_AT_BOUNDARY(context, state)) + context->text_ptr--; + break; + case SRE_OP_NOT_CATEGORY: + while (context->text_ptr >= min_ptr && state->encoding->in_category(tail[1], context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_NOT_CHARSET: + while (context->text_ptr >= min_ptr && in_charset(tail + 2, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_NOT_CHARSET_IGNORE: + while (context->text_ptr >= min_ptr && in_charset_ignore(state, tail + 2, context->text_ptr[0])) + context->text_ptr--; break; - case SRE_OP_NOT_LITERAL: - /* repeated non-literal */ - chr = pattern[1]; - TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) *ptr != chr) - ptr++; + while (context->text_ptr >= min_ptr && context->text_ptr[0] == (SRE_CHAR)tail[1]) + context->text_ptr--; break; - case SRE_OP_NOT_LITERAL_IGNORE: - /* repeated non-literal */ - chr = pattern[1]; - TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr) - ptr++; + while (context->text_ptr >= min_ptr && same_char_ignore(state, context->text_ptr[0], tail[1])) + context->text_ptr--; + break; + case SRE_OP_NOT_RANGE: + while (context->text_ptr >= min_ptr && in_range(context->text_ptr[0], tail[1], tail[2])) + context->text_ptr--; + break; + case SRE_OP_NOT_RANGE_IGNORE: + while (context->text_ptr >= min_ptr && in_range_ignore(state, context->text_ptr[0], tail[1], tail[2])) + context->text_ptr--; + break; + case SRE_OP_NOT_SET: + while (context->text_ptr >= min_ptr && in_set(state, tail + 1, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_NOT_SET_IGNORE: + while (context->text_ptr >= min_ptr && in_set_ignore(state, tail + 1, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_RANGE: + while (context->text_ptr >= min_ptr && !in_range(context->text_ptr[0], tail[1], tail[2])) + context->text_ptr--; + break; + case SRE_OP_RANGE_IGNORE: + while (context->text_ptr >= min_ptr && !in_range_ignore(state, context->text_ptr[0], tail[1], tail[2])) + context->text_ptr--; + break; + case SRE_OP_SET: + while (context->text_ptr >= min_ptr && !in_set(state, tail + 1, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_SET_IGNORE: + while (context->text_ptr >= min_ptr && !in_set_ignore(state, tail + 1, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_START_OF_LINE: + while (context->text_ptr >= min_ptr && context->text_ptr != context->text_beginning && + state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_START_OF_STRING: + while (context->text_ptr >= min_ptr && context->text_ptr != context->text_beginning) + context->text_ptr--; break; - - default: - /* repeated single character pattern */ - TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr)); - while ((SRE_CHAR*) state->ptr < end) { - i = SRE_MATCH(state, pattern); - if (i < 0) - return i; - if (!i) - break; - } - TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, - (SRE_CHAR*) state->ptr - ptr)); - return (SRE_CHAR*) state->ptr - ptr; } - TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr)); - return ptr - (SRE_CHAR*) state->ptr; + return context->text_ptr >= min_ptr; } -#if 0 /* not used in this release */ -LOCAL(int) -SRE_INFO(SRE_STATE* state, SRE_CODE* pattern) -{ - /* check if an SRE_OP_INFO block matches at the current position. - returns the number of SRE_CODE objects to skip if successful, 0 - if no match */ - - SRE_CHAR* end = state->end; - SRE_CHAR* ptr = state->ptr; - Py_ssize_t i; - - /* check minimal length */ - if (pattern[3] && (end - ptr) < pattern[3]) - return 0; +// Unmatch down to the minimum until the tail could match. Returns 0 if min_ptr is reached but still no match. +LOCAL(int) SRE_UNMATCH_UNTIL_TAIL_REV(SRE_CONTEXT* context, SRE_STATE* state, SRE_CHAR* min_ptr, SRE_CODE* tail) { + while (tail[0] == SRE_OP_MARK) + tail += SRE_MARK_OP_SIZE; - /* check known prefix */ - if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) { - /* */ - for (i = 0; i < pattern[5]; i++) - if ((SRE_CODE) ptr[i] != pattern[7 + i]) - return 0; - return pattern[0] + 2 * pattern[6]; + switch (tail[0]) { + case SRE_OP_ANY_ALL_REV: + break; + case SRE_OP_ANY_REV: + while (context->text_ptr <= min_ptr && state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_BOUNDARY: + while (context->text_ptr <= min_ptr && !SRE_AT_BOUNDARY(context, state)) + context->text_ptr++; + break; + case SRE_OP_CATEGORY_REV: + while (context->text_ptr <= min_ptr && !state->encoding->in_category(tail[1], context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_CHARSET_IGNORE_REV: + while (context->text_ptr <= min_ptr && !in_charset_ignore(state, tail + 2, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_CHARSET_REV: + while (context->text_ptr <= min_ptr && !in_charset(tail + 2, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_END_OF_LINE: + while (context->text_ptr <= min_ptr && context->text_ptr < context->text_end && + !state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_END_OF_STRING: + while (context->text_ptr <= min_ptr && context->text_ptr < context->text_end) + context->text_ptr++; + break; + case SRE_OP_END_OF_STRING_LN: + while (context->text_ptr <= min_ptr && context->text_ptr < context->text_end && context->text_ptr != context->final_linebreak) + context->text_ptr++; + break; + case SRE_OP_LITERAL_IGNORE_REV: + while (context->text_ptr <= min_ptr && !same_char_ignore(state, context->text_ptr[-1], tail[1])) + context->text_ptr++; + break; + case SRE_OP_LITERAL_REV: + while (context->text_ptr <= min_ptr && context->text_ptr[-1] != (SRE_CHAR)tail[1]) + context->text_ptr++; + break; + case SRE_OP_LITERAL_STRING_IGNORE_REV: + while (context->text_ptr <= min_ptr && state->encoding->lower(context->text_ptr[-(int)tail[1]]) != (SRE_CHAR)tail[2]) + context->text_ptr++; + break; + case SRE_OP_LITERAL_STRING_REV: + while (context->text_ptr <= min_ptr && context->text_ptr[-(int)tail[1]] != (SRE_CHAR)tail[2]) + context->text_ptr++; + break; + case SRE_OP_NOT_BOUNDARY: + while (context->text_ptr <= min_ptr && SRE_AT_BOUNDARY(context, state)) + context->text_ptr++; + break; + case SRE_OP_NOT_CATEGORY_REV: + while (context->text_ptr <= min_ptr && state->encoding->in_category(tail[1], context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_NOT_CHARSET_IGNORE_REV: + while (context->text_ptr <= min_ptr && in_charset_ignore(state, tail + 2, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_NOT_CHARSET_REV: + while (context->text_ptr <= min_ptr && in_charset(tail + 2, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_NOT_LITERAL_IGNORE_REV: + while (context->text_ptr <= min_ptr && same_char_ignore(state, context->text_ptr[-1], tail[1])) + context->text_ptr++; + break; + case SRE_OP_NOT_LITERAL_REV: + while (context->text_ptr <= min_ptr && context->text_ptr[-1] == (SRE_CHAR)tail[1]) + context->text_ptr++; + break; + case SRE_OP_NOT_RANGE_IGNORE_REV: + while (context->text_ptr <= min_ptr && in_range_ignore(state, context->text_ptr[-1], tail[1], tail[2])) + context->text_ptr++; + break; + case SRE_OP_NOT_RANGE_REV: + while (context->text_ptr <= min_ptr && in_range(context->text_ptr[-1], tail[1], tail[2])) + context->text_ptr++; + break; + case SRE_OP_NOT_SET_IGNORE_REV: + while (context->text_ptr <= min_ptr && in_set_ignore(state, tail + 1, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_NOT_SET_REV: + while (context->text_ptr <= min_ptr && in_set(state, tail + 1, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_RANGE_IGNORE_REV: + while (context->text_ptr <= min_ptr && !in_range_ignore(state, context->text_ptr[-1], tail[1], tail[2])) + context->text_ptr++; + break; + case SRE_OP_RANGE_REV: + while (context->text_ptr <= min_ptr && !in_range(context->text_ptr[-1], tail[1], tail[2])) + context->text_ptr++; + break; + case SRE_OP_SET_IGNORE_REV: + while (context->text_ptr <= min_ptr && !in_set_ignore(state, tail + 1, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_SET_REV: + while (context->text_ptr <= min_ptr && !in_set(state, tail + 1, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_START_OF_LINE: + while (context->text_ptr <= min_ptr && context->text_ptr != context->text_beginning && + state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_START_OF_STRING: + while (context->text_ptr <= min_ptr && context->text_ptr != context->text_beginning) + context->text_ptr++; + break; } - return pattern[0]; + + return context->text_ptr <= min_ptr; } -#endif -/* The macros below should be used to protect recursive SRE_MATCH() - * calls that *failed* and do *not* return immediately (IOW, those - * that will backtrack). Explaining: - * - * - Recursive SRE_MATCH() returned true: that's usually a success - * (besides atypical cases like ASSERT_NOT), therefore there's no - * reason to restore lastmark; - * - * - Recursive SRE_MATCH() returned false but the current SRE_MATCH() - * is returning to the caller: If the current SRE_MATCH() is the - * top function of the recursion, returning false will be a matching - * failure, and it doesn't matter where lastmark is pointing to. - * If it's *not* the top function, it will be a recursive SRE_MATCH() - * failure by itself, and the calling SRE_MATCH() will have to deal - * with the failure by the same rules explained here (it will restore - * lastmark by itself if necessary); - * - * - Recursive SRE_MATCH() returned false, and will continue the - * outside 'for' loop: must be protected when breaking, since the next - * OP could potentially depend on lastmark; - * - * - Recursive SRE_MATCH() returned false, and will be called again - * inside a local for/while loop: must be protected between each - * loop iteration, since the recursive SRE_MATCH() could do anything, - * and could potentially depend on lastmark. - * - * For more information, check the discussion at SF patch #712900. - */ -#define LASTMARK_SAVE() \ - do { \ - ctx->lastmark = state->lastmark; \ - ctx->lastindex = state->lastindex; \ - } while (0) -#define LASTMARK_RESTORE() \ - do { \ - state->lastmark = ctx->lastmark; \ - state->lastindex = ctx->lastindex; \ - } while (0) - -#define RETURN_ERROR(i) do { return i; } while(0) -#define RETURN_FAILURE do { ret = 0; goto exit; } while(0) -#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0) - -#define RETURN_ON_ERROR(i) \ - do { if (i < 0) RETURN_ERROR(i); } while (0) -#define RETURN_ON_SUCCESS(i) \ - do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0) -#define RETURN_ON_FAILURE(i) \ - do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0) - -#define SFY(x) #x - -#define DATA_STACK_ALLOC(state, type, ptr) \ -do { \ - alloc_pos = state->data_stack_base; \ - TRACE(("allocating %s in %d (%d)\n", \ - SFY(type), alloc_pos, sizeof(type))); \ - if (state->data_stack_size < alloc_pos+sizeof(type)) { \ - int j = data_stack_grow(state, sizeof(type)); \ - if (j < 0) return j; \ - if (ctx_pos != -1) \ - DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \ - } \ - ptr = (type*)(state->data_stack+alloc_pos); \ - state->data_stack_base += sizeof(type); \ -} while (0) - -#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \ -do { \ - TRACE(("looking up %s at %d\n", SFY(type), pos)); \ - ptr = (type*)(state->data_stack+pos); \ -} while (0) - -#define DATA_STACK_PUSH(state, data, size) \ -do { \ - TRACE(("copy data in %p to %d (%d)\n", \ - data, state->data_stack_base, size)); \ - if (state->data_stack_size < state->data_stack_base+size) { \ - int j = data_stack_grow(state, size); \ - if (j < 0) return j; \ - if (ctx_pos != -1) \ - DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \ - } \ - memcpy(state->data_stack+state->data_stack_base, data, size); \ - state->data_stack_base += size; \ -} while (0) - -#define DATA_STACK_POP(state, data, size, discard) \ -do { \ - TRACE(("copy data to %p from %d (%d)\n", \ - data, state->data_stack_base-size, size)); \ - memcpy(data, state->data_stack+state->data_stack_base-size, size); \ - if (discard) \ - state->data_stack_base -= size; \ -} while (0) - -#define DATA_STACK_POP_DISCARD(state, size) \ -do { \ - TRACE(("discard data from %d (%d)\n", \ - state->data_stack_base-size, size)); \ - state->data_stack_base -= size; \ -} while(0) - -#define DATA_PUSH(x) \ - DATA_STACK_PUSH(state, (x), sizeof(*(x))) -#define DATA_POP(x) \ - DATA_STACK_POP(state, (x), sizeof(*(x)), 1) -#define DATA_POP_DISCARD(x) \ - DATA_STACK_POP_DISCARD(state, sizeof(*(x))) -#define DATA_ALLOC(t,p) \ - DATA_STACK_ALLOC(state, t, p) -#define DATA_LOOKUP_AT(t,p,pos) \ - DATA_STACK_LOOKUP_AT(state,t,p,pos) - -#define MARK_PUSH(lastmark) \ - do if (lastmark > 0) { \ - i = lastmark; /* ctx->lastmark may change if reallocated */ \ - DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ - } while (0) -#define MARK_POP(lastmark) \ - do if (lastmark > 0) { \ - DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ - } while (0) -#define MARK_POP_KEEP(lastmark) \ - do if (lastmark > 0) { \ - DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ - } while (0) -#define MARK_POP_DISCARD(lastmark) \ - do if (lastmark > 0) { \ - DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ - } while (0) - -#define JUMP_NONE 0 -#define JUMP_MAX_UNTIL_1 1 -#define JUMP_MAX_UNTIL_2 2 -#define JUMP_MAX_UNTIL_3 3 -#define JUMP_MIN_UNTIL_1 4 -#define JUMP_MIN_UNTIL_2 5 -#define JUMP_MIN_UNTIL_3 6 -#define JUMP_REPEAT 7 -#define JUMP_REPEAT_ONE_1 8 -#define JUMP_REPEAT_ONE_2 9 -#define JUMP_MIN_REPEAT_ONE 10 -#define JUMP_BRANCH 11 -#define JUMP_ASSERT 12 -#define JUMP_ASSERT_NOT 13 - -#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \ - DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \ - nextctx->last_ctx_pos = ctx_pos; \ - nextctx->jump = jumpvalue; \ - nextctx->pattern = nextpattern; \ - ctx_pos = alloc_pos; \ - ctx = nextctx; \ - goto entrance; \ - jumplabel: \ - while (0) /* gcc doesn't like labels at end of scopes */ \ - -typedef struct { - Py_ssize_t last_ctx_pos; - Py_ssize_t jump; - SRE_CHAR* ptr; - SRE_CODE* pattern; - Py_ssize_t count; - Py_ssize_t lastmark; - Py_ssize_t lastindex; - union { - SRE_CODE chr; - SRE_REPEAT* rep; - } u; -} SRE_MATCH_CONTEXT; +// Match up to the maximum until the tail could match. Returns 0 if max_ptr is reached but still no match. +LOCAL(int) SRE_MATCH_UNTIL_TAIL(SRE_CONTEXT* context, SRE_STATE* state, SRE_CHAR* max_ptr, SRE_CODE* body, SRE_CODE* tail) { + while (tail[0] == SRE_OP_MARK) + tail += SRE_MARK_OP_SIZE; -/* check if string matches the given pattern. returns <0 for - error, 0 for failure, and 1 for success */ -LOCAL(Py_ssize_t) -SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) -{ - SRE_CHAR* end = (SRE_CHAR *)state->end; - Py_ssize_t alloc_pos, ctx_pos = -1; - Py_ssize_t i, ret = 0; - Py_ssize_t jump; - unsigned int sigcount=0; - - SRE_MATCH_CONTEXT* ctx; - SRE_MATCH_CONTEXT* nextctx; - - TRACE(("|%p|%p|ENTER\n", pattern, state->ptr)); - - DATA_ALLOC(SRE_MATCH_CONTEXT, ctx); - ctx->last_ctx_pos = -1; - ctx->jump = JUMP_NONE; - ctx->pattern = pattern; - ctx_pos = alloc_pos; - -entrance: - - ctx->ptr = (SRE_CHAR *)state->ptr; - - if (ctx->pattern[0] == SRE_OP_INFO) { - /* optimization info block */ - /* <1=skip> <2=flags> <3=min> ... */ - if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) { - TRACE(("reject (got %d chars, need %d)\n", - (end - ctx->ptr), ctx->pattern[3])); - RETURN_FAILURE; + switch (body[0]) { + case SRE_OP_ANY: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_ANY_ALL: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_ANY_ALL_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_ANY_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || state->encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + return 0; + context->text_ptr--; } - ctx->pattern += ctx->pattern[1] + 1; + break; + case SRE_OP_CATEGORY: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || !state->encoding->in_category(body[1], context->text_ptr[0])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_CHARSET: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || !in_charset(body + 2, context->text_ptr[0])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_CHARSET_IGNORE: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || !in_charset_ignore(state, body + 2, context->text_ptr[0])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_CHARSET_IGNORE_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || !in_charset_ignore(state, body + 2, context->text_ptr[-1])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_CHARSET_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || !in_charset(body + 2, context->text_ptr[-1])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_LITERAL: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || context->text_ptr[0] != (SRE_CHAR)body[1]) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_LITERAL_IGNORE: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || !same_char_ignore(state, context->text_ptr[0], body[1])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_LITERAL_IGNORE_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || !same_char_ignore(state, context->text_ptr[-1], body[1])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_LITERAL_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || context->text_ptr[-1] != (SRE_CHAR)body[1]) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_NOT_CATEGORY: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || state->encoding->in_category(body[1], context->text_ptr[0])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_NOT_CHARSET: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || in_charset(body + 2, context->text_ptr[0])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_NOT_CHARSET_IGNORE: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || in_charset_ignore(state, body + 2, context->text_ptr[0])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_NOT_CHARSET_IGNORE_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || in_charset_ignore(state, body + 2, context->text_ptr[-1])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_NOT_CHARSET_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || in_charset(body + 2, context->text_ptr[-1])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_NOT_LITERAL: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || context->text_ptr[0] == (SRE_CHAR)body[1]) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_NOT_LITERAL_IGNORE: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || same_char_ignore(state, context->text_ptr[0], body[1])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_NOT_LITERAL_IGNORE_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || same_char_ignore(state, context->text_ptr[-1], body[1])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_NOT_LITERAL_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || context->text_ptr[-1] == (SRE_CHAR)body[1]) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_NOT_RANGE: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || in_range(context->text_ptr[0], body[1], body[2])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_NOT_RANGE_IGNORE: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || in_range_ignore(state, context->text_ptr[0], body[1], body[2])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_NOT_RANGE_IGNORE_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || in_range_ignore(state, context->text_ptr[-1], body[1], body[2])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_NOT_RANGE_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || in_range(context->text_ptr[-1], body[1], body[2])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_NOT_SET: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || in_set(state, body + 1, context->text_ptr[0])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_NOT_SET_IGNORE: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || in_set_ignore(state, body + 1, context->text_ptr[0])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_NOT_SET_IGNORE_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || in_set_ignore(state, body + 1, context->text_ptr[-1])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_NOT_SET_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || in_set(state, body + 1, context->text_ptr[-1])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_RANGE: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || !in_range(context->text_ptr[0], body[1], body[2])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_RANGE_IGNORE: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || !in_range_ignore(state, context->text_ptr[0], body[1], body[2])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_RANGE_IGNORE_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || !in_range_ignore(state, context->text_ptr[-1], body[1], body[2])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_RANGE_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || !in_range(context->text_ptr[-1], body[1], body[2])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_SET: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || !in_set(state, body + 1, context->text_ptr[0])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_SET_IGNORE: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr >= max_ptr || !in_set_ignore(state, body + 1, context->text_ptr[0])) + return 0; + context->text_ptr++; + } + break; + case SRE_OP_SET_IGNORE_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || !in_set_ignore(state, body + 1, context->text_ptr[-1])) + return 0; + context->text_ptr--; + } + break; + case SRE_OP_SET_REV: + while (!SRE_POSSIBLE_MATCH_AHEAD(context, state, tail)) { + if (context->text_ptr <= max_ptr || !in_set(state, body + 1, context->text_ptr[-1])) + return 0; + context->text_ptr--; + } + break; } - for (;;) { - ++sigcount; - if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals()) - RETURN_ERROR(SRE_ERROR_INTERRUPTED); - - switch (*ctx->pattern++) { - - case SRE_OP_MARK: - /* set mark */ - /* */ - TRACE(("|%p|%p|MARK %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[0])); - i = ctx->pattern[0]; - if (i & 1) - state->lastindex = i/2 + 1; - if (i > state->lastmark) { - /* state->lastmark is the highest valid index in the - state->mark array. If it is increased by more than 1, - the intervening marks must be set to NULL to signal - that these marks have not been encountered. */ - Py_ssize_t j = state->lastmark + 1; - while (j < i) - state->mark[j++] = NULL; - state->lastmark = i; - } - state->mark[i] = ctx->ptr; - ctx->pattern++; - break; - - case SRE_OP_LITERAL: - /* match literal string */ - /* */ - TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern, - ctx->ptr, *ctx->pattern)); - if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0]) - RETURN_FAILURE; - ctx->pattern++; - ctx->ptr++; - break; + return TRUE; +} - case SRE_OP_NOT_LITERAL: - /* match anything that is not literal character */ - /* */ - TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern, - ctx->ptr, *ctx->pattern)); - if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0]) - RETURN_FAILURE; - ctx->pattern++; - ctx->ptr++; - break; +/* check if string matches the given pattern. returns <0 for + error, 0 for failure, and 1 for success */ +LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern_ptr) { + SRE_CONTEXT context; + unsigned int repeat_min; + unsigned int repeat_max; + unsigned int repeat_counter; + SRE_CHAR* repeat_start; + SRE_BACKTRACK_ITEM* top_nested; + unsigned int sigcount = 0; + int result; + + context.text_beginning = (SRE_CHAR *)state->beginning; + context.text_ptr = state->ptr; + context.text_start = (SRE_CHAR *)state->start; + context.text_end = (SRE_CHAR *)state->end; + context.pattern_ptr = pattern_ptr; + context.marks = (SRE_CHAR**)state->mark; + context.marks_size = (state->numbered_mark_count + state->named_mark_count) * sizeof(context.marks[0]); + context.backtrack_chunk = state->backtrack_chunk; + + // Point to the final newline if it's the final character. + context.final_linebreak = context.text_beginning < context.text_end && + state->encoding->in_category(SRE_CAT_LineBreak, context.text_end[-1]) ? context.text_end - 1 : NULL; + + TRACE(("|%p|%p|ENTER\n", context.pattern_ptr, context.text_ptr)); + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_FAILURE, 0); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + top_nested = context.backtrack_item; - case SRE_OP_SUCCESS: - /* end of pattern */ - TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr)); - state->ptr = ctx->ptr; - RETURN_SUCCESS; - - case SRE_OP_AT: - /* match at given position */ - /* */ - TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); - if (!SRE_AT(state, ctx->ptr, *ctx->pattern)) - RETURN_FAILURE; - ctx->pattern++; - break; + memset(context.marks, 0, context.marks_size); - case SRE_OP_CATEGORY: - /* match at given category */ - /* */ - TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern, - ctx->ptr, *ctx->pattern)); - if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0])) - RETURN_FAILURE; - ctx->pattern++; - ctx->ptr++; - break; +advance: + for (;;) { + ++sigcount; + if ((0 == (sigcount & 0xFFF)) && PyErr_CheckSignals()) + return SRE_CLEANUP(&context, state, SRE_ERROR_INTERRUPTED); + switch (context.pattern_ptr[0]) { case SRE_OP_ANY: - /* match anything (except a newline) */ - /* */ - TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr)); - if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0])) - RETURN_FAILURE; - ctx->ptr++; + // Any character except a newline. + // + TRACE(("|%p|%p|ANY\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr >= context.text_end || state->encoding->in_category(SRE_CAT_LineBreak, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr++; break; - case SRE_OP_ANY_ALL: - /* match anything */ - /* */ - TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr)); - if (ctx->ptr >= end) - RETURN_FAILURE; - ctx->ptr++; - break; - - case SRE_OP_IN: - /* match set member (or non_member) */ - /* */ - TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr)); - if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr)) - RETURN_FAILURE; - ctx->pattern += ctx->pattern[0]; - ctx->ptr++; - break; - - case SRE_OP_LITERAL_IGNORE: - TRACE(("|%p|%p|LITERAL_IGNORE %d\n", - ctx->pattern, ctx->ptr, ctx->pattern[0])); - if (ctx->ptr >= end || - state->lower(*ctx->ptr) != state->lower(*ctx->pattern)) - RETURN_FAILURE; - ctx->pattern++; - ctx->ptr++; + // Any character. + // + TRACE(("|%p|%p|ANY_ALL\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr >= context.text_end) + goto backtrack; + context.text_ptr++; + context.pattern_ptr++; + break; + case SRE_OP_ANY_ALL_REV: + // Any character. + // + TRACE(("|%p|%p|ANY_ALL_REV\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr <= context.text_start) + goto backtrack; + context.text_ptr--; + context.pattern_ptr++; + break; + case SRE_OP_ANY_REV: + // Any character except a newline. + // + TRACE(("|%p|%p|ANY_REV\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr <= context.text_start || state->encoding->in_category(SRE_CAT_LineBreak, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr++; break; - - case SRE_OP_NOT_LITERAL_IGNORE: - TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", - ctx->pattern, ctx->ptr, *ctx->pattern)); - if (ctx->ptr >= end || - state->lower(*ctx->ptr) == state->lower(*ctx->pattern)) - RETURN_FAILURE; - ctx->pattern++; - ctx->ptr++; - break; - - case SRE_OP_IN_IGNORE: - TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); - if (ctx->ptr >= end - || !SRE_CHARSET(ctx->pattern+1, - (SRE_CODE)state->lower(*ctx->ptr))) - RETURN_FAILURE; - ctx->pattern += ctx->pattern[0]; - ctx->ptr++; + case SRE_OP_ASSERT: + // Assert subpattern. + // ... + TRACE(("|%p|%p|ASSERT\n", context.pattern_ptr, context.text_ptr)); + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_ASSERT, TRUE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + // If the subpattern succeeds then we'll discard the enclosed backtrack info, + // including any marks, so we need to save the marks here. + memmove(context.backtrack_item->marks, context.marks, context.marks_size); + context.backtrack_item->assert.text_start = context.text_start; + context.backtrack_item->assert.text_ptr = context.text_ptr; + // The assert can look at the text before the start position, if any. + context.text_start = state->beginning; + context.pattern_ptr += 2; break; - - case SRE_OP_JUMP: - case SRE_OP_INFO: - /* jump forward */ - /* */ - TRACE(("|%p|%p|JUMP %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[0])); - ctx->pattern += ctx->pattern[0]; + case SRE_OP_ASSERT_NOT: + // Assert not subpattern. + // ... + TRACE(("|%p|%p|ASSERT_NOT\n", context.pattern_ptr, context.text_ptr)); + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_ASSERT_NOT, TRUE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + // If the subpattern succeeds then we'll discard the enclosed backtrack info, + // including any marks, so we need to save the marks here. + memmove(context.backtrack_item->marks, context.marks, context.marks_size); + context.backtrack_item->assert.text_start = context.text_start; + context.backtrack_item->assert.text_ptr = context.text_ptr; + context.backtrack_item->assert.pattern_ptr = context.pattern_ptr; + // The assert can look at the text before the start position, if any. + context.text_start = state->beginning; + context.pattern_ptr += 2; + break; + case SRE_OP_ATOMIC: + // Atomic subpattern. + // ... + TRACE(("|%p|%p|ATOMIC\n", context.pattern_ptr, context.text_ptr)); + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_ATOMIC, TRUE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + // If the subpattern succeeds then we'll discard the enclosed backtrack info, + // including any marks, so we need to save the marks here. + memmove(context.backtrack_item->marks, context.marks, context.marks_size); + context.pattern_ptr++; + break; + case SRE_OP_BOUNDARY: + // Boundary between word and non-word. + // + TRACE(("|%p|%p|BOUNDARY\n", context.pattern_ptr, context.text_ptr)); + if (!SRE_AT_BOUNDARY(&context, state)) + goto backtrack; + context.pattern_ptr++; break; - case SRE_OP_BRANCH: - /* alternation */ - /* <0=skip> code ... */ - TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr)); - LASTMARK_SAVE(); - ctx->u.rep = state->repeat; - if (ctx->u.rep) - MARK_PUSH(ctx->lastmark); - for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) { - if (ctx->pattern[1] == SRE_OP_LITERAL && - (ctx->ptr >= end || - (SRE_CODE) *ctx->ptr != ctx->pattern[2])) - continue; - if (ctx->pattern[1] == SRE_OP_IN && - (ctx->ptr >= end || - !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr))) - continue; - state->ptr = ctx->ptr; - DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); - if (ret) { - if (ctx->u.rep) - MARK_POP_DISCARD(ctx->lastmark); - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - if (ctx->u.rep) - MARK_POP_KEEP(ctx->lastmark); - LASTMARK_RESTORE(); - } - if (ctx->u.rep) - MARK_POP_DISCARD(ctx->lastmark); - RETURN_FAILURE; - - case SRE_OP_REPEAT_ONE: - /* match repeated sequence (maximizing regexp) */ - - /* this operator only works if the repeated item is - exactly one character wide, and we're not already - collecting backtracking points. for other cases, - use the MAX_REPEAT operator */ - - /* <1=min> <2=max> item tail */ - - TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, - ctx->pattern[1], ctx->pattern[2])); - - if (ctx->ptr + ctx->pattern[1] > end) - RETURN_FAILURE; /* cannot match */ - - state->ptr = ctx->ptr; - - ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]); - RETURN_ON_ERROR(ret); - DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); - ctx->count = ret; - ctx->ptr += ctx->count; - - /* when we arrive here, count contains the number of - matches, and ctx->ptr points to the tail of the target - string. check if the rest of the pattern matches, - and backtrack if not. */ - - if (ctx->count < (Py_ssize_t) ctx->pattern[1]) - RETURN_FAILURE; - - if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { - /* tail is empty. we're finished */ - state->ptr = ctx->ptr; - RETURN_SUCCESS; - } - - LASTMARK_SAVE(); - - if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) { - /* tail starts with a literal. skip positions where - the rest of the pattern cannot possibly match */ - ctx->u.chr = ctx->pattern[ctx->pattern[0]+1]; - for (;;) { - while (ctx->count >= (Py_ssize_t) ctx->pattern[1] && - (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) { - ctx->ptr--; - ctx->count--; - } - if (ctx->count < (Py_ssize_t) ctx->pattern[1]) - break; - state->ptr = ctx->ptr; - DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1, - ctx->pattern+ctx->pattern[0]); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - - LASTMARK_RESTORE(); - - ctx->ptr--; - ctx->count--; + { + // Alternation. + // ... ... 0 + SRE_CODE* skip_ptr = context.pattern_ptr + 1; + TRACE(("|%p|%p|BRANCH\n", context.pattern_ptr, context.text_ptr)); + // Look ahead in the branch to avoid unnecessary backtracking. + while (! SRE_POSSIBLE_MATCH_AHEAD(&context, state, skip_ptr + 1)) { + skip_ptr += skip_ptr[0]; + // Is there another branch? + if (skip_ptr[0] == 0) + goto backtrack; + } + // Try this branch. + context.pattern_ptr = skip_ptr + 1; + // Save the next branch, if present. + skip_ptr += skip_ptr[0]; + if (skip_ptr[0] != 0) { + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_BRANCH, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->branch.text_ptr = context.text_ptr; + context.backtrack_item->branch.pattern_ptr = skip_ptr; + } + break; + } + case SRE_OP_CATEGORY: + // Character in category. + // + TRACE(("|%p|%p|CATEGORY 0x%X\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (context.text_ptr >= context.text_end || !state->encoding->in_category(context.pattern_ptr[1], context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 2; + break; + case SRE_OP_CATEGORY_REV: + // Character in category. + // + TRACE(("|%p|%p|CATEGORY_REV 0x%X\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (context.text_ptr <= context.text_start || !state->encoding->in_category(context.pattern_ptr[1], context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 2; + break; + case SRE_OP_CHARSET: + // Character in set. + // + TRACE(("|%p|%p|CHARSET\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr >= context.text_end || !in_charset(context.pattern_ptr + 2, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_CHARSET_IGNORE: + // Character in set, ignoring case. + // + TRACE(("|%p|%p|CHARSET_IGNORE\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr >= context.text_end || !in_charset_ignore(state, context.pattern_ptr + 2, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_CHARSET_IGNORE_REV: + // Character in set, ignoring case. + // + TRACE(("|%p|%p|CHARSET_IGNORE_REV\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr <= context.text_start || !in_charset_ignore(state, context.pattern_ptr + 2, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_CHARSET_REV: + // Character in set. + // + TRACE(("|%p|%p|CHARSET_REV\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr <= context.text_start || !in_charset(context.pattern_ptr + 2, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_END_ASSERT: + // Assert subpattern. + // ... + TRACE(("|%p|%p|END_ASSERT\n", context.pattern_ptr, context.text_ptr)); + // Discard all backtrack info in the assertion. + SRE_DISCARD_UNTIL(&context, SRE_OP_ASSERT); + // Restore the marks. + memmove(context.marks, context.backtrack_item->marks, context.marks_size); + context.text_start = context.backtrack_item->assert.text_start; + context.text_ptr = context.backtrack_item->assert.text_ptr; + SRE_DISCARD_BACKTRACK(&context); + context.pattern_ptr++; + break; + case SRE_OP_END_ASSERT_NOT: + // Assert not subpattern. + // ... + TRACE(("|%p|%p|END_ASSERT_NOT\n", context.pattern_ptr, context.text_ptr)); + // Discard all backtrack info in the assertion. + SRE_DISCARD_UNTIL(&context, SRE_OP_ASSERT_NOT); + // Restore the marks. + memmove(context.marks, context.backtrack_item->marks, context.marks_size); + context.text_start = context.backtrack_item->assert.text_start; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_END_ATOMIC: + // Atomic subpattern. + // ... + TRACE(("|%p|%p|END_ATOMIC\n", context.pattern_ptr, context.text_ptr)); + // Discard all backtrack info in the atomic group. + SRE_DISCARD_UNTIL(&context, SRE_OP_ATOMIC); + // Modify the backtrack info so that the marks will be restored if the tail + // of the pattern fails. + context.backtrack_item->op = SRE_OP_END_ATOMIC; + context.pattern_ptr++; + break; + case SRE_OP_END_OF_LINE: + // End of line. + // + TRACE(("|%p|%p|END_OF_LINE\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr < context.text_end && !state->encoding->in_category(SRE_CAT_LineBreak, context.text_ptr[0])) + goto backtrack; + context.pattern_ptr++; + break; + case SRE_OP_END_OF_STRING: + // End of string. + // + TRACE(("|%p|%p|END_OF_STRING\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr < context.text_end) + goto backtrack; + context.pattern_ptr++; + break; + case SRE_OP_END_OF_STRING_LN: + // End of string or final line. + // + TRACE(("|%p|%p|END_OF_STRING_LN\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr < context.text_end && context.text_ptr != context.final_linebreak) + goto backtrack; + context.pattern_ptr++; + break; + case SRE_OP_END_REPEAT_MAX: + { + // End of greedy repeat. + // ... + SRE_CODE* end_repeat_ptr = context.pattern_ptr; + SRE_CODE* repeat_ptr = end_repeat_ptr - end_repeat_ptr[1]; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = end_repeat_ptr + 2; + unsigned int available = context.text_end - context.text_ptr; + BOOL try_again; + BOOL try_tail; + TRACE(("|%p|%p|END_REPEAT_MAX\n", context.pattern_ptr, context.text_ptr)); + // At this point the repeat info refers to the inner repeat. + ++repeat_counter; + repeat_max = sre_min(repeat_max, repeat_counter + available); + try_again = repeat_counter < repeat_max && context.text_ptr != repeat_start; + try_tail = repeat_counter >= repeat_min && SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail); + if (try_again) { + if (try_tail) { + // Save this position for possible match of the tail. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MAX, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + // The backtrack info must refer to the outer repeat. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.pattern_ptr = tail; } - + repeat_start = context.text_ptr; + context.pattern_ptr = body; } else { - /* general case */ - while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) { - state->ptr = ctx->ptr; - DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2, - ctx->pattern+ctx->pattern[0]); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - ctx->ptr--; - ctx->count--; - LASTMARK_RESTORE(); + if (try_tail) { + // Restore the repeat info for the outer repeat. + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + top_nested = top_nested->repeat.top_nested; + context.pattern_ptr = tail; + } else + goto backtrack; + } + break; + } + case SRE_OP_END_REPEAT_MAX_REV: + { + // End of greedy repeat. + // ... + SRE_CODE* end_repeat_ptr = context.pattern_ptr; + SRE_CODE* repeat_ptr = end_repeat_ptr - end_repeat_ptr[1]; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = end_repeat_ptr + 2; + unsigned int available = context.text_ptr - context.text_start; + BOOL try_again; + BOOL try_tail; + TRACE(("|%p|%p|END_REPEAT_MAX_REV\n", context.pattern_ptr, context.text_ptr)); + // At this point the repeat info refers to the inner repeat. + ++repeat_counter; + repeat_max = sre_min(repeat_max, repeat_counter + available); + try_again = repeat_counter < repeat_max && context.text_ptr != repeat_start; + try_tail = repeat_counter >= repeat_min && SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail); + if (try_again) { + if (try_tail) { + // Save this position for possible match of the tail. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MAX_REV, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + // The backtrack info must refer to the outer repeat. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.pattern_ptr = tail; } + repeat_start = context.text_ptr; + context.pattern_ptr = body; + } else { + if (try_tail) { + // Restore the repeat info for the outer repeat. + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + top_nested = top_nested->repeat.top_nested; + context.pattern_ptr = tail; + } else + goto backtrack; } - RETURN_FAILURE; - - case SRE_OP_MIN_REPEAT_ONE: - /* match repeated sequence (minimizing regexp) */ - - /* this operator only works if the repeated item is - exactly one character wide, and we're not already - collecting backtracking points. for other cases, - use the MIN_REPEAT operator */ - - /* <1=min> <2=max> item tail */ - - TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, - ctx->pattern[1], ctx->pattern[2])); - - if (ctx->ptr + ctx->pattern[1] > end) - RETURN_FAILURE; /* cannot match */ - - state->ptr = ctx->ptr; - - if (ctx->pattern[1] == 0) - ctx->count = 0; - else { - /* count using pattern min as the maximum */ - ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]); - RETURN_ON_ERROR(ret); - DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); - if (ret < (Py_ssize_t) ctx->pattern[1]) - /* didn't match minimum number of times */ - RETURN_FAILURE; - /* advance past minimum matches of repeat */ - ctx->count = ret; - ctx->ptr += ctx->count; - } - - if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { - /* tail is empty. we're finished */ - state->ptr = ctx->ptr; - RETURN_SUCCESS; - + break; + } + case SRE_OP_END_REPEAT_MIN: + { + // Lazy repeat. + // ... + SRE_CODE* end_repeat_ptr = context.pattern_ptr; + SRE_CODE* repeat_ptr = end_repeat_ptr - end_repeat_ptr[1]; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = end_repeat_ptr + 2; + unsigned int available = context.text_end - context.text_ptr; + BOOL try_again; + BOOL try_tail; + TRACE(("|%p|%p|END_REPEAT_MIN\n", context.pattern_ptr, context.text_ptr)); + // At this point the repeat info refers to the inner repeat. + ++repeat_counter; + repeat_max = sre_min(repeat_max, repeat_counter + available); + try_again = repeat_counter < repeat_max && context.text_ptr != repeat_start; + try_tail = repeat_counter >= repeat_min && SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail); + if (try_tail) { + if (try_again) { + // Need to save the repeat info for the inner repeat in case the tail fails. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MIN, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + // The backtrack info must refer to the outer repeat. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.repeat_min = repeat_min; + context.backtrack_item->repeat.repeat_max = repeat_max; + context.backtrack_item->repeat.repeat_counter = repeat_counter; + context.backtrack_item->repeat.repeat_start = repeat_start; + context.backtrack_item->repeat.pattern_ptr = body; + } + // Restore the repeat info for the outer repeat. + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + top_nested = top_nested->repeat.top_nested; + context.pattern_ptr = tail; } else { - /* general case */ - LASTMARK_SAVE(); - while ((Py_ssize_t)ctx->pattern[2] == 65535 - || ctx->count <= (Py_ssize_t)ctx->pattern[2]) { - state->ptr = ctx->ptr; - DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one, - ctx->pattern+ctx->pattern[0]); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - state->ptr = ctx->ptr; - ret = SRE_COUNT(state, ctx->pattern+3, 1); - RETURN_ON_ERROR(ret); - DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); - if (ret == 0) - break; - assert(ret == 1); - ctx->ptr++; - ctx->count++; - LASTMARK_RESTORE(); + if (try_again) + context.pattern_ptr = body; + else + goto backtrack; + } + break; + } + case SRE_OP_END_REPEAT_MIN_REV: + { + // Lazy repeat. + // ... + SRE_CODE* end_repeat_ptr = context.pattern_ptr; + SRE_CODE* repeat_ptr = end_repeat_ptr - end_repeat_ptr[1]; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = end_repeat_ptr + 2; + unsigned int available = context.text_ptr - context.text_start; + BOOL try_again; + BOOL try_tail; + TRACE(("|%p|%p|END_REPEAT_MIN_REV\n", context.pattern_ptr, context.text_ptr)); + // At this point the repeat info refers to the inner repeat. + ++repeat_counter; + repeat_max = sre_min(repeat_max, repeat_counter + available); + try_again = repeat_counter < repeat_max && context.text_ptr != repeat_start; + try_tail = repeat_counter >= repeat_min && SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail); + if (try_tail) { + if (try_again) { + // Need to save the repeat info for the inner repeat in case the tail fails. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MIN_REV, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + // The backtrack info must refer to the outer repeat. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.repeat_min = repeat_min; + context.backtrack_item->repeat.repeat_max = repeat_max; + context.backtrack_item->repeat.repeat_counter = repeat_counter; + context.backtrack_item->repeat.repeat_start = repeat_start; + context.backtrack_item->repeat.pattern_ptr = body; } + // Restore the repeat info for the outer repeat. + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + top_nested = top_nested->repeat.top_nested; + context.pattern_ptr = tail; + } else { + if (try_again) + context.pattern_ptr = body; + else + goto backtrack; } - RETURN_FAILURE; - - case SRE_OP_REPEAT: - /* create repeat context. all the hard work is done - by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ - /* <1=min> <2=max> item tail */ - TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr, - ctx->pattern[1], ctx->pattern[2])); - - /* install new repeat context */ - ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep)); - if (!ctx->u.rep) { - PyErr_NoMemory(); - RETURN_FAILURE; - } - ctx->u.rep->count = -1; - ctx->u.rep->pattern = ctx->pattern; - ctx->u.rep->prev = state->repeat; - ctx->u.rep->last_ptr = NULL; - state->repeat = ctx->u.rep; - - state->ptr = ctx->ptr; - DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]); - state->repeat = ctx->u.rep->prev; - PyObject_FREE(ctx->u.rep); - - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - RETURN_FAILURE; - - case SRE_OP_MAX_UNTIL: - /* maximizing repeat */ - /* <1=min> <2=max> item tail */ - - /* FIXME: we probably need to deal with zero-width - matches in here... */ - - ctx->u.rep = state->repeat; - if (!ctx->u.rep) - RETURN_ERROR(SRE_ERROR_STATE); - - state->ptr = ctx->ptr; - - ctx->count = ctx->u.rep->count+1; - - TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern, - ctx->ptr, ctx->count)); - - if (ctx->count < ctx->u.rep->pattern[1]) { - /* not enough matches */ - ctx->u.rep->count = ctx->count; - DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1, - ctx->u.rep->pattern+3); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; + break; + } + case SRE_OP_END_REPEAT_POSS: + { + // End of greedy repeat. + // ... + SRE_CODE* end_repeat_ptr = context.pattern_ptr; + SRE_CODE* repeat_ptr = end_repeat_ptr - end_repeat_ptr[1]; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = end_repeat_ptr + 2; + unsigned int available = context.text_end - context.text_ptr; + BOOL try_again; + BOOL try_tail; + TRACE(("|%p|%p|END_REPEAT_POSS\n", context.pattern_ptr, context.text_ptr)); + // Discard all backtrack info in the body of the possessive repeat. + SRE_DISCARD_UNTIL(&context, SRE_OP_REPEAT_POSS); + // At this point the repeat info refers to the inner repeat. + ++repeat_counter; + repeat_max = sre_min(repeat_max, repeat_counter + available); + try_again = repeat_counter < repeat_max && context.text_ptr != repeat_start; + try_tail = repeat_counter >= repeat_min && SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail); + if (try_again) { + if (try_tail) { + // Save this position for possible match of the tail. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_POSS, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + // The backtrack info must refer to the outer repeat. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.pattern_ptr = tail; } - ctx->u.rep->count = ctx->count-1; - state->ptr = ctx->ptr; - RETURN_FAILURE; - } - - if ((ctx->count < ctx->u.rep->pattern[2] || - ctx->u.rep->pattern[2] == 65535) && - state->ptr != ctx->u.rep->last_ptr) { - /* we may have enough matches, but if we can - match another item, do so */ - ctx->u.rep->count = ctx->count; - LASTMARK_SAVE(); - MARK_PUSH(ctx->lastmark); - /* zero-width match protection */ - DATA_PUSH(&ctx->u.rep->last_ptr); - ctx->u.rep->last_ptr = state->ptr; - DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2, - ctx->u.rep->pattern+3); - DATA_POP(&ctx->u.rep->last_ptr); - if (ret) { - MARK_POP_DISCARD(ctx->lastmark); - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; + repeat_start = context.text_ptr; + context.pattern_ptr = body; + } else { + if (try_tail) { + // Restore the repeat info for the outer repeat. + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + top_nested = top_nested->repeat.top_nested; + context.pattern_ptr = tail; + } else + goto backtrack; + } + break; + } + case SRE_OP_END_REPEAT_POSS_REV: + { + // End of greedy repeat. + // ... + SRE_CODE* end_repeat_ptr = context.pattern_ptr; + SRE_CODE* repeat_ptr = end_repeat_ptr - end_repeat_ptr[1]; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = end_repeat_ptr + 2; + unsigned int available = context.text_ptr - context.text_start; + BOOL try_again; + BOOL try_tail; + TRACE(("|%p|%p|END_REPEAT_POSS_REV\n", context.pattern_ptr, context.text_ptr)); + // Discard all backtrack info in the body of the possessive repeat. + SRE_DISCARD_UNTIL(&context, SRE_OP_REPEAT_POSS_REV); + // At this point the repeat info refers to the inner repeat. + ++repeat_counter; + repeat_max = sre_min(repeat_max, repeat_counter + available); + try_again = repeat_counter < repeat_max && context.text_ptr != repeat_start; + try_tail = repeat_counter >= repeat_min && SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail); + if (try_again) { + if (try_tail) { + // Save this position for possible match of the tail. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_POSS_REV, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + // The backtrack info must refer to the outer repeat. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.pattern_ptr = tail; } - MARK_POP(ctx->lastmark); - LASTMARK_RESTORE(); - ctx->u.rep->count = ctx->count-1; - state->ptr = ctx->ptr; - } - - /* cannot match more repeated items here. make sure the - tail matches */ - state->repeat = ctx->u.rep->prev; - DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); - RETURN_ON_SUCCESS(ret); - state->repeat = ctx->u.rep; - state->ptr = ctx->ptr; - RETURN_FAILURE; - - case SRE_OP_MIN_UNTIL: - /* minimizing repeat */ - /* <1=min> <2=max> item tail */ - - ctx->u.rep = state->repeat; - if (!ctx->u.rep) - RETURN_ERROR(SRE_ERROR_STATE); - - state->ptr = ctx->ptr; - - ctx->count = ctx->u.rep->count+1; - - TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern, - ctx->ptr, ctx->count, ctx->u.rep->pattern)); - - if (ctx->count < ctx->u.rep->pattern[1]) { - /* not enough matches */ - ctx->u.rep->count = ctx->count; - DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1, - ctx->u.rep->pattern+3); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; + repeat_start = context.text_ptr; + context.pattern_ptr = body; + } else { + if (try_tail) { + // Restore the repeat info for the outer repeat. + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + top_nested = top_nested->repeat.top_nested; + context.pattern_ptr = tail; + } else + goto backtrack; + } + break; + } + case SRE_OP_GROUPREF: + { + // Match capture group. + // + Py_ssize_t group; + SRE_CHAR* group_start; + SRE_CHAR* group_end; + Py_ssize_t length; + Py_ssize_t i; + TRACE(("|%p|%p|GROUPREF %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + group = context.pattern_ptr[1]; // Zero-based index. Note that externally group 0 is the entire matched string. + group_start = context.marks[group * 2]; + group_end = context.marks[group * 2 + 1]; + if (group_start == NULL || group_start > group_end) + goto backtrack; + length = group_end - group_start; + if (length > context.text_end - context.text_ptr) + goto backtrack; + i = 0; + while (i < length) { + if (context.text_ptr[i] != group_start[i]) + goto backtrack; + i++; + } + context.text_ptr += length; + context.pattern_ptr += 2; + break; + } + case SRE_OP_GROUPREF_EXISTS: + { + // Whether capture group exists. + // code_yes code_no + Py_ssize_t group; + SRE_CHAR* group_start; + SRE_CHAR* group_end; + TRACE(("|%p|%p|GROUPREF_EXISTS %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + group = context.pattern_ptr[1]; // Zero-based index. Note that externally group 0 is the entire matched string. + group_start = context.marks[group * 2]; + group_end = context.marks[group * 2 + 1]; + if (group_start == NULL || group_start > group_end) + context.pattern_ptr += 1 + context.pattern_ptr[2]; + else + context.pattern_ptr += 3; + break; + } + case SRE_OP_GROUPREF_IGNORE: + { + // Match capture group, ignoring case. + // + Py_ssize_t group; + SRE_CHAR* group_start; + SRE_CHAR* group_end; + Py_ssize_t length; + Py_ssize_t i; + TRACE(("|%p|%p|GROUPREF_IGNORE %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + group = context.pattern_ptr[1]; // Zero-based index. Note that externally group 0 is the entire matched string. + group_start = context.marks[group * 2]; + group_end = context.marks[group * 2 + 1]; + if (group_start == NULL || group_start > group_end) + goto backtrack; + length = group_end - group_start; + if (length > context.text_end - context.text_ptr) + goto backtrack; + i = 0; + while (i < length) { + if (!same_char_ignore(state, context.text_ptr[i], group_start[i])) + goto backtrack; + i++; + } + context.text_ptr += length; + context.pattern_ptr += 2; + break; + } + case SRE_OP_GROUPREF_IGNORE_REV: + { + // Match capture group, ignoring case. + // + Py_ssize_t group; + SRE_CHAR* group_start; + SRE_CHAR* group_end; + Py_ssize_t length; + Py_ssize_t i; + TRACE(("|%p|%p|GROUPREF_IGNORE_REV %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + group = context.pattern_ptr[1]; // Zero-based index. Note that externally group 0 is the entire matched string. + group_start = context.marks[group * 2]; + group_end = context.marks[group * 2 + 1]; + if (group_start == NULL || group_start > group_end) + goto backtrack; + length = group_end - group_start; + if (length > context.text_ptr - context.text_start) + goto backtrack; + context.text_ptr -= length; + i = 0; + while (i < length) { + if (!same_char_ignore(state, context.text_ptr[i], group_start[i])) + goto backtrack; + i++; + } + context.pattern_ptr += 2; + break; + } + case SRE_OP_GROUPREF_REV: + { + // Match capture group. + // + Py_ssize_t group; + SRE_CHAR* group_start; + SRE_CHAR* group_end; + Py_ssize_t length; + Py_ssize_t i; + TRACE(("|%p|%p|GROUPREF_REV %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + group = context.pattern_ptr[1]; // Zero-based index. Note that externally group 0 is the entire matched string. + group_start = context.marks[group * 2]; + group_end = context.marks[group * 2 + 1]; + if (group_start == NULL || group_start > group_end) + goto backtrack; + length = group_end - group_start; + if (length > context.text_ptr - context.text_start) + goto backtrack; + context.text_ptr -= length; + i = 0; + while (i < length) { + if (context.text_ptr[i] != group_start[i]) + goto backtrack; + i++; + } + context.pattern_ptr += 2; + break; + } + case SRE_OP_JUMP: + // Jump forward. + // + TRACE(("|%p|%p|JUMP %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_LITERAL: + // Character is a literal. + // + TRACE(("|%p|%p|LITERAL %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (context.text_ptr >= context.text_end || context.text_ptr[0] != (SRE_CHAR)context.pattern_ptr[1]) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 2; + break; + case SRE_OP_LITERAL_IGNORE: + // Character is a literal, ignoring case. + // + TRACE(("|%p|%p|LITERAL_IGNORE %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (context.text_ptr >= context.text_end || !same_char_ignore(state, context.text_ptr[0], context.pattern_ptr[1])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 2; + break; + case SRE_OP_LITERAL_IGNORE_REV: + // Character is a literal, ignoring case. + // + TRACE(("|%p|%p|LITERAL_IGNORE_REV %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (context.text_ptr <= context.text_start || !same_char_ignore(state, context.text_ptr[-1], context.pattern_ptr[1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 2; + break; + case SRE_OP_LITERAL_REV: + // Character is a literal. + // + TRACE(("|%p|%p|LITERAL_REV %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (context.text_ptr <= context.text_start || context.text_ptr[-1] != (SRE_CHAR)context.pattern_ptr[1]) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 2; + break; + case SRE_OP_LITERAL_STRING: + { + // Literal string. + // ... + Py_ssize_t length = context.pattern_ptr[1]; + SRE_CODE* literal = context.pattern_ptr + 2; + Py_ssize_t i; + TRACE(("|%p|%p|LITERAL_STRING %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (length > context.text_end - context.text_ptr) + goto backtrack; + i = 0; + do { + if (context.text_ptr[i] != (SRE_CHAR)literal[i]) + goto backtrack; + i++; + } + while (i < length); + context.text_ptr += length; + context.pattern_ptr = literal + length; + break; + } + case SRE_OP_LITERAL_STRING_IGNORE: + { + // Literal string, ignoring case. + // ... + Py_ssize_t length = context.pattern_ptr[1]; + SRE_CODE* literal = context.pattern_ptr + 2; + Py_ssize_t i; + TRACE(("|%p|%p|LITERAL_STRING_IGNORE %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (length > context.text_end - context.text_ptr) + goto backtrack; + i = 0; + do { + if (!same_char_ignore(state, context.text_ptr[i], literal[i])) + goto backtrack; + i++; + } + while (i < length); + context.text_ptr += length; + context.pattern_ptr = literal + length; + break; + } + case SRE_OP_LITERAL_STRING_IGNORE_REV: + { + // Literal string, ignoring case. + // ... + Py_ssize_t length = context.pattern_ptr[1]; + SRE_CODE* literal = context.pattern_ptr + 2; + Py_ssize_t i; + TRACE(("|%p|%p|LITERAL_STRING_IGNORE_REV %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (length > context.text_ptr - context.text_start) + goto backtrack; + context.text_ptr -= length; + i = 0; + do { + if (!same_char_ignore(state, context.text_ptr[i], literal[i])) + goto backtrack; + i++; + } + while (i < length); + context.pattern_ptr = literal + length; + break; + } + case SRE_OP_LITERAL_STRING_REV: + { + // Literal string. + // ... + Py_ssize_t length = context.pattern_ptr[1]; + SRE_CODE* literal = context.pattern_ptr + 2; + Py_ssize_t i; + TRACE(("|%p|%p|LITERAL_STRING_REV %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (length > context.text_ptr - context.text_start) + goto backtrack; + context.text_ptr -= length; + i = 0; + do { + if (context.text_ptr[i] != (SRE_CHAR)literal[i]) + goto backtrack; + i++; + } + while (i < length); + context.pattern_ptr = literal + length; + break; + } + case SRE_OP_MARK: + { + // Set mark. + // + int numbered_index = context.pattern_ptr[1]; + int named_index = context.pattern_ptr[2]; + TRACE(("|%p|%p|MARK %u %u\n", context.pattern_ptr, context.text_ptr, numbered_index, named_index)); + // Save the current marks. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_MARK, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->mark.numbered_index = numbered_index; + context.backtrack_item->mark.numbered_mark_ptr = context.marks[numbered_index]; + context.marks[numbered_index] = context.text_ptr; + DEBUG_TRACE(("saving mark %u as 0x%p\n", context.backtrack_item->mark.numbered_index, context.backtrack_item->mark.numbered_mark_ptr)); + context.backtrack_item->mark.named_index = named_index; + context.backtrack_item->mark.named_mark_ptr = context.marks[named_index]; + context.marks[named_index] = context.text_ptr; + DEBUG_TRACE(("saving mark %u as 0x%p\n", context.backtrack_item->mark.named_index, context.backtrack_item->mark.named_mark_ptr)); + context.pattern_ptr += 3; + break; + } + case SRE_OP_NOT_BOUNDARY: + // Not boundary between word and non-word. + // + TRACE(("|%p|%p|NOT_BOUNDARY\n", context.pattern_ptr, context.text_ptr)); + if (SRE_AT_BOUNDARY(&context, state)) + goto backtrack; + context.pattern_ptr++; + break; + case SRE_OP_NOT_CATEGORY: + // Character not in category. + // + TRACE(("|%p|%p|NOT_CATEGORY 0x%X\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (context.text_ptr >= context.text_end || state->encoding->in_category(context.pattern_ptr[1], context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 2; + break; + case SRE_OP_NOT_CATEGORY_REV: + // Character not in category. + // + TRACE(("|%p|%p|NOT_CATEGORY_REV 0x%X\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (context.text_ptr >= context.text_start || state->encoding->in_category(context.pattern_ptr[1], context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 2; + break; + case SRE_OP_NOT_CHARSET: + // Character not in set. + // + TRACE(("|%p|%p|NOT_CHARSET\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr >= context.text_end || in_charset(context.pattern_ptr + 2, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_CHARSET_IGNORE: + // Character not in set, ignoring case. + // + TRACE(("|%p|%p|NOT_CHARSET_IGNORE\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr >= context.text_end || in_charset_ignore(state, context.pattern_ptr + 2, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_CHARSET_IGNORE_REV: + // Character not in set, ignoring case. + // + TRACE(("|%p|%p|NOT_CHARSET_IGNORE_REV\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr <= context.text_start || in_charset_ignore(state, context.pattern_ptr + 2, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_CHARSET_REV: + // Character not in set. + // + TRACE(("|%p|%p|NOT_CHARSET_REV\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr <= context.text_start || in_charset(context.pattern_ptr + 2, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_LITERAL: + // Character is not a literal. + // + TRACE(("|%p|%p|NOT_LITERAL %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (context.text_ptr >= context.text_end || context.text_ptr[0] == (SRE_CHAR)context.pattern_ptr[1]) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 2; + break; + case SRE_OP_NOT_LITERAL_IGNORE: + // Character is not a literal, ignoring case. + // + TRACE(("|%p|%p|NOT_LITERAL_IGNORE %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (context.text_ptr >= context.text_end || same_char_ignore(state, context.text_ptr[0], context.pattern_ptr[1])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 2; + break; + case SRE_OP_NOT_LITERAL_IGNORE_REV: + // Character is not a literal, ignoring case. + // + TRACE(("|%p|%p|NOT_LITERAL_IGNORE_REV %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (context.text_ptr <= context.text_start || same_char_ignore(state, context.text_ptr[-1], context.pattern_ptr[1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 2; + break; + case SRE_OP_NOT_LITERAL_REV: + // Character is not a literal. + // + TRACE(("|%p|%p|NOT_LITERAL_REV %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1])); + if (context.text_ptr <= context.text_start || context.text_ptr[-1] == (SRE_CHAR)context.pattern_ptr[1]) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 2; + break; + case SRE_OP_NOT_RANGE: + // Character not in range. + // + TRACE(("|%p|%p|NOT_RANGE %u %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1], context.pattern_ptr[2])); + if (context.text_ptr >= context.text_end || in_range(context.text_ptr[0], context.pattern_ptr[1], context.pattern_ptr[2])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 3; + break; + case SRE_OP_NOT_RANGE_IGNORE: + // Character not in range, ignoring case. + // + TRACE(("|%p|%p|NOT_RANGE_IGNORE %u %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1], context.pattern_ptr[2])); + if (context.text_ptr >= context.text_end || in_range_ignore(state, context.text_ptr[0], context.pattern_ptr[1], context.pattern_ptr[2])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 3; + break; + case SRE_OP_NOT_RANGE_IGNORE_REV: + // Character not in range, ignoring case. + // + TRACE(("|%p|%p|NOT_RANGE_IGNORE_REV %u %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1], context.pattern_ptr[2])); + if (context.text_ptr <= context.text_start || in_range_ignore(state, context.text_ptr[-1], context.pattern_ptr[1], context.pattern_ptr[2])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 3; + break; + case SRE_OP_NOT_RANGE_REV: + // Character not in range. + // + TRACE(("|%p|%p|NOT_RANGE_REV %u %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1], context.pattern_ptr[2])); + if (context.text_ptr <= context.text_start || in_range(context.text_ptr[-1], context.pattern_ptr[1], context.pattern_ptr[2])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 3; + break; + case SRE_OP_NOT_SET: + // Character not in set. + // + TRACE(("|%p|%p|NOT_SET\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr >= context.text_end || in_set(state, context.pattern_ptr + 1, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr = context.pattern_ptr + 1 + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_SET_IGNORE: + // Character not in set, ignoring case. + // + TRACE(("|%p|%p|NOT_SET_IGNORE\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr >= context.text_end || in_set_ignore(state, context.pattern_ptr + 1, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr = context.pattern_ptr + 1 + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_SET_IGNORE_REV: + // Character not in set, ignoring case. + // + TRACE(("|%p|%p|NOT_SET_IGNORE_REV\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr <= context.text_start || in_set_ignore(state, context.pattern_ptr + 1, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr = context.pattern_ptr + 1 + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_SET_REV: + // Character not in set. + // + TRACE(("|%p|%p|NOT_SET_REV\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr <= context.text_start || in_set(state, context.pattern_ptr + 1, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr = context.pattern_ptr + 1 + context.pattern_ptr[1]; + break; + case SRE_OP_RANGE: + // Character in range. + // + TRACE(("|%p|%p|RANGE %u %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1], context.pattern_ptr[2])); + if (context.text_ptr >= context.text_end || !in_range(context.text_ptr[0], context.pattern_ptr[1], context.pattern_ptr[2])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 3; + break; + case SRE_OP_RANGE_IGNORE: + // Character in range, ignoring case. + // + TRACE(("|%p|%p|RANGE_IGNORE %u %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1], context.pattern_ptr[2])); + if (context.text_ptr >= context.text_end || !in_range_ignore(state, context.text_ptr[0], context.pattern_ptr[1], context.pattern_ptr[2])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 3; + break; + case SRE_OP_RANGE_IGNORE_REV: + // Character in range, ignoring case. + // + TRACE(("|%p|%p|RANGE_IGNORE_REV %u %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1], context.pattern_ptr[2])); + if (context.text_ptr <= context.text_start || !in_range_ignore(state, context.text_ptr[-1], context.pattern_ptr[1], context.pattern_ptr[2])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 3; + break; + case SRE_OP_RANGE_REV: + // Character in range. + // + TRACE(("|%p|%p|RANGE_REV %u %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[1], context.pattern_ptr[2])); + if (context.text_ptr <= context.text_start || !in_range(context.text_ptr[-1], context.pattern_ptr[1], context.pattern_ptr[2])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 3; + break; + case SRE_OP_SET: + // Character in set. + // + TRACE(("|%p|%p|SET\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr >= context.text_end || !in_set(state, context.pattern_ptr + 1, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr = context.pattern_ptr + 1 + context.pattern_ptr[1]; + break; + case SRE_OP_SET_IGNORE: + // Character in set, ignoring case. + // + TRACE(("|%p|%p|SET_IGNORE\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr >= context.text_end || !in_set_ignore(state, context.pattern_ptr + 1, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr = context.pattern_ptr + 1 + context.pattern_ptr[1]; + break; + case SRE_OP_SET_IGNORE_REV: + // Character in set, ignoring case. + // + TRACE(("|%p|%p|SET_IGNORE_REV\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr <= context.text_start || !in_set_ignore(state, context.pattern_ptr + 1, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr = context.pattern_ptr + 1 + context.pattern_ptr[1]; + break; + case SRE_OP_SET_REV: + // Character in set. + // + TRACE(("|%p|%p|SET_REV\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr <= context.text_start || !in_set(state, context.pattern_ptr + 1, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr = context.pattern_ptr + 1 + context.pattern_ptr[1]; + break; + case SRE_OP_REPEAT_MAX: + { + // Greedy repeat. + // ... + SRE_CODE* repeat_ptr = context.pattern_ptr; + SRE_CODE* end_repeat_ptr = repeat_ptr + repeat_ptr[1]; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = end_repeat_ptr + 2; + unsigned int available = context.text_end - context.text_ptr; + BOOL try_again; + BOOL try_tail; + TRACE(("|%p|%p|REPEAT_MAX %u %u\n", context.pattern_ptr, context.text_ptr, repeat_ptr[2], repeat_ptr[3])); + // Are there enough characters available for the repeat? (We're assuming at least one per iteration, up to the minimum.) + if (repeat_ptr[2] > available) + goto backtrack; + // At this point the repeat info refers to the outer repeat, so save it. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_MAX, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_min = repeat_min; + context.backtrack_item->repeat.repeat_max = repeat_max; + context.backtrack_item->repeat.repeat_counter = repeat_counter; + context.backtrack_item->repeat.repeat_start = repeat_start; + // Initialise the repeat info for the inner repeat. + top_nested = context.backtrack_item; + repeat_min = repeat_ptr[2]; + repeat_max = repeat_ptr[3] == SRE_UNLIMITED_REPEATS ? available : repeat_ptr[3]; + repeat_counter = 0; + repeat_start = context.text_ptr; + try_again = available > 0; + try_tail = repeat_min == 0 && SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail); + if (try_again) { + if (try_tail) { + // Save this position for possible match of the tail. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MAX, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.pattern_ptr = tail; } - ctx->u.rep->count = ctx->count-1; - state->ptr = ctx->ptr; - RETURN_FAILURE; + context.pattern_ptr = body; + } else { + if (try_tail) { + // Restore the repeat info for the outer repeat. + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + top_nested = top_nested->repeat.top_nested; + context.pattern_ptr = tail; + } else + goto backtrack; } - - LASTMARK_SAVE(); - - /* see if the tail matches */ - state->repeat = ctx->u.rep->prev; - DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; + break; + } + case SRE_OP_REPEAT_MAX_REV: + { + // Greedy repeat. + // ... + SRE_CODE* repeat_ptr = context.pattern_ptr; + SRE_CODE* end_repeat_ptr = repeat_ptr + repeat_ptr[1]; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = end_repeat_ptr + 2; + unsigned int available = context.text_ptr - context.text_start; + BOOL try_again; + BOOL try_tail; + TRACE(("|%p|%p|REPEAT_MAX_REV %u %u\n", context.pattern_ptr, context.text_ptr, repeat_ptr[2], repeat_ptr[3])); + // Are there enough characters available for the repeat? (We're assuming at least one per iteration, up to the minimum.) + if (repeat_ptr[2] > available) + goto backtrack; + // At this point the repeat info refers to the outer repeat, so save it. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_MAX_REV, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_min = repeat_min; + context.backtrack_item->repeat.repeat_max = repeat_max; + context.backtrack_item->repeat.repeat_counter = repeat_counter; + context.backtrack_item->repeat.repeat_start = repeat_start; + // Initialise the repeat info for the inner repeat. + top_nested = context.backtrack_item; + repeat_min = repeat_ptr[2]; + repeat_max = repeat_ptr[3] == SRE_UNLIMITED_REPEATS ? available : repeat_ptr[3]; + repeat_counter = 0; + repeat_start = context.text_ptr; + try_again = available > 0; + try_tail = repeat_min == 0 && SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail); + if (try_again) { + if (try_tail) { + // Save this position for possible match of the tail. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MAX_REV, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.pattern_ptr = tail; + } + context.pattern_ptr = body; + } else { + if (try_tail) { + // Restore the repeat info for the outer repeat. + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + top_nested = top_nested->repeat.top_nested; + context.pattern_ptr = tail; + } else + goto backtrack; } - - state->repeat = ctx->u.rep; - state->ptr = ctx->ptr; - - LASTMARK_RESTORE(); - - if (ctx->count >= ctx->u.rep->pattern[2] - && ctx->u.rep->pattern[2] != 65535) - RETURN_FAILURE; - - ctx->u.rep->count = ctx->count; - DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3, - ctx->u.rep->pattern+3); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; + break; + } + case SRE_OP_REPEAT_MIN: + { + // Lazy repeat. + // ... + SRE_CODE* repeat_ptr = context.pattern_ptr; + SRE_CODE* end_repeat_ptr = repeat_ptr + repeat_ptr[1]; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = end_repeat_ptr + 2; + unsigned int available = context.text_end - context.text_ptr; + BOOL try_again; + BOOL try_tail; + TRACE(("|%p|%p|REPEAT_MIN %u %u\n", context.pattern_ptr, context.text_ptr, repeat_ptr[2], repeat_ptr[3])); + // Are there enough characters available for the repeat? (We're assuming at least one per iteration, up to the minimum.) + if (repeat_ptr[2] > available) + goto backtrack; + // At this point the repeat info refers to the outer repeat, so save it. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_MIN, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_min = repeat_min; + context.backtrack_item->repeat.repeat_max = repeat_max; + context.backtrack_item->repeat.repeat_counter = repeat_counter; + context.backtrack_item->repeat.repeat_start = repeat_start; + // Initialise the repeat info for the inner repeat. + top_nested = context.backtrack_item; + repeat_min = repeat_ptr[2]; + repeat_max = repeat_ptr[3] == SRE_UNLIMITED_REPEATS ? available : repeat_ptr[3]; + repeat_counter = 0; + repeat_start = context.text_ptr; + try_again = available > 0; + try_tail = repeat_min == 0 && SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail); + if (try_tail) { + if (try_again) { + // Need to save the repeat info for the inner repeat in case the tail fails. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MIN, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.repeat_min = repeat_min; + context.backtrack_item->repeat.repeat_max = repeat_max; + context.backtrack_item->repeat.repeat_counter = repeat_counter; + context.backtrack_item->repeat.repeat_start = repeat_start; + context.backtrack_item->repeat.pattern_ptr = body; + } + // Restore the repeat info for the outer repeat. + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + top_nested = top_nested->repeat.top_nested; + context.pattern_ptr = tail; + } else { + if (try_again) + context.pattern_ptr = body; + else + goto backtrack; } - ctx->u.rep->count = ctx->count-1; - state->ptr = ctx->ptr; - RETURN_FAILURE; - - case SRE_OP_GROUPREF: - /* match backreference */ - TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[0])); - i = ctx->pattern[0]; - { - Py_ssize_t groupref = i+i; - if (groupref >= state->lastmark) { - RETURN_FAILURE; - } else { - SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; - SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; - if (!p || !e || e < p) - RETURN_FAILURE; - while (p < e) { - if (ctx->ptr >= end || *ctx->ptr != *p) - RETURN_FAILURE; - p++; ctx->ptr++; - } + break; + } + case SRE_OP_REPEAT_MIN_REV: + { + // Lazy repeat. + // ... + SRE_CODE* repeat_ptr = context.pattern_ptr; + SRE_CODE* end_repeat_ptr = repeat_ptr + repeat_ptr[1]; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = end_repeat_ptr + 2; + unsigned int available = context.text_ptr - context.text_start; + BOOL try_again; + BOOL try_tail; + TRACE(("|%p|%p|REPEAT_MIN_REV %u %u\n", context.pattern_ptr, context.text_ptr, repeat_ptr[2], repeat_ptr[3])); + // Are there enough characters available for the repeat? (We're assuming at least one per iteration, up to the minimum.) + if (repeat_ptr[2] > available) + goto backtrack; + // At this point the repeat info refers to the outer repeat, so save it. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_MIN_REV, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_min = repeat_min; + context.backtrack_item->repeat.repeat_max = repeat_max; + context.backtrack_item->repeat.repeat_counter = repeat_counter; + context.backtrack_item->repeat.repeat_start = repeat_start; + // Initialise the repeat info for the inner repeat. + top_nested = context.backtrack_item; + repeat_min = repeat_ptr[2]; + repeat_max = repeat_ptr[3] == SRE_UNLIMITED_REPEATS ? available : repeat_ptr[3]; + repeat_counter = 0; + repeat_start = context.text_ptr; + try_again = available > 0; + try_tail = repeat_min == 0 && SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail); + if (try_tail) { + if (try_again) { + // Need to save the repeat info for the inner repeat in case the tail fails. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MIN_REV, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.repeat_min = repeat_min; + context.backtrack_item->repeat.repeat_max = repeat_max; + context.backtrack_item->repeat.repeat_counter = repeat_counter; + context.backtrack_item->repeat.repeat_start = repeat_start; + context.backtrack_item->repeat.pattern_ptr = body; } + // Restore the repeat info for the outer repeat. + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + top_nested = top_nested->repeat.top_nested; + context.pattern_ptr = tail; + } else { + if (try_again) + context.pattern_ptr = body; + else + goto backtrack; } - ctx->pattern++; break; - - case SRE_OP_GROUPREF_IGNORE: - /* match backreference */ - TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[0])); - i = ctx->pattern[0]; - { - Py_ssize_t groupref = i+i; - if (groupref >= state->lastmark) { - RETURN_FAILURE; - } else { - SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; - SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; - if (!p || !e || e < p) - RETURN_FAILURE; - while (p < e) { - if (ctx->ptr >= end || - state->lower(*ctx->ptr) != state->lower(*p)) - RETURN_FAILURE; - p++; ctx->ptr++; - } + } + case SRE_OP_REPEAT_ONE_MAX: + { + // Greedy repeat. + // ... + SRE_CODE* repeat_ptr = context.pattern_ptr; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = repeat_ptr + 1 + repeat_ptr[1]; + unsigned int available = context.text_end - context.text_ptr; + SRE_CHAR* start_ptr; + unsigned int rep_max; + SRE_CHAR* max_ptr; + SRE_CHAR* min_ptr; + TRACE(("|%p|%p|REPEAT_ONE_MAX %u %u\n", context.pattern_ptr, context.text_ptr, repeat_ptr[2], repeat_ptr[3])); + // Are there enough characters available for the repeat? (We're assuming at least one per iteration, up to the minimum.) + if (repeat_ptr[2] > available) + goto backtrack; + start_ptr = context.text_ptr; + // Match up to the maximum. + rep_max = repeat_ptr[3] == SRE_UNLIMITED_REPEATS ? available : repeat_ptr[3]; + max_ptr = start_ptr + rep_max; + SRE_MATCH_MANY(&context, state, max_ptr, body); + // Unmatch down to the minimum until the tail could match. + min_ptr = start_ptr + repeat_ptr[2]; + if (!SRE_UNMATCH_UNTIL_TAIL(&context, state, min_ptr, tail)) + // Reached the minimum and the tail still couldn't match. + goto backtrack; + // Save the repeat info for the inner repeat. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_ONE_MAX, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_min = repeat_ptr[2]; + context.backtrack_item->repeat.repeat_max = rep_max; + context.backtrack_item->repeat.repeat_counter = context.text_ptr - start_ptr; + context.backtrack_item->repeat.pattern_ptr = context.pattern_ptr; + // Now match the tail. + context.pattern_ptr = tail; + break; + } + case SRE_OP_REPEAT_ONE_MAX_REV: + { + // Greedy repeat. + // ... + SRE_CODE* repeat_ptr = context.pattern_ptr; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = repeat_ptr + 1 + repeat_ptr[1]; + unsigned int available = context.text_ptr - context.text_start; + SRE_CHAR* start_ptr; + unsigned int rep_max; + SRE_CHAR* max_ptr; + SRE_CHAR* min_ptr; + TRACE(("|%p|%p|REPEAT_ONE_MAX_REV %u %u\n", context.pattern_ptr, context.text_ptr, repeat_ptr[2], repeat_ptr[3])); + // Are there enough characters available for the repeat? (We're assuming at least one per iteration, up to the minimum.) + if (repeat_ptr[2] > available) + goto backtrack; + start_ptr = context.text_ptr; + // Match up to the maximum. + rep_max = repeat_ptr[3] == SRE_UNLIMITED_REPEATS ? available : repeat_ptr[3]; + max_ptr = start_ptr - rep_max; + SRE_MATCH_MANY(&context, state, max_ptr, body); + // Unmatch down to the minimum until the tail could match. + min_ptr = start_ptr - repeat_ptr[2]; + if (!SRE_UNMATCH_UNTIL_TAIL_REV(&context, state, min_ptr, tail)) + // Reached the minimum and the tail still couldn't match. + goto backtrack; + // Save the repeat info for the inner repeat. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_ONE_MAX_REV, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_min = repeat_ptr[2]; + context.backtrack_item->repeat.repeat_max = rep_max; + context.backtrack_item->repeat.repeat_counter = start_ptr - context.text_ptr; + context.backtrack_item->repeat.pattern_ptr = context.pattern_ptr; + // Now match the tail. + context.pattern_ptr = tail; + break; + } + case SRE_OP_REPEAT_ONE_MIN: + { + // Lazy repeat. + // ... + SRE_CODE* repeat_ptr = context.pattern_ptr; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = repeat_ptr + 1 + repeat_ptr[1]; + unsigned int available = context.text_end - context.text_ptr; + SRE_CHAR* start_ptr; + SRE_CHAR* min_ptr; + unsigned int rep_max; + SRE_CHAR* max_ptr; + TRACE(("|%p|%p|REPEAT_ONE_MIN %u %u\n", context.pattern_ptr, context.text_ptr, repeat_ptr[2], repeat_ptr[3])); + // Are there enough characters available for the repeat? (We're assuming at least one per iteration, up to the minimum.) + if (repeat_ptr[2] > available) + goto backtrack; + start_ptr = context.text_ptr; + // Match up to the minimum. + min_ptr = start_ptr + repeat_ptr[2]; + SRE_MATCH_MANY(&context, state, min_ptr, body); + // Matched at least the minimum? + if (context.text_ptr < min_ptr) + goto backtrack; + // Match up to the maximum until the tail could match. + rep_max = repeat_ptr[3] == SRE_UNLIMITED_REPEATS ? available : repeat_ptr[3]; + max_ptr = start_ptr + rep_max; + if(!SRE_MATCH_UNTIL_TAIL(&context, state, max_ptr, body, tail)) + // Reached the maximum and the tail still couldn't match. + goto backtrack; + // Save the repeat info for the inner repeat. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_ONE_MIN, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_min = repeat_ptr[2]; + context.backtrack_item->repeat.repeat_max = rep_max; + context.backtrack_item->repeat.repeat_counter = context.text_ptr - start_ptr; + context.backtrack_item->repeat.pattern_ptr = context.pattern_ptr; + // Now match the tail. + context.pattern_ptr = tail; + break; + } + case SRE_OP_REPEAT_ONE_MIN_REV: + { + // Lazy repeat. + // ... + SRE_CODE* repeat_ptr = context.pattern_ptr; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = repeat_ptr + 1 + repeat_ptr[1]; + unsigned int available = context.text_ptr - context.text_start; + SRE_CHAR* start_ptr; + SRE_CHAR* min_ptr; + unsigned int rep_max; + SRE_CHAR* max_ptr; + TRACE(("|%p|%p|REPEAT_ONE_MIN_REV %u %u\n", context.pattern_ptr, context.text_ptr, repeat_ptr[2], repeat_ptr[3])); + // Are there enough characters available for the repeat? (We're assuming at least one per iteration, up to the minimum.) + if (repeat_ptr[2] > available) + goto backtrack; + start_ptr = context.text_ptr; + // Match up to the minimum. + min_ptr = start_ptr - repeat_ptr[2]; + SRE_MATCH_MANY(&context, state, min_ptr, body); + // Matched at least the minimum? + if (context.text_ptr > min_ptr) + goto backtrack; + // Match up to the maximum until the tail could match. + rep_max = repeat_ptr[3] == SRE_UNLIMITED_REPEATS ? available : repeat_ptr[3]; + max_ptr = start_ptr - rep_max; + if(!SRE_MATCH_UNTIL_TAIL(&context, state, max_ptr, body, tail)) + // Reached the maximum and the tail still couldn't match. + goto backtrack; + // Save the repeat info for the inner repeat. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_ONE_MIN_REV, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_min = repeat_ptr[2]; + context.backtrack_item->repeat.repeat_max = rep_max; + context.backtrack_item->repeat.repeat_counter = start_ptr - context.text_ptr; + context.backtrack_item->repeat.pattern_ptr = context.pattern_ptr; + // Now match the tail. + context.pattern_ptr = tail; + break; + } + case SRE_OP_REPEAT_ONE_POSS: + { + // Possessive repeat. + // ... + SRE_CODE* repeat_ptr = context.pattern_ptr; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = repeat_ptr + 1 + repeat_ptr[1]; + unsigned int available = context.text_end - context.text_ptr; + SRE_CHAR* start_ptr; + unsigned int rep_max; + SRE_CHAR* max_ptr; + SRE_CHAR* min_ptr; + TRACE(("|%p|%p|REPEAT_ONE_POSS %u %u\n", context.pattern_ptr, context.text_ptr, repeat_ptr[2], repeat_ptr[3])); + // Are there enough characters available for the repeat? (We're assuming at least one per iteration, up to the minimum.) + if (repeat_ptr[2] > available) + goto backtrack; + start_ptr = context.text_ptr; + // Match up to the maximum. + rep_max = repeat_ptr[3] == SRE_UNLIMITED_REPEATS ? available : repeat_ptr[3]; + max_ptr = start_ptr + rep_max; + SRE_MATCH_MANY(&context, state, max_ptr, body); + // Matched at least the minimum? + min_ptr = start_ptr + repeat_ptr[2]; + if (context.text_ptr < min_ptr) + goto backtrack; + // Now match the tail. + context.pattern_ptr = tail; + break; + } + case SRE_OP_REPEAT_ONE_POSS_REV: + { + // Possessive repeat. + // ... + SRE_CODE* repeat_ptr = context.pattern_ptr; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = repeat_ptr + 1 + repeat_ptr[1]; + unsigned int available = context.text_ptr - context.text_start; + SRE_CHAR* start_ptr; + unsigned int rep_max; + SRE_CHAR* max_ptr; + SRE_CHAR* min_ptr; + TRACE(("|%p|%p|REPEAT_ONE_POSS_REV %u %u\n", context.pattern_ptr, context.text_ptr, repeat_ptr[2], repeat_ptr[3])); + // Are there enough characters available for the repeat? (We're assuming at least one per iteration, up to the minimum.) + if (repeat_ptr[2] > available) + goto backtrack; + start_ptr = context.text_ptr; + // Match up to the maximum. + rep_max = repeat_ptr[3] == SRE_UNLIMITED_REPEATS ? available : repeat_ptr[3]; + max_ptr = start_ptr - rep_max; + SRE_MATCH_MANY(&context, state, max_ptr, body); + // Matched at least the minimum? + min_ptr = start_ptr - repeat_ptr[2]; + if (context.text_ptr > min_ptr) + goto backtrack; + // Now match the tail. + context.pattern_ptr = tail; + break; + } + case SRE_OP_REPEAT_POSS: + { + // Possessive repeat. + // ... + SRE_CODE* repeat_ptr = context.pattern_ptr; + SRE_CODE* end_repeat_ptr = repeat_ptr + repeat_ptr[1]; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = end_repeat_ptr + 2; + unsigned int available = context.text_end - context.text_ptr; + BOOL try_again; + BOOL try_tail; + TRACE(("|%p|%p|REPEAT_POSS %u %u\n", context.pattern_ptr, context.text_ptr, repeat_ptr[2], repeat_ptr[3])); + // Are there enough characters available for the repeat? (We're assuming at least one per iteration, up to the minimum.) + if (repeat_ptr[2] > available) + goto backtrack; + // At this point the repeat info refers to the outer repeat, so save it. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_POSS, TRUE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + // If the subpattern succeeds then we'll discard the enclosed backtrack info, + // including any marks, so we need to save the marks here. + memmove(context.backtrack_item->marks, context.marks, context.marks_size); + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_min = repeat_min; + context.backtrack_item->repeat.repeat_max = repeat_max; + context.backtrack_item->repeat.repeat_counter = repeat_counter; + context.backtrack_item->repeat.repeat_start = repeat_start; + // Initialise the repeat info for the inner repeat. + top_nested = context.backtrack_item; + repeat_min = repeat_ptr[2]; + repeat_max = repeat_ptr[3] == SRE_UNLIMITED_REPEATS ? available : repeat_ptr[3]; + repeat_counter = 0; + repeat_start = context.text_ptr; + try_again = available > 0; + try_tail = repeat_min == 0 && SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail); + if (try_again) { + if (try_tail) { + // Save this position for possible match of the tail. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_POSS, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.pattern_ptr = tail; } + context.pattern_ptr = body; + } else { + if (try_tail) { + // Restore the repeat info for the outer repeat. + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + top_nested = top_nested->repeat.top_nested; + context.pattern_ptr = tail; + } else + goto backtrack; } - ctx->pattern++; break; - - case SRE_OP_GROUPREF_EXISTS: - TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[0])); - /* codeyes codeno ... */ - i = ctx->pattern[0]; - { - Py_ssize_t groupref = i+i; - if (groupref >= state->lastmark) { - ctx->pattern += ctx->pattern[1]; - break; - } else { - SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; - SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; - if (!p || !e || e < p) { - ctx->pattern += ctx->pattern[1]; - break; - } + } + case SRE_OP_REPEAT_POSS_REV: + { + // Possessive repeat. + // ... + SRE_CODE* repeat_ptr = context.pattern_ptr; + SRE_CODE* end_repeat_ptr = repeat_ptr + repeat_ptr[1]; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = end_repeat_ptr + 2; + unsigned int available = context.text_ptr - context.text_start; + BOOL try_again; + BOOL try_tail; + TRACE(("|%p|%p|REPEAT_POSS_REV %u %u\n", context.pattern_ptr, context.text_ptr, repeat_ptr[2], repeat_ptr[3])); + // Are there enough characters available for the repeat? (We're assuming at least one per iteration, up to the minimum.) + if (repeat_ptr[2] > available) + goto backtrack; + // At this point the repeat info refers to the outer repeat, so save it. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_POSS_REV, TRUE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + // If the subpattern succeeds then we'll discard the enclosed backtrack info, + // including any marks, so we need to save the marks here. + memmove(context.backtrack_item->marks, context.marks, context.marks_size); + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_min = repeat_min; + context.backtrack_item->repeat.repeat_max = repeat_max; + context.backtrack_item->repeat.repeat_counter = repeat_counter; + context.backtrack_item->repeat.repeat_start = repeat_start; + // Initialise the repeat info for the inner repeat. + top_nested = context.backtrack_item; + repeat_min = repeat_ptr[2]; + repeat_max = repeat_ptr[3] == SRE_UNLIMITED_REPEATS ? available : repeat_ptr[3]; + repeat_counter = 0; + repeat_start = context.text_ptr; + try_again = available > 0; + try_tail = repeat_min == 0 && SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail); + if (try_again) { + if (try_tail) { + // Save this position for possible match of the tail. + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_POSS_REV, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.top_nested = top_nested; // top_nested currently refers to the outer repeat. + context.backtrack_item->repeat.pattern_ptr = tail; } + context.pattern_ptr = body; + } else { + if (try_tail) { + // Restore the repeat info for the outer repeat. + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + top_nested = top_nested->repeat.top_nested; + context.pattern_ptr = tail; + } else + goto backtrack; } - ctx->pattern += 2; break; - - case SRE_OP_ASSERT: - /* assert subpattern */ - /* */ - TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[1])); - state->ptr = ctx->ptr - ctx->pattern[1]; - if (state->ptr < state->beginning) - RETURN_FAILURE; - DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2); - RETURN_ON_FAILURE(ret); - ctx->pattern += ctx->pattern[0]; + } + case SRE_OP_START_OF_LINE: + // Start of line. + // + TRACE(("|%p|%p|START_OF_LINE\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr > context.text_beginning && !state->encoding->in_category(SRE_CAT_LineBreak, context.text_ptr[-1])) + goto backtrack; + context.pattern_ptr++; + break; + case SRE_OP_START_OF_STRING: + // Start of string. + // + TRACE(("|%p|%p|START_OF_STRING\n", context.pattern_ptr, context.text_ptr)); + if (context.text_ptr > context.text_beginning) + goto backtrack; + context.pattern_ptr++; break; - - case SRE_OP_ASSERT_NOT: - /* assert not subpattern */ - /* */ - TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[1])); - state->ptr = ctx->ptr - ctx->pattern[1]; - if (state->ptr >= state->beginning) { - DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_FAILURE; + case SRE_OP_SUCCESS: + { + // End of pattern. + // + int zero_width; + int m; + SRE_CHAR* end_ptr; + TRACE(("|%p|%p|SUCCESS\n", context.pattern_ptr, context.text_ptr)); + // Is the entire matched portion zero-width? + zero_width = context.text_ptr == context.text_start; + // Reject the match if it's zero-width and we aren't allowed to return it. + if (zero_width && state->reject_zero_width) + goto backtrack; + + // Find the numbered mark which matched the furthest to the right. + end_ptr = NULL; + for (m = 1; m < state->numbered_mark_count; m += 2) { + TRACE(("context.marks[%u] = 0x%p, context.marks[%u] = 0x%p\n", m - 1, context.marks[m - 1], m, context.marks[m])); + if (context.marks[m - 1] != NULL && context.marks[m] >= context.marks[m - 1]) { + state->lastmark = m; + if (end_ptr < context.marks[m]) { + state->lastindex = 1 + m / 2; + end_ptr = context.marks[m]; + } } } - ctx->pattern += ctx->pattern[0]; - break; - - case SRE_OP_FAILURE: - /* immediate failure */ - TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr)); - RETURN_FAILURE; + // Find the named mark which matched the furthest to the right. + end_ptr = NULL; + for (m = state->numbered_mark_count + 1; m < state->numbered_mark_count + state->named_mark_count; m += 2) { + TRACE(("context.marks[%u] = 0x%p, context.marks[%u] = 0x%p\n", m - 1, context.marks[m - 1], m, context.marks[m])); + if (context.marks[m - 1] != NULL && context.marks[m] >= context.marks[m - 1]) { + if (end_ptr < context.marks[m]) { + state->last_named_index = 1 + m / 2; + end_ptr = context.marks[m]; + } + } + } + state->ptr = context.text_ptr; + return SRE_CLEANUP(&context, state, 1); + } default: - TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr, - ctx->pattern[-1])); - RETURN_ERROR(SRE_ERROR_ILLEGAL); - } - } - -exit: - ctx_pos = ctx->last_ctx_pos; - jump = ctx->jump; - DATA_POP_DISCARD(ctx); - if (ctx_pos == -1) - return ret; - DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); - - switch (jump) { - case JUMP_MAX_UNTIL_2: - TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr)); - goto jump_max_until_2; - case JUMP_MAX_UNTIL_3: - TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr)); - goto jump_max_until_3; - case JUMP_MIN_UNTIL_2: - TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr)); - goto jump_min_until_2; - case JUMP_MIN_UNTIL_3: - TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr)); - goto jump_min_until_3; - case JUMP_BRANCH: - TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr)); - goto jump_branch; - case JUMP_MAX_UNTIL_1: - TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr)); - goto jump_max_until_1; - case JUMP_MIN_UNTIL_1: - TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr)); - goto jump_min_until_1; - case JUMP_REPEAT: - TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr)); - goto jump_repeat; - case JUMP_REPEAT_ONE_1: - TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr)); - goto jump_repeat_one_1; - case JUMP_REPEAT_ONE_2: - TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr)); - goto jump_repeat_one_2; - case JUMP_MIN_REPEAT_ONE: - TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr)); - goto jump_min_repeat_one; - case JUMP_ASSERT: - TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr)); - goto jump_assert; - case JUMP_ASSERT_NOT: - TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr)); - goto jump_assert_not; - case JUMP_NONE: - TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret)); - break; + TRACE(("|%p|%p|UNKNOWN %u\n", context.pattern_ptr, context.text_ptr, context.pattern_ptr[0])); + return SRE_CLEANUP(&context, state, SRE_ERROR_ILLEGAL); + } + } + +backtrack: + TRACE(("|%p|%p|BACKTRACK ", context.pattern_ptr, context.text_ptr)); + context.backtrack_item = &context.backtrack_chunk->items[context.backtrack_chunk->count - 1]; + switch (context.backtrack_item->op) { + case SRE_OP_ASSERT: + // Assert subpattern. + // ... + TRACE(("ASSERT\n")); + // The subpattern has failed, so the marks have already backtracked and been restored. + context.text_start = context.backtrack_item->assert.text_start; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_ASSERT_NOT: + // Assert not subpattern. + // ... + TRACE(("ASSERT_NOT\n")); + // The subpattern has failed, so the marks have already backtracked and been restored. + context.text_start = context.backtrack_item->assert.text_start; + context.text_ptr = context.backtrack_item->assert.text_ptr; + context.pattern_ptr = context.backtrack_item->assert.pattern_ptr; + SRE_DISCARD_BACKTRACK(&context); + context.pattern_ptr += 1 + context.pattern_ptr[1]; + goto advance; + case SRE_OP_ATOMIC: + // Atomic subpattern. + // ... + TRACE(("ATOMIC\n")); + // The subpattern has failed, so the marks have already backtracked and been restored. + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_BRANCH: + { + // Alternation. + // ... ... 0 + SRE_CODE* skip_ptr = context.backtrack_item->branch.pattern_ptr; + TRACE(("BRANCH\n")); + context.text_ptr = context.backtrack_item->branch.text_ptr; + // Look ahead in the branch to avoid unnecessary backtracking. + while (! SRE_POSSIBLE_MATCH_AHEAD(&context, state, skip_ptr + 1)) { + skip_ptr += skip_ptr[0]; + // Is there another branch? + if (skip_ptr[0] == 0) { + // No more branches, so backtrack. + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + } + } + // Try this branch. + context.pattern_ptr = skip_ptr + 1; + // Is there another branch? + skip_ptr += skip_ptr[0]; + if (skip_ptr[0] == 0) + // No more branches after this one. + SRE_DISCARD_BACKTRACK(&context); + else + // Save the next branch for backtracking. + context.backtrack_item->branch.pattern_ptr = skip_ptr; + goto advance; + } + case SRE_OP_END_ATOMIC: + // Atomic subpattern. + // ... + TRACE(("END_ATOMIC\n")); + // Restore the marks. + memmove(context.marks, context.backtrack_item->marks, context.marks_size); + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_END_REPEAT_MAX: + // End of greedy repeat. + // ... + TRACE(("END_REPEAT_MAX\n")); + // Restore the repeat info for the outer repeat. + top_nested = context.backtrack_item->repeat.top_nested; + context.text_ptr = context.backtrack_item->repeat.text_ptr; + context.pattern_ptr = context.backtrack_item->repeat.pattern_ptr; + SRE_DISCARD_BACKTRACK(&context); + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + goto advance; + case SRE_OP_END_REPEAT_MAX_REV: + // End of greedy repeat. + // ... + TRACE(("END_REPEAT_MAX_REV\n")); + // Restore the repeat info for the outer repeat. + top_nested = context.backtrack_item->repeat.top_nested; + context.text_ptr = context.backtrack_item->repeat.text_ptr; + context.pattern_ptr = context.backtrack_item->repeat.pattern_ptr; + SRE_DISCARD_BACKTRACK(&context); + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + goto advance; + case SRE_OP_END_REPEAT_MIN: + // Lazy repeat. + // ... + TRACE(("END_REPEAT_MIN\n")); + // Restore the repeat info for the inner repeat. + context.text_ptr = context.backtrack_item->repeat.text_ptr; + top_nested = context.backtrack_item->repeat.top_nested; + repeat_min = context.backtrack_item->repeat.repeat_min; + repeat_max = context.backtrack_item->repeat.repeat_max; + repeat_counter = context.backtrack_item->repeat.repeat_counter; + repeat_start = context.backtrack_item->repeat.repeat_start; + context.pattern_ptr = context.backtrack_item->repeat.pattern_ptr; + SRE_DISCARD_BACKTRACK(&context); + goto advance; + case SRE_OP_END_REPEAT_MIN_REV: + // Lazy repeat. + // ... + TRACE(("END_REPEAT_MIN_REV\n")); + // Restore the repeat info for the inner repeat. + context.text_ptr = context.backtrack_item->repeat.text_ptr; + top_nested = context.backtrack_item->repeat.top_nested; + repeat_min = context.backtrack_item->repeat.repeat_min; + repeat_max = context.backtrack_item->repeat.repeat_max; + repeat_counter = context.backtrack_item->repeat.repeat_counter; + repeat_start = context.backtrack_item->repeat.repeat_start; + context.pattern_ptr = context.backtrack_item->repeat.pattern_ptr; + SRE_DISCARD_BACKTRACK(&context); + goto advance; + case SRE_OP_END_REPEAT_POSS: + // End of greedy repeat. + // ... + TRACE(("END_REPEAT_POSS\n")); + // Restore the repeat info for the outer repeat. + top_nested = context.backtrack_item->repeat.top_nested; + context.text_ptr = context.backtrack_item->repeat.text_ptr; + context.pattern_ptr = context.backtrack_item->repeat.pattern_ptr; + SRE_DISCARD_BACKTRACK(&context); + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + goto advance; + case SRE_OP_END_REPEAT_POSS_REV: + // End of greedy repeat. + // ... + TRACE(("END_REPEAT_POSS_REV\n")); + // Restore the repeat info for the outer repeat. + top_nested = context.backtrack_item->repeat.top_nested; + context.text_ptr = context.backtrack_item->repeat.text_ptr; + context.pattern_ptr = context.backtrack_item->repeat.pattern_ptr; + SRE_DISCARD_BACKTRACK(&context); + repeat_min = top_nested->repeat.repeat_min; + repeat_max = top_nested->repeat.repeat_max; + repeat_counter = top_nested->repeat.repeat_counter; + repeat_start = top_nested->repeat.repeat_start; + goto advance; + case SRE_OP_FAILURE: + // Failed to match. + TRACE(("FAILURE\n")); + state->reject_zero_width = 0; + return SRE_CLEANUP(&context, state, 0); + case SRE_OP_MARK: + // Set mark. + // + TRACE(("MARK\n")); + // The numbered and named marks need to be restored in the opposite order to which they were saved. + context.marks[context.backtrack_item->mark.named_index] = context.backtrack_item->mark.named_mark_ptr; + DEBUG_TRACE(("restoring mark %u to 0x%p\n", context.backtrack_item->mark.named_index, context.backtrack_item->mark.named_mark_ptr)); + context.marks[context.backtrack_item->mark.numbered_index] = context.backtrack_item->mark.numbered_mark_ptr; + DEBUG_TRACE(("restoring mark %u to 0x%p\n", context.backtrack_item->mark.numbered_index, context.backtrack_item->mark.numbered_mark_ptr)); + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_REPEAT_MAX: + // Greedy repeat. + // ... + TRACE(("REPEAT_MAX\n")); + // Restore the repeat info for the outer repeat. + top_nested = context.backtrack_item->repeat.top_nested; + repeat_min = context.backtrack_item->repeat.repeat_min; + repeat_max = context.backtrack_item->repeat.repeat_max; + repeat_counter = context.backtrack_item->repeat.repeat_counter; + repeat_start = context.backtrack_item->repeat.repeat_start; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_REPEAT_MAX_REV: + // Greedy repeat. + // ... + TRACE(("REPEAT_MAX_REV\n")); + // Restore the repeat info for the outer repeat. + top_nested = context.backtrack_item->repeat.top_nested; + repeat_min = context.backtrack_item->repeat.repeat_min; + repeat_max = context.backtrack_item->repeat.repeat_max; + repeat_counter = context.backtrack_item->repeat.repeat_counter; + repeat_start = context.backtrack_item->repeat.repeat_start; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_REPEAT_MIN: + // Lazy repeat. + // ... + TRACE(("REPEAT_MIN\n")); + // Restore the repeat info for the outer repeat. + top_nested = context.backtrack_item->repeat.top_nested; + repeat_min = context.backtrack_item->repeat.repeat_min; + repeat_max = context.backtrack_item->repeat.repeat_max; + repeat_counter = context.backtrack_item->repeat.repeat_counter; + repeat_start = context.backtrack_item->repeat.repeat_start; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_REPEAT_MIN_REV: + // Lazy repeat. + // ... + TRACE(("REPEAT_MIN_REV\n")); + // Restore the repeat info for the outer repeat. + top_nested = context.backtrack_item->repeat.top_nested; + repeat_min = context.backtrack_item->repeat.repeat_min; + repeat_max = context.backtrack_item->repeat.repeat_max; + repeat_counter = context.backtrack_item->repeat.repeat_counter; + repeat_start = context.backtrack_item->repeat.repeat_start; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_REPEAT_ONE_MAX: + { + // Greedy repeat. + // ... + SRE_CODE* repeat_ptr = context.backtrack_item->repeat.pattern_ptr; + SRE_CODE* tail = repeat_ptr + 1 + repeat_ptr[1]; + SRE_CHAR* start_ptr; + SRE_CHAR* min_ptr; + TRACE(("REPEAT_ONE_MAX\n")); + context.text_ptr = context.backtrack_item->repeat.text_ptr; + start_ptr = context.text_ptr - context.backtrack_item->repeat.repeat_counter; + // Match down to the minimum until the tail could match. + min_ptr = start_ptr + context.backtrack_item->repeat.repeat_min; + // Release a character. + context.text_ptr--; + if(!SRE_UNMATCH_UNTIL_TAIL(&context, state, min_ptr, tail)) { + // Reached the minimum and the tail still couldn't match. + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + } + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_counter = context.text_ptr - start_ptr; + // Now match the tail. + context.pattern_ptr = tail; + goto advance; + } + case SRE_OP_REPEAT_ONE_MAX_REV: + { + // Greedy repeat. + // ... + SRE_CODE* repeat_ptr = context.backtrack_item->repeat.pattern_ptr; + SRE_CODE* tail = repeat_ptr + 1 + repeat_ptr[1]; + SRE_CHAR* start_ptr; + SRE_CHAR* min_ptr; + TRACE(("REPEAT_ONE_MAX_REV\n")); + context.text_ptr = context.backtrack_item->repeat.text_ptr; + start_ptr = context.text_ptr + context.backtrack_item->repeat.repeat_counter; + // Match down to the minimum until the tail could match. + min_ptr = start_ptr - context.backtrack_item->repeat.repeat_min; + // Release a character. + context.text_ptr++; + if(!SRE_UNMATCH_UNTIL_TAIL_REV(&context, state, min_ptr, tail)) { + // Reached the minimum and the tail still couldn't match. + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + } + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_counter = start_ptr - context.text_ptr; + // Now match the tail. + context.pattern_ptr = tail; + goto advance; + } + case SRE_OP_REPEAT_ONE_MIN: + { + // Lazy repeat. + // ... + SRE_CODE* repeat_ptr = context.backtrack_item->repeat.pattern_ptr; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = repeat_ptr + 1 + repeat_ptr[1]; + unsigned int available; + SRE_CHAR* start_ptr; + SRE_CHAR* max_ptr; + TRACE(("REPEAT_ONE_MIN\n")); + context.text_ptr = context.backtrack_item->repeat.text_ptr; + available = context.text_end - context.text_ptr; + start_ptr = context.text_ptr - context.backtrack_item->repeat.repeat_counter; + // Consume a character. + context.text_ptr++; + // Match up to the maximum until the tail could match. + max_ptr = start_ptr + context.backtrack_item->repeat.repeat_max; + if(context.text_ptr > max_ptr || !SRE_MATCH_UNTIL_TAIL(&context, state, max_ptr, body, tail)) { + // Reached the maximum and the tail still couldn't match. + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + } + // Now match the tail. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_counter = context.text_ptr - start_ptr; + context.pattern_ptr = tail; + goto advance; + } + case SRE_OP_REPEAT_ONE_MIN_REV: + { + // Lazy repeat. + // ... + SRE_CODE* repeat_ptr = context.backtrack_item->repeat.pattern_ptr; + SRE_CODE* body = repeat_ptr + 4; + SRE_CODE* tail = repeat_ptr + 1 + repeat_ptr[1]; + unsigned int available; + SRE_CHAR* start_ptr; + SRE_CHAR* max_ptr; + TRACE(("REPEAT_ONE_MIN_REV\n")); + context.text_ptr = context.backtrack_item->repeat.text_ptr; + available = context.text_ptr - context.text_start; + start_ptr = context.text_ptr + context.backtrack_item->repeat.repeat_counter; + // Consume a character. + context.text_ptr--; + // Match up to the maximum until the tail could match. + max_ptr = start_ptr - context.backtrack_item->repeat.repeat_max; + if(context.text_ptr < max_ptr || !SRE_MATCH_UNTIL_TAIL(&context, state, max_ptr, body, tail)) { + // Reached the maximum and the tail still couldn't match. + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + } + // Now match the tail. + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.repeat_counter = start_ptr - context.text_ptr; + context.pattern_ptr = tail; + goto advance; + } + case SRE_OP_REPEAT_POSS: + // Possessive repeat. + // ... + TRACE(("REPEAT_POSS\n")); + // Restore the repeat info for the outer repeat. + memmove(context.marks, context.backtrack_item->marks, context.marks_size); + top_nested = context.backtrack_item->repeat.top_nested; + repeat_min = context.backtrack_item->repeat.repeat_min; + repeat_max = context.backtrack_item->repeat.repeat_max; + repeat_counter = context.backtrack_item->repeat.repeat_counter; + repeat_start = context.backtrack_item->repeat.repeat_start; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_REPEAT_POSS_REV: + // Possessive repeat. + // ... + TRACE(("REPEAT_POSS_REV\n")); + // Restore the repeat info for the outer repeat. + memmove(context.marks, context.backtrack_item->marks, context.marks_size); + top_nested = context.backtrack_item->repeat.top_nested; + repeat_min = context.backtrack_item->repeat.repeat_min; + repeat_max = context.backtrack_item->repeat.repeat_max; + repeat_counter = context.backtrack_item->repeat.repeat_counter; + repeat_start = context.backtrack_item->repeat.repeat_start; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + default: + TRACE(("UNKNOWN %u\n", context.backtrack_item->op)); + return SRE_CLEANUP(&context, state, SRE_ERROR_ILLEGAL); } - return ret; /* should never get here */ + return 0; } -LOCAL(Py_ssize_t) -SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) -{ - SRE_CHAR* ptr = (SRE_CHAR *)state->start; - SRE_CHAR* end = (SRE_CHAR *)state->end; +LOCAL(Py_ssize_t) SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) { + SRE_CODE* tail; + SRE_CONTEXT context; Py_ssize_t status = 0; - Py_ssize_t prefix_len = 0; - Py_ssize_t prefix_skip = 0; - SRE_CODE* prefix = NULL; - SRE_CODE* charset = NULL; - SRE_CODE* overlap = NULL; - int flags = 0; - if (pattern[0] == SRE_OP_INFO) { - /* optimization info block */ - /* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ - - flags = pattern[2]; - - if (pattern[3] > 1) { - /* adjust end point (but make sure we leave at least one - character in there, so literal search will work) */ - end -= pattern[3]-1; - if (end <= ptr) - end = ptr+1; - } - - if (flags & SRE_INFO_PREFIX) { - /* pattern starts with a known prefix */ - /* */ - prefix_len = pattern[5]; - prefix_skip = pattern[6]; - prefix = pattern + 7; - overlap = prefix + prefix_len - 1; - } else if (flags & SRE_INFO_CHARSET) - /* pattern starts with a character from a known set */ - /* */ - charset = pattern + 5; - - pattern += 1 + pattern[1]; - } - - TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip)); - TRACE(("charset = %p\n", charset)); - -#if defined(USE_FAST_SEARCH) - if (prefix_len > 1) { - /* pattern starts with a known prefix. use the overlap - table to skip forward as fast as we possibly can */ - Py_ssize_t i = 0; - end = (SRE_CHAR *)state->end; - while (ptr < end) { - for (;;) { - if ((SRE_CODE) ptr[0] != prefix[i]) { - if (!i) - break; - else - i = overlap[i]; - } else { - if (++i == prefix_len) { - /* found a potential match */ - TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr)); - state->start = ptr + 1 - prefix_len; - state->ptr = ptr + 1 - prefix_len + prefix_skip; - if (flags & SRE_INFO_LITERAL) - return 1; /* we got all of it */ - status = SRE_MATCH(state, pattern + 2*prefix_skip); - if (status != 0) - return status; - /* close but no cigar -- try again */ - i = overlap[i]; - } + tail = pattern; + while (tail[0] == SRE_OP_MARK) + tail += SRE_MARK_OP_SIZE; + + context.text_beginning = (SRE_CHAR *)state->beginning; + context.text_start = (SRE_CHAR *)state->start; + context.text_end = (SRE_CHAR *)state->end; + + // Point to the final newline if it's the final character. + context.final_linebreak = context.text_beginning < context.text_end && + state->encoding->in_category(SRE_CAT_LineBreak, context.text_end[-1]) ? context.text_end - 1 : NULL; + + // state->reject_zero_width might initially be set to reject an initial zero-width match. + // If there's no match initially then state->reject_zero_width will be cleared to allow any kind of match subsequently. + if (state->reverse) { + context.text_ptr = (SRE_CHAR *)state->end; + + while (context.text_ptr >= context.text_start) { + TRACE(("|%p|%p|SEARCH\n", pattern, context.text_ptr)); + if (SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail)) { + state->end = state->ptr = context.text_ptr; + status = SRE_MATCH(state, state->pattern_code); + if (status != 0) break; - } } - ptr++; + context.text_ptr--; + state->reject_zero_width = 0; } - return 0; - } -#endif + } else { + context.text_ptr = (SRE_CHAR *)state->start; - if (pattern[0] == SRE_OP_LITERAL) { - /* pattern starts with a literal character. this is used - for short prefixes, and if fast search is disabled */ - SRE_CODE chr = pattern[1]; - end = (SRE_CHAR *)state->end; - for (;;) { - while (ptr < end && (SRE_CODE) ptr[0] != chr) - ptr++; - if (ptr >= end) - return 0; - TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr)); - state->start = ptr; - state->ptr = ++ptr; - if (flags & SRE_INFO_LITERAL) - return 1; /* we got all of it */ - status = SRE_MATCH(state, pattern + 2); - if (status != 0) - break; - } - } else if (charset) { - /* pattern starts with a character from a known set */ - end = (SRE_CHAR *)state->end; - for (;;) { - while (ptr < end && !SRE_CHARSET(charset, ptr[0])) - ptr++; - if (ptr >= end) - return 0; - TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr)); - state->start = ptr; - state->ptr = ptr; - status = SRE_MATCH(state, pattern); - if (status != 0) - break; - ptr++; - } - } else - /* general case */ - while (ptr <= end) { - TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); - state->start = state->ptr = ptr++; - status = SRE_MATCH(state, pattern); - if (status != 0) - break; + while (context.text_ptr <= context.text_end) { + TRACE(("|%p|%p|SEARCH\n", pattern, context.text_ptr)); + if (SRE_POSSIBLE_MATCH_AHEAD(&context, state, tail)) { + state->start = state->ptr = context.text_ptr; + status = SRE_MATCH(state, state->pattern_code); + if (status != 0) + break; + } + context.text_ptr++; + state->reject_zero_width = 0; } + } return status; } -LOCAL(int) -SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len) -{ +LOCAL(int) SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len) { /* check if given string is a literal template (i.e. no escapes) */ while (len-- > 0) if (*ptr++ == '\\') return 0; - return 1; + return TRUE; } #if !defined(SRE_RECURSIVE) @@ -1630,49 +3555,65 @@ /* factories and destructors */ /* see sre.h for object declarations */ -static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int); -static PyObject*pattern_scanner(PatternObject*, PyObject*); +static PyObject* pattern_new_match(PatternObject*, SRE_STATE*, int); +static PyObject* pattern_scanner(PatternObject*, PyObject*); -static PyObject * -sre_codesize(PyObject* self, PyObject *unused) -{ +static PyObject* sre_codesize(PyObject* self, PyObject *unused) { return Py_BuildValue("l", sizeof(SRE_CODE)); } -static PyObject * -sre_getlower(PyObject* self, PyObject* args) -{ +static PyObject* sre_getlower(PyObject* self, PyObject* args) { int character, flags; if (!PyArg_ParseTuple(args, "ii", &character, &flags)) return NULL; if (flags & SRE_FLAG_LOCALE) - return Py_BuildValue("i", sre_lower_locale(character)); + return Py_BuildValue("i", loc_lower(character)); if (flags & SRE_FLAG_UNICODE) #if defined(HAVE_UNICODE) - return Py_BuildValue("i", sre_lower_unicode(character)); + return Py_BuildValue("i", uni_lower(character)); #else - return Py_BuildValue("i", sre_lower_locale(character)); + return Py_BuildValue("i", loc_lower(character)); #endif - return Py_BuildValue("i", sre_lower(character)); + return Py_BuildValue("i", ascii_lower(character)); } -LOCAL(void) -state_reset(SRE_STATE* state) -{ - /* FIXME: dynamic! */ - /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/ +static PyObject* sre_getupper(PyObject* self, PyObject* args) { + int character, flags; + if (!PyArg_ParseTuple(args, "ii", &character, &flags)) + return NULL; + if (flags & SRE_FLAG_LOCALE) + return Py_BuildValue("i", loc_upper(character)); + if (flags & SRE_FLAG_UNICODE) +#if defined(HAVE_UNICODE) + return Py_BuildValue("i", uni_upper(character)); +#else + return Py_BuildValue("i", loc_upper(character)); +#endif + return Py_BuildValue("i", ascii_upper(character)); +} + +static PyObject* sre_gettitle(PyObject* self, PyObject* args) { + int character, flags; + if (!PyArg_ParseTuple(args, "ii", &character, &flags)) + return NULL; + if (flags & SRE_FLAG_LOCALE) + return Py_BuildValue("i", loc_upper(character)); + if (flags & SRE_FLAG_UNICODE) +#if defined(HAVE_UNICODE) + return Py_BuildValue("i", uni_title(character)); +#else + return Py_BuildValue("i", loc_upper(character)); +#endif + return Py_BuildValue("i", ascii_upper(character)); +} +LOCAL(void) state_reset(SRE_STATE* state) { state->lastmark = -1; state->lastindex = -1; - - state->repeat = NULL; - - data_stack_dealloc(state); + state->last_named_index = -1; } -static void* -getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize) -{ +static void* getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize) { /* given a python object, return a data pointer, a length (in characters), and a character size. return NULL if the object is not a string (or not compatible) */ @@ -1694,7 +3635,7 @@ #endif /* get pointer to string buffer */ - buffer = Py_TYPE(string)->tp_as_buffer; + buffer = string->ob_type->tp_as_buffer; if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount || buffer->bf_getsegcount(string, NULL) != 1) { PyErr_SetString(PyExc_TypeError, "expected string or buffer"); @@ -1736,10 +3677,8 @@ return ptr; } -LOCAL(PyObject*) -state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, - Py_ssize_t start, Py_ssize_t end) -{ +LOCAL(PyObject*) state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, + Py_ssize_t start, Py_ssize_t end, SRE_CODE* pattern_code) { /* prepare state object */ Py_ssize_t length; @@ -1748,12 +3687,25 @@ memset(state, 0, sizeof(SRE_STATE)); + state->pattern_code = pattern_code; + + state->backtrack_chunk = (SRE_BACKTRACK_CHUNK*)PyMem_MALLOC(sizeof(SRE_BACKTRACK_CHUNK)); + if (state->backtrack_chunk == NULL) + goto error; + + state->backtrack_chunk->previous = NULL; + state->backtrack_chunk->count = 0; + + state->numbered_mark_count = 2 * pattern->groups; + state->named_mark_count = 2 * (pattern->internal_groups - pattern->groups); + state->lastmark = -1; state->lastindex = -1; + state->last_named_index = -1; ptr = getstring(string, &length, &charsize); if (!ptr) - return NULL; + goto error; /* adjust boundaries */ if (start < 0) @@ -1773,44 +3725,49 @@ state->start = (void*) ((char*) ptr + start * state->charsize); state->end = (void*) ((char*) ptr + end * state->charsize); + state->reject_zero_width = 0; + Py_INCREF(string); state->string = string; state->pos = start; state->endpos = end; - if (pattern->flags & SRE_FLAG_LOCALE) - state->lower = sre_lower_locale; - else if (pattern->flags & SRE_FLAG_UNICODE) -#if defined(HAVE_UNICODE) - state->lower = sre_lower_unicode; -#else - state->lower = sre_lower_locale; -#endif + if ((pattern->flags & SRE_FLAG_UNICODE) || state->charsize == 2) + state->encoding = &sre_unicode_encoding; + else if (pattern->flags & SRE_FLAG_LOCALE) + state->encoding = &locale_encoding; else - state->lower = sre_lower; + state->encoding = &ascii_encoding; + + state->reverse = pattern->flags & SRE_FLAG_REVERSE; return string; + +error: + PyMem_FREE(state->backtrack_chunk); + return NULL; } -LOCAL(void) -state_fini(SRE_STATE* state) -{ +LOCAL(void) state_fini(SRE_STATE* state) { + /* There are actually 2 versions of backtrack_chunk, 8-bit and Unicode. + This shouldn't be a problem because they have the same format + and contain pointers and an int, which are always the same size. */ + PyMem_FREE(state->backtrack_chunk); + state->backtrack_chunk = NULL; + Py_XDECREF(state->string); - data_stack_dealloc(state); } /* calculate offset from start of string */ -#define STATE_OFFSET(state, member)\ +#define STATE_OFFSET(state, member) \ (((char*)(member) - (char*)(state)->beginning) / (state)->charsize) -LOCAL(PyObject*) -state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty) -{ +LOCAL(PyObject*) state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty) { Py_ssize_t i, j; index = (index - 1) * 2; - if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) { + if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index + 1]) { if (empty) /* want empty string */ i = j = 0; @@ -1820,15 +3777,13 @@ } } else { i = STATE_OFFSET(state, state->mark[index]); - j = STATE_OFFSET(state, state->mark[index+1]); + j = STATE_OFFSET(state, state->mark[index + 1]); } return PySequence_GetSlice(string, i, j); } -static void -pattern_error(int status) -{ +static void pattern_error(int status) { switch (status) { case SRE_ERROR_RECURSION_LIMIT: PyErr_SetString( @@ -1851,23 +3806,19 @@ } } -static void -pattern_dealloc(PatternObject* self) -{ +static void pattern_dealloc(PatternObject* self) { if (self->weakreflist != NULL) - PyObject_ClearWeakRefs((PyObject *) self); + PyObject_ClearWeakRefs((PyObject*)self); Py_XDECREF(self->pattern); Py_XDECREF(self->groupindex); Py_XDECREF(self->indexgroup); PyObject_DEL(self); } -static PyObject* -pattern_match(PatternObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* pattern_match(PatternObject* self, PyObject* args, PyObject* kw) { SRE_STATE state; int status; - + SRE_CODE* pattern_code; PyObject* string; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; @@ -1876,23 +3827,25 @@ &string, &start, &end)) return NULL; - string = state_init(&state, self, string, start, end); + pattern_code = PatternObject_GetCode(self); + + string = state_init(&state, self, string, start, end, pattern_code); if (!string) return NULL; - state.ptr = state.start; + state.ptr = state.reverse ? state.end : state.start; - TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr)); + TRACE(("|%p|%p|MATCH\n", pattern_code, state.ptr)); if (state.charsize == 1) { - status = sre_match(&state, PatternObject_GetCode(self)); + status = sre_bmatch(&state, state.pattern_code); } else { #if defined(HAVE_UNICODE) - status = sre_umatch(&state, PatternObject_GetCode(self)); + status = sre_umatch(&state, state.pattern_code); #endif } - TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); + TRACE(("|%p|%p|END\n", pattern_code, state.ptr)); if (PyErr_Occurred()) return NULL; @@ -1901,12 +3854,10 @@ return pattern_new_match(self, &state, status); } -static PyObject* -pattern_search(PatternObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* pattern_search(PatternObject* self, PyObject* args, PyObject* kw) { SRE_STATE state; int status; - + SRE_CODE* pattern_code; PyObject* string; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; @@ -1915,17 +3866,19 @@ &string, &start, &end)) return NULL; - string = state_init(&state, self, string, start, end); + pattern_code = PatternObject_GetCode(self); + + string = state_init(&state, self, string, start, end, pattern_code); if (!string) return NULL; - TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr)); + TRACE(("|%p|%p|SEARCH\n", pattern_code, state.ptr)); if (state.charsize == 1) { - status = sre_search(&state, PatternObject_GetCode(self)); + status = sre_bsearch(&state, state.pattern_code); } else { #if defined(HAVE_UNICODE) - status = sre_usearch(&state, PatternObject_GetCode(self)); + status = sre_usearch(&state, state.pattern_code); #endif } @@ -1939,9 +3892,7 @@ return pattern_new_match(self, &state, status); } -static PyObject* -call(char* module, char* function, PyObject* args) -{ +static PyObject* call(char* module, char* function, PyObject* args) { PyObject* name; PyObject* mod; PyObject* func; @@ -1967,9 +3918,7 @@ } #ifdef USE_BUILTIN_COPY -static int -deepcopy(PyObject** object, PyObject* memo) -{ +static int deepcopy(PyObject** object, PyObject* memo) { PyObject* copy; copy = call( @@ -1986,9 +3935,7 @@ } #endif -static PyObject* -join_list(PyObject* list, PyObject* string) -{ +static PyObject* join_list(PyObject* list, PyObject* string) { /* join list elements */ PyObject* joiner; @@ -2034,14 +3981,12 @@ return result; } -static PyObject* -pattern_findall(PatternObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* pattern_findall(PatternObject* self, PyObject* args, PyObject* kw) { SRE_STATE state; PyObject* list; int status; Py_ssize_t i, b, e; - + SRE_CODE* pattern_code; PyObject* string; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; @@ -2050,7 +3995,9 @@ &string, &start, &end)) return NULL; - string = state_init(&state, self, string, start, end); + pattern_code = PatternObject_GetCode(self); + + string = state_init(&state, self, string, start, end, pattern_code); if (!string) return NULL; @@ -2066,18 +4013,18 @@ state_reset(&state); - state.ptr = state.start; + state.ptr = state.reverse ? state.end : state.start; if (state.charsize == 1) { - status = sre_search(&state, PatternObject_GetCode(self)); + status = sre_bsearch(&state, state.pattern_code); } else { #if defined(HAVE_UNICODE) - status = sre_usearch(&state, PatternObject_GetCode(self)); + status = sre_usearch(&state, state.pattern_code); #endif } - if (PyErr_Occurred()) - goto error; + if (PyErr_Occurred()) + goto error; if (status <= 0) { if (status == 0) @@ -2089,8 +4036,13 @@ /* don't bother to build a match object */ switch (self->groups) { case 0: - b = STATE_OFFSET(&state, state.start); - e = STATE_OFFSET(&state, state.ptr); + if (state.reverse) { + b = STATE_OFFSET(&state, state.ptr); + e = STATE_OFFSET(&state, state.end); + } else { + b = STATE_OFFSET(&state, state.start); + e = STATE_OFFSET(&state, state.ptr); + } item = PySequence_GetSlice(string, b, e); if (!item) goto error; @@ -2105,7 +4057,7 @@ if (!item) goto error; for (i = 0; i < self->groups; i++) { - PyObject* o = state_getslice(&state, i+1, string, 1); + PyObject* o = state_getslice(&state, i + 1, string, 1); if (!o) { Py_DECREF(item); goto error; @@ -2120,11 +4072,12 @@ if (status < 0) goto error; - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); + // Continue search from where we left off, but reject an initial zero-width match. + if (state.reverse) + state.end = state.ptr; else state.start = state.ptr; - + state.reject_zero_width = 1; } state_fini(&state); @@ -2138,9 +4091,7 @@ } #if PY_VERSION_HEX >= 0x02020000 -static PyObject* -pattern_finditer(PatternObject* pattern, PyObject* args) -{ +static PyObject* pattern_finditer(PatternObject* pattern, PyObject* args) { PyObject* scanner; PyObject* search; PyObject* iterator; @@ -2161,16 +4112,16 @@ } #endif -static PyObject* -pattern_split(PatternObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* pattern_split(PatternObject* self, PyObject* args, PyObject* kw) { SRE_STATE state; PyObject* list; PyObject* item; int status; + SRE_CODE* pattern_code; Py_ssize_t n; Py_ssize_t i; void* last; + int zero_width; PyObject* string; Py_ssize_t maxsplit = 0; @@ -2179,7 +4130,9 @@ &string, &maxsplit)) return NULL; - string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX); + pattern_code = PatternObject_GetCode(self); + + string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX, pattern_code); if (!string) return NULL; @@ -2189,25 +4142,27 @@ return NULL; } + zero_width = self->flags & SRE_FLAG_ZEROWIDTH ? 1 : 0; + n = 0; - last = state.start; + last = state.reverse ? state.end : state.start; while (!maxsplit || n < maxsplit) { state_reset(&state); - state.ptr = state.start; + state.ptr = state.reverse ? state.end : state.start; if (state.charsize == 1) { - status = sre_search(&state, PatternObject_GetCode(self)); + status = sre_bsearch(&state, state.pattern_code); } else { #if defined(HAVE_UNICODE) - status = sre_usearch(&state, PatternObject_GetCode(self)); + status = sre_usearch(&state, state.pattern_code); #endif } - if (PyErr_Occurred()) - goto error; + if (PyErr_Occurred()) + goto error; if (status <= 0) { if (status == 0) @@ -2216,19 +4171,37 @@ goto error; } - if (state.start == state.ptr) { - if (last == state.end) - break; - /* skip one character */ - state.start = (void*) ((char*) state.ptr + state.charsize); - continue; - } + if (state.reverse) { + // Zero-width match? + if (state.ptr == state.end) { + // Are we permitted to split on zero-width? + if (!zero_width) { + state.end = (void*) ((char*) state.ptr - state.charsize); + continue; + } + } - /* get segment before this match */ - item = PySequence_GetSlice( - string, STATE_OFFSET(&state, last), - STATE_OFFSET(&state, state.start) - ); + /* get segment before this match */ + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, state.end), + STATE_OFFSET(&state, last) + ); + } else { + // Zero-width match? + if (state.ptr == state.start) { + // Are we permitted to split on zero-width? + if (!zero_width) { + state.start = (void*) ((char*) state.ptr + state.charsize); + continue; + } + } + + /* get segment before this match */ + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, last), + STATE_OFFSET(&state, state.start) + ); + } if (!item) goto error; status = PyList_Append(list, item); @@ -2238,7 +4211,7 @@ /* add groups (if any) */ for (i = 0; i < self->groups; i++) { - item = state_getslice(&state, i+1, string, 0); + item = state_getslice(&state, i + 1, string, 0); if (!item) goto error; status = PyList_Append(list, item); @@ -2249,14 +4222,42 @@ n = n + 1; - last = state.start = state.ptr; + last = state.ptr; + // Continue search from where we left off, but reject an initial zero-width match. + if (state.reverse) { + if (zero_width) { + state.end = state.ptr; + state.reject_zero_width = 1; + } else { + if (state.ptr == state.end) + state.end = (void*) ((char*) state.ptr - state.charsize); + else + state.end = state.ptr; + } + } else { + if (zero_width) { + state.start = state.ptr; + state.reject_zero_width = 1; + } else { + if(state.ptr == state.start) + state.start = (void*) ((char*) state.ptr + state.charsize); + else + state.start = state.ptr; + } + } } /* get segment following last match (even if empty) */ - item = PySequence_GetSlice( - string, STATE_OFFSET(&state, last), state.endpos - ); + if (state.reverse) + item = PySequence_GetSlice( + string, state.pos, STATE_OFFSET(&state, last) + ); + else + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, last), state.endpos + ); + if (!item) goto error; status = PyList_Append(list, item); @@ -2275,9 +4276,7 @@ } static PyObject* -pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, - Py_ssize_t count, Py_ssize_t subn) -{ +pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, Py_ssize_t count, Py_ssize_t subn) { SRE_STATE state; PyObject* list; PyObject* item; @@ -2287,9 +4286,11 @@ void* ptr; int status; Py_ssize_t n; - Py_ssize_t i, b, e; + Py_ssize_t b; int bint; int filter_is_callable; + SRE_CODE* pattern_code; + void* last; if (PyCallable_Check(ptemplate)) { /* sub/subn takes either a function or a template */ @@ -2303,10 +4304,10 @@ b = bint; if (ptr) { if (b == 1) { - literal = sre_literal_template((unsigned char *)ptr, n); + literal = sre_bliteral_template((unsigned char*)ptr, n); } else { #if defined(HAVE_UNICODE) - literal = sre_uliteral_template((Py_UNICODE *)ptr, n); + literal = sre_uliteral_template((Py_UNICODE*)ptr, n); #endif } } else { @@ -2329,7 +4330,9 @@ } } - string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX); + pattern_code = PatternObject_GetCode(self); + + string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX, pattern_code); if (!string) { Py_DECREF(filter); return NULL; @@ -2342,24 +4345,25 @@ return NULL; } - n = i = 0; + n = 0; + last = state.reverse ? state.end : state.start; while (!count || n < count) { state_reset(&state); - state.ptr = state.start; + state.ptr = state.reverse ? state.end : state.start; if (state.charsize == 1) { - status = sre_search(&state, PatternObject_GetCode(self)); + status = sre_bsearch(&state, state.pattern_code); } else { #if defined(HAVE_UNICODE) - status = sre_usearch(&state, PatternObject_GetCode(self)); + status = sre_usearch(&state, state.pattern_code); #endif } - if (PyErr_Occurred()) - goto error; + if (PyErr_Occurred()) + goto error; if (status <= 0) { if (status == 0) @@ -2368,22 +4372,24 @@ goto error; } - b = STATE_OFFSET(&state, state.start); - e = STATE_OFFSET(&state, state.ptr); - - if (i < b) { - /* get segment before this match */ - item = PySequence_GetSlice(string, i, b); - if (!item) - goto error; - status = PyList_Append(list, item); - Py_DECREF(item); - if (status < 0) - goto error; - - } else if (i == b && i == e && n > 0) - /* ignore empty match on latest position */ - goto next; + /* get segment before this match */ + if (state.reverse) { + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, state.end), + STATE_OFFSET(&state, last) + ); + } else { + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, last), + STATE_OFFSET(&state, state.start) + ); + } + if (!item) + goto error; + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + goto error; if (filter_is_callable) { /* pass match object through filter */ @@ -2414,29 +4420,31 @@ goto error; } - i = e; n = n + 1; -next: /* move on */ - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); - else - state.start = state.ptr; - - } + last = state.ptr; - /* get segment following last match */ - if (i < state.endpos) { - item = PySequence_GetSlice(string, i, state.endpos); - if (!item) - goto error; - status = PyList_Append(list, item); - Py_DECREF(item); - if (status < 0) - goto error; + // Continue search from where we left off, but reject an initial zero-width match. + if (state.reverse) + state.end = state.ptr; + else + state.start = state.ptr; + state.reject_zero_width = 1; } + /* get segment following last match */ + if (state.reverse) + item = PySequence_GetSlice(string, state.pos, STATE_OFFSET(&state, last)); + else + item = PySequence_GetSlice(string, STATE_OFFSET(&state, last), state.endpos); + if (!item) + goto error; + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + goto error; + state_fini(&state); Py_DECREF(filter); @@ -2460,9 +4468,7 @@ } -static PyObject* -pattern_sub(PatternObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* pattern_sub(PatternObject* self, PyObject* args, PyObject* kw) { PyObject* ptemplate; PyObject* string; Py_ssize_t count = 0; @@ -2474,9 +4480,7 @@ return pattern_subx(self, ptemplate, string, count, 0); } -static PyObject* -pattern_subn(PatternObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* pattern_subn(PatternObject* self, PyObject* args, PyObject* kw) { PyObject* ptemplate; PyObject* string; Py_ssize_t count = 0; @@ -2488,9 +4492,7 @@ return pattern_subx(self, ptemplate, string, count, 1); } -static PyObject* -pattern_copy(PatternObject* self, PyObject *unused) -{ +static PyObject* pattern_copy(PatternObject* self, PyObject *unused) { #ifdef USE_BUILTIN_COPY PatternObject* copy; int offset; @@ -2516,9 +4518,7 @@ #endif } -static PyObject* -pattern_deepcopy(PatternObject* self, PyObject* memo) -{ +static PyObject* pattern_deepcopy(PatternObject* self, PyObject* memo) { #ifdef USE_BUILTIN_COPY PatternObject* copy; @@ -2577,20 +4577,20 @@ static PyMethodDef pattern_methods[] = { {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS, - pattern_match_doc}, + pattern_match_doc}, {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS, - pattern_search_doc}, + pattern_search_doc}, {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS, - pattern_sub_doc}, + pattern_sub_doc}, {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS, - pattern_subn_doc}, + pattern_subn_doc}, {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS, - pattern_split_doc}, + pattern_split_doc}, {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS, - pattern_findall_doc}, + pattern_findall_doc}, #if PY_VERSION_HEX >= 0x02020000 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS, - pattern_finditer_doc}, + pattern_finditer_doc}, #endif {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS}, {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS}, @@ -2598,9 +4598,7 @@ {NULL, NULL} }; -static PyObject* -pattern_getattr(PatternObject* self, char* name) -{ +static PyObject* pattern_getattr(PatternObject* self, char* name) { PyObject* res; res = Py_FindMethod(pattern_methods, (PyObject*) self, name); @@ -2636,33 +4634,31 @@ 0, "_" SRE_MODULE ".SRE_Pattern", sizeof(PatternObject), sizeof(SRE_CODE), (destructor)pattern_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ + 0, /*tp_print*/ (getattrfunc)pattern_getattr, /*tp_getattr*/ - 0, /* tp_setattr */ - 0, /* tp_compare */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */ - pattern_doc, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */ + pattern_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */ }; static int _validate(PatternObject *self); /* Forward */ -static PyObject * -_compile(PyObject* self_, PyObject* args) -{ +static PyObject* _compile(PyObject* self_, PyObject* args) { /* "compile" pattern descriptor to pattern object */ PatternObject* self; @@ -2763,454 +4759,404 @@ #define VTRACE(v) #endif -/* Report failure */ -#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0) +typedef struct SRE_Validation { + unsigned int numbered_mark_count; + unsigned int named_mark_count; + unsigned int min_numbered_mark; + unsigned int max_numbered_mark; + unsigned int min_named_mark; + unsigned int max_named_mark; + unsigned int group_ref_count; + unsigned int max_group_ref; +} SRE_Validation; + +/* + Validates a charset. Returns a pointer to the following op if valid + or NULL if invalid. +*/ +static SRE_CODE* validate_charset(SRE_CODE* charset) { + int hi_byte = charset[0] / 256; + int max_index = 0; + int index; + SRE_CODE* end_charset; + for (index = 0; index <= hi_byte; index ++) { + int i = (charset[1 + index / 2] >> ((index % 2) * 16)) & 0xFFFF; + if (i > max_index + 1) + return NULL; + if (i > max_index) + max_index = i; + } + end_charset = charset + 1 + hi_byte / 2 + 1 + (max_index + 1) * (256 / SRE_BITS_PER_CODE); + return end_charset; +} -/* Extract opcode, argument, or skip count from code array */ -#define GET_OP \ - do { \ - VTRACE(("%p: ", code)); \ - if (code >= end) FAIL; \ - op = *code++; \ - VTRACE(("%lu (op)\n", (unsigned long)op)); \ - } while (0) -#define GET_ARG \ - do { \ - VTRACE(("%p= ", code)); \ - if (code >= end) FAIL; \ - arg = *code++; \ - VTRACE(("%lu (arg)\n", (unsigned long)arg)); \ - } while (0) -#define GET_SKIP_ADJ(adj) \ - do { \ - VTRACE(("%p= ", code)); \ - if (code >= end) FAIL; \ - skip = *code; \ - VTRACE(("%lu (skip to %p)\n", \ - (unsigned long)skip, code+skip)); \ - if (code+skip-adj < code || code+skip-adj > end)\ - FAIL; \ - code++; \ - } while (0) -#define GET_SKIP GET_SKIP_ADJ(0) +/* + Validates a set. Returns a pointer to the following op if valid + or NULL if invalid. +*/ +static SRE_CODE* validate_set(SRE_CODE* pattern, SRE_CODE* end_ptr) { + SRE_OpInfo* info_ptr; + SRE_CODE* charset_end = pattern + pattern[0]; + if (pattern[0] < 1 || charset_end > end_ptr) + return NULL; -static int -_validate_charset(SRE_CODE *code, SRE_CODE *end) -{ - /* Some variables are manipulated by the macros above */ - SRE_CODE op; - SRE_CODE arg; - SRE_CODE offset; - int i; - - while (code < end) { - GET_OP; - switch (op) { + pattern++; - case SRE_OP_NEGATE: + do { + if (pattern[0] > SRE_MAX_OP) + return NULL; + DEBUG_TRACE(("op is %u\n", pattern[0])); + info_ptr = &op_info[pattern[0]]; + switch (info_ptr->type) { + case SRE_TYPE_CATEGORY: // category + pattern += 2; break; - - case SRE_OP_LITERAL: - GET_ARG; + case SRE_TYPE_CHARSET: // skip charset + { + SRE_CODE* end_charset = pattern + 1 + pattern[1]; + if (end_charset > end_ptr) + return NULL; + pattern = validate_charset(pattern + 2); + if (pattern != end_charset) + return NULL; break; - - case SRE_OP_RANGE: - GET_ARG; - GET_ARG; + } + case SRE_TYPE_LITERAL: // code + pattern += 2; break; - - case SRE_OP_CHARSET: - offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */ - if (code+offset < code || code+offset > end) - FAIL; - code += offset; - break; - - case SRE_OP_BIGCHARSET: - GET_ARG; /* Number of blocks */ - offset = 256/sizeof(SRE_CODE); /* 256-byte table */ - if (code+offset < code || code+offset > end) - FAIL; - /* Make sure that each byte points to a valid block */ - for (i = 0; i < 256; i++) { - if (((unsigned char *)code)[i] >= arg) - FAIL; - } - code += offset; - offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */ - if (code+offset < code || code+offset > end) - FAIL; - code += offset; + case SRE_TYPE_RANGE: // min max + if (pattern[1] > pattern[2]) + return NULL; + pattern += 3; break; + default: + return NULL; + } + } while (pattern < charset_end); - case SRE_OP_CATEGORY: - GET_ARG; - switch (arg) { - case SRE_CATEGORY_DIGIT: - case SRE_CATEGORY_NOT_DIGIT: - case SRE_CATEGORY_SPACE: - case SRE_CATEGORY_NOT_SPACE: - case SRE_CATEGORY_WORD: - case SRE_CATEGORY_NOT_WORD: - case SRE_CATEGORY_LINEBREAK: - case SRE_CATEGORY_NOT_LINEBREAK: - case SRE_CATEGORY_LOC_WORD: - case SRE_CATEGORY_LOC_NOT_WORD: - case SRE_CATEGORY_UNI_DIGIT: - case SRE_CATEGORY_UNI_NOT_DIGIT: - case SRE_CATEGORY_UNI_SPACE: - case SRE_CATEGORY_UNI_NOT_SPACE: - case SRE_CATEGORY_UNI_WORD: - case SRE_CATEGORY_UNI_NOT_WORD: - case SRE_CATEGORY_UNI_LINEBREAK: - case SRE_CATEGORY_UNI_NOT_LINEBREAK: - break; - default: - FAIL; - } - break; + return pattern > charset_end ? NULL : pattern; +} - default: - FAIL; +/* + Validates a single-character op. Returns a pointer to the following op if valid + or NULL if invalid. +*/ +static SRE_CODE* validate_one_pattern(SRE_CODE* pattern, SRE_CODE* end_ptr, int* direction) { + SRE_OpInfo* info_ptr; - } + if (pattern[0] > SRE_MAX_OP) + return NULL; + + info_ptr = &op_info[pattern[0]]; + DEBUG_TRACE(("op is %u\n", pattern[0])); + if (*direction != 0 && *direction != info_ptr->direction) + return NULL; + + switch (info_ptr->type) { + case SRE_TYPE_CATEGORY: // category + pattern += 2; + break; + case SRE_TYPE_CHARSET: // skip charset + { + SRE_CODE* end_charset = pattern + 1 + pattern[1]; + if (end_charset > end_ptr) + return NULL; + pattern = validate_charset(pattern + 2); + if (pattern != end_charset) + return NULL; + break; + } + case SRE_TYPE_LITERAL: // code + pattern += 2; + break; + case SRE_TYPE_RANGE: // min max + if (pattern[1] > pattern[2]) + return NULL; + pattern += 3; + break; + case SRE_TYPE_SET: // set + pattern = validate_set(pattern + 1, end_ptr); + if (pattern == NULL) + return NULL; + break; + case SRE_TYPE_SIMPLE_CATEGORY: // + pattern++; + break; + default: + return NULL; } - return 1; + if (pattern > end_ptr) + return NULL; + + *direction = info_ptr->direction; + + return pattern; } -static int -_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) -{ - /* Some variables are manipulated by the macros above */ - SRE_CODE op; - SRE_CODE arg; - SRE_CODE skip; - - VTRACE(("code=%p, end=%p\n", code, end)); - - if (code > end) - FAIL; - - while (code < end) { - GET_OP; - switch (op) { +static SRE_CODE* validate_subpattern(SRE_CODE* pattern, SRE_CODE* end_ptr, int* direction, SRE_Validation* validation) { + int dir = *direction; - case SRE_OP_MARK: - /* We don't check whether marks are properly nested; the - sre_match() code is robust even if they don't, and the worst - you can get is nonsensical match results. */ - GET_ARG; - if (arg > 2*groups+1) { - VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups)); - FAIL; - } - break; + while (pattern < end_ptr) { + SRE_OpInfo* info_ptr; - case SRE_OP_LITERAL: - case SRE_OP_NOT_LITERAL: - case SRE_OP_LITERAL_IGNORE: - case SRE_OP_NOT_LITERAL_IGNORE: - GET_ARG; - /* The arg is just a character, nothing to check */ - break; + DEBUG_TRACE(("op is %u\n", pattern[0])); + if (pattern[0] > SRE_MAX_OP) + return NULL; - case SRE_OP_SUCCESS: - case SRE_OP_FAILURE: - /* Nothing to check; these normally end the matching process */ - break; + info_ptr = &op_info[pattern[0]]; + if (dir != 0 && info_ptr->direction != 0 && dir != info_ptr->direction) + return NULL; - case SRE_OP_AT: - GET_ARG; - switch (arg) { - case SRE_AT_BEGINNING: - case SRE_AT_BEGINNING_STRING: - case SRE_AT_BEGINNING_LINE: - case SRE_AT_END: - case SRE_AT_END_LINE: - case SRE_AT_END_STRING: - case SRE_AT_BOUNDARY: - case SRE_AT_NON_BOUNDARY: - case SRE_AT_LOC_BOUNDARY: - case SRE_AT_LOC_NON_BOUNDARY: - case SRE_AT_UNI_BOUNDARY: - case SRE_AT_UNI_NON_BOUNDARY: - break; - default: - FAIL; - } + switch (info_ptr->type) { + case SRE_TYPE_ASSERT: // ... + { + SRE_CODE* tail_ptr = pattern + 1 + pattern[1]; + int subdir = 0; + if (pattern[1] < 2 || tail_ptr > end_ptr || tail_ptr[-1] != info_ptr->end_marker) + return NULL; + if (validate_subpattern(pattern + 2, tail_ptr - 1, &subdir, validation) != tail_ptr - 1) + return NULL; + pattern = tail_ptr; break; - - case SRE_OP_ANY: - case SRE_OP_ANY_ALL: - /* These have no operands */ + } + case SRE_TYPE_ATOMIC: // ... + { + // The call should return a pointer to the END_ATOMIC, which it doesn't understand. + SRE_CODE* ptr = validate_subpattern(pattern + 1, end_ptr, &dir, validation); + if (ptr == NULL || ptr >= end_ptr || ptr[0] != info_ptr->end_marker) + return NULL; + pattern = ptr + 1; break; - - case SRE_OP_IN: - case SRE_OP_IN_IGNORE: - GET_SKIP; - /* Stop 1 before the end; we check the FAILURE below */ - if (!_validate_charset(code, code+skip-2)) - FAIL; - if (code[skip-2] != SRE_OP_FAILURE) - FAIL; - code += skip-1; - break; - - case SRE_OP_INFO: - { - /* A minimal info field is - <1=skip> <2=flags> <3=min> <4=max>; - If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags, - more follows. */ - SRE_CODE flags, min, max, i; - SRE_CODE *newcode; - GET_SKIP; - newcode = code+skip-1; - GET_ARG; flags = arg; - GET_ARG; min = arg; - GET_ARG; max = arg; - /* Check that only valid flags are present */ - if ((flags & ~(SRE_INFO_PREFIX | - SRE_INFO_LITERAL | - SRE_INFO_CHARSET)) != 0) - FAIL; - /* PREFIX and CHARSET are mutually exclusive */ - if ((flags & SRE_INFO_PREFIX) && - (flags & SRE_INFO_CHARSET)) - FAIL; - /* LITERAL implies PREFIX */ - if ((flags & SRE_INFO_LITERAL) && - !(flags & SRE_INFO_PREFIX)) - FAIL; - /* Validate the prefix */ - if (flags & SRE_INFO_PREFIX) { - SRE_CODE prefix_len, prefix_skip; - GET_ARG; prefix_len = arg; - GET_ARG; prefix_skip = arg; - /* Here comes the prefix string */ - if (code+prefix_len < code || code+prefix_len > newcode) - FAIL; - code += prefix_len; - /* And here comes the overlap table */ - if (code+prefix_len < code || code+prefix_len > newcode) - FAIL; - /* Each overlap value should be < prefix_len */ - for (i = 0; i < prefix_len; i++) { - if (code[i] >= prefix_len) - FAIL; - } - code += prefix_len; - } - /* Validate the charset */ - if (flags & SRE_INFO_CHARSET) { - if (!_validate_charset(code, newcode-1)) - FAIL; - if (newcode[-1] != SRE_OP_FAILURE) - FAIL; - code = newcode; - } - else if (code != newcode) { - VTRACE(("code=%p, newcode=%p\n", code, newcode)); - FAIL; - } - } + } + case SRE_TYPE_BRANCH: // ... ... 0 + { + // All the jumps should end in the same place. + SRE_CODE* skip_end_ptr = NULL; + pattern++; + do { + SRE_CODE* next_ptr = pattern + pattern[0]; + SRE_CODE* ptr; + // The offset to the next alternative's offset. + if (pattern[0] < 3 || next_ptr >= end_ptr) + return NULL; + // Validate this alternative, which stops at the jump. + ptr = validate_subpattern(pattern + 1, next_ptr - 2, &dir, validation); + if (ptr != next_ptr - 2 || ptr[0] != SRE_OP_JUMP || ptr[1] < 1) + return NULL; + // The jump to the end. + ptr += 1 + ptr[1]; + if (skip_end_ptr == NULL) + skip_end_ptr = ptr; + else if (ptr != skip_end_ptr) + return NULL; + pattern = next_ptr; + } while (pattern[0] != 0); + pattern++; break; - - case SRE_OP_BRANCH: - { - SRE_CODE *target = NULL; - for (;;) { - GET_SKIP; - if (skip == 0) - break; - /* Stop 2 before the end; we check the JUMP below */ - if (!_validate_inner(code, code+skip-3, groups)) - FAIL; - code += skip-3; - /* Check that it ends with a JUMP, and that each JUMP - has the same target */ - GET_OP; - if (op != SRE_OP_JUMP) - FAIL; - GET_SKIP; - if (target == NULL) - target = code+skip-1; - else if (code+skip-1 != target) - FAIL; - } - } + } + case SRE_TYPE_CATEGORY: // category + pattern += 2; break; - - case SRE_OP_REPEAT_ONE: - case SRE_OP_MIN_REPEAT_ONE: - { - SRE_CODE min, max; - GET_SKIP; - GET_ARG; min = arg; - GET_ARG; max = arg; - if (min > max) - FAIL; -#ifdef Py_UNICODE_WIDE - if (max > 65535) - FAIL; -#endif - if (!_validate_inner(code, code+skip-4, groups)) - FAIL; - code += skip-4; - GET_OP; - if (op != SRE_OP_SUCCESS) - FAIL; - } + case SRE_TYPE_CHARSET: // skip charset + { + SRE_CODE* end_charset = pattern + 1 + pattern[1]; + if (end_charset > end_ptr) + return NULL; + pattern = validate_charset(pattern + 2); + if (pattern != end_charset) + return NULL; break; - - case SRE_OP_REPEAT: - { - SRE_CODE min, max; - GET_SKIP; - GET_ARG; min = arg; - GET_ARG; max = arg; - if (min > max) - FAIL; -#ifdef Py_UNICODE_WIDE - if (max > 65535) - FAIL; -#endif - if (!_validate_inner(code, code+skip-3, groups)) - FAIL; - code += skip-3; - GET_OP; - if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL) - FAIL; - } + } + case SRE_TYPE_GROUPREF: // group_id + DEBUG_TRACE(("GROUPREF %u\n", pattern[1])); + validation->group_ref_count++; + validation->max_group_ref = sre_max(validation->max_group_ref, pattern[1]); + pattern += 2; + DEBUG_TRACE(("group_ref_count is %u\n", validation->group_ref_count)); + DEBUG_TRACE(("max_group_ref is %u\n", validation->max_group_ref)); break; - - case SRE_OP_GROUPREF: - case SRE_OP_GROUPREF_IGNORE: - GET_ARG; - if (arg >= groups) - FAIL; + case SRE_TYPE_GROUPREF_EXISTS: // group_id code_yes code_no + { + SRE_CODE* skip_ptr = pattern + 1 + pattern[2]; + SRE_CODE* ptr; + // Locate code_no. + if (pattern[2] < 2 || skip_ptr > end_ptr) + return NULL; + // code_yes lies between the skip and code_no. + ptr = validate_subpattern(pattern + 3, skip_ptr, &dir, validation); + // 'ptr' will point after code_yes and at the jump, if present. + // (The jump will have been rejected by the call.) + // Validate code_no, if present. + if (ptr == skip_ptr - 2) { + if (ptr[0] != SRE_OP_JUMP || ptr[1] < 1) + return NULL; + skip_ptr = ptr + 1 + ptr[1]; + if (skip_ptr > end_ptr) + return NULL; + ptr = validate_subpattern(ptr + 2, skip_ptr, &dir, validation); + if (ptr < skip_ptr) + return NULL; + } else if (ptr != skip_ptr) + return NULL; + validation->group_ref_count++; + validation->max_group_ref = sre_max(validation->max_group_ref, pattern[1]); + pattern = skip_ptr; break; - - case SRE_OP_GROUPREF_EXISTS: - /* The regex syntax for this is: '(?(group)then|else)', where - 'group' is either an integer group number or a group name, - 'then' and 'else' are sub-regexes, and 'else' is optional. */ - GET_ARG; - if (arg >= groups) - FAIL; - GET_SKIP_ADJ(1); - code--; /* The skip is relative to the first arg! */ - /* There are two possibilities here: if there is both a 'then' - part and an 'else' part, the generated code looks like: - - GROUPREF_EXISTS - - - ...then part... - JUMP - - ( jumps here) - ...else part... - ( jumps here) - - If there is only a 'then' part, it looks like: - - GROUPREF_EXISTS - - - ...then part... - ( jumps here) - - There is no direct way to decide which it is, and we don't want - to allow arbitrary jumps anywhere in the code; so we just look - for a JUMP opcode preceding our skip target. - */ - if (skip >= 3 && code+skip-3 >= code && - code[skip-3] == SRE_OP_JUMP) - { - VTRACE(("both then and else parts present\n")); - if (!_validate_inner(code+1, code+skip-3, groups)) - FAIL; - code += skip-2; /* Position after JUMP, at */ - GET_SKIP; - if (!_validate_inner(code, code+skip-1, groups)) - FAIL; - code += skip-1; - } - else { - VTRACE(("only a then part present\n")); - if (!_validate_inner(code+1, code+skip-1, groups)) - FAIL; - code += skip-1; + } + case SRE_TYPE_LITERAL: // code + pattern += 2; + break; + case SRE_TYPE_LITERAL_STRING: // length ... + if (pattern[1] == 0) + return NULL; + pattern += 2 + pattern[1]; + break; + case SRE_TYPE_MARK: // + // All named marks are also numbered. + // The named marks all have higher ids than the numbered ones. + DEBUG_TRACE(("mark %u %u at 0x%p\n", pattern[1], pattern[2], pattern)); + if (pattern[1] > pattern[2]) + return NULL; + validation->numbered_mark_count++; + validation->min_numbered_mark = sre_min(validation->min_numbered_mark, pattern[1]); + validation->max_numbered_mark = sre_max(validation->max_numbered_mark, pattern[1]); + if (pattern[2] > pattern[1]) { + validation->named_mark_count++; + validation->min_named_mark = sre_min(validation->min_named_mark, pattern[2]); + validation->max_named_mark = sre_max(validation->max_named_mark, pattern[2]); } + pattern += 3; + DEBUG_TRACE(("numbered_mark_count is %u\n", validation->numbered_mark_count)); + DEBUG_TRACE(("min_numbered_mark is %u\n", validation->min_numbered_mark)); + DEBUG_TRACE(("max_numbered_mark is %u\n", validation->max_numbered_mark)); + DEBUG_TRACE(("named_mark_count is %u\n", validation->named_mark_count)); + DEBUG_TRACE(("min_named_mark is %u\n", validation->min_named_mark)); + DEBUG_TRACE(("max_named_mark is %u\n", validation->max_named_mark)); break; - - case SRE_OP_ASSERT: - case SRE_OP_ASSERT_NOT: - GET_SKIP; - GET_ARG; /* 0 for lookahead, width for lookbehind */ - code--; /* Back up over arg to simplify math below */ - if (arg & 0x80000000) - FAIL; /* Width too large */ - /* Stop 1 before the end; we check the SUCCESS below */ - if (!_validate_inner(code+1, code+skip-2, groups)) - FAIL; - code += skip-2; - GET_OP; - if (op != SRE_OP_SUCCESS) - FAIL; + case SRE_TYPE_POSITION: // + pattern++; + break; + case SRE_TYPE_RANGE: // min max + if (pattern[1] > pattern[2]) + return NULL; + pattern += 3; + break; + case SRE_TYPE_REPEAT: // ... + { + SRE_CODE* skip_end_ptr; + if (pattern[1] < 4 || pattern[2] > pattern[3]) + return NULL; + skip_end_ptr = pattern + pattern[1]; + if (skip_end_ptr + 2 > end_ptr || skip_end_ptr[0] != info_ptr->end_marker || skip_end_ptr[1] != pattern[1]) + return NULL; + if (validate_subpattern(pattern + 4, skip_end_ptr, &dir, validation) != skip_end_ptr) + return NULL; + pattern = skip_end_ptr + 2; + break; + } + case SRE_TYPE_REPEAT_ONE: // ... + { + SRE_CODE* tail_ptr; + DEBUG_TRACE(("skip is %u, min is %u, max is %u\n", pattern[1], pattern[2], pattern[3])); + if (pattern[1] < 4 || pattern[2] > pattern[3]) + return NULL; + tail_ptr = pattern + 1 + pattern[1]; + if (tail_ptr > end_ptr) + return NULL; + if (validate_one_pattern(pattern + 4, tail_ptr, &dir) != tail_ptr) + return NULL; + pattern = tail_ptr; + break; + } + case SRE_TYPE_SET: // set + pattern = validate_set(pattern + 1, end_ptr); + if (pattern == NULL) + return NULL; + break; + case SRE_TYPE_SIMPLE_CATEGORY: // + pattern++; break; - default: - FAIL; - + // Anything else might be meaningful to the caller. + *direction = dir; + return pattern; } + + if (info_ptr->direction != 0) + dir = info_ptr->direction; } - VTRACE(("okay\n")); - return 1; -} + *direction = dir; -static int -_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) -{ - if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS) - FAIL; - if (groups == 0) /* fix for simplejson */ - groups = 100; /* 100 groups should always be safe */ - return _validate_inner(code, end-1, groups); + return pattern > end_ptr ? NULL : pattern; } -static int -_validate(PatternObject *self) -{ - if (!_validate_outer(self->code, self->code+self->codesize, self->groups)) - { - PyErr_SetString(PyExc_RuntimeError, "invalid SRE code"); - return 0; - } - else - VTRACE(("Success!\n")); +static int _validate(PatternObject* self) { + SRE_Validation validation; + int direction = 0; + SRE_CODE* end_ptr = self->code + self->codesize; + + validation.numbered_mark_count = 0; + validation.min_numbered_mark = ~(unsigned int)0; + validation.max_numbered_mark = 0; + validation.named_mark_count = 0; + validation.min_named_mark = ~(unsigned int)0; + validation.max_named_mark = 0; + validation.group_ref_count = 0; + validation.max_group_ref = 0; + + /* _validate_subpattern will return a pointer to the first op it doesn't understand + or NULL if the pattern is invalid. It doesn't understand SRE_OP_SUCCESS (which + occurs only at the end of the pattern), so the result should be a pointer to that. */ + if (self->codesize < 1 || end_ptr[-1] != SRE_OP_SUCCESS) + goto error; + if (validate_subpattern(self->code, end_ptr, &direction, &validation) != end_ptr - 1) + goto error; + + // There should be an even number of marks (start and end of a group). + if (validation.numbered_mark_count % 2 != 0 || validation.named_mark_count % 2 != 0) + goto error; + // The numbered marks should be in the range 0 .. numbered_mark_count - 1. + // (We're not checking for duplicates.) + if (validation.numbered_mark_count > 0 && (validation.min_numbered_mark != 0 || + validation.min_numbered_mark + validation.numbered_mark_count - 1 != validation.max_numbered_mark)) + goto error; + // All the named marks should be in the range numbered_mark_count .. numbered_mark_count + named_mark_count - 1. + // (We're not checking for duplicates.) + // We can guarantee that named_mark_count <= numbered_mark_count. + if (validation.named_mark_count > 0 && (validation.min_named_mark != validation.numbered_mark_count || + validation.min_named_mark + validation.named_mark_count - 1 != validation.max_named_mark)) + goto error; + // All the group refs should be in the range 0 .. numbered_mark_count + named_mark_count - 1. + if (validation.group_ref_count > 0 && validation.max_group_ref * 2 >= validation.numbered_mark_count + validation.named_mark_count) + goto error; + + self->groups = validation.numbered_mark_count / 2; + self->internal_groups = (validation.numbered_mark_count + validation.named_mark_count) / 2; + + VTRACE(("Success!\n")); return 1; + +error: + PyErr_SetString(PyExc_RuntimeError, "invalid SRE code"); + return 0; } /* -------------------------------------------------------------------- */ /* match methods */ -static void -match_dealloc(MatchObject* self) -{ +static void match_dealloc(MatchObject* self) { Py_XDECREF(self->regs); Py_XDECREF(self->string); Py_DECREF(self->pattern); PyObject_DEL(self); } -static PyObject* -match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def) -{ - if (index < 0 || index >= self->groups) { +static PyObject* match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def, BOOL include_internal) { + int groups = include_internal ? self->internal_groups : self->groups; + DEBUG_TRACE(("match_getslice_by_index: include_internal is %d, internal_groups is %d, groups is %d, index is %d\n", include_internal, self->internal_groups, self->groups, index)); + if (index < 0 || index >= groups) { /* raise IndexError if we were given a bad group number */ PyErr_SetString( PyExc_IndexError, @@ -3228,17 +5174,19 @@ } return PySequence_GetSlice( - self->string, self->mark[index], self->mark[index+1] + self->string, self->mark[index], self->mark[index + 1] ); } -static Py_ssize_t -match_getindex(MatchObject* self, PyObject* index) -{ +static Py_ssize_t match_getindex(MatchObject* self, PyObject* index, BOOL include_internal) { Py_ssize_t i; if (PyInt_Check(index)) - return PyInt_AsSsize_t(index); + { + Py_ssize_t groups = include_internal ? self->internal_groups : self->groups; + i = PyInt_AsSsize_t(index); + return i > groups ? -1 : i; + } i = -1; @@ -3255,15 +5203,11 @@ return i; } -static PyObject* -match_getslice(MatchObject* self, PyObject* index, PyObject* def) -{ - return match_getslice_by_index(self, match_getindex(self, index), def); +static PyObject* match_getslice(MatchObject* self, PyObject* index, PyObject* def, BOOL include_internal) { + return match_getslice_by_index(self, match_getindex(self, index, include_internal), def, TRUE); } -static PyObject* -match_expand(MatchObject* self, PyObject* ptemplate) -{ +static PyObject* match_expand(MatchObject* self, PyObject* ptemplate) { /* delegate to Python code */ return call( SRE_PY_MODULE, "_expand", @@ -3271,9 +5215,7 @@ ); } -static PyObject* -match_group(MatchObject* self, PyObject* args) -{ +static PyObject* sre_get_match_group(MatchObject* self, PyObject* args, BOOL include_internal) { PyObject* result; Py_ssize_t i, size; @@ -3281,10 +5223,10 @@ switch (size) { case 0: - result = match_getslice(self, Py_False, Py_None); + result = match_getslice(self, Py_False, Py_None, include_internal); break; case 1: - result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None); + result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None, include_internal); break; default: /* fetch multiple items */ @@ -3293,7 +5235,7 @@ return NULL; for (i = 0; i < size; i++) { PyObject* item = match_getslice( - self, PyTuple_GET_ITEM(args, i), Py_None + self, PyTuple_GET_ITEM(args, i), Py_None, include_internal ); if (!item) { Py_DECREF(result); @@ -3306,9 +5248,22 @@ return result; } -static PyObject* -match_groups(MatchObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* match_group(MatchObject* self, PyObject* args) { + return sre_get_match_group(self, args, FALSE); +} + +static PyObject* match_internal_group(MatchObject* self, PyObject* args) { + return sre_get_match_group(self, args, TRUE); +} + +static PyObject* match_subscript(MatchObject* self, register PyObject* group) { + if (PyTuple_GET_SIZE(group) != 1) + return NULL; + + return match_getslice(self, PyTuple_GET_ITEM(group, 0), Py_None, FALSE); +} + +static PyObject* match_groups(MatchObject* self, PyObject* args, PyObject* kw) { PyObject* result; Py_ssize_t index; @@ -3317,26 +5272,24 @@ if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def)) return NULL; - result = PyTuple_New(self->groups-1); + result = PyTuple_New(self->groups - 1); if (!result) return NULL; for (index = 1; index < self->groups; index++) { PyObject* item; - item = match_getslice_by_index(self, index, def); + item = match_getslice_by_index(self, index, def, FALSE); if (!item) { Py_DECREF(result); return NULL; } - PyTuple_SET_ITEM(result, index-1, item); + PyTuple_SET_ITEM(result, index - 1, item); } return result; } -static PyObject* -match_groupdict(MatchObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* match_groupdict(MatchObject* self, PyObject* args, PyObject* kw) { PyObject* result; PyObject* keys; Py_ssize_t index; @@ -3361,7 +5314,7 @@ key = PyList_GET_ITEM(keys, index); if (!key) goto failed; - value = match_getslice(self, key, def); + value = match_getslice(self, key, def, FALSE); if (!value) { Py_DECREF(key); goto failed; @@ -3382,18 +5335,16 @@ return NULL; } -static PyObject* -match_start(MatchObject* self, PyObject* args) -{ +static PyObject* match_start(MatchObject* self, PyObject* args) { Py_ssize_t index; PyObject* index_ = Py_False; /* zero */ if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_)) return NULL; - index = match_getindex(self, index_); + index = match_getindex(self, index_, FALSE); - if (index < 0 || index >= self->groups) { + if (index < 0 || index >= self->internal_groups) { PyErr_SetString( PyExc_IndexError, "no such group" @@ -3402,21 +5353,19 @@ } /* mark is -1 if group is undefined */ - return Py_BuildValue("i", self->mark[index*2]); + return Py_BuildValue("i", self->mark[index * 2]); } -static PyObject* -match_end(MatchObject* self, PyObject* args) -{ +static PyObject* match_end(MatchObject* self, PyObject* args) { Py_ssize_t index; PyObject* index_ = Py_False; /* zero */ if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_)) return NULL; - index = match_getindex(self, index_); + index = match_getindex(self, index_, FALSE); - if (index < 0 || index >= self->groups) { + if (index < 0 || index >= self->internal_groups) { PyErr_SetString( PyExc_IndexError, "no such group" @@ -3425,12 +5374,10 @@ } /* mark is -1 if group is undefined */ - return Py_BuildValue("i", self->mark[index*2+1]); + return Py_BuildValue("i", self->mark[index * 2 + 1]); } -LOCAL(PyObject*) -_pair(Py_ssize_t i1, Py_ssize_t i2) -{ +LOCAL(PyObject*) _pair(Py_ssize_t i1, Py_ssize_t i2) { PyObject* pair; PyObject* item; @@ -3450,23 +5397,21 @@ return pair; - error: +error: Py_DECREF(pair); return NULL; } -static PyObject* -match_span(MatchObject* self, PyObject* args) -{ +static PyObject* match_span(MatchObject* self, PyObject* args) { Py_ssize_t index; PyObject* index_ = Py_False; /* zero */ if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_)) return NULL; - index = match_getindex(self, index_); + index = match_getindex(self, index_, FALSE); - if (index < 0 || index >= self->groups) { + if (index < 0 || index >= self->internal_groups) { PyErr_SetString( PyExc_IndexError, "no such group" @@ -3475,12 +5420,10 @@ } /* marks are -1 if group is undefined */ - return _pair(self->mark[index*2], self->mark[index*2+1]); + return _pair(self->mark[index * 2], self->mark[index * 2 + 1]); } -static PyObject* -match_regs(MatchObject* self) -{ +static PyObject* match_regs(MatchObject* self) { PyObject* regs; PyObject* item; Py_ssize_t index; @@ -3490,7 +5433,7 @@ return NULL; for (index = 0; index < self->groups; index++) { - item = _pair(self->mark[index*2], self->mark[index*2+1]); + item = _pair(self->mark[index * 2], self->mark[index * 2 + 1]); if (!item) { Py_DECREF(regs); return NULL; @@ -3504,14 +5447,12 @@ return regs; } -static PyObject* -match_copy(MatchObject* self, PyObject *unused) -{ +static PyObject* match_copy(MatchObject* self, PyObject* unused) { #ifdef USE_BUILTIN_COPY MatchObject* copy; Py_ssize_t slots, offset; - slots = 2 * (self->pattern->groups+1); + slots = 2 * (self->pattern->groups + 1); copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots); if (!copy) @@ -3525,23 +5466,21 @@ Py_XINCREF(self->string); Py_XINCREF(self->regs); - memcpy((char*) copy + offset, (char*) self + offset, + memcpy((char*)copy + offset, (char*)self + offset, sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset); - return (PyObject*) copy; + return (PyObject*)copy; #else PyErr_SetString(PyExc_TypeError, "cannot copy this match object"); return NULL; #endif } -static PyObject* -match_deepcopy(MatchObject* self, PyObject* memo) -{ +static PyObject* match_deepcopy(MatchObject* self, PyObject* memo) { #ifdef USE_BUILTIN_COPY MatchObject* copy; - copy = (MatchObject*) match_copy(self); + copy = (MatchObject*)match_copy(self); if (!copy) return NULL; @@ -3568,15 +5507,15 @@ {"expand", (PyCFunction) match_expand, METH_O}, {"__copy__", (PyCFunction) match_copy, METH_NOARGS}, {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O}, + {"__getitem__", (PyCFunction) match_subscript, METH_O|METH_COEXIST}, + {"_internal_group", (PyCFunction) match_internal_group, METH_VARARGS}, {NULL, NULL} }; -static PyObject* -match_getattr(MatchObject* self, char* name) -{ +static PyObject* match_getattr(MatchObject* self, char* name) { PyObject* res; - res = Py_FindMethod(match_methods, (PyObject*) self, name); + res = Py_FindMethod(match_methods, (PyObject*)self, name); if (res) return res; @@ -3590,9 +5529,9 @@ } if (!strcmp(name, "lastgroup")) { - if (self->pattern->indexgroup && self->lastindex >= 0) { + if (self->pattern->indexgroup && self->last_named_index >= 0) { PyObject* result = PySequence_GetItem( - self->pattern->indexgroup, self->lastindex + self->pattern->indexgroup, self->last_named_index ); if (result) return result; @@ -3642,27 +5581,43 @@ PyObject_HEAD_INIT(NULL) 0, "_" SRE_MODULE ".SRE_Match", sizeof(MatchObject), sizeof(Py_ssize_t), - (destructor)match_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - (getattrfunc)match_getattr /*tp_getattr*/ + (destructor)match_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + (getattrfunc)match_getattr, /*tp_getattr*/ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_HAVE_INDEX, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + match_methods, /* tp_methods */ }; -static PyObject* -pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status) -{ +static PyObject* pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status) { /* create match object (from state object) */ - - MatchObject* match; - Py_ssize_t i, j; - char* base; - int n; - if (status > 0) { + MatchObject* match; + char* base = (char*) state->beginning; + Py_ssize_t mark_index; + int charsize = state->charsize; /* create match object (with room for extra group marks) */ /* coverity[ampersand_in_size] */ - match = PyObject_NEW_VAR(MatchObject, &Match_Type, - 2*(pattern->groups+1)); + match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2 * (pattern->internal_groups + 1)); if (!match) return NULL; @@ -3673,36 +5628,37 @@ match->string = state->string; match->regs = NULL; - match->groups = pattern->groups+1; + match->groups = pattern->groups + 1; + match->internal_groups = pattern->internal_groups + 1; /* fill in group slices */ + if (state->reverse) { + match->mark[0] = ((char*) state->ptr - base) / charsize; + match->mark[1] = ((char*) state->end - base) / charsize; + } else { + match->mark[0] = ((char*) state->start - base) / charsize; + match->mark[1] = ((char*) state->ptr - base) / charsize; + } - base = (char*) state->beginning; - n = state->charsize; - - match->mark[0] = ((char*) state->start - base) / n; - match->mark[1] = ((char*) state->ptr - base) / n; - - for (i = j = 0; i < pattern->groups; i++, j+=2) - if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) { - match->mark[j+2] = ((char*) state->mark[j] - base) / n; - match->mark[j+3] = ((char*) state->mark[j+1] - base) / n; + for (mark_index = 0; mark_index < pattern->internal_groups * 2; mark_index += 2) { + if (state->mark[mark_index] != NULL && state->mark[mark_index] <= state->mark[mark_index + 1]) { + match->mark[mark_index + 2] = ((char*) state->mark[mark_index] - base) / charsize; + match->mark[mark_index + 3] = ((char*) state->mark[mark_index + 1] - base) / charsize; } else - match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ + match->mark[mark_index + 2] = match->mark[mark_index + 3] = -1; /* unmatched */ + } match->pos = state->pos; match->endpos = state->endpos; match->lastindex = state->lastindex; + match->last_named_index = state->last_named_index; - return (PyObject*) match; - + return (PyObject*)match; } else if (status == 0) { - /* no match */ Py_INCREF(Py_None); return Py_None; - } /* internal error */ @@ -3714,30 +5670,27 @@ /* -------------------------------------------------------------------- */ /* scanner methods (experimental) */ -static void -scanner_dealloc(ScannerObject* self) -{ +static void scanner_dealloc(ScannerObject* self) { state_fini(&self->state); Py_DECREF(self->pattern); PyObject_DEL(self); } -static PyObject* -scanner_match(ScannerObject* self, PyObject *unused) -{ +static PyObject* scanner_match(ScannerObject* self, PyObject* unused) { SRE_STATE* state = &self->state; PyObject* match; int status; state_reset(state); - state->ptr = state->start; + state->ptr = state->reverse ? state->end : state->start; + memset(state->mark, 0, state->pattern_code[0] * sizeof(SRE_CHAR*)); if (state->charsize == 1) { - status = sre_match(state, PatternObject_GetCode(self->pattern)); + status = sre_bmatch(state, state->pattern_code); } else { #if defined(HAVE_UNICODE) - status = sre_umatch(state, PatternObject_GetCode(self->pattern)); + status = sre_umatch(state, state->pattern_code); #endif } if (PyErr_Occurred()) @@ -3746,31 +5699,36 @@ match = pattern_new_match((PatternObject*) self->pattern, state, status); - if (status == 0 || state->ptr == state->start) - state->start = (void*) ((char*) state->ptr + state->charsize); - else - state->start = state->ptr; + if (state->reverse) { + if (status == 0 || state->ptr == state->end) + state->end = (void*) ((char*) state->ptr - state->charsize); + else + state->end = state->ptr; + } else { + if (status == 0 || state->ptr == state->start) + state->start = (void*) ((char*) state->ptr + state->charsize); + else + state->start = state->ptr; + } return match; } -static PyObject* -scanner_search(ScannerObject* self, PyObject *unused) -{ +static PyObject* scanner_search(ScannerObject* self, PyObject* unused) { SRE_STATE* state = &self->state; PyObject* match; int status; state_reset(state); - state->ptr = state->start; + state->ptr = state->reverse ? state->end : state->start; if (state->charsize == 1) { - status = sre_search(state, PatternObject_GetCode(self->pattern)); + status = sre_bsearch(state, state->pattern_code); } else { #if defined(HAVE_UNICODE) - status = sre_usearch(state, PatternObject_GetCode(self->pattern)); + status = sre_usearch(state, state->pattern_code); #endif } if (PyErr_Occurred()) @@ -3779,10 +5737,17 @@ match = pattern_new_match((PatternObject*) self->pattern, state, status); - if (status == 0 || state->ptr == state->start) - state->start = (void*) ((char*) state->ptr + state->charsize); - else - state->start = state->ptr; + if (state->reverse) { + if (status == 0 || state->ptr == state->end) + state->end = (void*) ((char*) state->ptr - state->charsize); + else + state->end = state->ptr; + } else { + if (status == 0 || state->ptr == state->start) + state->start = (void*) ((char*) state->ptr + state->charsize); + else + state->start = state->ptr; + } return match; } @@ -3793,12 +5758,10 @@ {NULL, NULL} }; -static PyObject* -scanner_getattr(ScannerObject* self, char* name) -{ +static PyObject* scanner_getattr(ScannerObject* self, char* name) { PyObject* res; - res = Py_FindMethod(scanner_methods, (PyObject*) self, name); + res = Py_FindMethod(scanner_methods, (PyObject*)self, name); if (res) return res; @@ -3823,9 +5786,7 @@ (getattrfunc)scanner_getattr, /*tp_getattr*/ }; -static PyObject* -pattern_scanner(PatternObject* pattern, PyObject* args) -{ +static PyObject* pattern_scanner(PatternObject* pattern, PyObject* args) { /* create search state object */ ScannerObject* self; @@ -3833,6 +5794,8 @@ PyObject* string; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; + SRE_CODE* pattern_code; + if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end)) return NULL; @@ -3841,22 +5804,26 @@ if (!self) return NULL; - string = state_init(&self->state, pattern, string, start, end); + pattern_code = PatternObject_GetCode(pattern); + + string = state_init(&self->state, pattern, string, start, end, pattern_code); if (!string) { PyObject_DEL(self); return NULL; } Py_INCREF(pattern); - self->pattern = (PyObject*) pattern; + self->pattern = (PyObject*)pattern; - return (PyObject*) self; + return (PyObject*)self; } static PyMethodDef _functions[] = { {"compile", _compile, METH_VARARGS}, {"getcodesize", sre_codesize, METH_NOARGS}, {"getlower", sre_getlower, METH_VARARGS}, + {"getupper", sre_getupper, METH_VARARGS}, + {"gettitle", sre_gettitle, METH_VARARGS}, {NULL, NULL} }; @@ -3876,7 +5843,7 @@ m = Py_InitModule("_" SRE_MODULE, _functions); if (m == NULL) - return; + return; d = PyModule_GetDict(m); x = PyInt_FromLong(SRE_MAGIC);