=== modified file 'Doc/library/re.rst' --- old/Doc/library/re.rst 2008-05-31 13:05:34 +0000 +++ new/Doc/library/re.rst 2008-06-09 14:37:21 +0000 @@ -127,6 +127,21 @@ characters as possible will be matched. Using ``.*?`` in the previous expression will match only ``'

'``. +``*+``, ``++``, ``?+`` + Like the ``'*'``, ``'+'``, and ``'?'`` qualifiers, those where ``'+'`` is + appended also match as many times as possible. However, unlike the true greedy + qualifiers, these do not allow back-tracking when the expression following it + fails to match. These are known as :dfn:`Possessive` qualifiers. For example, + ``a*a`` will match ``'aaaa'`` because the ``a*`` will match all 4 ``'a'``s, but, + when the final ``'a'`` is encountered, the expression is backtracked so that in the + end the ``a*`` ends up matching 3 ``'a'``s total, and the fourth ``'a'`` is matched + by the final ``'a'``. However, when ``a*+a`` is used to match ``'aaaa'``, the + ``a*+`` will match all 4 ``'a'``, but when the final ``'a'`` fails to find any more + characters to match, the expression cannot be backtracked and will thus fail to + match. + + .. versionadded:: 2.6 + ``{m}`` Specifies that exactly *m* copies of the previous RE should be matched; fewer matches cause the entire RE not to match. For example, ``a{6}`` will match @@ -148,6 +163,18 @@ 6-character string ``'aaaaaa'``, ``a{3,5}`` will match 5 ``'a'`` characters, while ``a{3,5}?`` will only match 3 characters. +``{m,n}+`` + Causes the resulting RE to match from *m* to *n* repetitions of the preceding + RE, attempting to match as many repetitions as possible *without* establishing any + backtracking points. This is the possessive version of the qualifier above. For + example, on the 6-character string ``'aaaaaa'``, ``a{3,5}aa`` attempt to match 5 + ``'a'`` characters, then, requiring 2 more ``'a'``s, will need more characters than + available and thus fail, while ``a{3,5}aa`` will match with ``a{3,5}`` capturing + 5, then 4 ``'a'``s by backtracking and then the final 2 ``'a'``s are matched by the + final ``aa`` in the pattern. + + .. versionadded:: 2.6 + ``'\'`` Either escapes special characters (permitting you to match characters like ``'*'``, ``'?'``, and so forth), or signals a special sequence; special @@ -293,6 +320,20 @@ some fixed length. Patterns which start with negative lookbehind assertions may match at the beginning of the string being searched. +``(?>...)`` + Attempts to match ``...`` as if it was a separate Regular Expression, and if + successful, continues to match the rest of the pattern following it. If the + subsequent pattern fails to match, the stack can only be unwound to a point + *before* the ``(?>...)`` because once exited, the expression, known as an + :dfn:`Atomic Group`, has thrown away all stack points within itself. Thus, + ``(?>.*).`` would never match anything because first the ``.*`` would match all + characters possible, then, having nothing left to match, the final ``.`` would + fail to match. Since there are no stack points saved in the Atomic Group, and + there is no stack point before it, the entire expression would thus fail to + match. + + .. versionadded:: 2.6 + ``(?(id/name)yes-pattern|no-pattern)`` Will try to match with ``yes-pattern`` if the group with given *id* or *name* exists, and with ``no-pattern`` if it doesn't. ``no-pattern`` is optional and === modified file 'Lib/re.py' --- old/Lib/re.py 2008-05-20 07:49:57 +0000 +++ new/Lib/re.py 2008-05-24 16:05:21 +0000 @@ -235,7 +235,7 @@ if flags: raise ValueError('Cannot process flags argument with a compiled pattern') return pattern - if not sre_compile.isstring(pattern): + if not isinstance(pattern, basestring): raise TypeError, "first argument must be string or compiled pattern" try: p = sre_compile.compile(pattern, flags) === modified file 'Lib/sre_compile.py' --- old/Lib/sre_compile.py 2008-04-08 21:27:42 +0000 +++ new/Lib/sre_compile.py 2008-06-04 19:22:10 +0000 @@ -11,7 +11,7 @@ """Internal support module for sre""" import _sre, sys -import sre_parse + from sre_constants import * assert _sre.MAGIC == MAGIC, "SRE module mismatch" @@ -31,7 +31,8 @@ return s _LITERAL_CODES = set([LITERAL, NOT_LITERAL]) -_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) +_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT, + POSSESSIVE_REPEAT]) _SUCCESS_CODES = set([SUCCESS, FAILURE]) _ASSERT_CODES = set([ASSERT, ASSERT_NOT]) @@ -80,6 +81,8 @@ elif _simple(av) and op is not REPEAT: if op is MAX_REPEAT: emit(OPCODES[REPEAT_ONE]) + elif op is POSSESSIVE_REPEAT: + emit(OPCODES[POSSESSIVE_ONE]) else: emit(OPCODES[MIN_REPEAT_ONE]) skip = _len(code); emit(0) @@ -88,6 +91,14 @@ _compile(code, av[2], flags) emit(OPCODES[SUCCESS]) code[skip] = _len(code) - skip + elif op is POSSESSIVE_REPEAT: + emit(OPCODES[POSSESSIVE_REPEAT]) + skip = _len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + code[skip] = _len(code) - skip + emit(OPCODES[SUCCESS]) else: emit(OPCODES[REPEAT]) skip = _len(code); emit(0) @@ -95,6 +106,8 @@ emit(av[1]) _compile(code, av[2], flags) code[skip] = _len(code) - skip + # TODO: What if op is REPEAT, not MIN_REPEAT; + # Default of MIN_UNTIL may be wrong if op is MAX_REPEAT: emit(OPCODES[MAX_UNTIL]) else: @@ -108,6 +121,17 @@ if av[0]: emit(OPCODES[MARK]) emit((av[0]-1)*2+1) + elif op is ATOMIC_GROUP: + # Atomic Groups are handled by starting with an Atomic + # Group op code, then putting in the atomic group pattern + # and finally a success op code to tell any repeat + # operations within the Atomic Group to stop eating and + # pop their stack if they reach it + emit(OPCODES[ATOMIC_GROUP]) + skip = _len(code); emit(0) + _compile(code, av, flags) + emit(OPCODES[SUCCESS]) + code[skip] = _len(code) - skip elif op in SUCCESS_CODES: emit(OPCODES[op]) elif op in ASSERT_CODES: @@ -149,7 +173,7 @@ emit(OPCODES[JUMP]) tailappend(_len(code)); emit(0) code[skip] = _len(code) - skip - emit(0) # end of branch + emit(OPCODES[FAILURE]) # end of branch for tail in tail: code[tail] = _len(code) - tail elif op is CATEGORY: @@ -470,19 +494,6 @@ _compile_charset(charset, flags, code) code[skip] = len(code) - skip -try: - unicode -except NameError: - STRING_TYPES = (type(""),) -else: - STRING_TYPES = (type(""), type(unicode(""))) - -def isstring(obj): - for tp in STRING_TYPES: - if isinstance(obj, tp): - return 1 - return 0 - def _code(p, flags): flags = p.pattern.flags | flags @@ -501,7 +512,8 @@ def compile(p, flags=0): # internal: convert pattern list to internal format - if isstring(p): + if isinstance(p, basestring): + import sre_parse pattern = p p = sre_parse.parse(p, flags) else: === modified file 'Lib/sre_constants.py' --- old/Lib/sre_constants.py 2004-08-25 02:22:30 +0000 +++ new/Lib/sre_constants.py 2008-06-10 22:54:27 +0000 @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20031017 +MAGIC = 20080329 # max code word in this release @@ -54,6 +54,7 @@ MAX_UNTIL = "max_until" MIN_REPEAT = "min_repeat" MIN_UNTIL = "min_until" +POSSESSIVE_REPEAT = "possessive_repeat" NEGATE = "negate" NOT_LITERAL = "not_literal" NOT_LITERAL_IGNORE = "not_literal_ignore" @@ -62,6 +63,8 @@ REPEAT_ONE = "repeat_one" SUBPATTERN = "subpattern" MIN_REPEAT_ONE = "min_repeat_one" +ATOMIC_GROUP = "atomic_group" +POSSESSIVE_ONE = "possessive_one" # positions AT_BEGINNING = "at_beginning" @@ -97,6 +100,10 @@ CATEGORY_UNI_LINEBREAK = "category_uni_linebreak" CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak" +SRE_GROUP_IGNORE = "ignore_this_group" +SRE_GROUP_CAPTURE = "capture_this_group" +SRE_GROUP_NON_CAPTURE = "non_capturing_group" + OPCODES = [ # failure=0 success=1 (just because it looks better that way :-) @@ -123,7 +130,10 @@ REPEAT, REPEAT_ONE, SUBPATTERN, - MIN_REPEAT_ONE + MIN_REPEAT_ONE, + ATOMIC_GROUP, + POSSESSIVE_REPEAT, + POSSESSIVE_ONE ] === modified file 'Lib/sre_parse.py' --- old/Lib/sre_parse.py 2008-05-27 01:18:39 +0000 +++ new/Lib/sre_parse.py 2008-06-09 14:37:21 +0000 @@ -149,7 +149,7 @@ return self.width lo = hi = 0L UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) - REPEATCODES = (MIN_REPEAT, MAX_REPEAT) + REPEATCODES = (MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT) for op, av in self.data: if op is BRANCH: i = sys.maxint @@ -168,6 +168,10 @@ i, j = av[1].getwidth() lo = lo + i hi = hi + j + elif op is ATOMIC_GROUP: + i, j = av.getwidth() + lo = lo + i + hi = hi + j elif op in REPEATCODES: i, j = av[2].getwidth() lo = lo + long(i) * av[0] @@ -380,7 +384,7 @@ _PATTERNENDERS = set("|)") _ASSERTCHARS = set("=!<") _LOOKBEHINDASSERTCHARS = set("=!") -_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) +_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT]) def _parse(source, state): # parse a simple pattern @@ -422,8 +426,6 @@ # character set set = [] setappend = set.append -## if sourcematch(":"): -## pass # handle character classes if sourcematch("^"): setappend((NEGATE, None)) # check remaining characters @@ -520,19 +522,25 @@ if item[0][0] in REPEATCODES: raise error, "multiple repeat" if sourcematch("?"): + # Non-Greedy Match subpattern[-1] = (MIN_REPEAT, (min, max, item)) + elif sourcematch("+"): + # Possessive Match (Always Greedy) + subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item)) else: + # Greedy Match subpattern[-1] = (MAX_REPEAT, (min, max, item)) elif this == ".": subpatternappend((ANY, None)) elif this == "(": - group = 1 + grouptype = SRE_GROUP_CAPTURE name = None condgroup = None + atomic = False if sourcematch("?"): - group = 0 + grouptype = SRE_GROUP_IGNORE # options if sourcematch("P"): # python extensions @@ -546,7 +554,7 @@ if char == ">": break name = name + char - group = 1 + grouptype = SRE_GROUP_CAPTURE if not isname(name): raise error, "bad character in group name" elif sourcematch("="): @@ -573,7 +581,7 @@ raise error, "unknown specifier: ?P%s" % char elif sourcematch(":"): # non-capturing group - group = 2 + grouptype = SRE_GROUP_NON_CAPTURE elif sourcematch("#"): # comment while 1: @@ -610,7 +618,7 @@ if char == ")": break condname = condname + char - group = 2 + grouptype = SRE_GROUP_NON_CAPTURE if isname(condname): condgroup = state.groupdict.get(condname) if condgroup is None: @@ -620,15 +628,19 @@ condgroup = int(condname) except ValueError: raise error, "bad character in group name" + elif sourcematch(">"): + # non-capturing, atomic group + grouptype = SRE_GROUP_NON_CAPTURE + atomic = True else: # flags if not source.next in FLAGS: raise error, "unexpected end of pattern" while source.next in FLAGS: state.flags = state.flags | FLAGS[sourceget()] - if group: + if grouptype != SRE_GROUP_IGNORE: # parse group contents - if group == 2: + if grouptype == SRE_GROUP_NON_CAPTURE: # anonymous group group = None else: @@ -641,7 +653,12 @@ raise error, "unbalanced parenthesis" if group is not None: state.closegroup(group) - subpatternappend((SUBPATTERN, (group, p))) + if atomic: + # TODO: Assert that group is always None in this + # case + subpatternappend((ATOMIC_GROUP, p)) + else: + subpatternappend((SUBPATTERN, (group, p))) else: while 1: char = sourceget() === modified file 'Lib/test/test_re.py' --- old/Lib/test/test_re.py 2008-01-10 21:59:42 +0000 +++ new/Lib/test/test_re.py 2008-06-14 13:56:51 +0000 @@ -35,6 +35,23 @@ self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) self.assertEqual(re.match('a+', 'xxx'), None) + def test_branching(self): + """Test Branching + Test expressions using the OR ('|') operator.""" + self.assertEqual(re.match('(ab|ba)', 'ab').span(), (0, 2)) + self.assertEqual(re.match('(ab|ba)', 'ba').span(), (0, 2)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'abc').span(), + (0, 3)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'bac').span(), + (0, 3)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'ca').span(), + (0, 2)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'cb').span(), + (0, 2)) + self.assertEqual(re.match('((a)|(b)|(c))', 'a').span(), (0, 1)) + self.assertEqual(re.match('((a)|(b)|(c))', 'b').span(), (0, 1)) + self.assertEqual(re.match('((a)|(b)|(c))', 'c').span(), (0, 1)) + def bump_num(self, matchobj): int_value = int(matchobj.group(0)) return str(int_value + 1) @@ -644,8 +661,8 @@ def test_inline_flags(self): # Bug #1700 - upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow - lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow + upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Below + lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Below p = re.compile(upper_char, re.I | re.U) q = p.match(lower_char) @@ -672,7 +689,8 @@ self.assertNotEqual(q, None) def test_dollar_matches_twice(self): - "$ matches the end of string, and just before the terminating \n" + """Test that $ does not include \\n + $ matches the end of string, and just before the terminating \n""" pattern = re.compile('$') self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') @@ -683,6 +701,62 @@ self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') self.assertEqual(pattern.sub('#', '\n'), '#\n#') + def test_possessive_qualifiers(self): + """Test Possessive Qualifiers + test qualifiers of the form @+ for some repetition operator @, + e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed + without creating a stack frame for rolling the stack back and + trying 1 or more fewer matches.""" + self.assertEqual(re.match('e*+e', 'eeee'), None) + self.assertEqual(re.match('e++a', 'eeea').group(0), 'eeea') + self.assertEqual(re.match('e?+a', 'ea').group(0), 'ea') + self.assertEqual(re.match('e{2,4}+a', 'eeea').group(0), 'eeea') + self.assertEqual(re.match('(.)++.', 'ee'), None) + self.assertEqual(re.match('(ae)*+a', 'aea').groups(), ('ae',)) + self.assertEqual(re.match('([ae][ae])?+a', 'aea').groups(), + ('ae',)) + self.assertEqual(re.match('(e?){2,4}+a', 'eeea').groups(), + ('',)) + self.assertEqual(re.match('()*+a', 'a').groups(), ('',)) + self.assertEqual(re.search('x*+', 'axx').span(0), (0, 0)) + self.assertEqual(re.search('x*+', 'axx').span(), (0, 0)) + self.assertEqual(re.search('x++', 'axx').span(0), (1, 3)) + self.assertEqual(re.search('x++', 'axx').span(), (1, 3)) + self.assertEqual(re.match('a*+', 'xxx').span(0), (0, 0)) + self.assertEqual(re.match('a*+', 'xxx').span(), (0, 0)) + self.assertEqual(re.match('x*+', 'xxxa').span(0), (0, 3)) + self.assertEqual(re.match('x*+', 'xxxa').span(), (0, 3)) + self.assertEqual(re.match('a++', 'xxx'), None) + self.assertEqual(re.match("^(\w){1}+$", "abc"), None) + self.assertEqual(re.match("^(\w){1,2}+$", "abc"), None) + + self.assertEqual(re.match("^(\w){3}+$", "abc").group(1), "c") + self.assertEqual(re.match("^(\w){1,3}+$", "abc").group(1), "c") + self.assertEqual(re.match("^(\w){1,4}+$", "abc").group(1), "c") + + self.assertEqual(re.match("^x{1}+$", "xxx"), None) + self.assertEqual(re.match("^x{1,2}+$", "xxx"), None) + + self.assertNotEqual(re.match("^x{3}+$", "xxx"), None) + self.assertNotEqual(re.match("^x{1,3}+$", "xxx"), None) + self.assertNotEqual(re.match("^x{1,4}+$", "xxx"), None) + + self.assertEqual(re.match("^x{}+$", "xxx"), None) + self.assertNotEqual(re.match("^x{}+$", "x{}"), None) + + def test_atomic_grouping(self): + """Test Atomic Grouping + test non-capturing groups of the form (?>...), which acts does + not maintain any stack point created within the group once the + group is finished being evaluated.""" + pattern1 = re.compile(r'a(?>bc|b)c') + self.assertEqual(pattern1.match('abc'), None) + self.assertNotEqual(pattern1.match('abcc'), None) + self.assertEqual(re.match(r'(?>.*).', 'abc'), None) + self.assertNotEqual(re.match(r'(?>x)++', 'xxx'), None) + self.assertNotEqual(re.match(r'(?>x++)', 'xxx'), None) + self.assertEqual(re.match(r'(?>x)++x', 'xxx'), None) + self.assertEqual(re.match(r'(?>x++)x', 'xxx'), None) def run_re_tests(): from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR === modified file 'Modules/_sre.c' --- old/Modules/_sre.c 2008-06-09 04:58:54 +0000 +++ new/Modules/_sre.c 2008-06-09 14:37:21 +0000 @@ -55,8 +55,8 @@ #define SRE_PY_MODULE "re" -/* defining this one enables tracing */ -#undef VERBOSE +/* uncomment this define to enable tracing */ +/* #define VERBOSE_SRE_ENGINE */ #if PY_VERSION_HEX >= 0x01060000 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE) @@ -101,7 +101,7 @@ #define SRE_ERROR_MEMORY -9 /* out of memory */ #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */ -#if defined(VERBOSE) +#if defined(VERBOSE_SRE_ENGINE) #define TRACE(v) printf v #else #define TRACE(v) @@ -775,6 +775,9 @@ #define JUMP_BRANCH 11 #define JUMP_ASSERT 12 #define JUMP_ASSERT_NOT 13 +#define JUMP_POSS_REPEAT_1 14 +#define JUMP_POSS_REPEAT_2 15 +#define JUMP_ATOMIC_GROUP 16 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \ DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \ @@ -1162,6 +1165,57 @@ } RETURN_FAILURE; + case SRE_OP_POSSESSIVE_ONE: + /* match repeated sequence (maximizing regexp) without + backtracking */ + + /* this operator only works if the repeated item is + exactly one character wide, and we're not already + collecting backtracking points. for other cases, + use the MAX_REPEAT operator */ + + /* <1=min> <2=max> item + tail */ + + TRACE(("|%p|%p|POSSESSIVE_ONE %d %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[1], ctx->pattern[2])); + + if (ctx->ptr + ctx->pattern[1] > end) { + RETURN_FAILURE; /* cannot match */ + } + + state->ptr = ctx->ptr; + + ret = SRE_COUNT(state, ctx->pattern + 3, ctx->pattern[2]); + RETURN_ON_ERROR(ret); + DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); + ctx->count = ret; + ctx->ptr += ctx->count; + + /* when we arrive here, count contains the number of + matches, and ctx->ptr points to the tail of the target + string. check if the rest of the pattern matches, + and fail if not. */ + + /* Test for not enough repetitions in match */ + if (ctx->count < (Py_ssize_t) ctx->pattern[1]) { + RETURN_FAILURE; + } + + /* Update the pattern to point to the next op code */ + ctx->pattern += ctx->pattern[0]; + + /* Let the tail be evaluated separately and consider this + match successful. */ + if (*ctx->pattern == SRE_OP_SUCCESS) { + /* tail is empty. we're finished */ + state->ptr = ctx->ptr; + RETURN_SUCCESS; + } + + /* Attempt to match the rest of the string */ + break; + case SRE_OP_REPEAT: /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ @@ -1317,10 +1371,141 @@ state->ptr = ctx->ptr; RETURN_FAILURE; + case SRE_OP_POSSESSIVE_REPEAT: + /* create possessive repeat contexts. */ + /* <1=min> <2=max> pattern + tail */ + TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[1], ctx->pattern[2])); + + /* Set the global Input pointer to this context's Input + pointer */ + state->ptr = ctx->ptr; + + /* Initialize Count to 0 */ + ctx->count = 0; + + /* Check for minimum required matches. */ + while (ctx->count < (int)ctx->pattern[1]) { + /* not enough matches */ + DO_JUMP(JUMP_POSS_REPEAT_1, jump_poss_repeat_1, + &ctx->pattern[3]); + if (ret) { + RETURN_ON_ERROR(ret); + ctx->count++; + } + else { + state->ptr = ctx->ptr; + RETURN_FAILURE; + } + } + + /* Clear the context's Input stream pointer so that it + doesn't match the global state so that the while loop can + be entered. */ + ctx->ptr = NULL; + + /* Keep trying to parse the sub-pattern until the + end is reached, creating a new context each time. */ + while ((ctx->count < (int)ctx->pattern[2] || + (int)ctx->pattern[2] == 65535) && + state->ptr != ctx->ptr) { + /* Save the Capture Group Marker state into the current + Context and back up the current highest number + Capture Group marker. */ + LASTMARK_SAVE(); + MARK_PUSH(ctx->lastmark); + + /* zero-width match protection */ + /* Set the context's Input Stream pointer to be the + current Input Stream pointer from the global + state. When the loop reaches the next iteration, + the context will then store the last known good + position with the global state holding the Input + Input Stream position that has been updated with + the most recent match. Thus, if state's Input + stream remains the same as the one stored in the + current Context, we know we have successfully + matched an empty string and that all subsequent + matches will also be the empty string until the + maximum number of matches are counted, and because + of this, we could immediately stop at that point and + consider this match successful. */ + ctx->ptr = state->ptr; + + /* We have not reached the maximin matches, so try to + match once more. */ + DO_JUMP(JUMP_POSS_REPEAT_2, jump_poss_repeat_2, + &ctx->pattern[3]); + + /* Check to see if the last attempted match + succeeded. */ + if (ret) { + /* Drop the saved highest number Capture Group + marker saved above and use the newly updated + value. */ + MARK_POP_DISCARD(ctx->lastmark); + RETURN_ON_ERROR(ret); + + /* Success, increment the count. */ + ctx->count++; + } + /* Last attempted match failed. */ + else { + /* Restore the previously saved highest number + Capture Group marker since the last iteration + did not match, then restore that to the global + state. */ + MARK_POP(ctx->lastmark); + LASTMARK_RESTORE(); + + /* We have sufficient matches, so exit loop. */ + break; + } + } + + /* Evaluate Tail */ + /* Jump to end of pattern indicated by skip, and then skip + the SUCCESS op code that follows it. */ + ctx->pattern += ctx->pattern[0] + 1; + ctx->ptr = state->ptr; + break; + + case SRE_OP_ATOMIC_GROUP: + /* Atomic Group Sub Pattern */ + /* pattern tail */ + TRACE(("|%p|%p|ATOMIC_GROUP\n", ctx->pattern, ctx->ptr)); + + /* Set the global Input pointer to this context's Input + pointer */ + state->ptr = ctx->ptr; + + /* Evaluate the Atomic Group in a new context, terminating + when the end of the group, represented by a SUCCESS op + code, is reached. */ + /* Group Pattern begins at an offset of 1 code. */ + DO_JUMP(JUMP_ATOMIC_GROUP, jump_atomic_group, + &ctx->pattern[1]); + + /* Test Exit Condition */ + RETURN_ON_ERROR(ret); + + if (ret == 0) { + /* Atomic Group failed to Match. */ + state->ptr = ctx->ptr; + RETURN_FAILURE; + } + + /* Evaluate Tail */ + /* Jump to end of pattern indicated by skip, and then skip + the SUCCESS op code that follows it. */ + ctx->pattern += ctx->pattern[0]; + ctx->ptr = state->ptr; + break; + case SRE_OP_GROUPREF: /* match backreference */ - TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[0])); + TRACE(("|%p|%p|GROUPREF %d\n", , ctx->pattern[0])); i = ctx->pattern[0]; { Py_ssize_t groupref = i+i; @@ -1459,6 +1644,12 @@ case JUMP_MIN_UNTIL_1: TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr)); goto jump_min_until_1; + case JUMP_POSS_REPEAT_1: + TRACE(("|%p|%p|JUMP_POSS_REPEAT_1\n", ctx->pattern, ctx->ptr)); + goto jump_poss_repeat_1; + case JUMP_POSS_REPEAT_2: + TRACE(("|%p|%p|JUMP_POSS_REPEAT_2\n", ctx->pattern, ctx->ptr)); + goto jump_poss_repeat_2; case JUMP_REPEAT: TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr)); goto jump_repeat; @@ -1471,6 +1662,9 @@ case JUMP_MIN_REPEAT_ONE: TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr)); goto jump_min_repeat_one; + case JUMP_ATOMIC_GROUP: + TRACE(("|%p|%p|JUMP_ATOMIC_GROUP\n", ctx->pattern, ctx->ptr)); + goto jump_atomic_group; case JUMP_ASSERT: TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr)); goto jump_assert; === modified file 'Modules/sre_constants.h' --- old/Modules/sre_constants.h 2003-10-17 22:13:16 +0000 +++ new/Modules/sre_constants.h 2008-06-04 19:22:10 +0000 @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20031017 +#define SRE_MAGIC 20080329 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -44,6 +44,9 @@ #define SRE_OP_REPEAT_ONE 29 #define SRE_OP_SUBPATTERN 30 #define SRE_OP_MIN_REPEAT_ONE 31 +#define SRE_OP_ATOMIC_GROUP 32 +#define SRE_OP_POSSESSIVE_REPEAT 33 +#define SRE_OP_POSSESSIVE_ONE 34 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2