=== modified file 'Doc/library/re.rst'
--- old/Doc/library/re.rst 2008-05-31 13:05:34 +0000
+++ new/Doc/library/re.rst 2008-06-09 14:37:21 +0000
@@ -127,6 +127,21 @@
characters as possible will be matched. Using ``.*?`` in the previous
expression will match only ``'
'``.
+``*+``, ``++``, ``?+``
+ Like the ``'*'``, ``'+'``, and ``'?'`` qualifiers, those where ``'+'`` is
+ appended also match as many times as possible. However, unlike the true greedy
+ qualifiers, these do not allow back-tracking when the expression following it
+ fails to match. These are known as :dfn:`Possessive` qualifiers. For example,
+ ``a*a`` will match ``'aaaa'`` because the ``a*`` will match all 4 ``'a'``s, but,
+ when the final ``'a'`` is encountered, the expression is backtracked so that in the
+ end the ``a*`` ends up matching 3 ``'a'``s total, and the fourth ``'a'`` is matched
+ by the final ``'a'``. However, when ``a*+a`` is used to match ``'aaaa'``, the
+ ``a*+`` will match all 4 ``'a'``, but when the final ``'a'`` fails to find any more
+ characters to match, the expression cannot be backtracked and will thus fail to
+ match.
+
+ .. versionadded:: 2.6
+
``{m}``
Specifies that exactly *m* copies of the previous RE should be matched; fewer
matches cause the entire RE not to match. For example, ``a{6}`` will match
@@ -148,6 +163,18 @@
6-character string ``'aaaaaa'``, ``a{3,5}`` will match 5 ``'a'`` characters,
while ``a{3,5}?`` will only match 3 characters.
+``{m,n}+``
+ Causes the resulting RE to match from *m* to *n* repetitions of the preceding
+ RE, attempting to match as many repetitions as possible *without* establishing any
+ backtracking points. This is the possessive version of the qualifier above. For
+ example, on the 6-character string ``'aaaaaa'``, ``a{3,5}aa`` attempt to match 5
+ ``'a'`` characters, then, requiring 2 more ``'a'``s, will need more characters than
+ available and thus fail, while ``a{3,5}aa`` will match with ``a{3,5}`` capturing
+ 5, then 4 ``'a'``s by backtracking and then the final 2 ``'a'``s are matched by the
+ final ``aa`` in the pattern.
+
+ .. versionadded:: 2.6
+
``'\'``
Either escapes special characters (permitting you to match characters like
``'*'``, ``'?'``, and so forth), or signals a special sequence; special
@@ -293,6 +320,20 @@
some fixed length. Patterns which start with negative lookbehind assertions may
match at the beginning of the string being searched.
+``(?>...)``
+ Attempts to match ``...`` as if it was a separate Regular Expression, and if
+ successful, continues to match the rest of the pattern following it. If the
+ subsequent pattern fails to match, the stack can only be unwound to a point
+ *before* the ``(?>...)`` because once exited, the expression, known as an
+ :dfn:`Atomic Group`, has thrown away all stack points within itself. Thus,
+ ``(?>.*).`` would never match anything because first the ``.*`` would match all
+ characters possible, then, having nothing left to match, the final ``.`` would
+ fail to match. Since there are no stack points saved in the Atomic Group, and
+ there is no stack point before it, the entire expression would thus fail to
+ match.
+
+ .. versionadded:: 2.6
+
``(?(id/name)yes-pattern|no-pattern)``
Will try to match with ``yes-pattern`` if the group with given *id* or *name*
exists, and with ``no-pattern`` if it doesn't. ``no-pattern`` is optional and
=== modified file 'Lib/re.py'
--- old/Lib/re.py 2008-05-20 07:49:57 +0000
+++ new/Lib/re.py 2008-05-24 16:05:21 +0000
@@ -235,7 +235,7 @@
if flags:
raise ValueError('Cannot process flags argument with a compiled pattern')
return pattern
- if not sre_compile.isstring(pattern):
+ if not isinstance(pattern, basestring):
raise TypeError, "first argument must be string or compiled pattern"
try:
p = sre_compile.compile(pattern, flags)
=== modified file 'Lib/sre_compile.py'
--- old/Lib/sre_compile.py 2008-04-08 21:27:42 +0000
+++ new/Lib/sre_compile.py 2008-06-04 19:22:10 +0000
@@ -11,7 +11,7 @@
"""Internal support module for sre"""
import _sre, sys
-import sre_parse
+
from sre_constants import *
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
@@ -31,7 +31,8 @@
return s
_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
-_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
+_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT,
+ POSSESSIVE_REPEAT])
_SUCCESS_CODES = set([SUCCESS, FAILURE])
_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
@@ -80,6 +81,8 @@
elif _simple(av) and op is not REPEAT:
if op is MAX_REPEAT:
emit(OPCODES[REPEAT_ONE])
+ elif op is POSSESSIVE_REPEAT:
+ emit(OPCODES[POSSESSIVE_ONE])
else:
emit(OPCODES[MIN_REPEAT_ONE])
skip = _len(code); emit(0)
@@ -88,6 +91,14 @@
_compile(code, av[2], flags)
emit(OPCODES[SUCCESS])
code[skip] = _len(code) - skip
+ elif op is POSSESSIVE_REPEAT:
+ emit(OPCODES[POSSESSIVE_REPEAT])
+ skip = _len(code); emit(0)
+ emit(av[0])
+ emit(av[1])
+ _compile(code, av[2], flags)
+ code[skip] = _len(code) - skip
+ emit(OPCODES[SUCCESS])
else:
emit(OPCODES[REPEAT])
skip = _len(code); emit(0)
@@ -95,6 +106,8 @@
emit(av[1])
_compile(code, av[2], flags)
code[skip] = _len(code) - skip
+ # TODO: What if op is REPEAT, not MIN_REPEAT;
+ # Default of MIN_UNTIL may be wrong
if op is MAX_REPEAT:
emit(OPCODES[MAX_UNTIL])
else:
@@ -108,6 +121,17 @@
if av[0]:
emit(OPCODES[MARK])
emit((av[0]-1)*2+1)
+ elif op is ATOMIC_GROUP:
+ # Atomic Groups are handled by starting with an Atomic
+ # Group op code, then putting in the atomic group pattern
+ # and finally a success op code to tell any repeat
+ # operations within the Atomic Group to stop eating and
+ # pop their stack if they reach it
+ emit(OPCODES[ATOMIC_GROUP])
+ skip = _len(code); emit(0)
+ _compile(code, av, flags)
+ emit(OPCODES[SUCCESS])
+ code[skip] = _len(code) - skip
elif op in SUCCESS_CODES:
emit(OPCODES[op])
elif op in ASSERT_CODES:
@@ -149,7 +173,7 @@
emit(OPCODES[JUMP])
tailappend(_len(code)); emit(0)
code[skip] = _len(code) - skip
- emit(0) # end of branch
+ emit(OPCODES[FAILURE]) # end of branch
for tail in tail:
code[tail] = _len(code) - tail
elif op is CATEGORY:
@@ -470,19 +494,6 @@
_compile_charset(charset, flags, code)
code[skip] = len(code) - skip
-try:
- unicode
-except NameError:
- STRING_TYPES = (type(""),)
-else:
- STRING_TYPES = (type(""), type(unicode("")))
-
-def isstring(obj):
- for tp in STRING_TYPES:
- if isinstance(obj, tp):
- return 1
- return 0
-
def _code(p, flags):
flags = p.pattern.flags | flags
@@ -501,7 +512,8 @@
def compile(p, flags=0):
# internal: convert pattern list to internal format
- if isstring(p):
+ if isinstance(p, basestring):
+ import sre_parse
pattern = p
p = sre_parse.parse(p, flags)
else:
=== modified file 'Lib/sre_constants.py'
--- old/Lib/sre_constants.py 2004-08-25 02:22:30 +0000
+++ new/Lib/sre_constants.py 2008-06-10 22:54:27 +0000
@@ -13,7 +13,7 @@
# update when constants are added or removed
-MAGIC = 20031017
+MAGIC = 20080329
# max code word in this release
@@ -54,6 +54,7 @@
MAX_UNTIL = "max_until"
MIN_REPEAT = "min_repeat"
MIN_UNTIL = "min_until"
+POSSESSIVE_REPEAT = "possessive_repeat"
NEGATE = "negate"
NOT_LITERAL = "not_literal"
NOT_LITERAL_IGNORE = "not_literal_ignore"
@@ -62,6 +63,8 @@
REPEAT_ONE = "repeat_one"
SUBPATTERN = "subpattern"
MIN_REPEAT_ONE = "min_repeat_one"
+ATOMIC_GROUP = "atomic_group"
+POSSESSIVE_ONE = "possessive_one"
# positions
AT_BEGINNING = "at_beginning"
@@ -97,6 +100,10 @@
CATEGORY_UNI_LINEBREAK = "category_uni_linebreak"
CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak"
+SRE_GROUP_IGNORE = "ignore_this_group"
+SRE_GROUP_CAPTURE = "capture_this_group"
+SRE_GROUP_NON_CAPTURE = "non_capturing_group"
+
OPCODES = [
# failure=0 success=1 (just because it looks better that way :-)
@@ -123,7 +130,10 @@
REPEAT,
REPEAT_ONE,
SUBPATTERN,
- MIN_REPEAT_ONE
+ MIN_REPEAT_ONE,
+ ATOMIC_GROUP,
+ POSSESSIVE_REPEAT,
+ POSSESSIVE_ONE
]
=== modified file 'Lib/sre_parse.py'
--- old/Lib/sre_parse.py 2008-05-27 01:18:39 +0000
+++ new/Lib/sre_parse.py 2008-06-09 14:37:21 +0000
@@ -149,7 +149,7 @@
return self.width
lo = hi = 0L
UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
- REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
+ REPEATCODES = (MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT)
for op, av in self.data:
if op is BRANCH:
i = sys.maxint
@@ -168,6 +168,10 @@
i, j = av[1].getwidth()
lo = lo + i
hi = hi + j
+ elif op is ATOMIC_GROUP:
+ i, j = av.getwidth()
+ lo = lo + i
+ hi = hi + j
elif op in REPEATCODES:
i, j = av[2].getwidth()
lo = lo + long(i) * av[0]
@@ -380,7 +384,7 @@
_PATTERNENDERS = set("|)")
_ASSERTCHARS = set("=!<")
_LOOKBEHINDASSERTCHARS = set("=!")
-_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
+_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT])
def _parse(source, state):
# parse a simple pattern
@@ -422,8 +426,6 @@
# character set
set = []
setappend = set.append
-## if sourcematch(":"):
-## pass # handle character classes
if sourcematch("^"):
setappend((NEGATE, None))
# check remaining characters
@@ -520,19 +522,25 @@
if item[0][0] in REPEATCODES:
raise error, "multiple repeat"
if sourcematch("?"):
+ # Non-Greedy Match
subpattern[-1] = (MIN_REPEAT, (min, max, item))
+ elif sourcematch("+"):
+ # Possessive Match (Always Greedy)
+ subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item))
else:
+ # Greedy Match
subpattern[-1] = (MAX_REPEAT, (min, max, item))
elif this == ".":
subpatternappend((ANY, None))
elif this == "(":
- group = 1
+ grouptype = SRE_GROUP_CAPTURE
name = None
condgroup = None
+ atomic = False
if sourcematch("?"):
- group = 0
+ grouptype = SRE_GROUP_IGNORE
# options
if sourcematch("P"):
# python extensions
@@ -546,7 +554,7 @@
if char == ">":
break
name = name + char
- group = 1
+ grouptype = SRE_GROUP_CAPTURE
if not isname(name):
raise error, "bad character in group name"
elif sourcematch("="):
@@ -573,7 +581,7 @@
raise error, "unknown specifier: ?P%s" % char
elif sourcematch(":"):
# non-capturing group
- group = 2
+ grouptype = SRE_GROUP_NON_CAPTURE
elif sourcematch("#"):
# comment
while 1:
@@ -610,7 +618,7 @@
if char == ")":
break
condname = condname + char
- group = 2
+ grouptype = SRE_GROUP_NON_CAPTURE
if isname(condname):
condgroup = state.groupdict.get(condname)
if condgroup is None:
@@ -620,15 +628,19 @@
condgroup = int(condname)
except ValueError:
raise error, "bad character in group name"
+ elif sourcematch(">"):
+ # non-capturing, atomic group
+ grouptype = SRE_GROUP_NON_CAPTURE
+ atomic = True
else:
# flags
if not source.next in FLAGS:
raise error, "unexpected end of pattern"
while source.next in FLAGS:
state.flags = state.flags | FLAGS[sourceget()]
- if group:
+ if grouptype != SRE_GROUP_IGNORE:
# parse group contents
- if group == 2:
+ if grouptype == SRE_GROUP_NON_CAPTURE:
# anonymous group
group = None
else:
@@ -641,7 +653,12 @@
raise error, "unbalanced parenthesis"
if group is not None:
state.closegroup(group)
- subpatternappend((SUBPATTERN, (group, p)))
+ if atomic:
+ # TODO: Assert that group is always None in this
+ # case
+ subpatternappend((ATOMIC_GROUP, p))
+ else:
+ subpatternappend((SUBPATTERN, (group, p)))
else:
while 1:
char = sourceget()
=== modified file 'Lib/test/test_re.py'
--- old/Lib/test/test_re.py 2008-01-10 21:59:42 +0000
+++ new/Lib/test/test_re.py 2008-06-14 13:56:51 +0000
@@ -35,6 +35,23 @@
self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
self.assertEqual(re.match('a+', 'xxx'), None)
+ def test_branching(self):
+ """Test Branching
+ Test expressions using the OR ('|') operator."""
+ self.assertEqual(re.match('(ab|ba)', 'ab').span(), (0, 2))
+ self.assertEqual(re.match('(ab|ba)', 'ba').span(), (0, 2))
+ self.assertEqual(re.match('(abc|bac|ca|cb)', 'abc').span(),
+ (0, 3))
+ self.assertEqual(re.match('(abc|bac|ca|cb)', 'bac').span(),
+ (0, 3))
+ self.assertEqual(re.match('(abc|bac|ca|cb)', 'ca').span(),
+ (0, 2))
+ self.assertEqual(re.match('(abc|bac|ca|cb)', 'cb').span(),
+ (0, 2))
+ self.assertEqual(re.match('((a)|(b)|(c))', 'a').span(), (0, 1))
+ self.assertEqual(re.match('((a)|(b)|(c))', 'b').span(), (0, 1))
+ self.assertEqual(re.match('((a)|(b)|(c))', 'c').span(), (0, 1))
+
def bump_num(self, matchobj):
int_value = int(matchobj.group(0))
return str(int_value + 1)
@@ -644,8 +661,8 @@
def test_inline_flags(self):
# Bug #1700
- upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
- lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
+ upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Below
+ lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Below
p = re.compile(upper_char, re.I | re.U)
q = p.match(lower_char)
@@ -672,7 +689,8 @@
self.assertNotEqual(q, None)
def test_dollar_matches_twice(self):
- "$ matches the end of string, and just before the terminating \n"
+ """Test that $ does not include \\n
+ $ matches the end of string, and just before the terminating \n"""
pattern = re.compile('$')
self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
@@ -683,6 +701,62 @@
self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
self.assertEqual(pattern.sub('#', '\n'), '#\n#')
+ def test_possessive_qualifiers(self):
+ """Test Possessive Qualifiers
+ test qualifiers of the form @+ for some repetition operator @,
+ e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed
+ without creating a stack frame for rolling the stack back and
+ trying 1 or more fewer matches."""
+ self.assertEqual(re.match('e*+e', 'eeee'), None)
+ self.assertEqual(re.match('e++a', 'eeea').group(0), 'eeea')
+ self.assertEqual(re.match('e?+a', 'ea').group(0), 'ea')
+ self.assertEqual(re.match('e{2,4}+a', 'eeea').group(0), 'eeea')
+ self.assertEqual(re.match('(.)++.', 'ee'), None)
+ self.assertEqual(re.match('(ae)*+a', 'aea').groups(), ('ae',))
+ self.assertEqual(re.match('([ae][ae])?+a', 'aea').groups(),
+ ('ae',))
+ self.assertEqual(re.match('(e?){2,4}+a', 'eeea').groups(),
+ ('',))
+ self.assertEqual(re.match('()*+a', 'a').groups(), ('',))
+ self.assertEqual(re.search('x*+', 'axx').span(0), (0, 0))
+ self.assertEqual(re.search('x*+', 'axx').span(), (0, 0))
+ self.assertEqual(re.search('x++', 'axx').span(0), (1, 3))
+ self.assertEqual(re.search('x++', 'axx').span(), (1, 3))
+ self.assertEqual(re.match('a*+', 'xxx').span(0), (0, 0))
+ self.assertEqual(re.match('a*+', 'xxx').span(), (0, 0))
+ self.assertEqual(re.match('x*+', 'xxxa').span(0), (0, 3))
+ self.assertEqual(re.match('x*+', 'xxxa').span(), (0, 3))
+ self.assertEqual(re.match('a++', 'xxx'), None)
+ self.assertEqual(re.match("^(\w){1}+$", "abc"), None)
+ self.assertEqual(re.match("^(\w){1,2}+$", "abc"), None)
+
+ self.assertEqual(re.match("^(\w){3}+$", "abc").group(1), "c")
+ self.assertEqual(re.match("^(\w){1,3}+$", "abc").group(1), "c")
+ self.assertEqual(re.match("^(\w){1,4}+$", "abc").group(1), "c")
+
+ self.assertEqual(re.match("^x{1}+$", "xxx"), None)
+ self.assertEqual(re.match("^x{1,2}+$", "xxx"), None)
+
+ self.assertNotEqual(re.match("^x{3}+$", "xxx"), None)
+ self.assertNotEqual(re.match("^x{1,3}+$", "xxx"), None)
+ self.assertNotEqual(re.match("^x{1,4}+$", "xxx"), None)
+
+ self.assertEqual(re.match("^x{}+$", "xxx"), None)
+ self.assertNotEqual(re.match("^x{}+$", "x{}"), None)
+
+ def test_atomic_grouping(self):
+ """Test Atomic Grouping
+ test non-capturing groups of the form (?>...), which acts does
+ not maintain any stack point created within the group once the
+ group is finished being evaluated."""
+ pattern1 = re.compile(r'a(?>bc|b)c')
+ self.assertEqual(pattern1.match('abc'), None)
+ self.assertNotEqual(pattern1.match('abcc'), None)
+ self.assertEqual(re.match(r'(?>.*).', 'abc'), None)
+ self.assertNotEqual(re.match(r'(?>x)++', 'xxx'), None)
+ self.assertNotEqual(re.match(r'(?>x++)', 'xxx'), None)
+ self.assertEqual(re.match(r'(?>x)++x', 'xxx'), None)
+ self.assertEqual(re.match(r'(?>x++)x', 'xxx'), None)
def run_re_tests():
from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR
=== modified file 'Modules/_sre.c'
--- old/Modules/_sre.c 2008-06-09 04:58:54 +0000
+++ new/Modules/_sre.c 2008-06-09 14:37:21 +0000
@@ -55,8 +55,8 @@
#define SRE_PY_MODULE "re"
-/* defining this one enables tracing */
-#undef VERBOSE
+/* uncomment this define to enable tracing */
+/* #define VERBOSE_SRE_ENGINE */
#if PY_VERSION_HEX >= 0x01060000
#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
@@ -101,7 +101,7 @@
#define SRE_ERROR_MEMORY -9 /* out of memory */
#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
-#if defined(VERBOSE)
+#if defined(VERBOSE_SRE_ENGINE)
#define TRACE(v) printf v
#else
#define TRACE(v)
@@ -775,6 +775,9 @@
#define JUMP_BRANCH 11
#define JUMP_ASSERT 12
#define JUMP_ASSERT_NOT 13
+#define JUMP_POSS_REPEAT_1 14
+#define JUMP_POSS_REPEAT_2 15
+#define JUMP_ATOMIC_GROUP 16
#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
@@ -1162,6 +1165,57 @@
}
RETURN_FAILURE;
+ case SRE_OP_POSSESSIVE_ONE:
+ /* match repeated sequence (maximizing regexp) without
+ backtracking */
+
+ /* this operator only works if the repeated item is
+ exactly one character wide, and we're not already
+ collecting backtracking points. for other cases,
+ use the MAX_REPEAT operator */
+
+ /* <1=min> <2=max> item
+ tail */
+
+ TRACE(("|%p|%p|POSSESSIVE_ONE %d %d\n", ctx->pattern,
+ ctx->ptr, ctx->pattern[1], ctx->pattern[2]));
+
+ if (ctx->ptr + ctx->pattern[1] > end) {
+ RETURN_FAILURE; /* cannot match */
+ }
+
+ state->ptr = ctx->ptr;
+
+ ret = SRE_COUNT(state, ctx->pattern + 3, ctx->pattern[2]);
+ RETURN_ON_ERROR(ret);
+ DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
+ ctx->count = ret;
+ ctx->ptr += ctx->count;
+
+ /* when we arrive here, count contains the number of
+ matches, and ctx->ptr points to the tail of the target
+ string. check if the rest of the pattern matches,
+ and fail if not. */
+
+ /* Test for not enough repetitions in match */
+ if (ctx->count < (Py_ssize_t) ctx->pattern[1]) {
+ RETURN_FAILURE;
+ }
+
+ /* Update the pattern to point to the next op code */
+ ctx->pattern += ctx->pattern[0];
+
+ /* Let the tail be evaluated separately and consider this
+ match successful. */
+ if (*ctx->pattern == SRE_OP_SUCCESS) {
+ /* tail is empty. we're finished */
+ state->ptr = ctx->ptr;
+ RETURN_SUCCESS;
+ }
+
+ /* Attempt to match the rest of the string */
+ break;
+
case SRE_OP_REPEAT:
/* create repeat context. all the hard work is done
by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
@@ -1317,10 +1371,141 @@
state->ptr = ctx->ptr;
RETURN_FAILURE;
+ case SRE_OP_POSSESSIVE_REPEAT:
+ /* create possessive repeat contexts. */
+ /* <1=min> <2=max> pattern
+ tail */
+ TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", ctx->pattern,
+ ctx->ptr, ctx->pattern[1], ctx->pattern[2]));
+
+ /* Set the global Input pointer to this context's Input
+ pointer */
+ state->ptr = ctx->ptr;
+
+ /* Initialize Count to 0 */
+ ctx->count = 0;
+
+ /* Check for minimum required matches. */
+ while (ctx->count < (int)ctx->pattern[1]) {
+ /* not enough matches */
+ DO_JUMP(JUMP_POSS_REPEAT_1, jump_poss_repeat_1,
+ &ctx->pattern[3]);
+ if (ret) {
+ RETURN_ON_ERROR(ret);
+ ctx->count++;
+ }
+ else {
+ state->ptr = ctx->ptr;
+ RETURN_FAILURE;
+ }
+ }
+
+ /* Clear the context's Input stream pointer so that it
+ doesn't match the global state so that the while loop can
+ be entered. */
+ ctx->ptr = NULL;
+
+ /* Keep trying to parse the sub-pattern until the
+ end is reached, creating a new context each time. */
+ while ((ctx->count < (int)ctx->pattern[2] ||
+ (int)ctx->pattern[2] == 65535) &&
+ state->ptr != ctx->ptr) {
+ /* Save the Capture Group Marker state into the current
+ Context and back up the current highest number
+ Capture Group marker. */
+ LASTMARK_SAVE();
+ MARK_PUSH(ctx->lastmark);
+
+ /* zero-width match protection */
+ /* Set the context's Input Stream pointer to be the
+ current Input Stream pointer from the global
+ state. When the loop reaches the next iteration,
+ the context will then store the last known good
+ position with the global state holding the Input
+ Input Stream position that has been updated with
+ the most recent match. Thus, if state's Input
+ stream remains the same as the one stored in the
+ current Context, we know we have successfully
+ matched an empty string and that all subsequent
+ matches will also be the empty string until the
+ maximum number of matches are counted, and because
+ of this, we could immediately stop at that point and
+ consider this match successful. */
+ ctx->ptr = state->ptr;
+
+ /* We have not reached the maximin matches, so try to
+ match once more. */
+ DO_JUMP(JUMP_POSS_REPEAT_2, jump_poss_repeat_2,
+ &ctx->pattern[3]);
+
+ /* Check to see if the last attempted match
+ succeeded. */
+ if (ret) {
+ /* Drop the saved highest number Capture Group
+ marker saved above and use the newly updated
+ value. */
+ MARK_POP_DISCARD(ctx->lastmark);
+ RETURN_ON_ERROR(ret);
+
+ /* Success, increment the count. */
+ ctx->count++;
+ }
+ /* Last attempted match failed. */
+ else {
+ /* Restore the previously saved highest number
+ Capture Group marker since the last iteration
+ did not match, then restore that to the global
+ state. */
+ MARK_POP(ctx->lastmark);
+ LASTMARK_RESTORE();
+
+ /* We have sufficient matches, so exit loop. */
+ break;
+ }
+ }
+
+ /* Evaluate Tail */
+ /* Jump to end of pattern indicated by skip, and then skip
+ the SUCCESS op code that follows it. */
+ ctx->pattern += ctx->pattern[0] + 1;
+ ctx->ptr = state->ptr;
+ break;
+
+ case SRE_OP_ATOMIC_GROUP:
+ /* Atomic Group Sub Pattern */
+ /* pattern tail */
+ TRACE(("|%p|%p|ATOMIC_GROUP\n", ctx->pattern, ctx->ptr));
+
+ /* Set the global Input pointer to this context's Input
+ pointer */
+ state->ptr = ctx->ptr;
+
+ /* Evaluate the Atomic Group in a new context, terminating
+ when the end of the group, represented by a SUCCESS op
+ code, is reached. */
+ /* Group Pattern begins at an offset of 1 code. */
+ DO_JUMP(JUMP_ATOMIC_GROUP, jump_atomic_group,
+ &ctx->pattern[1]);
+
+ /* Test Exit Condition */
+ RETURN_ON_ERROR(ret);
+
+ if (ret == 0) {
+ /* Atomic Group failed to Match. */
+ state->ptr = ctx->ptr;
+ RETURN_FAILURE;
+ }
+
+ /* Evaluate Tail */
+ /* Jump to end of pattern indicated by skip, and then skip
+ the SUCCESS op code that follows it. */
+ ctx->pattern += ctx->pattern[0];
+ ctx->ptr = state->ptr;
+ break;
+
case SRE_OP_GROUPREF:
/* match backreference */
- TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
- ctx->ptr, ctx->pattern[0]));
+ TRACE(("|%p|%p|GROUPREF %d\n", , ctx->pattern[0]));
i = ctx->pattern[0];
{
Py_ssize_t groupref = i+i;
@@ -1459,6 +1644,12 @@
case JUMP_MIN_UNTIL_1:
TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
goto jump_min_until_1;
+ case JUMP_POSS_REPEAT_1:
+ TRACE(("|%p|%p|JUMP_POSS_REPEAT_1\n", ctx->pattern, ctx->ptr));
+ goto jump_poss_repeat_1;
+ case JUMP_POSS_REPEAT_2:
+ TRACE(("|%p|%p|JUMP_POSS_REPEAT_2\n", ctx->pattern, ctx->ptr));
+ goto jump_poss_repeat_2;
case JUMP_REPEAT:
TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
goto jump_repeat;
@@ -1471,6 +1662,9 @@
case JUMP_MIN_REPEAT_ONE:
TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
goto jump_min_repeat_one;
+ case JUMP_ATOMIC_GROUP:
+ TRACE(("|%p|%p|JUMP_ATOMIC_GROUP\n", ctx->pattern, ctx->ptr));
+ goto jump_atomic_group;
case JUMP_ASSERT:
TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
goto jump_assert;
=== modified file 'Modules/sre_constants.h'
--- old/Modules/sre_constants.h 2003-10-17 22:13:16 +0000
+++ new/Modules/sre_constants.h 2008-06-04 19:22:10 +0000
@@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/
-#define SRE_MAGIC 20031017
+#define SRE_MAGIC 20080329
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
@@ -44,6 +44,9 @@
#define SRE_OP_REPEAT_ONE 29
#define SRE_OP_SUBPATTERN 30
#define SRE_OP_MIN_REPEAT_ONE 31
+#define SRE_OP_ATOMIC_GROUP 32
+#define SRE_OP_POSSESSIVE_REPEAT 33
+#define SRE_OP_POSSESSIVE_ONE 34
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2