--- Lib/re.py Sat Sep 20 17:00:36 2008 +++ Lib/re.py Mon Sep 22 01:19:52 2008 @@ -46,7 +46,7 @@ "|" A|B, creates an RE that will match either A or B. (...) Matches the RE inside the parentheses. The contents can be retrieved or matched later in the string. - (?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below). + (?iLmrsux) Set the I, L, M, R, S, U, or X flag for the RE (see below). (?:...) Non-capturing version of regular parentheses. (?P...) The substring matched by the group is accessible by name. (?P=name) Matches the text matched earlier by the group named name. @@ -97,6 +97,7 @@ as well as the string. "$" matches the end of lines (before a newline) as well as the end of the string. + R REVERSE Search backwards, from the end to the start. S DOTALL "." matches any character at all, including the newline. X VERBOSE Ignore whitespace and comments for nicer looking RE's. U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale. @@ -113,7 +114,7 @@ __all__ = [ "match", "search", "sub", "subn", "split", "findall", "compile", "purge", "template", "escape", "I", "L", "M", "S", "X", "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", - "UNICODE", "error" ] + "UNICODE", "REVERSE", "error" ] __version__ = "2.2.2" @@ -122,6 +123,7 @@ L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline +R = REVERSE = sre_compile.SRE_FLAG_REVERSE # search backwards S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments --- Lib/sre_constants.py Sat Sep 20 17:01:18 2008 +++ Lib/sre_constants.py Sun Sep 21 19:45:44 2008 @@ -299,6 +299,7 @@ SRE_FLAG_UNICODE = 32 # use unicode locale SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments SRE_FLAG_DEBUG = 128 # debugging +SRE_FLAG_REVERSE = 256 # search backwards # flags for INFO primitive SRE_INFO_PREFIX = 1 # has prefix @@ -340,6 +341,7 @@ f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL) f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE) f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE) + f.write("#define SRE_FLAG_REVERSE %d\n" % SRE_FLAG_REVERSE) f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX) f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL) --- Lib/sre_parse.py Sat Sep 20 17:01:14 2008 +++ Lib/sre_parse.py Sun Sep 21 20:35:19 2008 @@ -58,6 +58,7 @@ "i": SRE_FLAG_IGNORECASE, "L": SRE_FLAG_LOCALE, "m": SRE_FLAG_MULTILINE, + "r": SRE_FLAG_REVERSE, "s": SRE_FLAG_DOTALL, "x": SRE_FLAG_VERBOSE, # extensions --- Lib/sre_compile.py Sun Sep 21 19:45:15 2008 +++ Lib/sre_compile.py Mon Sep 22 01:00:01 2008 @@ -565,7 +565,11 @@ info = Record() info.group_count = 0 info.repeat_count = 0 - _compile(code, p.data, flags, info) + if flags & SRE_FLAG_REVERSE: + dir = -1 + else: + dir = 1 + _compile(code, p.data, flags, info, dir) code.append(OPCODES[SUCCESS]) --- Modules/_sre.c Mon Sep 22 01:39:28 2008 +++ Modules/_sre.c Mon Sep 22 01:03:17 2008 @@ -4,24 +4,25 @@ * regular expression matching engine * * partial history: - * 1999-10-24 fl created (based on existing template matcher code) - * 2000-03-06 fl first alpha, sort of - * 2000-08-01 fl fixes for 1.6b1 - * 2000-08-07 fl use PyOS_CheckStack() if available - * 2000-09-20 fl added expand method - * 2001-03-20 fl lots of fixes for 2.1b2 - * 2001-04-15 fl export copyright as Python attribute, not global - * 2001-04-28 fl added __copy__ methods (work in progress) - * 2001-05-14 fl fixes for 1.5.2 compatibility - * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis) - * 2001-10-18 fl fixed group reset issue (from Matthew Mueller) - * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1 - * 2001-10-21 fl added sub/subn primitive - * 2001-10-24 fl added finditer primitive (for 2.2 only) - * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) - * 2002-11-09 fl fixed empty sub/subn return type - * 2003-04-18 mvl fully support 4-byte codes - * 2003-10-17 gn implemented non recursive scheme + * 1999-10-24 fl created (based on existing template matcher code) + * 2000-03-06 fl first alpha, sort of + * 2000-08-01 fl fixes for 1.6b1 + * 2000-08-07 fl use PyOS_CheckStack() if available + * 2000-09-20 fl added expand method + * 2001-03-20 fl lots of fixes for 2.1b2 + * 2001-04-15 fl export copyright as Python attribute, not global + * 2001-04-28 fl added __copy__ methods (work in progress) + * 2001-05-14 fl fixes for 1.5.2 compatibility + * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis) + * 2001-10-18 fl fixed group reset issue (from Matthew Mueller) + * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1 + * 2001-10-21 fl added sub/subn primitive + * 2001-10-24 fl added finditer primitive (for 2.2 only) + * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) + * 2002-11-09 fl fixed empty sub/subn return type + * 2003-04-18 mvl fully support 4-byte codes + * 2003-10-17 gn implemented non recursive scheme + * 2008-09-21 mrab major reworking * * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * @@ -5124,17 +5125,33 @@ context.text_beginning = (SRE_CHAR *)state->beginning; context.text_start = (SRE_CHAR *)state->start; context.text_end = (SRE_CHAR *)state->end; - context.text_ptr = (SRE_CHAR *)state->start; - while (context.text_ptr <= context.text_end) { - TRACE(("|%p|%p|SEARCH\n", pattern, context.text_ptr)); - if (SRE_LOOK_AHEAD_ONE(&context, state, look_ahead)) { - state->start = state->ptr = context.text_ptr; - status = SRE_MATCH(state, state->pattern_code); - if (status != 0) - break; + if (state->reverse) { + context.text_ptr = (SRE_CHAR *)state->end; + + while (context.text_ptr >= context.text_start) { + TRACE(("|%p|%p|SEARCH\n", pattern, context.text_ptr)); + if (SRE_LOOK_AHEAD_ONE(&context, state, look_ahead)) { + state->end = state->ptr = context.text_ptr; + status = SRE_MATCH(state, state->pattern_code); + if (status != 0) + break; + } + context.text_ptr--; + } + } else { + context.text_ptr = (SRE_CHAR *)state->start; + + while (context.text_ptr <= context.text_end) { + TRACE(("|%p|%p|SEARCH\n", pattern, context.text_ptr)); + if (SRE_LOOK_AHEAD_ONE(&context, state, look_ahead)) { + state->start = state->ptr = context.text_ptr; + status = SRE_MATCH(state, state->pattern_code); + if (status != 0) + break; + } + context.text_ptr++; } - context.text_ptr++; } return status; @@ -5343,6 +5360,8 @@ else state->lower = sre_lower; + state->reverse = pattern->flags & SRE_FLAG_REVERSE; + return string; error: @@ -5464,7 +5483,7 @@ if (!string) return NULL; - state.ptr = state.start; + state.ptr = state.reverse ? state.end : state.start; TRACE(("|%p|%p|MATCH\n", pattern_code, state.ptr)); @@ -5654,7 +5673,7 @@ state_reset(&state); - state.ptr = state.start; + state.ptr = state.reverse ? state.end : state.start; if (state.charsize == 1) { status = sre_search(&state, state.pattern_code); @@ -5677,8 +5696,13 @@ /* don't bother to build a match object */ switch (self->groups) { case 0: - b = STATE_OFFSET(&state, state.start); - e = STATE_OFFSET(&state, state.ptr); + if (state.reverse) { + b = STATE_OFFSET(&state, state.ptr); + e = STATE_OFFSET(&state, state.end); + } else { + b = STATE_OFFSET(&state, state.start); + e = STATE_OFFSET(&state, state.ptr); + } item = PySequence_GetSlice(string, b, e); if (!item) goto error; @@ -5708,11 +5732,17 @@ if (status < 0) goto error; - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); - else - state.start = state.ptr; - + if (state.reverse) { + if (state.ptr == state.end) + state.end = (void*) ((char*) state.ptr - state.charsize); + else + state.end = state.ptr; + } else { + if (state.ptr == state.start) + state.start = (void*) ((char*) state.ptr + state.charsize); + else + state.start = state.ptr; + } } state_fini(&state); @@ -5781,13 +5811,13 @@ } n = 0; - last = state.start; + last = state.reverse ? state.end : state.start; while (!maxsplit || n < maxsplit) { state_reset(&state); - state.ptr = state.start; + state.ptr = state.reverse ? state.end : state.start; if (state.charsize == 1) { status = sre_search(&state, state.pattern_code); @@ -5807,19 +5837,36 @@ goto error; } - if (state.start == state.ptr) { - if (last == state.end) - break; - /* skip one character */ - state.start = (void*) ((char*) state.ptr + state.charsize); - continue; + if (state.reverse) { + if (state.end == state.ptr) { + if (last == state.start) + break; + /* skip one character */ + state.end = (void*) ((char*) state.ptr - state.charsize); + continue; + } + } else { + if (state.start == state.ptr) { + if (last == state.end) + break; + /* skip one character */ + state.start = (void*) ((char*) state.ptr + state.charsize); + continue; + } } /* get segment before this match */ - item = PySequence_GetSlice( - string, STATE_OFFSET(&state, last), - STATE_OFFSET(&state, state.start) - ); + if (state.reverse) { + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, state.end), + STATE_OFFSET(&state, last) + ); + } else { + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, last), + STATE_OFFSET(&state, state.start) + ); + } if (!item) goto error; status = PyList_Append(list, item); @@ -5840,14 +5887,24 @@ n = n + 1; - last = state.start = state.ptr; + last = state.ptr; + if (state.reverse) + state.end = state.ptr; + else + state.start = state.ptr; } /* get segment following last match (even if empty) */ - item = PySequence_GetSlice( - string, STATE_OFFSET(&state, last), state.endpos - ); + if (state.reverse) + item = PySequence_GetSlice( + string, state.pos, STATE_OFFSET(&state, last) + ); + else + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, last), state.endpos + ); + if (!item) goto error; status = PyList_Append(list, item); @@ -5936,13 +5993,14 @@ return NULL; } - n = i = 0; + n = 0; + i = STATE_OFFSET(&state, state.reverse ? state.end : state.start); while (!count || n < count) { state_reset(&state); - state.ptr = state.start; + state.ptr = state.reverse ? state.end : state.start; if (state.charsize == 1) { status = sre_search(&state, state.pattern_code); @@ -5962,12 +6020,20 @@ goto error; } - b = STATE_OFFSET(&state, state.start); - e = STATE_OFFSET(&state, state.ptr); + if (state.reverse) { + b = STATE_OFFSET(&state, state.ptr); + e = STATE_OFFSET(&state, state.end); + } else { + b = STATE_OFFSET(&state, state.start); + e = STATE_OFFSET(&state, state.ptr); + } - if (i < b) { + if (state.reverse ? i > e : i < b) { /* get segment before this match */ - item = PySequence_GetSlice(string, i, b); + if (state.reverse) + item = PySequence_GetSlice(string, e, i); + else + item = PySequence_GetSlice(string, i, b); if (!item) goto error; status = PyList_Append(list, item); @@ -6008,21 +6074,30 @@ goto error; } - i = e; + i = state.reverse ? b : e; n = n + 1; next: /* move on */ - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); - else - state.start = state.ptr; - + if (state.reverse) { + if (state.ptr == state.end) + state.end = (void*) ((char*) state.ptr - state.charsize); + else + state.end = state.ptr; + } else { + if (state.ptr == state.start) + state.start = (void*) ((char*) state.ptr + state.charsize); + else + state.start = state.ptr; + } } /* get segment following last match */ - if (i < state.endpos) { - item = PySequence_GetSlice(string, i, state.endpos); + if (state.reverse ? i > state.pos : i < state.endpos) { + if (state.reverse) + item = PySequence_GetSlice(string, state.pos, i); + else + item = PySequence_GetSlice(string, i, state.endpos); if (!item) goto error; status = PyList_Append(list, item); @@ -7259,8 +7334,13 @@ base = (char*) state->beginning; n = state->charsize; - match->mark[0] = ((char*) state->start - base) / n; - match->mark[1] = ((char*) state->ptr - base) / n; + if (state->reverse) { + match->mark[0] = ((char*) state->ptr - base) / n; + match->mark[1] = ((char*) state->end - base) / n; + } else { + match->mark[0] = ((char*) state->start - base) / n; + match->mark[1] = ((char*) state->ptr - base) / n; + } for (i = j = 0; i < pattern->groups; i++, j+=2) if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) { @@ -7310,7 +7390,7 @@ state_reset(state); - state->ptr = state->start; + state->ptr = state->reverse ? state->end : state->start; memset(state->mark, 0, state->pattern_code[0] * sizeof(SRE_CHAR*)); if (state->charsize == 1) { @@ -7326,10 +7406,17 @@ match = pattern_new_match((PatternObject*) self->pattern, state, status); - if (status == 0 || state->ptr == state->start) - state->start = (void*) ((char*) state->ptr + state->charsize); - else - state->start = state->ptr; + if (state->reverse) { + if (status == 0 || state->ptr == state->end) + state->end = (void*) ((char*) state->ptr - state->charsize); + else + state->end = state->ptr; + } else { + if (status == 0 || state->ptr == state->start) + state->start = (void*) ((char*) state->ptr + state->charsize); + else + state->start = state->ptr; + } return match; } @@ -7344,7 +7431,7 @@ state_reset(state); - state->ptr = state->start; + state->ptr = state->reverse ? state->end : state->start; if (state->charsize == 1) { status = sre_search(state, state->pattern_code); @@ -7359,10 +7446,17 @@ match = pattern_new_match((PatternObject*) self->pattern, state, status); - if (status == 0 || state->ptr == state->start) - state->start = (void*) ((char*) state->ptr + state->charsize); - else - state->start = state->ptr; + if (state->reverse) { + if (status == 0 || state->ptr == state->end) + state->end = (void*) ((char*) state->ptr - state->charsize); + else + state->end = state->ptr; + } else { + if (status == 0 || state->ptr == state->start) + state->start = (void*) ((char*) state->ptr + state->charsize); + else + state->start = state->ptr; + } return match; }