Index: Lib/sre_compile.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/sre_compile.py,v retrieving revision 1.47 diff -u -r1.47 sre_compile.py --- Lib/sre_compile.py 19 Apr 2003 12:56:07 -0000 1.47 +++ Lib/sre_compile.py 22 Jun 2003 20:15:33 -0000 @@ -48,21 +48,18 @@ emit(OPCODES[ANY_ALL]) else: emit(OPCODES[ANY]) - elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): + elif op in (MIN_REPEAT, MAX_REPEAT): if flags & SRE_FLAG_TEMPLATE: raise error, "internal: unsupported template operator" - emit(OPCODES[REPEAT]) + emit(OPCODES[MAX_REPEAT]) skip = len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip - elif _simple(av) and op != REPEAT: - if op == MAX_REPEAT: - emit(OPCODES[REPEAT_ONE]) - else: - emit(OPCODES[MIN_REPEAT_ONE]) + elif op == MAX_REPEAT and _simple(av): + emit(OPCODES[MAX_REPEAT_ONE]) skip = len(code); emit(0) emit(av[0]) emit(av[1]) @@ -70,16 +67,13 @@ emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip else: - emit(OPCODES[REPEAT]) + emit(OPCODES[op]) skip = len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) + emit(OPCODES[REPEAT_END]) code[skip] = len(code) - skip - if op == MAX_REPEAT: - emit(OPCODES[MAX_UNTIL]) - else: - emit(OPCODES[MIN_UNTIL]) elif op is SUBPATTERN: if av[0]: emit(OPCODES[MARK]) @@ -145,6 +139,19 @@ else: emit(OPCODES[op]) emit(av-1) + elif op is GROUPREF_EXISTS: + emit(OPCODES[op]) + emit((av[0]-1)*2) + skipyes = len(code); emit(0) + _compile(code, av[1], flags) + if av[2]: + emit(OPCODES[JUMP]) + skipno = len(code); emit(0) + code[skipyes] = len(code) - skipyes + 1 + _compile(code, av[2], flags) + code[skipno] = len(code) - skipno + else: + code[skipyes] = len(code) - skipyes + 1 else: raise ValueError, ("unsupported operand type", op) Index: Lib/sre_constants.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/sre_constants.py,v retrieving revision 1.32 diff -u -r1.32 sre_constants.py --- Lib/sre_constants.py 19 Apr 2003 12:56:07 -0000 1.32 +++ Lib/sre_constants.py 22 Jun 2003 20:15:33 -0000 @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20030419 +MAGIC = 20030614 # max code word in this release @@ -42,6 +42,7 @@ CHARSET = "charset" GROUPREF = "groupref" GROUPREF_IGNORE = "groupref_ignore" +GROUPREF_EXISTS = "groupref_exists" IN = "in" IN_IGNORE = "in_ignore" INFO = "info" @@ -50,17 +51,16 @@ LITERAL_IGNORE = "literal_ignore" MARK = "mark" MAX_REPEAT = "max_repeat" -MAX_UNTIL = "max_until" +MAX_REPEAT_END = "max_repeat_end" +MAX_REPEAT_ONE = "max_repeat_one" MIN_REPEAT = "min_repeat" -MIN_UNTIL = "min_until" +MIN_REPEAT_END = "min_repeat" NEGATE = "negate" NOT_LITERAL = "not_literal" NOT_LITERAL_IGNORE = "not_literal_ignore" RANGE = "range" -REPEAT = "repeat" -REPEAT_ONE = "repeat_one" +REPEAT_END = "repeat_end" SUBPATTERN = "subpattern" -MIN_REPEAT_ONE = "min_repeat_one" # positions AT_BEGINNING = "at_beginning" @@ -108,21 +108,22 @@ CALL, CATEGORY, CHARSET, BIGCHARSET, - GROUPREF, GROUPREF_IGNORE, + GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE, IN, IN_IGNORE, INFO, JUMP, LITERAL, LITERAL_IGNORE, MARK, - MAX_UNTIL, - MIN_UNTIL, + MAX_REPEAT, + MAX_REPEAT_END, + MAX_REPEAT_ONE, + MIN_REPEAT, + MIN_REPEAT_END, NOT_LITERAL, NOT_LITERAL_IGNORE, NEGATE, RANGE, - REPEAT, - REPEAT_ONE, + REPEAT_END, SUBPATTERN, - MIN_REPEAT_ONE ] Index: Lib/sre_parse.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/sre_parse.py,v retrieving revision 1.57 diff -u -r1.57 sre_parse.py --- Lib/sre_parse.py 19 Apr 2003 08:37:23 -0000 1.57 +++ Lib/sre_parse.py 22 Jun 2003 20:15:36 -0000 @@ -364,6 +364,20 @@ subpattern.append((BRANCH, (None, items))) return subpattern +def _parse_sub_cond(source, state, condgroup): + item_yes = _parse(source, state) + if source.match("|"): + item_no = _parse(source, state) + if source.match("|"): + raise error, "conditional backref with more than two branches" + else: + item_no = None + if source.next and not source.match(")", 0): + raise error, "pattern not properly closed" + subpattern = SubPattern(state) + subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) + return subpattern + def _parse(source, state): # parse a simple pattern @@ -499,6 +513,7 @@ elif this == "(": group = 1 name = None + condgroup = None if source.match("?"): group = 0 # options @@ -568,6 +583,26 @@ else: subpattern.append((ASSERT_NOT, (dir, p))) continue + elif source.match("("): + # conditional backreference group + condname = "" + while 1: + char = source.get() + if char is None: + raise error, "unterminated name" + if char == ")": + break + condname = condname + char + group = 2 + if isname(condname): + condgroup = state.groupdict.get(condname) + if condgroup is None: + raise error, "unknown group name" + else: + try: + condgroup = atoi(condname) + except ValueError: + raise error, "bad character in group name" else: # flags if not source.next in FLAGS: @@ -581,7 +616,10 @@ group = None else: group = state.opengroup(name) - p = _parse_sub(source, state) + if condgroup: + p = _parse_sub_cond(source, state, condgroup) + else: + p = _parse_sub(source, state) if not source.match(")"): raise error, "unbalanced parenthesis" if group is not None: Index: Lib/test/test_re.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/test/test_re.py,v retrieving revision 1.43 diff -u -r1.43 test_re.py --- Lib/test/test_re.py 20 Jun 2003 00:25:14 -0000 1.43 +++ Lib/test/test_re.py 22 Jun 2003 20:15:43 -0000 @@ -398,26 +398,22 @@ self.assertRaises(re.error, re.compile, 'foo[a-') def test_bug_418626(self): - # bugs 418626 at al. -- Testing Greg Chapman's addition of op code - # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of - # pattern '*?' on a long string. self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), 20003) self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) - # non-simple '*?' still recurses and hits the recursion limit - self.assertRaises(RuntimeError, re.search, '(a|b)*?c', 10000*'ab'+'cd') + self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) def test_bug_612074(self): pat=u"["+re.escape(u"\u2039")+u"]" self.assertEqual(re.compile(pat) and 1, 1) def test_stack_overflow(self): - # nasty case that overflows the straightforward recursive + # nasty case that used to overflow the straightforward recursive # implementation of repeated groups. - self.assertRaises(RuntimeError, re.match, '(x)*', 50000*'x') - self.assertRaises(RuntimeError, re.match, '(x)*y', 50000*'x'+'y') - self.assertRaises(RuntimeError, re.match, '(x)*?y', 50000*'x'+'y') + self.assertEqual(re.match('(x)*', 50000*'x').end(0), 50000) + self.assertEqual(re.match('(x)*y', 50000*'x'+'y').end(0), 50001) + self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').end(0), 50001) def test_scanner(self): def s_ident(scanner, token): return token @@ -489,6 +485,7 @@ pass for t in tests: + #print t # XXX <- remove this sys.stdout.flush() pattern = s = outcome = repl = expected = None if len(t) == 5: Index: Modules/_sre.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Modules/_sre.c,v retrieving revision 2.98 diff -u -r2.98 _sre.c --- Modules/_sre.c 9 Jun 2003 08:22:11 -0000 2.98 +++ Modules/_sre.c 22 Jun 2003 20:15:57 -0000 @@ -52,6 +52,7 @@ /* defining this one enables tracing */ #undef VERBOSE +//#define VERBOSE 1 #if PY_VERSION_HEX >= 0x01060000 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE) @@ -275,67 +276,83 @@ /* helpers */ static void -mark_fini(SRE_STATE* state) +data_stack_dealloc(SRE_STATE* state) { - if (state->mark_stack) { - free(state->mark_stack); - state->mark_stack = NULL; + if (state->data_stack) { + free(state->data_stack); + state->data_stack = NULL; } - state->mark_stack_size = state->mark_stack_base = 0; + state->data_stack_size = state->data_stack_base = 0; } static int -mark_save(SRE_STATE* state, int lo, int hi, int *mark_stack_base) +data_stack_grow(SRE_STATE* state, int size) { + int minsize, cursize; void* stack; - int size; - int minsize, newsize; - if (hi <= lo) - return 0; + minsize = state->data_stack_base + size; + cursize = state->data_stack_size; - size = (hi - lo) + 1; - - newsize = state->mark_stack_size; - minsize = state->mark_stack_base + size; - - if (newsize < minsize) { + if (cursize < minsize) { /* create new stack */ - if (!newsize) { - newsize = 512; - if (newsize < minsize) - newsize = minsize; - TRACE(("allocate stack %d\n", newsize)); - stack = malloc(sizeof(void*) * newsize); + if (!cursize) { + cursize = 512; + if (cursize < minsize) + cursize = minsize; + TRACE(("allocate stack %d\n", cursize)); + stack = malloc(sizeof(void*) * cursize); } else { /* grow the stack */ - while (newsize < minsize) - newsize += newsize; - TRACE(("grow stack to %d\n", newsize)); - stack = realloc(state->mark_stack, sizeof(void*) * newsize); + cursize += 1024; + while (cursize < minsize) + cursize += cursize; + TRACE(("grow stack to %d\n", cursize)); + stack = realloc(state->data_stack, sizeof(void*) * cursize); } if (!stack) { - mark_fini(state); + data_stack_dealloc(state); return SRE_ERROR_MEMORY; } - state->mark_stack = stack; - state->mark_stack_size = newsize; + state->data_stack = stack; + state->data_stack_size = cursize; } - TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->mark_stack_base, size)); + return 0; +} + +static int +mark_save(SRE_STATE* state, int lo, int hi, int *data_stack_base) +{ + int size; + int i; + + if (hi <= lo) + return 0; + + size = (hi - lo) + 1; + + if (*data_stack_base != -1) + state->data_stack_base = *data_stack_base; + + i = data_stack_grow(state, size+1); + if (i) + return i; - memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo, + TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->data_stack_base, size)); + + memcpy(state->data_stack + state->data_stack_base, state->mark + lo, size * sizeof(void*)); - state->mark_stack_base += size; + state->data_stack_base += size; - *mark_stack_base = state->mark_stack_base; + *data_stack_base = state->data_stack_base; return 0; } static int -mark_restore(SRE_STATE* state, int lo, int hi, int *mark_stack_base) +mark_restore(SRE_STATE* state, int lo, int hi, int *data_stack_base) { int size; @@ -344,16 +361,56 @@ size = (hi - lo) + 1; - state->mark_stack_base = *mark_stack_base - size; + *data_stack_base -= size; + state->data_stack_base = *data_stack_base; - TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base)); + TRACE(("copy %d:%d from %d\n", lo, hi, state->data_stack_base)); - memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base, + memcpy(state->mark + lo, state->data_stack + state->data_stack_base, size * sizeof(void*)); return 0; } +static int +data_save(SRE_STATE* state, void *data, int size, int *data_stack_base) +{ + int stacksize = (size+sizeof(void*)-1)/sizeof(void*); + int i; + + if (*data_stack_base != -1) + state->data_stack_base = *data_stack_base; + + i = data_stack_grow(state, stacksize); + if (i) + return i; + + TRACE(("copy data in %p to %d (%d/%d)\n", data, state->data_stack_base, stacksize, size)); + + memcpy(state->data_stack + state->data_stack_base, data, size); + + state->data_stack_base += stacksize; + + *data_stack_base = state->data_stack_base; + + return 0; +} + +static int +data_restore(SRE_STATE* state, void *data, int size, int *data_stack_base) +{ + int stacksize = (size+sizeof(void*)-1)/sizeof(void*); + + *data_stack_base -= stacksize; + state->data_stack_base = *data_stack_base; + + TRACE(("copy data to %p from %d (%d/%d)\n", data, state->data_stack_base, stacksize, size)); + + memcpy(data, state->data_stack + state->data_stack_base, size); + + return 0; +} + /* generate 8-bit version */ #define SRE_CHAR unsigned char @@ -578,7 +635,7 @@ } } -LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level); +LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level, int tail); LOCAL(int) SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level) @@ -651,7 +708,7 @@ /* repeated single character pattern */ TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr)); while ((SRE_CHAR*) state->ptr < end) { - i = SRE_MATCH(state, pattern, level); + i = SRE_MATCH(state, pattern, level, 0); if (i < 0) return i; if (!i) @@ -729,28 +786,44 @@ } while (0) #define LASTMARK_RESTORE() \ do { \ - if (state->lastmark > lastmark) { \ - memset(state->mark + lastmark + 1, 0, \ - (state->lastmark - lastmark) * sizeof(void*)); \ - state->lastmark = lastmark; \ - state->lastindex = lastindex; \ - } \ + state->lastmark = lastmark; \ + state->lastindex = lastindex; \ + } while (0) + +#define DATA_SAVE(s,x) data_save(state, (x), sizeof(x), &data_stack_base) +#define DATA_RESTORE(s,x) data_restore(state, (x), sizeof(x), &data_stack_base) + +#define STATE_SAVE() \ + do { \ + i = mark_save(state, 0, state->lastmark, &data_stack_base); \ + if (i < 0) return i; \ + i = data_save(state, state, sizeof(SRE_STATE_HEAD), &data_stack_base); \ + if (i < 0) return i; \ + } while (0) +#define STATE_RESTORE() \ + do { \ + i = data_restore(state, state, sizeof(SRE_STATE_HEAD), &data_stack_base); \ + if (i < 0) return i; \ + i = mark_restore(state, 0, state->lastmark, &data_stack_base); \ + if (i < 0) return i; \ } while (0) LOCAL(int) -SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) +SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level, int tail) { /* check if string matches the given pattern. returns <0 for error, 0 for failure, and 1 for success */ SRE_CHAR* end = state->end; SRE_CHAR* ptr = state->ptr; + SRE_CHAR* last_ptr; int i, count; - SRE_REPEAT* rp; - int lastmark, lastindex, mark_stack_base; + int lastmark, lastindex; + int data_stack_base = -1; SRE_CODE chr; - SRE_REPEAT rep; /* FIXME: allocate in STATE instead */ + SRE_REPEAT rep; + SRE_REPEAT *repp; TRACE(("|%p|%p|ENTER %d\n", pattern, ptr, level)); @@ -862,14 +935,19 @@ TRACE(("|%p|%p|GROUPREF %d\n", pattern, ptr, pattern[0])); i = pattern[0]; { - SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i]; - SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1]; - if (!p || !e || e < p) + int groupref = i+i; + if (groupref >= state->lastmark) { return 0; - while (p < e) { - if (ptr >= end || *ptr != *p) + } else { + SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; + SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; + if (!p || !e || e < p) return 0; - p++; ptr++; + while (p < e) { + if (ptr >= end || *ptr != *p) + return 0; + p++; ptr++; + } } } pattern++; @@ -880,20 +958,46 @@ TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, ptr, pattern[0])); i = pattern[0]; { - SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i]; - SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1]; - if (!p || !e || e < p) + int groupref = i+i; + if (groupref >= state->lastmark) { return 0; - while (p < e) { - if (ptr >= end || - state->lower(*ptr) != state->lower(*p)) + } else { + SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; + SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; + if (!p || !e || e < p) return 0; - p++; ptr++; + while (p < e) { + if (ptr >= end || + state->lower(*ptr) != state->lower(*p)) + return 0; + p++; ptr++; + } } } pattern++; break; + case SRE_OP_GROUPREF_EXISTS: + TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", pattern, ptr, pattern[0])); + /* codeyes codeno ... */ + i = pattern[0]; + { + int groupref = i+i; + if (groupref >= state->lastmark) { + pattern += pattern[1]; + break; + } else { + SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; + SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; + if (!p || !e || e < p) { + pattern += pattern[1]; + break; + } + } + } + pattern += 2; + break; + case SRE_OP_LITERAL_IGNORE: TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern, ptr, pattern[0])); if (ptr >= end || @@ -928,8 +1032,16 @@ i = pattern[0]; if (i & 1) state->lastindex = i/2 + 1; - if (i > state->lastmark) + if (i > state->lastmark) { + /* state->lastmark is the highest valid index in the + state->mark array. If it is increased by more than 1, + the intervening marks must be set to NULL to signal + that these marks have not been encountered. */ + int j = state->lastmark + 1; + while (j < i) + state->mark[j++] = NULL; state->lastmark = i; + } state->mark[i] = ptr; pattern++; break; @@ -944,12 +1056,12 @@ case SRE_OP_ASSERT: /* assert subpattern */ - /* */ + /* */ TRACE(("|%p|%p|ASSERT %d\n", pattern, ptr, pattern[1])); state->ptr = ptr - pattern[1]; if (state->ptr < state->beginning) return 0; - i = SRE_MATCH(state, pattern + 2, level + 1); + i = SRE_MATCH(state, pattern + 2, level + 1, 0); if (i <= 0) return i; pattern += pattern[0]; @@ -957,15 +1069,17 @@ case SRE_OP_ASSERT_NOT: /* assert not subpattern */ - /* */ + /* */ TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1])); - state->ptr = ptr - pattern[1]; - if (state->ptr >= state->beginning) { - i = SRE_MATCH(state, pattern + 2, level + 1); + if (((void*)ptr - pattern[1]) >= state->beginning) { + STATE_SAVE(); + state->ptr = ptr - pattern[1]; + i = SRE_MATCH(state, pattern + 2, level + 1, 0); if (i < 0) return i; if (i) return 0; + STATE_RESTORE(); } pattern += pattern[0]; break; @@ -974,33 +1088,31 @@ /* alternation */ /* <0=skip> code ... */ TRACE(("|%p|%p|BRANCH\n", pattern, ptr)); - LASTMARK_SAVE(); - if (state->repeat) { - i = mark_save(state, 0, lastmark, &mark_stack_base); - if (i < 0) - return i; - } - for (; pattern[0]; pattern += pattern[0]) { - if (pattern[1] == SRE_OP_LITERAL && - (ptr >= end || (SRE_CODE) *ptr != pattern[2])) - continue; - if (pattern[1] == SRE_OP_IN && - (ptr >= end || !SRE_CHARSET(pattern + 3, (SRE_CODE) *ptr))) - continue; - state->ptr = ptr; - i = SRE_MATCH(state, pattern + 1, level + 1); - if (i) - return i; - if (state->repeat) { - i = mark_restore(state, 0, lastmark, &mark_stack_base); - if (i < 0) + { + int inside_repeat = (state->repeat != NULL); + LASTMARK_SAVE(); + for (; pattern[0]; pattern += pattern[0]) { + if (inside_repeat) + STATE_SAVE(); + if (pattern[1] == SRE_OP_LITERAL && + (ptr >= end || (SRE_CODE) *ptr != pattern[2])) + continue; + if (pattern[1] == SRE_OP_IN && + (ptr >= end || !SRE_CHARSET(pattern + 3, (SRE_CODE) *ptr))) + continue; + state->ptr = ptr; + i = SRE_MATCH(state, pattern + 1, level + 1, 1); + if (i) return i; + if (inside_repeat) + STATE_RESTORE(); + else + LASTMARK_RESTORE(); } - LASTMARK_RESTORE(); } return 0; - case SRE_OP_REPEAT_ONE: + case SRE_OP_MAX_REPEAT_ONE: /* match repeated sequence (maximizing regexp) */ /* this operator only works if the repeated item is @@ -1008,9 +1120,9 @@ collecting backtracking points. for other cases, use the MAX_REPEAT operator */ - /* <1=min> <2=max> item tail */ + /* <1=min> <2=max> item tail */ - TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr, + TRACE(("|%p|%p|MAX_REPEAT_ONE %d %d\n", pattern, ptr, pattern[1], pattern[2])); if (ptr + pattern[1] > end) @@ -1053,7 +1165,7 @@ if (count < (int) pattern[1]) break; state->ptr = ptr; - i = SRE_MATCH(state, pattern + pattern[0], level + 1); + i = SRE_MATCH(state, pattern + pattern[0], level + 1, 1); if (i) return i; ptr--; @@ -1065,7 +1177,7 @@ /* general case */ while (count >= (int) pattern[1]) { state->ptr = ptr; - i = SRE_MATCH(state, pattern + pattern[0], level + 1); + i = SRE_MATCH(state, pattern + pattern[0], level + 1, 1); if (i) return i; ptr--; @@ -1075,198 +1187,325 @@ } return 0; - case SRE_OP_MIN_REPEAT_ONE: - /* match repeated sequence (minimizing regexp) */ + case SRE_OP_MIN_REPEAT: + /* non-greedy repeat */ + /* <1=min> <2=max> item tail */ + TRACE(("|%p|%p|MIN_REPEAT %p %d %d\n", pattern, ptr, + pattern, pattern[1], pattern[2])); - /* this operator only works if the repeated item is - exactly one character wide, and we're not already - collecting backtracking points. for other cases, - use the MIN_REPEAT operator */ - - /* <1=min> <2=max> item tail */ + rep.prev = state->repeat; + rep.count = 0; + rep.pattern = pattern; + rep.last_ptr = NULL; + + state->ptr = ptr; + state->repeat = &rep; - TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", pattern, ptr, - pattern[1], pattern[2])); + /* try to match the minimum */ + while (rep.count < pattern[1]) { + /* no need to check for infinite loops here, since + * the minimum count is always reach */ + i = SRE_MATCH(state, pattern + 3, level + 1, 0); + if (i <= 0) + return 0; + rep.count += 1; + } - if (ptr + pattern[1] > end) - return 0; /* cannot match */ + /* try to match the tail until the maximum */ + if (pattern[pattern[0]] == SRE_OP_LITERAL) { + /* tail starts with a literal. skip positions where + the rest of the pattern cannot possibly match */ + chr = pattern[pattern[0]+1]; + for (;;) { + STATE_SAVE(); - state->ptr = ptr; + if (chr == *(SRE_CHAR*)state->ptr) { + state->repeat = rep.prev; + i = SRE_MATCH(state, pattern + pattern[0], level + 1, 1); + if (i) + return i; + state->repeat = &rep; + } - if (pattern[1] == 0) - count = 0; - else { - /* count using pattern min as the maximum */ - count = SRE_COUNT(state, pattern + 3, pattern[1], level + 1); + if (rep.count >= pattern[2] && pattern[2] != 65535) + return 0; - if (count < 0) - return count; /* exception */ - if (count < (int) pattern[1]) - return 0; /* did not match minimum number of times */ - ptr += count; /* advance past minimum matches of repeat */ - } - - if (pattern[pattern[0]] == SRE_OP_SUCCESS) { - /* tail is empty. we're finished */ - state->ptr = ptr; - return 1; + STATE_RESTORE(); + /* try one more item */ + last_ptr = rep.last_ptr = state->ptr; + i = SRE_MATCH(state, pattern + 3, level + 1, 0); + if (i <= 0) + return 0; + /* no infinite loops, please */ + if (state->ptr == last_ptr) + return 0; + rep.count += 1; + } } else { - /* general case */ - int matchmax = ((int)pattern[2] == 65535); - int c; - LASTMARK_SAVE(); - while (matchmax || count <= (int) pattern[2]) { - state->ptr = ptr; - i = SRE_MATCH(state, pattern + pattern[0], level + 1); + /* generic case */ + for (;;) { + TRACE(("MIN_REPEAT trying tail\n")); + STATE_SAVE(); + + /* if the tail matches, we're done */ + ptr = state->ptr; + state->repeat = rep.prev; + i = SRE_MATCH(state, pattern + pattern[0], level + 1, 1); if (i) return i; - state->ptr = ptr; - c = SRE_COUNT(state, pattern+3, 1, level+1); - if (c < 0) - return c; - if (c == 0) - break; - assert(c == 1); - ptr++; - count++; - LASTMARK_RESTORE(); + state->repeat = &rep; + + STATE_RESTORE(); + + TRACE(("MIN_REPEAT tail failed\n")); + if (rep.count >= pattern[2] && pattern[2] != 65535) + return 0; + + TRACE(("MIN_REPEAT trying one more item\n")); + + /* try one more item */ + last_ptr = rep.last_ptr = state->ptr; + i = SRE_MATCH(state, pattern + 3, level + 1, 0); + TRACE(("MIN_REPEAT checking result\n")); + if (i <= 0) + return 0; + /* no infinite loops, please */ + if (state->ptr == last_ptr) + return 0; + rep.count += 1; + TRACE(("MIN_REPEAT item succeeded\n")); } } - return 0; - case SRE_OP_REPEAT: - /* create repeat context. all the hard work is done - by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ - /* <1=min> <2=max> item tail */ - TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr, - pattern[1], pattern[2])); + return 0; - rep.count = -1; - rep.pattern = pattern; + case SRE_OP_MAX_REPEAT: + /* greedy repeat */ + /* <1=min> <2=max> item tail */ + TRACE(("|%p|%p|MAX_REPEAT %p %d %d\n", pattern, ptr, + pattern, pattern[1], pattern[2])); - /* install new repeat context */ rep.prev = state->repeat; - state->repeat = &rep; - + rep.count = 0; + rep.pattern = pattern; + rep.last_ptr = NULL; + state->ptr = ptr; - i = SRE_MATCH(state, pattern + pattern[0], level + 1); + state->repeat = &rep; - state->repeat = rep.prev; + /* try to match the minimum */ + TRACE(("MAX_REPEAT starting\n")); + while (rep.count < pattern[1]) { + /* no need to check for infinite loops here, since + * the minimum count is always reach */ + TRACE(("MAX_REPEAT trying one item for the minimium\n")); + i = SRE_MATCH(state, pattern + 3, level + 1, 0); + if (i <= 0) + return 0; + rep.count += 1; + } - return i; + STATE_SAVE(); - case SRE_OP_MAX_UNTIL: - /* maximizing repeat */ - /* <1=min> <2=max> item tail */ + TRACE(("MAX_REPEAT minimum succeeded\n")); - /* FIXME: we probably need to deal with zero-width - matches in here... */ + /* try to match more */ + if (pattern[pattern[0]] == SRE_OP_LITERAL) { + /* tail starts with a literal */ + TRACE(("MAX_REPEAT will try to match more, with literal\n")); + DATA_SAVE(state, &rep.count); + chr = pattern[pattern[0]+1]; + while (rep.count < pattern[2] || pattern[2] == 65535) { + TRACE(("MAX_REPEAT trying to match more, with literal\n")); + last_ptr = rep.last_ptr = state->ptr; + i = SRE_MATCH(state, pattern + 3, level + 1, 0); + if (i < 0) + return i; + if (i == 0) + break; + /* don't save when we're sure the tail won't match */ + rep.count += 1; + if (*(SRE_CHAR*)state->ptr == chr) { + STATE_SAVE(); + DATA_SAVE(state, &rep.count); + } + /* no infinite loops, please */ + if (state->ptr == last_ptr) + break; + } + DATA_RESTORE(state, &rep.count); + } else { + /* generic */ + TRACE(("MAX_REPEAT will try to match more\n")); + while (rep.count < pattern[2] || pattern[2] == 65535) { + last_ptr = rep.last_ptr = state->ptr; + TRACE(("MAX_REPEAT trying to match more (%c)\n", *(SRE_CHAR*)state->ptr)); + i = SRE_MATCH(state, pattern + 3, level + 1, 0); + if (i < 0) + return i; + if (i == 0) + break; + STATE_SAVE(); + rep.count += 1; + /* no infinite loops, please */ + if (state->ptr == last_ptr) { + TRACE(("MAX_REPEAT would get in infinite loop while trying to match more (%c).\n", *(SRE_CHAR*)state->ptr)); + break; + } + } + } - rp = state->repeat; - if (!rp) - return SRE_ERROR_STATE; + STATE_RESTORE(); - state->ptr = ptr; + /* try to match the tail until the minimum */ + if (pattern[pattern[0]] == SRE_OP_LITERAL) { + /* tail starts with a literal */ + TRACE(("MAX_REPEAT will try to match tail, with literal\n")); + for (;;) { + /* if the tail matches, we're done */ + TRACE(("MAX_REPEAT trying to match tail, with literal\n")); + state->repeat = rep.prev; + i = SRE_MATCH(state, pattern + pattern[0], level + 1, 1); + if (i) + return i; + state->repeat = &rep; + TRACE(("MAX_REPEAT tail failed, with literal\n")); - count = rp->count + 1; + if (rep.count == pattern[1]) + return 0; + TRACE(("MAX_REPEAT count not exceeded, try with one less, with literal\n")); - TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern, ptr, count)); + /* try with one less match */ + DATA_RESTORE(state, &rep.count); + STATE_RESTORE(); + } + } else { + /* generic case */ + for (;;) { + /* if the tail matches, we're done */ + state->repeat = rep.prev; + i = SRE_MATCH(state, pattern + pattern[0], level + 1, 1); + if (i) + return i; + state->repeat = &rep; - if (count < rp->pattern[1]) { - /* not enough matches */ - rp->count = count; - /* RECURSIVE */ - i = SRE_MATCH(state, rp->pattern + 3, level + 1); - if (i) - return i; - rp->count = count - 1; - state->ptr = ptr; - return 0; - } + if (rep.count == pattern[1]) + return 0; - if (count < rp->pattern[2] || rp->pattern[2] == 65535) { - /* we may have enough matches, but if we can - match another item, do so */ - rp->count = count; - LASTMARK_SAVE(); - i = mark_save(state, 0, lastmark, &mark_stack_base); - if (i < 0) - return i; - /* RECURSIVE */ - i = SRE_MATCH(state, rp->pattern + 3, level + 1); - if (i) - return i; - i = mark_restore(state, 0, lastmark, &mark_stack_base); - if (i < 0) - return i; - LASTMARK_RESTORE(); - rp->count = count - 1; - state->ptr = ptr; + /* try with one less match */ + STATE_RESTORE(); + rep.count -= 1; + } } - /* cannot match more repeated items here. make sure the - tail matches */ - state->repeat = rp->prev; - i = SRE_MATCH(state, pattern, level + 1); - if (i) - return i; - state->repeat = rp; - state->ptr = ptr; return 0; - case SRE_OP_MIN_UNTIL: - /* minimizing repeat */ - /* <1=min> <2=max> item tail */ + case SRE_OP_REPEAT_END: + /* repeat end */ + /* */ + + /* this is currently the only truly recursive opcode */ - rp = state->repeat; - if (!rp) + repp = state->repeat; + if (!repp) return SRE_ERROR_STATE; + TRACE(("|%p|%p|REPEAT_END %p %d %d\n", pattern, ptr, + repp->pattern, repp->pattern[1], repp->pattern[2])); + state->ptr = ptr; - count = rp->count + 1; + if (tail) { + STATE_SAVE(); + count = repp->count; + repp->count += 1; + + TRACE(("REPEAT_END will check tail (count=%d)\n", repp->count)); + + if (repp->count < repp->pattern[1]) { + + /* ensure the minimum is matched */ + + TRACE(("REPEAT_END trying to match minimum\n")); + i = SRE_MATCH(state, repp->pattern + 3, level + 1, 0); + if (i <= 0) { + TRACE(("REPEAT_END minimum not reached\n")); + repp->count = count; + STATE_RESTORE(); + return 0; + } - TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count, - rp->pattern)); + } else { - if (count < rp->pattern[1]) { - /* not enough matches */ - rp->count = count; - /* RECURSIVE */ - i = SRE_MATCH(state, rp->pattern + 3, level + 1); - if (i) - return i; - rp->count = count-1; - state->ptr = ptr; - return 0; - } + /* check the tail */ - LASTMARK_SAVE(); + TRACE(("REPEAT_END trying to match tail\n")); + LASTMARK_SAVE(); - /* see if the tail matches */ - state->repeat = rp->prev; - i = SRE_MATCH(state, pattern, level + 1); - if (i) - return i; + state->repeat = repp->prev; + i = SRE_MATCH(state, repp->pattern + repp->pattern[0], level + 1, 1); + if (i < 0) + return i; + state->repeat = repp; - state->ptr = ptr; - state->repeat = rp; + if (i == 0) { + TRACE(("REPEAT_END tail not matched\n")); - if (count >= rp->pattern[2] && rp->pattern[2] != 65535) - return 0; + /* check maximum count and infinite recursion */ + if (repp->count >= repp->pattern[2] && repp->pattern[2] != 65535) { + TRACE(("REPEAT_END count exceeded\n")); + repp->count = count; + STATE_RESTORE(); + return 0; + } + + if (repp->last_ptr >= state->ptr) { + TRACE(("REPEAT_END infinite recursion detected\n")); + repp->count = count; + STATE_RESTORE(); + return 0; + } + + LASTMARK_RESTORE(); + state->ptr = ptr; + + TRACE(("REPEAT_END trying one more item\n")); + + /* try one more item */ + + repp->last_ptr = state->ptr; + i = SRE_MATCH(state, repp->pattern + 3, level + 1, 0); + if (i <= 0) { + TRACE(("REPEAT_END one more item failed\n")); + repp->count = count; + STATE_RESTORE(); + return 0; + } + repp->count += 1; + TRACE(("REPEAT_END one more item succeeded\n")); + } - LASTMARK_RESTORE(); + // no need to test tail again, since if we got here, + // is because some item inside the repeated pattern + // is trying to check the tail, so it will check + // again when we execute the item above - rp->count = count; - /* RECURSIVE */ - i = SRE_MATCH(state, rp->pattern + 3, level + 1); - if (i) - return i; - rp->count = count - 1; - state->ptr = ptr; + } + TRACE(("REPEAT_END tail succeeded\n")); - return 0; + repp->count = count; + + /* we must restore here because we're going to + * return a success value below, but the match + * ended before the tail matched */ + STATE_RESTORE(); + + repp->last_ptr = state->ptr; + } + TRACE(("REPEAT_END exitting with success\n")); + + return 1; default: TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr, pattern[-1])); @@ -1297,7 +1536,7 @@ flags = pattern[2]; - if (pattern[3] > 0) { + if (pattern[3] > 1) { /* adjust end point (but make sure we leave at least one character in there, so literal search will work) */ end -= pattern[3]-1; @@ -1344,7 +1583,7 @@ state->ptr = ptr + 1 - prefix_len + prefix_skip; if (flags & SRE_INFO_LITERAL) return 1; /* we got all of it */ - status = SRE_MATCH(state, pattern + 2*prefix_skip, 1); + status = SRE_MATCH(state, pattern + 2*prefix_skip, 1, 0); if (status != 0) return status; /* close but no cigar -- try again */ @@ -1375,7 +1614,7 @@ state->ptr = ++ptr; if (flags & SRE_INFO_LITERAL) return 1; /* we got all of it */ - status = SRE_MATCH(state, pattern + 2, 1); + status = SRE_MATCH(state, pattern + 2, 1, 0); if (status != 0) break; } @@ -1390,7 +1629,7 @@ TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr)); state->start = ptr; state->ptr = ptr; - status = SRE_MATCH(state, pattern, 1); + status = SRE_MATCH(state, pattern, 1, 0); if (status != 0) break; ptr++; @@ -1400,7 +1639,7 @@ while (ptr <= end) { TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); state->start = state->ptr = ptr++; - status = SRE_MATCH(state, pattern, 1); + status = SRE_MATCH(state, pattern, 1, 0); if (status != 0) break; } @@ -1511,16 +1750,12 @@ LOCAL(void) state_reset(SRE_STATE* state) { - state->lastmark = 0; - - /* FIXME: dynamic! */ - memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE); - + state->lastmark = -1; state->lastindex = -1; state->repeat = NULL; - mark_fini(state); + data_stack_dealloc(state); } static void* @@ -1600,6 +1835,7 @@ memset(state, 0, sizeof(SRE_STATE)); + state->lastmark = -1; state->lastindex = -1; ptr = getstring(string, &length, &charsize); @@ -1647,7 +1883,7 @@ state_fini(SRE_STATE* state) { Py_XDECREF(state->string); - mark_fini(state); + data_stack_dealloc(state); } /* calculate offset from start of string */ @@ -1661,7 +1897,7 @@ index = (index - 1) * 2; - if (string == Py_None || !state->mark[index] || !state->mark[index+1]) { + if (index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) { if (empty) /* want empty string */ i = j = 0; @@ -1823,10 +2059,10 @@ TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr)); if (state.charsize == 1) { - status = sre_match(&state, PatternObject_GetCode(self), 1); + status = sre_match(&state, PatternObject_GetCode(self), 1, 0); } else { #if defined(HAVE_UNICODE) - status = sre_umatch(&state, PatternObject_GetCode(self), 1); + status = sre_umatch(&state, PatternObject_GetCode(self), 1, 0); #endif } @@ -3020,10 +3256,10 @@ state->ptr = state->start; if (state->charsize == 1) { - status = sre_match(state, PatternObject_GetCode(self->pattern), 1); + status = sre_match(state, PatternObject_GetCode(self->pattern), 1, 0); } else { #if defined(HAVE_UNICODE) - status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1); + status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1, 0); #endif } Index: Modules/sre.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Modules/sre.h,v retrieving revision 2.22 diff -u -r2.22 sre.h --- Modules/sre.h 18 Mar 2002 18:46:14 -0000 2.22 +++ Modules/sre.h 22 Jun 2003 20:15:58 -0000 @@ -55,12 +55,26 @@ typedef struct SRE_REPEAT_T { int count; SRE_CODE* pattern; /* points to REPEAT operator arguments */ + void* last_ptr; /* helper to check for infinite loops */ struct SRE_REPEAT_T *prev; /* points to previous repeat context */ } SRE_REPEAT; +/* this is used to save the state, and must match the head + * of the structure below. *BE CAREFUL* not to change these + * structures individually. */ typedef struct { + int lastindex; + int lastmark; + void* ptr; +} SRE_STATE_HEAD; + +typedef struct { + /* registers */ + int lastindex; + int lastmark; /* string pointers */ void* ptr; /* current position (also end of current slice) */ + /* SRE_STATE_HEAD END */ void* beginning; /* start of original string */ void* start; /* start of current slice */ void* end; /* end of original string */ @@ -69,15 +83,13 @@ int pos, endpos; /* character size */ int charsize; - /* registers */ - int lastindex; - int lastmark; void* mark[SRE_MARK_SIZE]; /* dynamically allocated stuff */ - void** mark_stack; - int mark_stack_size; - int mark_stack_base; - SRE_REPEAT *repeat; /* current repeat context */ + void** data_stack; + int data_stack_size; + int data_stack_base; + /* current repeat context */ + SRE_REPEAT *repeat; /* hooks */ SRE_TOLOWER_HOOK lower; } SRE_STATE; Index: Modules/sre_constants.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Modules/sre_constants.h,v retrieving revision 2.15 diff -u -r2.15 sre_constants.h --- Modules/sre_constants.h 19 Apr 2003 12:56:08 -0000 2.15 +++ Modules/sre_constants.h 22 Jun 2003 20:15:58 -0000 @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20030419 +#define SRE_MAGIC 20030614 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -25,24 +25,25 @@ #define SRE_OP_CHARSET 10 #define SRE_OP_BIGCHARSET 11 #define SRE_OP_GROUPREF 12 -#define SRE_OP_GROUPREF_IGNORE 13 -#define SRE_OP_IN 14 -#define SRE_OP_IN_IGNORE 15 -#define SRE_OP_INFO 16 -#define SRE_OP_JUMP 17 -#define SRE_OP_LITERAL 18 -#define SRE_OP_LITERAL_IGNORE 19 -#define SRE_OP_MARK 20 -#define SRE_OP_MAX_UNTIL 21 -#define SRE_OP_MIN_UNTIL 22 -#define SRE_OP_NOT_LITERAL 23 -#define SRE_OP_NOT_LITERAL_IGNORE 24 -#define SRE_OP_NEGATE 25 -#define SRE_OP_RANGE 26 -#define SRE_OP_REPEAT 27 -#define SRE_OP_REPEAT_ONE 28 -#define SRE_OP_SUBPATTERN 29 -#define SRE_OP_MIN_REPEAT_ONE 30 +#define SRE_OP_GROUPREF_EXISTS 13 +#define SRE_OP_GROUPREF_IGNORE 14 +#define SRE_OP_IN 15 +#define SRE_OP_IN_IGNORE 16 +#define SRE_OP_INFO 17 +#define SRE_OP_JUMP 18 +#define SRE_OP_LITERAL 19 +#define SRE_OP_LITERAL_IGNORE 20 +#define SRE_OP_MARK 21 +#define SRE_OP_MAX_REPEAT 22 +#define SRE_OP_MAX_REPEAT_END 23 +#define SRE_OP_MAX_REPEAT_ONE 24 +#define SRE_OP_MIN_REPEAT 26 +#define SRE_OP_NOT_LITERAL 27 +#define SRE_OP_NOT_LITERAL_IGNORE 28 +#define SRE_OP_NEGATE 29 +#define SRE_OP_RANGE 30 +#define SRE_OP_REPEAT_END 31 +#define SRE_OP_SUBPATTERN 32 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2 Index: Doc/lib/libre.tex =================================================================== RCS file: /cvsroot/python/python/dist/src/Doc/lib/libre.tex,v retrieving revision 1.100 diff -u -r1.100 libre.tex --- Doc/lib/libre.tex 13 May 2003 14:40:24 -0000 1.100 +++ Doc/lib/libre.tex 22 Jun 2003 20:15:29 -0000 @@ -297,6 +297,15 @@ fixed length. Patterns which start with negative lookbehind assertions may match at the beginning of the string being searched. +\item[\code{(?(\var{id/name})yes-pattern|no-pattern)}] Will try to match +with \regexp{yes-pattern} if the group with given \var{id} or \var{name} +exists, and with \regexp{no-pattern} if it doesn't. \regexp{|no-pattern} +is optional and can be omitted. For example, +\regexp{(<)?(\e w+@\e w+(?:\e .\e w+)+)(?(1)>)} is a poor email matching +pattern, which will match with \code{''} as well as +\code{'user@host.com'}, but not with \code{'