=== modified file Modules/_sre.c --- Modules/_sre.c 2008-09-26 17:38:03 +0000 +++ Modules/_sre.c 2008-09-30 23:06:24 +0000 @@ -388,6 +388,7 @@ #define SRE_SAVE_MARKS sre_save_marks #define SRE_RESTORE_MARKS sre_restore_marks #define SRE_DISCARD_SAVED_MARKS sre_discard_saved_marks +#define SRE_REFRESH_MARKS sre_refresh_marks #define SRE_DISCARD_UNTIL_OP sre_discard_until_op #define SRE_CLEANUP sre_cleanup #define SRE_LOOK_AHEAD_ONE sre_look_ahead_one @@ -407,6 +408,7 @@ #undef SRE_LOOK_AHEAD_ONE #undef SRE_CLEANUP #undef SRE_DISCARD_UNTIL_OP +#undef SRE_REFRESH_MARKS #undef SRE_DISCARD_SAVED_MARKS #undef SRE_RESTORE_MARKS #undef SRE_SAVE_MARKS @@ -441,6 +443,7 @@ #define SRE_SAVE_MARKS sre_usave_marks #define SRE_RESTORE_MARKS sre_urestore_marks #define SRE_DISCARD_SAVED_MARKS sre_udiscard_saved_marks +#define SRE_REFRESH_MARKS sre_urefresh_marks #define SRE_DISCARD_UNTIL_OP sre_udiscard_until_op #define SRE_CLEANUP sre_ucleanup #define SRE_LOOK_AHEAD_ONE sre_ulook_ahead_one @@ -693,6 +696,10 @@ } } +LOCAL(void) SRE_REFRESH_MARKS(SRE_CONTEXT* context) { + memmove(context->mark, &context->saved_marks_chunk->marks[context->saved_marks_chunk->count - context->mark_count], context->mark_count * sizeof(SRE_CHAR*)); +} + LOCAL(void) SRE_DISCARD_UNTIL_OP(SRE_CONTEXT* context, int op) { for (;;) { SRE_BACKTRACK_ITEM* backtrack_item = &context->backtrack_chunk->items[context->backtrack_chunk->count - 1]; @@ -713,7 +720,8 @@ case SRE_OP_REPEAT_MIN_REV: case SRE_OP_REPEAT_POSS: case SRE_OP_REPEAT_POSS_REV: - SRE_DISCARD_SAVED_MARKS(context); + if (context->mark_count > 0) + SRE_DISCARD_SAVED_MARKS(context); break; } SRE_DISCARD_BACKTRACK(context); @@ -740,6 +748,16 @@ LOCAL(int) SRE_LOOK_AHEAD_ONE(SRE_CONTEXT* context, SRE_STATE* state, SRE_CODE* look_literal) { switch (look_literal[0]) { + case SRE_OP_BOUNDARY: + return SRE_AT_BOUNDARY(context); + case SRE_OP_DIGIT: + return SRE_IS_DIGIT(context->text_ptr[0]); + case SRE_OP_END_OF_LINE: + return context->text_ptr >= context->text_end || SRE_IS_LINEBREAK(context->text_ptr[0]); + case SRE_OP_END_OF_STRING: + return context->text_ptr >= context->text_end; + case SRE_OP_END_OF_STRING_2: + return context->text_ptr >= context->text_end || context->text_ptr == context->final_linebreak; case SRE_OP_LITERAL: return context->text_ptr[0] == (SRE_CHAR)look_literal[1]; case SRE_OP_LITERAL_IGNORE: @@ -748,12 +766,40 @@ return context->text_ptr[0] == (SRE_CHAR)look_literal[2]; case SRE_OP_LITERAL_STRING_IGNORE: return state->lower(context->text_ptr[0]) == (SRE_CHAR)look_literal[2]; - case SRE_OP_BOUNDARY: - return SRE_AT_BOUNDARY(context); case SRE_OP_LOC_BOUNDARY: return SRE_LOC_AT_BOUNDARY(context); + case SRE_OP_LOC_NOT_WORD: + return ! SRE_LOC_IS_WORD(context->text_ptr[0]); + case SRE_OP_LOC_WORD: + return SRE_LOC_IS_WORD(context->text_ptr[0]); + case SRE_OP_NOT_DIGIT: + return !SRE_IS_DIGIT(context->text_ptr[0]); + case SRE_OP_NOT_WHITESPACE: + return !SRE_IS_WHITESPACE(context->text_ptr[0]); + case SRE_OP_NOT_WORD: + return ! SRE_IS_WORD(context->text_ptr[0]); + case SRE_OP_START_OF_LINE: + return context->text_ptr == context->text_beginning || SRE_IS_LINEBREAK(context->text_ptr[-1]); + case SRE_OP_START_OF_STRING: + return context->text_ptr == context->text_beginning; case SRE_OP_UNI_BOUNDARY: return SRE_UNI_AT_BOUNDARY(context); + case SRE_OP_UNI_DIGIT: + return SRE_UNI_IS_DIGIT(context->text_ptr[0]); + case SRE_OP_UNI_NOT_DIGIT: + return !SRE_UNI_IS_DIGIT(context->text_ptr[0]); + case SRE_OP_UNI_NOT_WHITESPACE: + return ! SRE_UNI_IS_WHITESPACE(context->text_ptr[0]); + case SRE_OP_UNI_NOT_WORD: + return ! SRE_UNI_IS_WORD(context->text_ptr[0]); + case SRE_OP_UNI_WHITESPACE: + return SRE_UNI_IS_WHITESPACE(context->text_ptr[0]); + case SRE_OP_UNI_WORD: + return SRE_UNI_IS_WORD(context->text_ptr[0]); + case SRE_OP_WHITESPACE: + return SRE_IS_WHITESPACE(context->text_ptr[0]); + case SRE_OP_WORD: + return SRE_IS_WORD(context->text_ptr[0]); default: return 1; } @@ -761,6 +807,16 @@ LOCAL(int) SRE_LOOK_AHEAD_ONE_REV(SRE_CONTEXT* context, SRE_STATE* state, SRE_CODE* look_literal) { switch (look_literal[0]) { + case SRE_OP_BOUNDARY: + return SRE_AT_BOUNDARY(context); + case SRE_OP_DIGIT: + return SRE_IS_DIGIT(context->text_ptr[-1]); + case SRE_OP_END_OF_LINE: + return context->text_ptr >= context->text_end || SRE_IS_LINEBREAK(context->text_ptr[0]); + case SRE_OP_END_OF_STRING: + return context->text_ptr >= context->text_end; + case SRE_OP_END_OF_STRING_2: + return context->text_ptr >= context->text_end || context->text_ptr == context->final_linebreak; case SRE_OP_LITERAL: return context->text_ptr[-1] == (SRE_CHAR)look_literal[1]; case SRE_OP_LITERAL_IGNORE: @@ -769,12 +825,40 @@ return context->text_ptr[-1] == (SRE_CHAR)look_literal[2]; case SRE_OP_LITERAL_STRING_IGNORE: return state->lower(context->text_ptr[-1]) == (SRE_CHAR)look_literal[2]; - case SRE_OP_BOUNDARY: - return SRE_AT_BOUNDARY(context); case SRE_OP_LOC_BOUNDARY: return SRE_LOC_AT_BOUNDARY(context); + case SRE_OP_LOC_NOT_WORD: + return ! SRE_LOC_IS_WORD(context->text_ptr[-1]); + case SRE_OP_LOC_WORD: + return SRE_LOC_IS_WORD(context->text_ptr[-1]); + case SRE_OP_NOT_DIGIT: + return !SRE_IS_DIGIT(context->text_ptr[-1]); + case SRE_OP_NOT_WHITESPACE: + return !SRE_IS_WHITESPACE(context->text_ptr[-1]); + case SRE_OP_NOT_WORD: + return ! SRE_IS_WORD(context->text_ptr[-1]); + case SRE_OP_START_OF_LINE: + return context->text_ptr == context->text_beginning || SRE_IS_LINEBREAK(context->text_ptr[-1]); + case SRE_OP_START_OF_STRING: + return context->text_ptr == context->text_beginning; case SRE_OP_UNI_BOUNDARY: return SRE_UNI_AT_BOUNDARY(context); + case SRE_OP_UNI_DIGIT: + return SRE_UNI_IS_DIGIT(context->text_ptr[-1]); + case SRE_OP_UNI_NOT_DIGIT: + return !SRE_UNI_IS_DIGIT(context->text_ptr[-1]); + case SRE_OP_UNI_NOT_WHITESPACE: + return ! SRE_UNI_IS_WHITESPACE(context->text_ptr[-1]); + case SRE_OP_UNI_NOT_WORD: + return ! SRE_UNI_IS_WORD(context->text_ptr[-1]); + case SRE_OP_UNI_WHITESPACE: + return SRE_UNI_IS_WHITESPACE(context->text_ptr[-1]); + case SRE_OP_UNI_WORD: + return SRE_UNI_IS_WORD(context->text_ptr[-1]); + case SRE_OP_WHITESPACE: + return SRE_IS_WHITESPACE(context->text_ptr[-1]); + case SRE_OP_WORD: + return SRE_IS_WORD(context->text_ptr[-1]); default: return 1; } @@ -921,9 +1005,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_ASSERT, context.pattern_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.text_start = state->beginning; context.pattern_ptr++; break; @@ -934,9 +1020,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_ASSERT_NOT, context.pattern_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.text_start = state->beginning; context.pattern_ptr++; break; @@ -947,9 +1035,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_ATOMIC, NULL, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } break; case SRE_OP_BIGCHARSET: // Match character in charset. @@ -995,17 +1085,33 @@ goto backtrack; break; case SRE_OP_BRANCH: + { // Alternation. // ... ... 0 + SRE_CODE* look_literal; TRACE(("|%p|%p|BRANCH\n", context.pattern_ptr, context.text_ptr)); +next_branch: + look_literal = context.pattern_ptr + 1; + while (look_literal[0] == SRE_OP_MARK) + look_literal += 2; + // Look ahead in the branch to avoid unnecessary backtracking. + if (! SRE_LOOK_AHEAD_ONE(&context, state, look_literal)) { + context.pattern_ptr += context.pattern_ptr[0]; + if (context.pattern_ptr[0] == 0) + goto backtrack; + goto next_branch; + } result = SRE_SAVE_BACKTRACK(&context, SRE_OP_BRANCH, context.pattern_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.pattern_ptr++; break; + } case SRE_OP_CHARSET: // Match character in charset. // @@ -1069,7 +1175,8 @@ context.text_start = backtrack_item->text_start; context.text_ptr = backtrack_item->text_ptr; SRE_DISCARD_BACKTRACK(&context); - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); break; } case SRE_OP_END_ASSERT_NOT: @@ -1082,7 +1189,8 @@ backtrack_item = &context.backtrack_chunk->items[context.backtrack_chunk->count - 1]; context.text_start = backtrack_item->text_start; SRE_DISCARD_BACKTRACK(&context); - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); goto backtrack; } case SRE_OP_END_ATOMIC: @@ -1091,7 +1199,8 @@ // ... SRE_DISCARD_UNTIL_OP(&context, SRE_OP_ATOMIC); SRE_DISCARD_BACKTRACK(&context); - SRE_DISCARD_SAVED_MARKS(&context); + if (context.mark_count > 0) + SRE_DISCARD_SAVED_MARKS(&context); break; } case SRE_OP_END_OF_LINE: @@ -1147,9 +1256,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MAX, end_repeat_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } } context.pattern_ptr = body; context.repeat_start[index] = context.text_ptr; @@ -1190,9 +1301,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MAX_REV, end_repeat_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } } context.pattern_ptr = body; context.repeat_start[index] = context.text_ptr; @@ -1220,10 +1333,8 @@ else if (curr_repeats < repeat_min) { if (repeat_min - curr_repeats > limit) goto backtrack; - SRE_RESTORE_MARKS(&context); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) + SRE_REFRESH_MARKS(&context); context.pattern_ptr = body; context.repeat_start[index] = context.text_ptr; } else { @@ -1244,9 +1355,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MIN, end_repeat_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.pattern_ptr = tail; } else { context.pattern_ptr = body; @@ -1280,10 +1393,8 @@ else if (curr_repeats < repeat_min) { if (repeat_min - curr_repeats > limit) goto backtrack; - SRE_RESTORE_MARKS(&context); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) + SRE_REFRESH_MARKS(&context); context.pattern_ptr = body; context.repeat_start[index] = context.text_ptr; } else { @@ -1304,9 +1415,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MIN, end_repeat_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.pattern_ptr = tail; } else { context.pattern_ptr = body; @@ -1349,9 +1462,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_POSS, end_repeat_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.pattern_ptr = body; context.repeat_start[index] = context.text_ptr; } else @@ -1387,9 +1502,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_POSS_REV, end_repeat_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.pattern_ptr = body; context.repeat_start[index] = context.text_ptr; } else @@ -2052,9 +2169,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_MAX, repeat_ptr, index); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.repeat_counter[index] = 0; if (repeat_min == 0) { int match; @@ -2070,9 +2189,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MAX, end_repeat_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } } } context.pattern_ptr = body; @@ -2097,9 +2218,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_MAX_REV, repeat_ptr, index); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.repeat_counter[index] = 0; if (repeat_min == 0) { int match; @@ -2115,9 +2238,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MAX_REV, end_repeat_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } } } context.pattern_ptr = body; @@ -2142,9 +2267,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_MIN, repeat_ptr, index); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.repeat_counter[index] = 0; if (repeat_min == 0) { int match; @@ -2160,9 +2287,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MIN, end_repeat_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.pattern_ptr = tail; } else { context.pattern_ptr = body; @@ -2192,9 +2321,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_MIN_REV, repeat_ptr, index); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.repeat_counter[index] = 0; if (repeat_min == 0) { int match; @@ -2210,9 +2341,11 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_MIN_REV, end_repeat_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.pattern_ptr = tail; } else { context.pattern_ptr = body; @@ -4121,17 +4254,21 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_POSS, repeat_ptr, index); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.repeat_counter[index] = 0; if (repeat_min == 0) { result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_POSS, end_repeat_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } } context.pattern_ptr = body; context.repeat_start[index] = context.text_ptr; @@ -4155,17 +4292,21 @@ result = SRE_SAVE_BACKTRACK(&context, SRE_OP_REPEAT_POSS_REV, repeat_ptr, index); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } context.repeat_counter[index] = 0; if (repeat_min == 0) { result = SRE_SAVE_BACKTRACK(&context, SRE_OP_END_REPEAT_POSS_REV, end_repeat_ptr, -1); if (result != 0) return SRE_CLEANUP(&context, state, result); - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); + if (context.mark_count > 0) { + result = SRE_SAVE_MARKS(&context); + if (result != 0) + return SRE_CLEANUP(&context, state, result); + } } context.pattern_ptr = body; context.repeat_start[index] = context.text_ptr; @@ -4188,9 +4329,16 @@ case SRE_OP_SUCCESS: { // End of pattern. + int zero_width; int m; SRE_CHAR* end_ptr = NULL; TRACE(("|%p|%p|SUCCESS\n", context.pattern_ptr, context.text_ptr)); + // Is the entire matched portion zero-width? + zero_width = context.text_ptr == context.text_start; + // Reject the match if it's zero-width and we aren't allowed to return it. + if (zero_width && state->reject_zero_width) + goto backtrack; + state->reject_zero_width = zero_width; // Find the mark which matched the furthest to the right. for (m = 1; m < context.mark_count; m += 2) { if (context.mark[m - 1] != NULL && context.mark[m] != NULL) { @@ -4370,7 +4518,8 @@ SRE_BACKTRACK_ITEM* backtrack_item = &context.backtrack_chunk->items[context.backtrack_chunk->count - 1]; TRACE(("ASSERT\n")); context.text_start = backtrack_item->text_start; - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); goto backtrack; } @@ -4383,7 +4532,8 @@ context.pattern_ptr = backtrack_item->pattern_ptr; context.text_start = backtrack_item->text_start; context.text_ptr = backtrack_item->text_ptr; - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); context.pattern_ptr += context.pattern_ptr[0]; goto advance; @@ -4392,7 +4542,8 @@ // Atomic subpattern. // ... TRACE(("ATOMIC\n")); - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); goto backtrack; case SRE_OP_BRANCH: @@ -4400,19 +4551,27 @@ // Alternation. // ... ... 0 SRE_BACKTRACK_ITEM* backtrack_item = &context.backtrack_chunk->items[context.backtrack_chunk->count - 1]; + SRE_CODE* look_literal; TRACE(("BRANCH\n")); context.pattern_ptr = backtrack_item->pattern_ptr; +next_branch_backtrack: context.pattern_ptr += context.pattern_ptr[0]; - SRE_RESTORE_MARKS(&context); if (context.pattern_ptr[0] == 0) { + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); goto backtrack; } + context.text_ptr = backtrack_item->text_ptr; + look_literal = context.pattern_ptr + 1; + while (look_literal[0] == SRE_OP_MARK) + look_literal += 2; + // Look ahead in the branch to avoid unnecessary backtracking. + if (! SRE_LOOK_AHEAD_ONE(&context, state, look_literal)) + goto next_branch_backtrack; + if (context.mark_count > 0) + SRE_REFRESH_MARKS(&context); backtrack_item->pattern_ptr = context.pattern_ptr; - result = SRE_SAVE_MARKS(&context); - if (result != 0) - return SRE_CLEANUP(&context, state, result); - context.text_ptr = backtrack_item->text_ptr; context.pattern_ptr++; goto advance; } @@ -4425,7 +4584,8 @@ SRE_CODE* tail = end_repeat_ptr + 1; TRACE(("END_REPEAT_MAX\n")); context.text_ptr = backtrack_item->text_ptr; - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); context.pattern_ptr = tail; goto advance; @@ -4439,7 +4599,8 @@ SRE_CODE* tail = end_repeat_ptr + 1; TRACE(("END_REPEAT_MAX_REV\n")); context.text_ptr = backtrack_item->text_ptr; - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); context.pattern_ptr = tail; goto advance; @@ -4458,7 +4619,8 @@ Py_ssize_t limit = context.text_end - context.text_ptr; TRACE(("END_REPEAT_MIN\n")); context.text_ptr = backtrack_item->text_ptr; - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); if (limit == 0) goto backtrack; @@ -4479,7 +4641,8 @@ Py_ssize_t limit = context.text_ptr - context.text_start; TRACE(("END_REPEAT_MIN_REV\n")); context.text_ptr = backtrack_item->text_ptr; - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); if (limit == 0) goto backtrack; @@ -4496,7 +4659,8 @@ SRE_CODE* tail = end_repeat_ptr + 1; TRACE(("END_REPEAT_POSS\n")); context.text_ptr = backtrack_item->text_ptr; - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); context.pattern_ptr = tail; goto advance; @@ -4511,20 +4675,23 @@ SRE_CODE* tail = end_repeat_ptr + 1; TRACE(("END_REPEAT_POSS_REV\n")); context.text_ptr = backtrack_item->text_ptr; - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); context.pattern_ptr = tail; goto advance; } case SRE_OP_FAILURE: // Failed to match. + state->reject_zero_width = 0; return SRE_CLEANUP(&context, state, 0); case SRE_OP_REPEAT_MAX: { // Greedy repeat. // ... TRACE(("REPEAT_MAX\n")); - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); goto backtrack; } @@ -4533,7 +4700,8 @@ // Greedy repeat. // ... TRACE(("REPEAT_MAX\n")); - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); goto backtrack; } @@ -4542,7 +4710,8 @@ // Lazy repeat. // ... TRACE(("REPEAT_MIN\n")); - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); goto backtrack; } @@ -4551,7 +4720,8 @@ // Lazy repeat. // ... TRACE(("REPEAT_MIN\n")); - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); goto backtrack; } @@ -5077,7 +5247,8 @@ // ... SRE_BACKTRACK_ITEM* backtrack_item = &context.backtrack_chunk->items[context.backtrack_chunk->count - 1]; TRACE(("REPEAT_POSS\n")); - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); goto backtrack; } @@ -5087,7 +5258,8 @@ // ... SRE_BACKTRACK_ITEM* backtrack_item = &context.backtrack_chunk->items[context.backtrack_chunk->count - 1]; TRACE(("REPEAT_POSS\n")); - SRE_RESTORE_MARKS(&context); + if (context.mark_count > 0) + SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); goto backtrack; } @@ -5134,21 +5306,26 @@ if (look_literal != NULL) look_ahead = look_literal; - context.text_start = (SRE_CHAR *)state->beginning; + context.text_beginning = (SRE_CHAR *)state->beginning; + context.text_start = (SRE_CHAR *)state->start; context.text_end = (SRE_CHAR *)state->end; + // Point to the final newline if it's the final character. + context.final_linebreak = context.text_beginning < context.text_end && SRE_IS_LINEBREAK(context.text_end[-1]) ? context.text_end - 1 : NULL; + if (state->reverse) { context.text_ptr = (SRE_CHAR *)state->end; while (context.text_ptr >= context.text_start) { TRACE(("|%p|%p|SEARCH\n", pattern, context.text_ptr)); if (SRE_LOOK_AHEAD_ONE(&context, state, look_ahead)) { - state->end = state->ptr = context.text_ptr; + state->end = state->ptr = context.text_ptr--; status = SRE_MATCH(state, state->pattern_code); if (status != 0) break; } - context.text_ptr--; + else + context.text_ptr--; } } else { context.text_ptr = (SRE_CHAR *)state->start; @@ -5156,12 +5333,13 @@ while (context.text_ptr <= context.text_end) { TRACE(("|%p|%p|SEARCH\n", pattern, context.text_ptr)); if (SRE_LOOK_AHEAD_ONE(&context, state, look_ahead)) { - state->start = state->ptr = context.text_ptr; + state->start = state->ptr = context.text_ptr++; status = SRE_MATCH(state, state->pattern_code); if (status != 0) break; } - context.text_ptr++; + else + context.text_ptr++; } } @@ -5355,6 +5533,8 @@ state->start = (void*) ((char*) ptr + start * state->charsize); state->end = (void*) ((char*) ptr + end * state->charsize); + state->reject_zero_width = 0; + Py_INCREF(string); state->string = string; state->pos = start; @@ -5743,17 +5923,10 @@ if (status < 0) goto error; - if (state.reverse) { - if (state.ptr == state.end) - state.end = (void*) ((char*) state.ptr - state.charsize); - else - state.end = state.ptr; - } else { - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); - else - state.start = state.ptr; - } + if (state.reverse) + state.end = state.ptr; + else + state.start = state.ptr; } state_fini(&state); @@ -5852,17 +6025,11 @@ if (state.end == state.ptr) { if (last == state.start) break; - /* skip one character */ - state.end = (void*) ((char*) state.ptr - state.charsize); - continue; } } else { if (state.start == state.ptr) { if (last == state.end) break; - /* skip one character */ - state.start = (void*) ((char*) state.ptr + state.charsize); - continue; } } @@ -6090,17 +6257,10 @@ next: /* move on */ - if (state.reverse) { - if (state.ptr == state.end) - state.end = (void*) ((char*) state.ptr - state.charsize); - else - state.end = state.ptr; - } else { - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); - else - state.start = state.ptr; - } + if (state.reverse) + state.end = state.ptr; + else + state.start = state.ptr; } /* get segment following last match */ @@ -7387,12 +7547,12 @@ state, status); if (state->reverse) { - if (status == 0 || state->ptr == state->end) + if (status == 0) state->end = (void*) ((char*) state->ptr - state->charsize); else state->end = state->ptr; } else { - if (status == 0 || state->ptr == state->start) + if (status == 0) state->start = (void*) ((char*) state->ptr + state->charsize); else state->start = state->ptr; @@ -7427,12 +7587,12 @@ state, status); if (state->reverse) { - if (status == 0 || state->ptr == state->end) + if (status == 0) state->end = (void*) ((char*) state->ptr - state->charsize); else state->end = state->ptr; } else { - if (status == 0 || state->ptr == state->start) + if (status == 0) state->start = (void*) ((char*) state->ptr + state->charsize); else state->start = state->ptr; === modified file Modules/sre.h --- Modules/sre.h 2008-09-26 17:38:03 +0000 +++ Modules/sre.h 2008-09-30 19:14:31 +0000 @@ -93,6 +93,7 @@ /* character size */ int charsize; int reverse; + int reject_zero_width; /* registers */ Py_ssize_t lastindex; Py_ssize_t lastmark;