--- Lib/sre_compile.py +++ Lib/sre_compile.py @@ -31,6 +31,7 @@ CATEGORY_CODES = [ (ANY, ANY, ANY), + (ANY_ALL, ANY_ALL, ANY_ALL), (BOUNDARY, LOC_BOUNDARY, UNI_BOUNDARY), (DIGIT, DIGIT, UNI_DIGIT), (NOT_BOUNDARY, LOC_NOT_BOUNDARY, UNI_NOT_BOUNDARY), @@ -56,11 +57,16 @@ (RANGE, RANGE_IGNORE), ] -LINE_CODES = [ - (END_OF_LINE, END_OF_STRING_2), - (END_OF_STRING, END_OF_STRING), - (START_OF_LINE, START_OF_STRING), - (START_OF_STRING, START_OF_STRING), +NOT_CODES = [ + (BIGCHARSET, NOT_BIGCHARSET), + (CHARSET, NOT_CHARSET), + (DIGIT, NOT_DIGIT), + (IN, NOT_IN), + (IN_IGNORE, NOT_IN_IGNORE), + (LITERAL, NOT_LITERAL), + (RANGE, NOT_RANGE), + (WHITESPACE, NOT_WHITESPACE), + (WORD, NOT_WORD), ] REPEAT_CODES = { @@ -70,21 +76,21 @@ } SINGLE_CHARACTER_CODES = set([ - ANY, - BIGCHARSET, - CHARSET, + ANY, ANY_ALL, + BIGCHARSET, BIGCHARSET_IGNORE, + CHARSET, CHARSET_IGNORE, DIGIT, - IN, - LITERAL, - NOT_BIGCHARSET, - NOT_CHARSET, + IN, IN_IGNORE, + LITERAL, LITERAL_IGNORE, + NOT_BIGCHARSET, NOT_BIGCHARSET_IGNORE, + NOT_CHARSET, NOT_CHARSET_IGNORE, NOT_DIGIT, - NOT_IN, - NOT_LITERAL, - NOT_RANGE, + NOT_IN, NOT_IN_IGNORE, + NOT_LITERAL, NOT_LITERAL_IGNORE, + NOT_RANGE, NOT_RANGE_IGNORE, NOT_WHITESPACE, NOT_WORD, - RANGE, + RANGE, RANGE_IGNORE, WHITESPACE, WORD, ]) @@ -172,49 +178,34 @@ REPEAT_POSS: REPEAT_ONE_POSS, } -BIGCHARSET_SET = set([BIGCHARSET, NOT_BIGCHARSET]) +BIGCHARSET_SET = set([BIGCHARSET, BIGCHARSET_IGNORE, NOT_BIGCHARSET, NOT_BIGCHARSET_IGNORE]) CATEGORY_SET = set([op for op, loc_op, uni_op in CATEGORY_CODES]) -CHARSET_SET = set([CHARSET, NOT_CHARSET]) -IN_SET = set([IN, NOT_IN]) -LINE_SET = set([END_OF_LINE, END_OF_STRING, START_OF_LINE, START_OF_STRING]) -LITERAL_SET = set([LITERAL, NOT_LITERAL]) -RANGE_SET = set([NOT_RANGE, RANGE]) +CHARSET_SET = set([CHARSET, CHARSET_IGNORE, NOT_CHARSET, NOT_CHARSET_IGNORE]) +GROUPREF_SET = set([GROUPREF, GROUPREF_IGNORE]) +IGNORE_SET = set([op_ignore for op, op_ignore in IGNORE_CODES]) +IN_SET = set([IN, IN_IGNORE, NOT_IN, NOT_IN_IGNORE]) +LINE_SET = set([END_OF_LINE, END_OF_STRING, END_OF_STRING_2, START_OF_LINE, START_OF_STRING]) +LITERAL_SET = set([LITERAL, LITERAL_IGNORE, NOT_LITERAL, NOT_LITERAL_IGNORE]) +NOT_SET = set([not_op for op, not_op in NOT_CODES]) +RANGE_SET = set([NOT_RANGE, NOT_RANGE_IGNORE, RANGE, RANGE_IGNORE]) REPEAT_SET = set([REPEAT_MAX, REPEAT_MIN, REPEAT_POSS]) def _compile(code, pattern, flags, info, dir=1): # internal: compile a (sub)pattern emit = code.append - literal_string = [] - fix_op = {} + literal_op, literal_string = None, [] + fix_encoding = {} if flags & SRE_FLAG_LOCALE: for op, loc_op, uni_op in CATEGORY_CODES: - fix_op[op] = loc_op + fix_encoding[op] = loc_op elif flags & SRE_FLAG_UNICODE: for op, loc_op, uni_op in CATEGORY_CODES: - fix_op[op] = uni_op + fix_encoding[op] = uni_op else: for op, loc_op, uni_op in CATEGORY_CODES: - fix_op[op] = op - if flags & SRE_FLAG_DOTALL: - fix_op[ANY] = ANY_ALL - else: - fix_op[ANY] = ANY - if flags & SRE_FLAG_IGNORECASE: - def fix_case(ch): - return _sre.getlower(ch, flags) - for op, op_ignore in IGNORE_CODES: - fix_op[op] = op_ignore - else: - def fix_case(ch): - return ch - for op, op_ignore in IGNORE_CODES: - fix_op[op] = op - if flags & SRE_FLAG_MULTILINE: - for line_op, op in LINE_CODES: - fix_op[line_op] = line_op - else: - for line_op, op in LINE_CODES: - fix_op[line_op] = op + fix_encoding[op] = op + def lower_case(ch): + return _sre.getlower(ch, flags) if dir < 0: fix_direction = dict((op, op_rev) for op, op_rev in REVERSE_CODES) else: @@ -224,198 +215,201 @@ pattern = reversed(pattern) for op, av in pattern: if op in IN_SET: - op, av = _optimize_in(op, av, flags) - if op is LITERAL: + op, av = _optimize_in(op, av, lower_case) + if op is literal_op: literal_string.append(av) else: if literal_string: if dir < 0: literal_string = literal_string[::-1] - emit_literal_string(emit, literal_string, fix_case, fix_op, fix_direction) - literal_string = [] - if op in ASSERT_CODES: - emit(OPCODES[op]) - skip = len(code); emit(0) - _compile(code, av[1], flags, info, av[0]) - emit(OPCODES[ASSERT_CODES[op]]) - code[skip] = len(code) - skip - elif op in BIGCHARSET_SET: - emit(OPCODES[fix_direction[fix_op[op]]]) - _compile_bigcharset(av, flags, code) - elif op is BRANCH: - emit(OPCODES[op]) - tail = [] - tailappend = tail.append - for av in av[1]: + emit_literal_string(code, literal_op, literal_string, lower_case, fix_direction) + if op in (LITERAL, LITERAL_IGNORE): + literal_op, literal_string = op, [av] + else: + literal_op, literal_string = None, [] + if op in ASSERT_CODES: + emit(OPCODES[op]) skip = len(code); emit(0) - _compile(code, av, flags, info, dir) - emit(OPCODES[JUMP]) - tailappend(len(code)); emit(0) + _compile(code, av[1], flags, info, av[0]) + emit(OPCODES[ASSERT_CODES[op]]) code[skip] = len(code) - skip - emit(0) # end of branchs - for tail in tail: - code[tail] = len(code) - tail - elif op is CALL: - emit(OPCODES[op]) - skip = len(code); emit(0) - _compile(code, av, flags, info, dir) - emit(OPCODES[SUCCESS]) - code[skip] = len(code) - skip - elif op in CHARSET_SET: - emit(OPCODES[fix_direction[fix_op[op]]]) - _compile_charset(av, flags, code) - elif op in CATEGORY_SET or op in LINE_SET: - emit(OPCODES[fix_direction[fix_op[op]]]) - elif op is GROUPREF: - emit(OPCODES[fix_direction[fix_op[op]]]) - emit(av - 1) - elif op is GROUPREF_EXISTS: - emit(OPCODES[op]) - emit(av[0] - 1) - skipyes = len(code); emit(0) - _compile(code, av[1], flags, info, dir) - if av[2]: - emit(OPCODES[JUMP]) - skipno = len(code); emit(0) - code[skipyes] = len(code) - skipyes + 1 - _compile(code, av[2], flags, info, dir) - code[skipno] = len(code) - skipno - else: - code[skipyes] = len(code) - skipyes + 1 - elif op in IN_SET: - emit(OPCODES[fix_direction[fix_op[op]]]) - _compile_in(av, flags, code, fix_direction) - elif op in LITERAL_SET: - emit(OPCODES[fix_direction[fix_op[op]]]) - emit(fix_case(av)) - elif op in RANGE_SET: - emit(OPCODES[fix_direction[fix_op[op]]]) - emit(fix_case(av[0])) - emit(fix_case(av[1])) - elif op in REPEAT_SET: - if flags & SRE_FLAG_TEMPLATE: - raise error, "internal: unsupported template operator" - else: - single = get_single_character(av[2]) - if single: - emit(OPCODES[fix_direction[REPEAT_ONE_CODES[op]]]) + elif op in BIGCHARSET_SET: + emit(OPCODES[fix_direction[op]]) + _compile_bigcharset(code, op, av, lower_case) + elif op is BRANCH: + emit(OPCODES[op]) + tail = [] + tailappend = tail.append + for av in av[1]: skip = len(code); emit(0) - emit(info.repeat_count) - info.repeat_count += 1 - emit(av[0]) - emit(av[1]) - _compile(code, single, flags, info, dir) + _compile(code, av, flags, info, dir) + emit(OPCODES[JUMP]) + tailappend(len(code)); emit(0) code[skip] = len(code) - skip + emit(0) # end of branchs + for tail in tail: + code[tail] = len(code) - tail + elif op is CALL: + emit(OPCODES[op]) + skip = len(code); emit(0) + _compile(code, av, flags, info, dir) + emit(OPCODES[SUCCESS]) + code[skip] = len(code) - skip + elif op in CHARSET_SET: + emit(OPCODES[fix_direction[op]]) + _compile_charset(code, op, av, lower_case) + elif op in CATEGORY_SET or op in LINE_SET: + emit(OPCODES[fix_direction[fix_encoding.get(op, op)]]) + elif op in GROUPREF_SET: + emit(OPCODES[fix_direction[op]]) + emit(av - 1) + elif op is GROUPREF_EXISTS: + emit(OPCODES[op]) + emit(av[0] - 1) + skipyes = len(code); emit(0) + _compile(code, av[1], flags, info, dir) + if av[2]: + emit(OPCODES[JUMP]) + skipno = len(code); emit(0) + code[skipyes] = len(code) - skipyes + 1 + _compile(code, av[2], flags, info, dir) + code[skipno] = len(code) - skipno else: - emit(OPCODES[fix_direction[op]]) - skip = len(code); emit(0) - emit(info.repeat_count) - info.repeat_count += 1 + code[skipyes] = len(code) - skipyes + 1 + elif op in IN_SET: + emit(OPCODES[fix_direction[op]]) + _compile_in(code, op, av, lower_case, fix_encoding) + elif op in LITERAL_SET: + emit(OPCODES[fix_direction[op]]) + if op in IGNORE_SET: + emit(lower_case(av)) + else: + emit(av) + elif op in RANGE_SET: + emit(OPCODES[fix_direction[op]]) + if op in IGNORE_SET: + emit(lower_case(av[0])) + emit(lower_case(av[1])) + else: emit(av[0]) emit(av[1]) - _compile(code, av[2], flags, info, dir) - emit(OPCODES[fix_direction[REPEAT_CODES[op]]]) - offset = len(code) - skip - code[skip] = offset - emit(offset) - elif op is SUBPATTERN: - if av[0]: - mark_1, mark_2 = av[0] * 2 - 2, av[0] * 2 - 1 - if dir < 0: - mark_1, mark_2 = mark_2, mark_1 - info.group_count = max(info.group_count, av[0]) - emit(OPCODES[MARK]) - emit(mark_1) - _compile(code, av[1], flags, info, dir) - if av[0]: - emit(OPCODES[MARK]) - emit(mark_2) - elif op is ATOMIC: - emit(OPCODES[ATOMIC]) - _compile(code, av[1], flags, info, dir) - emit(OPCODES[END_ATOMIC]) - else: - raise ValueError, ("unsupported operand type", op) + elif op in REPEAT_SET: + if flags & SRE_FLAG_TEMPLATE: + raise error, "internal: unsupported template operator" + else: + single = get_single_character(av[2]) + if single: + emit(OPCODES[fix_direction[REPEAT_ONE_CODES[op]]]) + skip = len(code); emit(0) + emit(info.repeat_count) + info.repeat_count += 1 + emit(av[0]) + emit(av[1]) + _compile(code, single, flags, info, dir) + code[skip] = len(code) - skip + else: + emit(OPCODES[fix_direction[op]]) + skip = len(code); emit(0) + emit(info.repeat_count) + info.repeat_count += 1 + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags, info, dir) + emit(OPCODES[fix_direction[REPEAT_CODES[op]]]) + offset = len(code) - skip + code[skip] = offset + emit(offset) + elif op is SUBPATTERN: + if av[0]: + mark_1, mark_2 = av[0] * 2 - 2, av[0] * 2 - 1 + if dir < 0: + mark_1, mark_2 = mark_2, mark_1 + info.group_count = max(info.group_count, av[0]) + emit(OPCODES[MARK]) + emit(mark_1) + _compile(code, av[1], flags, info, dir) + if av[0]: + emit(OPCODES[MARK]) + emit(mark_2) + elif op is ATOMIC: + emit(OPCODES[ATOMIC]) + _compile(code, av[1], flags, info, dir) + emit(OPCODES[END_ATOMIC]) + else: + raise ValueError, ("unsupported operand type", op) if literal_string: if dir < 0: literal_string = literal_string[::-1] - emit_literal_string(emit, literal_string, fix_case, fix_op, fix_direction) + emit_literal_string(code, literal_op, literal_string, lower_case, fix_direction) -def emit_literal_string(emit, literal_string, fix_case, fix_op, fix_direction): +def emit_literal_string(code, literal_op, literal_string, lower_case, fix_direction): + emit = code.append + if literal_op in IGNORE_SET: + literal_string = [lower_case(ch) for ch in literal_string] if len(literal_string) > 1: - emit(OPCODES[fix_direction[fix_op[LITERAL_STRING]]]) + if literal_op is LITERAL_IGNORE: + literal_op = LITERAL_STRING_IGNORE + else: + literal_op = LITERAL_STRING + emit(OPCODES[fix_direction[literal_op]]) emit(len(literal_string)) - for ch in literal_string: - emit(fix_case(ch)) else: - emit(OPCODES[fix_direction[fix_op[LITERAL]]]) - emit(fix_case(literal_string[0])) + emit(OPCODES[fix_direction[literal_op]]) + for ch in literal_string: + emit(ch) def get_single_character(pattern): if len(pattern) == 1 and pattern[0][0] in SINGLE_CHARACTER_CODES: return pattern return None -def _compile_in(charset, flags, code, fix_direction): +def _compile_in(code, op, charset, lower_case, fix_encoding): # compile charset subprogram emit = code.append - fix_op = {} - if flags & SRE_FLAG_LOCALE: - for op, loc_op, uni_op in CATEGORY_CODES: - fix_op[op] = loc_op - elif flags & SRE_FLAG_UNICODE: - for op, loc_op, uni_op in CATEGORY_CODES: - fix_op[op] = uni_op - else: - for op, loc_op, uni_op in CATEGORY_CODES: - fix_op[op] = op - if flags & SRE_FLAG_IGNORECASE: - def fix_case(ch): - return _sre.getlower(ch, flags) - for op, op_ignore in IGNORE_CODES: - fix_op[op] = op_ignore - else: - def fix_case(ch): - return ch - for op, op_ignore in IGNORE_CODES: - fix_op[op] = op skip = len(code); emit(0) for op, av in charset: - if op is BIGCHARSET: - emit(OPCODES[fix_direction[op]]) - _compile_bigcharset(av, flags, code) + if op in BIGCHARSET_SET: + emit(OPCODES[op]) + _compile_bigcharset(code, op, av, lower_case) elif op in CATEGORY_SET: - emit(OPCODES[fix_direction[fix_op[op]]]) - elif op is CHARSET: - emit(OPCODES[fix_direction[op]]) - _compile_charset(av, flags, code) - elif op is LITERAL: - emit(OPCODES[fix_direction[op]]) - emit(fix_case(av)) - elif op is RANGE: - emit(OPCODES[fix_direction[op]]) - emit(fix_case(av[0])) - emit(fix_case(av[1])) + emit(OPCODES[fix_encoding[op]]) + elif op in CHARSET_SET: + emit(OPCODES[op]) + _compile_charset(code, op, av, lower_case) + elif op in LITERAL_SET: + emit(OPCODES[op]) + if op in IGNORE_SET: + emit(lower_case(av)) + else: + emit(av) + elif op in RANGE_SET: + emit(OPCODES[op]) + if op in IGNORE_SET: + emit(lower_case(av[0])) + emit(lower_case(av[1])) + else: + emit(av[0]) + emit(av[1]) else: raise error, "internal: unsupported charset operator" code[skip] = len(code) - skip -def _compile_charset(av, flags, code): +def _compile_charset(code, op, charset, lower_case): emit = code.append for i in range(256 / _subchunk_size): - emit(av & _subchunk_mask) - av >>= _subchunk_size + emit(charset & _subchunk_mask) + charset >>= _subchunk_size -def _compile_bigcharset(av, flags, code): +def _compile_bigcharset(code, op, charset, lower_case): emit = code.append skip = len(code); emit(0) bits, shift = 0, 0 - for index in av[0]: + for index in charset[0]: bits |= index << shift; shift += 8 if shift == _subchunk_size: emit(bits) bits, shift = 0, 0 - for chunk in av[1]: + for chunk in charset[1]: for i in range(256 / _subchunk_size): emit(chunk & _subchunk_mask) chunk >>= _subchunk_size @@ -452,33 +446,13 @@ def _ones(n): return (1 << n) - 1 -NOT_CODES = [ - (BIGCHARSET, NOT_BIGCHARSET), - (CHARSET, NOT_CHARSET), - (DIGIT, NOT_DIGIT), - (LITERAL, NOT_LITERAL), - (RANGE, NOT_RANGE), - (WHITESPACE, NOT_WHITESPACE), - (WORD, NOT_WORD), -] - -def _optimize_in(op, charset, flags): +def _optimize_in(op, charset, lower_case): # internal: optimize character set - if flags & SRE_FLAG_IGNORECASE: - def fix_case(ch): - return _sre.getlower(ch, flags) + if op in IGNORE_SET: + fix_case = lower_case else: def fix_case(ch): return ch - fix_op = {} - if op is IN: - for o, not_o in NOT_CODES: - fix_op[o] = o - fix_op[not_o] = not_o - else: - for o, not_o in NOT_CODES: - fix_op[o] = not_o - fix_op[not_o] = o # consolidate the ranges ranges = [] categories = [] @@ -535,7 +509,11 @@ categories.append((BIGCHARSET, (index_table, [chunk for index, chunk in chunks]))) if len(categories) == 1: o, a = categories[0] - return (fix_op[o], a) + if op in NOT_SET: + o = dict(NOT_CODES)[o] + if op in IGNORE_SET: + o = dict(IGNORE_CODES).get(o, o) + return (o, a) return (op, categories) try: --- Lib/sre_parse.py +++ Lib/sre_parse.py @@ -66,6 +66,8 @@ "u": SRE_FLAG_UNICODE, } +_SCOPED_FLAGS_MASK = SRE_FLAG_IGNORECASE | SRE_FLAG_MULTILINE | SRE_FLAG_DOTALL + class Pattern: # master pattern object. keeps track of global attributes def __init__(self): @@ -89,9 +91,9 @@ def checkgroup(self, gid): return gid < self.groups and gid not in self.open -UNIT_CODES = set([ANY, DIGIT, IN, LITERAL, NOT_DIGIT, - NOT_IN, NOT_LITERAL, NOT_WHITESPACE, NOT_WORD, RANGE, - WHITESPACE, WORD]) +UNIT_CODES = set([ANY, ANY_ALL, DIGIT, IN, IN_IGNORE, LITERAL, LITERAL_IGNORE, + NOT_DIGIT, NOT_IN, NOT_IN_IGNORE, NOT_LITERAL, NOT_LITERAL_IGNORE, NOT_WHITESPACE, + NOT_WORD, RANGE, RANGE_IGNORE, WHITESPACE, WORD]) REPEAT_CODES = set([REPEAT_MIN, REPEAT_MAX, REPEAT_POSS]) @@ -108,12 +110,12 @@ seqtypes = type(()), type([]) for op, av in self.data: print level*" " + op,; nl = 0 - if op == IN: + if op in (IN, IN_IGNORE): # member sublanguage print; nl = 1 for op, a in av: print (level+1)*" " + op, a - elif op == NOT_IN: + elif op in (NOT_IN, NOT_IN_IGNORE): # member sublanguage print; nl = 1 for op, a in av: @@ -276,6 +278,12 @@ code = ESCAPES.get(escape) if code: return code + if state.flags & SRE_FLAG_IGNORECASE: + literal_op = LITERAL_IGNORE + groupref_op = GROUPREF_IGNORE + else: + literal_op = LITERAL + groupref_op = GROUPREF try: c = escape[1:2] if c == "x": @@ -284,12 +292,12 @@ escape = escape + source.get() if len(escape) != 4: raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff + return literal_op, int(escape[2:], 16) & 0xff elif c == "0": # octal escape while source.next in OCTDIGITS and len(escape) < 4: escape = escape + source.get() - return LITERAL, int(escape[1:], 8) & 0xff + return literal_op, int(escape[1:], 8) & 0xff elif c in DIGITS: # octal escape *or* decimal group reference (sigh) if source.next in DIGITS: @@ -298,16 +306,16 @@ source.next in OCTDIGITS): # got three octal digits; this is an octal escape escape = escape + source.get() - return LITERAL, int(escape[1:], 8) & 0xff + return literal_op, int(escape[1:], 8) & 0xff # not an octal escape, so this is a group reference group = int(escape[1:]) if group < state.groups: if not state.checkgroup(group): raise error, "cannot refer to open group" - return GROUPREF, group + return groupref_op, group raise ValueError if len(escape) == 2: - return LITERAL, ord(escape[1]) + return literal_op, ord(escape[1]) except ValueError: pass raise error, "bogus escape: %s" % repr(escape) @@ -388,7 +396,7 @@ PATTERN_ENDERS = set("|)") ASSERT_CHARS = set("=!<") LOOKBEHIND_ASSERT_CHARS = set("=!") -POSITION_CODES = set([BOUNDARY, END_OF_LINE, END_OF_STRING, NOT_BOUNDARY, START_OF_LINE, START_OF_STRING]) +POSITION_CODES = set([BOUNDARY, END_OF_LINE, END_OF_STRING, END_OF_STRING_2, NOT_BOUNDARY, START_OF_LINE, START_OF_STRING]) REPEAT_CODES = set([REPEAT_MIN, REPEAT_MAX, REPEAT_POSS]) QUERY_GROUP = 0 CAPTURE_GROUP = 1 @@ -423,7 +431,10 @@ continue if this and this[0] not in SPECIAL_CHARS: - subpatternappend((LITERAL, ord(this))) + if state.flags & SRE_FLAG_IGNORECASE: + subpatternappend((LITERAL_IGNORE, ord(this))) + else: + subpatternappend((LITERAL, ord(this))) elif this == "[": # character set charset = [] @@ -466,9 +477,15 @@ else: setappend(code1) if negate: - subpatternappend((NOT_IN, charset)) + if state.flags & SRE_FLAG_IGNORECASE: + subpatternappend((NOT_IN_IGNORE, charset)) + else: + subpatternappend((NOT_IN, charset)) else: - subpatternappend((IN, charset)) + if state.flags & SRE_FLAG_IGNORECASE: + subpatternappend((IN_IGNORE, charset)) + else: + subpatternappend((IN, charset)) elif this and this[0] in REPEAT_CHARS: # repeat previous item if this == "?": @@ -516,13 +533,17 @@ else: subpattern[-1] = (REPEAT_MAX, (min, max, item)) elif this == ".": - subpatternappend((ANY, None)) + if state.flags & SRE_FLAG_DOTALL: + subpatternappend((ANY_ALL, None)) + else: + subpatternappend((ANY, None)) elif this == "(": group = CAPTURE_GROUP name = None condgroup = None if sourcematch("?"): group = QUERY_GROUP + scoped_flags = None # options if sourcematch("P"): # python extensions @@ -554,16 +575,16 @@ gid = state.groupdict.get(name) if gid is None: raise error, "unknown group name" - subpatternappend((GROUPREF, gid)) + if state.flags & SRE_FLAG_IGNORECASE: + subpatternappend((GROUPREF_IGNORE, gid)) + else: + subpatternappend((GROUPREF, gid)) continue else: char = sourceget() if char is None: raise error, "unexpected end of pattern" raise error, "unknown specifier: ?P%s" % char - elif sourcematch(":"): - # non-capturing group - group = NONCAPTURE_GROUP elif sourcematch(">"): # atomic group group = ATOMIC_GROUP @@ -585,7 +606,9 @@ raise error, "syntax error" dir = -1 # lookbehind char = sourceget() + saved_flags = state.flags p = _parse_sub(source, state) + state.flags = (state.flags & ~_SCOPED_FLAGS_MASK) | (saved_flags & _SCOPED_FLAGS_MASK) if not sourcematch(")"): raise error, "unbalanced parenthesis" if char == "=": @@ -614,11 +637,32 @@ except ValueError: raise error, "bad character in group name" else: - # flags - if not source.next in FLAGS: + # probably non-capturing group or flags + # might be scoped (set at start of group and local to group) + scoped_flags = state.flags + seen_on, seen_off = 0, 0 + while source.next in FLAGS: + scoped_flags |= FLAGS[sourceget()] + seen_on = 1 + if sourcematch("-"): + while source.next in FLAGS: + if (FLAGS[source.next] & _SCOPED_FLAGS_MASK) == 0: + raise error, "bad pattern flag" + scoped_flags &= ~FLAGS[sourceget()] + seen_off = 1 + if not seen_off: + raise error, "bad pattern flag" + # update just global flags + state.flags |= scoped_flags & ~_SCOPED_FLAGS_MASK + if sourcematch(":"): + # non-capturing group with scoped flags + group = NONCAPTURE_GROUP + elif seen_on or seen_off: + # not start of group, just setting flags + state.flags = scoped_flags + scoped_flags = None + else: raise error, "unexpected end of pattern" - while source.next in FLAGS: - state.flags = state.flags | FLAGS[sourceget()] if group: atomic = group == ATOMIC_GROUP # parse group contents @@ -627,10 +671,14 @@ group = None else: group = state.opengroup(name) + saved_flags = state.flags + if scoped_flags is not None: + state.flags = scoped_flags if condgroup: p = _parse_sub_cond(source, state, condgroup) else: p = _parse_sub(source, state) + state.flags = (state.flags & ~_SCOPED_FLAGS_MASK) | (saved_flags & _SCOPED_FLAGS_MASK) if not sourcematch(")"): raise error, "unbalanced parenthesis" if group is not None: @@ -648,9 +696,15 @@ break raise error, "unknown extension" elif this == "^": - subpatternappend((START_OF_LINE, None)) + if state.flags & SRE_FLAG_MULTILINE: + subpatternappend((START_OF_LINE, None)) + else: + subpatternappend((START_OF_STRING, None)) elif this == "$": - subpattern.append((END_OF_LINE, None)) + if state.flags & SRE_FLAG_MULTILINE: + subpattern.append((END_OF_LINE, None)) + else: + subpattern.append((END_OF_STRING_2, None)) elif this and this[0] == "\\": code = _escape(source, this, state) subpatternappend(code) --- Lib/test/re_tests.py +++ Lib/test/re_tests.py @@ -106,8 +106,8 @@ ('a.*b', 'acc\nccb', FAIL), ('a.{4,5}b', 'acc\nccb', FAIL), ('a.b', 'a\rb', SUCCEED, 'found', 'a\rb'), - ('a.b(?s)', 'a\nb', SUCCEED, 'found', 'a\nb'), - ('a.*(?s)b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'), + ('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'), + ('(?s)a.*b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'), ('(?s)a.{4,5}b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'), ('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'), @@ -563,7 +563,7 @@ # Check odd placement of embedded pattern modifiers # not an error under PCRE/PRE: - ('w(?i)', 'W', SUCCEED, 'found', 'W'), + ('(?i)w', 'W', SUCCEED, 'found', 'W'), # ('w(?i)', 'W', SYNTAX_ERROR), # Comments using the x embedded pattern modifier --- Modules/_sre.c +++ Modules/_sre.c @@ -632,6 +632,7 @@ backtrack_item = &context->backtrack_chunk->items[context->backtrack_chunk->count++]; backtrack_item->op = op; backtrack_item->pattern_ptr = pattern_ptr; + backtrack_item->text_start = context->text_start; backtrack_item->text_ptr = context->text_ptr; backtrack_item->index = index; backtrack_item->repeat_counter = context->repeat_counter[index]; @@ -923,6 +924,7 @@ result = SRE_SAVE_MARKS(&context); if (result != 0) return SRE_CLEANUP(&context, state, result); + context.text_start = state->beginning; context.pattern_ptr++; break; case SRE_OP_ASSERT_NOT: @@ -935,6 +937,7 @@ result = SRE_SAVE_MARKS(&context); if (result != 0) return SRE_CLEANUP(&context, state, result); + context.text_start = state->beginning; context.pattern_ptr++; break; case SRE_OP_ATOMIC: @@ -1063,6 +1066,7 @@ TRACE(("|%p|%p|END_ASSERT\n", context.pattern_ptr, context.text_ptr)); SRE_DISCARD_UNTIL_OP(&context, SRE_OP_ASSERT); backtrack_item = &context.backtrack_chunk->items[context.backtrack_chunk->count - 1]; + context.text_start = backtrack_item->text_start; context.text_ptr = backtrack_item->text_ptr; SRE_DISCARD_BACKTRACK(&context); SRE_RESTORE_MARKS(&context); @@ -1072,8 +1076,11 @@ { // Assert not subpattern. // ... + SRE_BACKTRACK_ITEM* backtrack_item; TRACE(("|%p|%p|END_ASSERT_NOT\n", context.pattern_ptr, context.text_ptr)); SRE_DISCARD_UNTIL_OP(&context, SRE_OP_ASSERT_NOT); + backtrack_item = &context.backtrack_chunk->items[context.backtrack_chunk->count - 1]; + context.text_start = backtrack_item->text_start; SRE_DISCARD_BACKTRACK(&context); SRE_RESTORE_MARKS(&context); goto backtrack; @@ -4357,12 +4364,16 @@ TRACE(("|%p|%p|BACKTRACK ", context.pattern_ptr, context.text_ptr)); switch(context.backtrack_chunk->items[context.backtrack_chunk->count - 1].op) { case SRE_OP_ASSERT: + { // Assert subpattern. // ... + SRE_BACKTRACK_ITEM* backtrack_item = &context.backtrack_chunk->items[context.backtrack_chunk->count - 1]; TRACE(("ASSERT\n")); + context.text_start = backtrack_item->text_start; SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); goto backtrack; + } case SRE_OP_ASSERT_NOT: { // Assert not subpattern. @@ -4370,6 +4381,7 @@ SRE_BACKTRACK_ITEM* backtrack_item = &context.backtrack_chunk->items[context.backtrack_chunk->count - 1]; TRACE(("ASSERT_NOT\n")); context.pattern_ptr = backtrack_item->pattern_ptr; + context.text_start = backtrack_item->text_start; context.text_ptr = backtrack_item->text_ptr; SRE_RESTORE_MARKS(&context); SRE_DISCARD_BACKTRACK(&context); @@ -5738,7 +5750,7 @@ else state.end = state.ptr; } else { - if (state.ptr == state.start) + if (state.ptr == state.start) state.start = (void*) ((char*) state.ptr + state.charsize); else state.start = state.ptr; @@ -6562,52 +6574,38 @@ Validates a single-character op. Returns a pointer to the following op if valid or NULL if invalid. */ -SRE_CODE* validate_one_pattern(SRE_CODE* pattern, SRE_CODE* end_ptr, int direction) { +SRE_CODE* validate_one_pattern(SRE_CODE* pattern, SRE_CODE* end_ptr, int* direction) { SRE_OpInfo* info_ptr; if (pattern[0] > SRE_MAX_OP) return NULL; + info_ptr = &op_info[*pattern++]; + if (*direction != 0 && *direction != info_ptr->direction) + return NULL; + switch (info_ptr->type) { case SRE_TYPE_CATEGORY: // - if (direction != 0 && direction != info_ptr->direction) - return NULL; - direction = info_ptr->direction; break; case SRE_TYPE_BIGCHARSET: // bigcharset - if (direction != 0 && direction != info_ptr->direction) - return NULL; - direction = info_ptr->direction; pattern = validate_bigcharset(pattern, end_ptr); if (pattern == NULL) return NULL; break; case SRE_TYPE_CHARSET: // charset - if (direction != 0 && direction != info_ptr->direction) - return NULL; - direction = info_ptr->direction; pattern = validate_charset(pattern, end_ptr); if (pattern == NULL) return NULL; break; case SRE_TYPE_IN: // set - if (direction != 0 && direction != info_ptr->direction) - return NULL; - direction = info_ptr->direction; pattern = validate_in(pattern, end_ptr); if (pattern == NULL) return NULL; break; case SRE_TYPE_LITERAL: // code - if (direction != 0 && direction != info_ptr->direction) - return NULL; - direction = info_ptr->direction; pattern++; break; case SRE_TYPE_RANGE: // min max - if (direction != 0 && direction != info_ptr->direction) - return NULL; - direction = info_ptr->direction; if (pattern[0] > pattern[1]) return NULL; pattern += 2; @@ -6616,7 +6614,12 @@ return NULL; } - return pattern > end_ptr ? NULL : pattern; + if (pattern > end_ptr) + return NULL; + + *direction = info_ptr->direction; + + return pattern; } SRE_CODE* validate_subpattern(SRE_CODE* pattern, SRE_CODE* end_ptr, int* direction, SRE_Validation* validation) { @@ -6630,12 +6633,13 @@ while (pattern < end_ptr) { if (pattern[0] > SRE_MAX_OP) return NULL; + info_ptr = &op_info[*pattern++]; + if (dir != 0 && info_ptr->direction != 0 && dir != info_ptr->direction) + return NULL; + switch (info_ptr->type) { case SRE_TYPE_CATEGORY: // - if (dir != 0 && dir != info_ptr->direction) - return NULL; - dir = info_ptr->direction; break; case SRE_TYPE_ASSERT: // ... skip_ptr = pattern + pattern[0]; @@ -6654,9 +6658,6 @@ pattern = ptr + 1; break; case SRE_TYPE_BIGCHARSET: // bigcharset - if (dir != 0 && dir != info_ptr->direction) - return NULL; - dir = info_ptr->direction; pattern = validate_bigcharset(pattern, end_ptr); if (pattern == NULL) return NULL; @@ -6688,17 +6689,11 @@ pattern++; break; case SRE_TYPE_CHARSET: // charset - if (dir != 0 && dir != info_ptr->direction) - return NULL; - dir = info_ptr->direction; pattern = validate_charset(pattern, end_ptr); if (pattern == NULL) return NULL; break; case SRE_TYPE_GROUPREF: // group_id - if (dir != 0 && dir != info_ptr->direction) - return NULL; - dir = info_ptr->direction; if (validation->max_group_ref < pattern[0]) validation->max_group_ref = pattern[0]; pattern++; @@ -6729,23 +6724,14 @@ pattern = skip_ptr; break; case SRE_TYPE_IN: // set - if (dir != 0 && dir != info_ptr->direction) - return NULL; - dir = info_ptr->direction; pattern = validate_in(pattern, end_ptr); if (pattern == NULL) return NULL; break; case SRE_TYPE_LITERAL: // code - if (dir != 0 && dir != info_ptr->direction) - return NULL; - dir = info_ptr->direction; pattern++; break; case SRE_TYPE_LITERAL_STRING: // length ... - if (dir != 0 && dir != info_ptr->direction) - return NULL; - dir = info_ptr->direction; if (pattern[0] == 0) return NULL; pattern += 1 + pattern[0]; @@ -6757,17 +6743,11 @@ pattern++; break; case SRE_TYPE_RANGE: // min max - if (dir != 0 && dir != info_ptr->direction) - return NULL; - dir = info_ptr->direction; if (pattern[0] > pattern[1]) return NULL; pattern += 2; break; case SRE_TYPE_REPEAT: // ... - if (dir != 0 && dir != info_ptr->direction) - return NULL; - dir = info_ptr->direction; if (pattern[0] < 6) return NULL; skip_ptr = pattern + pattern[0]; @@ -6785,9 +6765,6 @@ pattern = skip_ptr + 1; break; case SRE_TYPE_REPEAT_ONE: // ... - if (dir != 0 && dir != info_ptr->direction) - return NULL; - dir = info_ptr->direction; if (pattern[0] < 5) return NULL; skip_ptr = pattern + pattern[0]; @@ -6798,7 +6775,7 @@ validation->max_repeat = pattern[1]; if (pattern[2] > pattern[3]) return NULL; - if (validate_one_pattern(pattern + 4, skip_ptr, dir) != skip_ptr) + if (validate_one_pattern(pattern + 4, skip_ptr, &dir) != skip_ptr) return NULL; pattern = skip_ptr; break; @@ -6807,9 +6784,13 @@ *direction = dir; return pattern - 1; } + + if (info_ptr->direction != 0) + dir = info_ptr->direction; } *direction = dir; + return pattern > end_ptr ? NULL : pattern; } --- Modules/sre.h +++ Modules/sre.h @@ -56,19 +56,13 @@ /* FIXME: shouldn't be a constant, really... */ #define SRE_MARK_SIZE 200 -typedef struct SRE_REPEAT_T { - Py_ssize_t count; - SRE_CODE* pattern; /* points to REPEAT operator arguments */ - void* last_ptr; /* helper to check for infinite loops */ - struct SRE_REPEAT_T *prev; /* points to previous repeat context */ -} SRE_REPEAT; - #define SRE_BACKTRACK_CHUNK_SIZE 1024 #define SRE_SAVED_MARKS_CHUNK_SIZE 1024 typedef struct SRE_BACKTRACK_ITEM { int op; SRE_CODE* pattern_ptr; + void* text_start; void* text_ptr; int index; int repeat_counter; @@ -98,7 +92,7 @@ Py_ssize_t pos, endpos; /* character size */ int charsize; - int reverse; + int reverse; /* registers */ Py_ssize_t lastindex; Py_ssize_t lastmark;