=== modified file Lib/sre_constants.py --- Lib/sre_constants.py 2004-08-25 02:22:30 +0000 +++ Lib/sre_constants.py 2009-03-04 15:26:20 +0000 @@ -13,11 +13,22 @@ # update when constants are added or removed -MAGIC = 20031017 - -# max code word in this release - -MAXREPEAT = 65535 +MAGIC = 20081218 + +import operator +import unicodedata +from collections import defaultdict + +# size of code word in this release +BYTES_PER_CODE = 4 +BITS_PER_CODE = 8 * BYTES_PER_CODE +MAXCODE = (1 << BITS_PER_CODE) - 1 + +MAXREPEAT = MAXCODE + +DIGITS = set("0123456789") +OCTDIGITS = set("01234567") +HEXDIGITS = set("0123456789abcdefABCDEF") # SRE standard exception (access as sre.error) # should this really be here? @@ -25,181 +36,175 @@ class error(Exception): pass -# operators - -FAILURE = "failure" -SUCCESS = "success" - -ANY = "any" -ANY_ALL = "any_all" -ASSERT = "assert" -ASSERT_NOT = "assert_not" -AT = "at" -BIGCHARSET = "bigcharset" -BRANCH = "branch" -CALL = "call" -CATEGORY = "category" -CHARSET = "charset" -GROUPREF = "groupref" -GROUPREF_IGNORE = "groupref_ignore" -GROUPREF_EXISTS = "groupref_exists" -IN = "in" -IN_IGNORE = "in_ignore" -INFO = "info" -JUMP = "jump" -LITERAL = "literal" -LITERAL_IGNORE = "literal_ignore" -MARK = "mark" -MAX_REPEAT = "max_repeat" -MAX_UNTIL = "max_until" -MIN_REPEAT = "min_repeat" -MIN_UNTIL = "min_until" -NEGATE = "negate" -NOT_LITERAL = "not_literal" -NOT_LITERAL_IGNORE = "not_literal_ignore" -RANGE = "range" -REPEAT = "repeat" -REPEAT_ONE = "repeat_one" -SUBPATTERN = "subpattern" -MIN_REPEAT_ONE = "min_repeat_one" - -# positions -AT_BEGINNING = "at_beginning" -AT_BEGINNING_LINE = "at_beginning_line" -AT_BEGINNING_STRING = "at_beginning_string" -AT_BOUNDARY = "at_boundary" -AT_NON_BOUNDARY = "at_non_boundary" -AT_END = "at_end" -AT_END_LINE = "at_end_line" -AT_END_STRING = "at_end_string" -AT_LOC_BOUNDARY = "at_loc_boundary" -AT_LOC_NON_BOUNDARY = "at_loc_non_boundary" -AT_UNI_BOUNDARY = "at_uni_boundary" -AT_UNI_NON_BOUNDARY = "at_uni_non_boundary" - -# categories -CATEGORY_DIGIT = "category_digit" -CATEGORY_NOT_DIGIT = "category_not_digit" -CATEGORY_SPACE = "category_space" -CATEGORY_NOT_SPACE = "category_not_space" -CATEGORY_WORD = "category_word" -CATEGORY_NOT_WORD = "category_not_word" -CATEGORY_LINEBREAK = "category_linebreak" -CATEGORY_NOT_LINEBREAK = "category_not_linebreak" -CATEGORY_LOC_WORD = "category_loc_word" -CATEGORY_LOC_NOT_WORD = "category_loc_not_word" -CATEGORY_UNI_DIGIT = "category_uni_digit" -CATEGORY_UNI_NOT_DIGIT = "category_uni_not_digit" -CATEGORY_UNI_SPACE = "category_uni_space" -CATEGORY_UNI_NOT_SPACE = "category_uni_not_space" -CATEGORY_UNI_WORD = "category_uni_word" -CATEGORY_UNI_NOT_WORD = "category_uni_not_word" -CATEGORY_UNI_LINEBREAK = "category_uni_linebreak" -CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak" - -OPCODES = [ - - # failure=0 success=1 (just because it looks better that way :-) - FAILURE, SUCCESS, - - ANY, ANY_ALL, - ASSERT, ASSERT_NOT, - AT, - BRANCH, - CALL, - CATEGORY, - CHARSET, BIGCHARSET, - GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE, - IN, IN_IGNORE, - INFO, - JUMP, - LITERAL, LITERAL_IGNORE, - MARK, - MAX_UNTIL, - MIN_UNTIL, - NOT_LITERAL, NOT_LITERAL_IGNORE, - NEGATE, - RANGE, - REPEAT, - REPEAT_ONE, - SUBPATTERN, - MIN_REPEAT_ONE - -] - -ATCODES = [ - AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY, - AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING, - AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY, - AT_UNI_NON_BOUNDARY -] - -CHCODES = [ - CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE, - CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD, - CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_WORD, - CATEGORY_LOC_NOT_WORD, CATEGORY_UNI_DIGIT, CATEGORY_UNI_NOT_DIGIT, - CATEGORY_UNI_SPACE, CATEGORY_UNI_NOT_SPACE, CATEGORY_UNI_WORD, - CATEGORY_UNI_NOT_WORD, CATEGORY_UNI_LINEBREAK, - CATEGORY_UNI_NOT_LINEBREAK -] - -def makedict(list): - d = {} - i = 0 - for item in list: - d[item] = i - i = i + 1 - return d - -OPCODES = makedict(OPCODES) -ATCODES = makedict(ATCODES) -CHCODES = makedict(CHCODES) - -# replacement operations for "ignore case" mode -OP_IGNORE = { - GROUPREF: GROUPREF_IGNORE, - IN: IN_IGNORE, - LITERAL: LITERAL_IGNORE, - NOT_LITERAL: NOT_LITERAL_IGNORE -} - -AT_MULTILINE = { - AT_BEGINNING: AT_BEGINNING_LINE, - AT_END: AT_END_LINE -} - -AT_LOCALE = { - AT_BOUNDARY: AT_LOC_BOUNDARY, - AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY -} - -AT_UNICODE = { - AT_BOUNDARY: AT_UNI_BOUNDARY, - AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY -} - -CH_LOCALE = { - CATEGORY_DIGIT: CATEGORY_DIGIT, - CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, - CATEGORY_SPACE: CATEGORY_SPACE, - CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE, - CATEGORY_WORD: CATEGORY_LOC_WORD, - CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, - CATEGORY_LINEBREAK: CATEGORY_LINEBREAK, - CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK -} - -CH_UNICODE = { - CATEGORY_DIGIT: CATEGORY_UNI_DIGIT, - CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT, - CATEGORY_SPACE: CATEGORY_UNI_SPACE, - CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE, - CATEGORY_WORD: CATEGORY_UNI_WORD, - CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD, - CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK, - CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK -} +# list of all the operators +# the fields are: name, op_type, negative, directional, end_marker +# those with a negative form start with NOT_ +# those with a reverse directional form end with _REV +_OPERATOR_LIST = """ +FAILURE INVALID N N - +SUCCESS INVALID N N - +ANY SIMPLE_CATEGORY N Y - +ANY_ALL SIMPLE_CATEGORY N Y - +ASSERT ASSERT N N END_ASSERT +ASSERT_NOT ASSERT N N END_ASSERT_NOT +ATOMIC ATOMIC N N END_ATOMIC +BOUNDARY POSITION Y N - +BRANCH BRANCH N N - +CATEGORY CATEGORY Y Y - +CHARSET CHARSET Y Y - +CHARSET_IGNORE CHARSET Y Y - +END_OF_LINE POSITION N N - +END_OF_STRING POSITION N N - +END_OF_STRING_LN POSITION N N - +GROUPREF GROUPREF N Y - +GROUPREF_EXISTS GROUPREF_EXISTS N N - +GROUPREF_IGNORE GROUPREF N Y - +JUMP INVALID N N - +LITERAL LITERAL Y Y - +LITERAL_IGNORE LITERAL Y Y - +LITERAL_STRING LITERAL_STRING N Y - +LITERAL_STRING_IGNORE LITERAL_STRING N Y - +MARK MARK N N - +RANGE RANGE Y Y - +RANGE_IGNORE RANGE Y Y - +REPEAT_MAX REPEAT N Y END_REPEAT_MAX +REPEAT_MIN REPEAT N Y END_REPEAT_MIN +REPEAT_ONE_MAX REPEAT_ONE N Y - +REPEAT_ONE_MIN REPEAT_ONE N Y - +REPEAT_ONE_POSS REPEAT_ONE N Y - +REPEAT_POSS REPEAT N Y END_REPEAT_POSS +SET SET Y Y - +SET_IGNORE SET Y Y - +START_OF_LINE POSITION N N - +START_OF_SEARCH POSITION N N - +START_OF_STRING POSITION N N - +SUBPATTERN INVALID N N - +""" + +# enumerates the operators +def _build_operator_list(OPERATOR_LIST): + neg_prefix = {"N": [""], "Y": ["", "NOT_"]} + dir_suffix = {"N": [(0, "")], "Y": [(1, ""), (-1, "_REV")]} + + operators = [] + for line in _OPERATOR_LIST.splitlines(): + fields = line.split() + if not fields: + continue + + name, op_type, negative, directional, end_marker = fields + # some opcodes have a negative "NOT_x" form + for n in neg_prefix[negative]: + # some opcodes are directional; they have a reverse "x_REV" form + for d, r in dir_suffix[directional]: + operators.append((n + name + r, op_type, d, end_marker)) + if end_marker != "-": + operators.append((n + end_marker + r, "INVALID", d, "-")) + + return operators + +_operator_list = _build_operator_list(_OPERATOR_LIST) + +# builds a dict of positive<->negative opcodes +def _build_not_opcodes(operator_list): + not_opcodes = {} + for name, op_type, direction, end_marker in _operator_list: + if name.startswith("NOT_"): + short_name = name[4 : ] + not_opcodes[name] = short_name + not_opcodes[short_name] = name + return not_opcodes + +_not_opcodes = _build_not_opcodes(_operator_list) + +# converts between positive/negative opcodes +def not_op(op): + return _not_opcodes[op[0]], op[1] + +# builds a dict of normal_case<->ignore_case opcodes +def _build_ignore_opcodes(operator_list): + ignore_opcodes = {} + for name, op_type, direction, end_marker in operator_list: + if name.endswith("_IGNORE"): + short_name = name[ : -7] + ignore_opcodes[name] = short_name + ignore_opcodes[short_name] = name + for op in ["CATEGORY", "NOT_CATEGORY"]: + ignore_opcodes[op] = op + return ignore_opcodes + +_ignore_opcodes = _build_ignore_opcodes(_operator_list) + +# converts between normal_case/ignore_case opcodes +def ignore_op(op): + return _ignore_opcodes[op[0]], op[1] + +# sorts the operators and assigns opcode numbers +def _sorted_operators(operator_list): + # FAILURE and SUCCESS are always first and second + sorted_operators = operator_list[ : 2] + sorted(operator_list[2 : ]) + + return [(name, number, op_type, direction, end_marker) for number, + (name, op_type, direction, end_marker) in enumerate(sorted_operators)] + +_operator_list = _sorted_operators(_operator_list) + +# build the OPCODES dict +OPCODES = dict((name, number) for name, number, op_type, direction, + end_marker in _operator_list) + +# collect the op_types +_op_types = set(op_type for name, number, op_type, direction, + end_marker in _operator_list) + +# create an attribute in OP for each operator +class _Record(object): + pass + +OP = _Record() +for _name in OPCODES: + setattr(OP, _name, _name) + +# unicode codepoint categories (property "\p{Lu}", etc) +# (these entries must have certain fixed values) +_UNI_CATEGORY_LIST = """- Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs + Co - Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So -""" + +# additional unicode categories (property "\p{Alpha}", etc) +_COMMON_CATEGORY_LIST = """Alpha Alnum ASCII Blank Cntrl Digit Graph LineBreak + Lower Print Punct Space Upper Word XDigit""" + +# builds the categories dict +def _build_categories(UNI_CATEGORY_LIST, COMMON_CATEGORY_LIST): + category_number = 0 + categories = {} + all_categories = 0 + for name in UNI_CATEGORY_LIST.split(): + if name != "-": + categories[name] = category_number + all_categories |= 1 << category_number + category_number += 1 + + assert category_number <= 0x20 + + category_number = 0x20 + + # add the unicode supercategories (properties "\p{L}", "\p{L&}", etc) + for name in UNI_CATEGORY_LIST.split(): + if name != "-" and name[0] not in categories: + categories[name[0]] = category_number + categories[name[0] + "&"] = category_number + category_number += 1 + + common_category_start = category_number + for name in COMMON_CATEGORY_LIST.split(): + categories[name] = category_number + category_number += 1 + + return categories, common_category_start, all_categories + +CATEGORIES, COMMON_CATEGORY_START, _ALL_CAT = _build_categories( + _UNI_CATEGORY_LIST, _COMMON_CATEGORY_LIST) # flags SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking) @@ -210,6 +215,8 @@ SRE_FLAG_UNICODE = 32 # use unicode locale SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments SRE_FLAG_DEBUG = 128 # debugging +SRE_FLAG_REVERSE = 256 # search backwards +SRE_FLAG_ZEROWIDTH = 512 # permit split on zero-width # flags for INFO primitive SRE_INFO_PREFIX = 1 # has prefix @@ -217,12 +224,10 @@ SRE_INFO_CHARSET = 4 # pattern starts with character from given set if __name__ == "__main__": - def dump(f, d, prefix): - items = d.items() - items.sort(key=lambda a: a[1]) - for k, v in items: - f.write("#define %s_%s %s\n" % (prefix, k.upper(), v)) - f = open("sre_constants.h", "w") + # generate the sre_constants.h header file + f = open("sre_constants.h", "wb") + + # the title comment f.write("""\ /* * Secret Labs' Regular Expression Engine @@ -239,23 +244,121 @@ """) + # the magic value f.write("#define SRE_MAGIC %d\n" % MAGIC) - dump(f, OPCODES, "SRE_OP") - dump(f, ATCODES, "SRE") - dump(f, CHCODES, "SRE") - - f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE) - f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE) - f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE) - f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE) - f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL) - f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE) - f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE) - - f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX) - f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL) - f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET) + # the codeword definition + f.write("\n") + f.write("/* size of a code word (must be unsigned short or larger, and\n") + f.write(" large enough to hold a Py_UNICODE character) */\n") + if BYTES_PER_CODE == 4: + f.write("typedef unsigned int SRE_CODE;\n") + else: + f.write("typedef unsigned short SRE_CODE;\n") + + # the codeword size + f.write("\n") + f.write("#define SRE_BYTES_PER_CODE %d\n" % BYTES_PER_CODE) + f.write("#define SRE_BITS_PER_CODE %d\n" % BITS_PER_CODE) + + # the constant for unlimited repeats + f.write("#define SRE_UNLIMITED_REPEATS 0x%X\n" % MAXREPEAT) + + # the opcodes + f.write("\n") + for name, number, op_type, direction, end_marker in _operator_list: + f.write("#define SRE_OP_%s %d\n" % (name, number)) + f.write("#define SRE_MAX_OP %d\n" % (len(_operator_list) - 1)) + + # the regex flags + f.write("\n") + f.write("#define SRE_FLAG_TEMPLATE 0x%X\n" % SRE_FLAG_TEMPLATE) + f.write("#define SRE_FLAG_IGNORECASE 0x%X\n" % SRE_FLAG_IGNORECASE) + f.write("#define SRE_FLAG_LOCALE 0x%X\n" % SRE_FLAG_LOCALE) + f.write("#define SRE_FLAG_MULTILINE 0x%X\n" % SRE_FLAG_MULTILINE) + f.write("#define SRE_FLAG_DOTALL 0x%X\n" % SRE_FLAG_DOTALL) + f.write("#define SRE_FLAG_UNICODE 0x%X\n" % SRE_FLAG_UNICODE) + f.write("#define SRE_FLAG_VERBOSE 0x%X\n" % SRE_FLAG_VERBOSE) + f.write("#define SRE_FLAG_REVERSE 0x%X\n" % SRE_FLAG_REVERSE) + f.write("#define SRE_FLAG_ZEROWIDTH 0x%X\n" % SRE_FLAG_ZEROWIDTH) + + # the info constants + f.write("\n") + f.write("#define SRE_INFO_PREFIX 0x%X\n" % SRE_INFO_PREFIX) + f.write("#define SRE_INFO_LITERAL 0x%X\n" % SRE_INFO_LITERAL) + f.write("#define SRE_INFO_CHARSET 0x%X\n" % SRE_INFO_CHARSET) + + # the unicode categories and supercategories + f.write("\n") + categories = sorted(CATEGORIES.items(), key=operator.itemgetter(1)) + for name, value in categories: + # include "L" but exclude "L&" when making the names + if value < COMMON_CATEGORY_START and name.isalnum(): + f.write("#define SRE_UNI_CAT_%s 0x%X\n" % (name, value)) + + # the common categories + f.write("\n") + for name, value in categories: + if value >= COMMON_CATEGORY_START: + f.write("#define SRE_CAT_%s 0x%X\n" % (name, value)) + + # build the supercategories ("L&") + f.write("\n") + groups = defaultdict(int) + for name, value in categories: + if value < COMMON_CATEGORY_START and len(name) == 2 and name.isalpha(): + groups[name[ : 1]] |= 1 << value + + # the supercategories + for name, value in sorted(groups.items()): + f.write("#define SRE_CAT_MASK_%s 0x%08X\n" % (name, value)) + + f.write("\n") + f.write("#define SRE_CAT_MASK_Alnum 0x%08X\n" % (groups["L"] | + (1 << CATEGORIES["Nd"]))) + f.write("#define SRE_CAT_MASK_Alpha 0x%08X\n" % groups["L"]) + f.write("#define SRE_CAT_MASK_Graph 0x%08X\n" % ((groups["Z"] | groups["C"]) + ^ _ALL_CAT)) + f.write("#define SRE_CAT_MASK_Print 0x%08X\n" % (groups["C"] ^ _ALL_CAT)) + f.write("#define SRE_CAT_MASK_Punct 0x%08X\n" % (groups["P"] | groups["S"])) + f.write("#define SRE_CAT_MASK_Word 0x%08X\n" % (groups["L"] | groups["N"] | + groups["M"] | (1 << CATEGORIES["Pc"]))) + + # the opcode type info + f.write(""" +// info for operator validation +typedef struct SRE_OpInfo { + char* name; + int type; + int direction; + int end_marker; +} SRE_OpInfo; + +""") + + # sort the op_types (putting "INVALID" first) and assign numbers + _op_types = sorted(_op_types, key=lambda name: + ("" if name == "INVALID" else name)) + _op_types = [(name, number) for number, name in enumerate(_op_types)] + for name, number in _op_types: + f.write("#define SRE_TYPE_%s %d\n" % (name, number)) + + # the opcode type info + _op_types = dict(_op_types) + f.write(""" +static SRE_OpInfo sre_op_info[] = { +""") + for name, number, op_type, direction, end_marker in _operator_list: + if end_marker == "-": + end_marker = "0" + else: + end_marker = "SRE_OP_%s" % end_marker + f.write(" {\"%s\", %s, %s, %s},\n" % (name, _op_types[op_type], + direction, end_marker)) + f.write("};\n") f.close() print "done" +else: + # make all the names lowercase so we can be case-insensitive when parsing + CATEGORIES = dict((n.lower(), v) for n, v in CATEGORIES.items()) === modified file Lib/sre_compile.py --- Lib/sre_compile.py 2008-10-14 22:37:18 +0000 +++ Lib/sre_compile.py 2009-03-06 18:05:22 +0000 @@ -11,275 +11,286 @@ """Internal support module for sre""" import _sre, sys -import sre_parse + from sre_constants import * assert _sre.MAGIC == MAGIC, "SRE module mismatch" -if _sre.CODESIZE == 2: - MAXCODE = 65535 -else: - MAXCODE = 0xFFFFFFFFL - -def _identityfunction(x): - return x - -_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) -_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) -_SUCCESS_CODES = set([SUCCESS, FAILURE]) -_ASSERT_CODES = set([ASSERT, ASSERT_NOT]) - -def _compile(code, pattern, flags): +ASSERT_OP_CODES = { + OP.ASSERT: OP.END_ASSERT, + OP.ASSERT_NOT: OP.END_ASSERT_NOT, +} + +REPEAT_OP_CODES = { + OP.REPEAT_MAX: OP.END_REPEAT_MAX, + OP.REPEAT_MIN: OP.END_REPEAT_MIN, + OP.REPEAT_POSS: OP.END_REPEAT_POSS, +} + +SINGLE_CHAR_OP_CODES = set([ + OP.ANY, OP.ANY_ALL, + OP.CATEGORY, OP.NOT_CATEGORY, + OP.CHARSET, OP.CHARSET_IGNORE, OP.NOT_CHARSET, OP.NOT_CHARSET_IGNORE, + OP.LITERAL, OP.LITERAL_IGNORE, OP.NOT_LITERAL, OP.NOT_LITERAL_IGNORE, + OP.RANGE, OP.RANGE_IGNORE, + OP.SET, OP.NOT_SET, +]) + +NORMAL_OP_CODES, REVERSE_OP_CODES = {}, {} +for op in dir(OP): + if not op.startswith("_"): + NORMAL_OP_CODES[op] = op + if op.endswith("_REV"): + REVERSE_OP_CODES[op[ : -4]] = op + else: + REVERSE_OP_CODES.setdefault(op, op) + +REPEAT_ONE_OP_CODES = { + OP.REPEAT_MAX: OP.REPEAT_ONE_MAX, + OP.REPEAT_MIN: OP.REPEAT_ONE_MIN, + OP.REPEAT_POSS: OP.REPEAT_ONE_POSS, +} + +CATEGORY_OP_SET = set([OP.CATEGORY, OP.NOT_CATEGORY]) +CHARSET_OP_SET = set([OP.CHARSET, OP.CHARSET_IGNORE, OP.NOT_CHARSET, + OP.NOT_CHARSET_IGNORE]) +GROUPREF_OP_SET = set([OP.GROUPREF, OP.GROUPREF_IGNORE]) +LITERAL_OP_SET = set([OP.LITERAL, OP.LITERAL_IGNORE, OP.NOT_LITERAL, + OP.NOT_LITERAL_IGNORE]) +POSITION_OP_SET = set([OP.BOUNDARY, OP.END_OF_LINE, OP.END_OF_STRING, + OP.END_OF_STRING_LN, OP.NOT_BOUNDARY, OP.START_OF_LINE, OP.START_OF_SEARCH, + OP.START_OF_STRING]) +RANGE_OP_SET = set([OP.NOT_RANGE, OP.NOT_RANGE_IGNORE, OP.RANGE, + OP.RANGE_IGNORE]) +REPEAT_OP_SET = set([OP.REPEAT_MAX, OP.REPEAT_MIN, OP.REPEAT_POSS]) +SET_OP_SET = set([OP.SET, OP.SET_IGNORE, OP.NOT_SET, OP.NOT_SET_IGNORE]) +SIMPLE_CATEGORY_OP_SET = set([OP.ANY, OP.ANY_ALL]) + +def _compile(code, pattern, flags, info, dir=1): # internal: compile a (sub)pattern emit = code.append - _len = len - LITERAL_CODES = _LITERAL_CODES - REPEATING_CODES = _REPEATING_CODES - SUCCESS_CODES = _SUCCESS_CODES - ASSERT_CODES = _ASSERT_CODES + literal_op, literal_string = None, [] + if dir < 0: + fix_direction = REVERSE_OP_CODES + else: + fix_direction = NORMAL_OP_CODES + if dir < 0: + # Within lookbehind, so reverse the order of the matching + pattern = reversed(pattern) + def flush_literal(): + if literal_string: + emit_literal_string(code, literal_op, literal_string[ : : dir], + fix_direction) for op, av in pattern: - if op in LITERAL_CODES: - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - emit(_sre.getlower(av, flags)) + if op in SET_OP_SET: + op, av = _optimize_set(op, av, flags) + if op == literal_op: + literal_string.append(av) + else: + flush_literal() + if op in (OP.LITERAL, OP.LITERAL_IGNORE): + literal_op, literal_string = op, [av] else: - emit(OPCODES[op]) - emit(av) - elif op is IN: - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - def fixup(literal, flags=flags): - return _sre.getlower(literal, flags) - else: - emit(OPCODES[op]) - fixup = _identityfunction - skip = _len(code); emit(0) - _compile_charset(av, flags, code, fixup) - code[skip] = _len(code) - skip - elif op is ANY: - if flags & SRE_FLAG_DOTALL: - emit(OPCODES[ANY_ALL]) - else: - emit(OPCODES[ANY]) - elif op in REPEATING_CODES: - if flags & SRE_FLAG_TEMPLATE: - raise error, "internal: unsupported template operator" - emit(OPCODES[REPEAT]) - skip = _len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - emit(OPCODES[SUCCESS]) - code[skip] = _len(code) - skip - elif _simple(av) and op is not REPEAT: - if op is MAX_REPEAT: - emit(OPCODES[REPEAT_ONE]) + literal_op, literal_string = None, [] + if op in ASSERT_OP_CODES: + # ... + emit(OPCODES[op]) + skip = len(code); emit(0) + _compile(code, av[1], flags, info, av[0]) + emit(OPCODES[ASSERT_OP_CODES[op]]) + code[skip] = len(code) - skip + elif op == OP.ATOMIC: + # ... + emit(OPCODES[OP.ATOMIC]) + _compile(code, av[1], flags, info, dir) + emit(OPCODES[OP.END_ATOMIC]) + elif op == OP.BRANCH: + # + # + # ... + # + # + # ... + # + # 0 + emit(OPCODES[op]) + tail = [] + tailappend = tail.append + for av in av[1]: + skip = len(code); emit(0) + _compile(code, av, flags, info, dir) + emit(OPCODES[OP.JUMP]) + tailappend(len(code)); emit(0) + code[skip] = len(code) - skip + emit(0) # end of branchs + for tail in tail: + code[tail] = len(code) - tail + elif op in CATEGORY_OP_SET: + # category + emit(OPCODES[fix_direction[op]]) + emit(av) + elif op in CHARSET_OP_SET: + # skip charset + emit(OPCODES[fix_direction[op]]) + skip = len(code); emit(0) + _compile_charset(code, av) + code[skip] = len(code) - skip + elif op in GROUPREF_OP_SET: + # group_id + emit(OPCODES[fix_direction[op]]) + emit(av - 1) + elif op == OP.GROUPREF_EXISTS: + # group_id + # + # code_yes + # + # code_no + emit(OPCODES[op]) + emit(av[0] - 1) + skipyes = len(code); emit(0) + _compile(code, av[1], flags, info, dir) + if av[2]: + emit(OPCODES[OP.JUMP]) + skipno = len(code); emit(0) + code[skipyes] = len(code) - skipyes + 1 + _compile(code, av[2], flags, info, dir) + code[skipno] = len(code) - skipno + else: + code[skipyes] = len(code) - skipyes + 1 + elif op in LITERAL_OP_SET: + # code + emit(OPCODES[fix_direction[op]]) + emit(av) + elif op in POSITION_OP_SET: + # + emit(OPCODES[op]) + elif op in RANGE_OP_SET: + # min max + emit(OPCODES[fix_direction[op]]) + emit(av[0]) + emit(av[1]) + elif op in REPEAT_OP_SET: + if flags & SRE_FLAG_TEMPLATE: + raise error("internal: unsupported template operator") + else: + single = get_single_character(av[2]) + if single: + # ... + emit(OPCODES[fix_direction[REPEAT_ONE_OP_CODES[op]]]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, single, flags, info, dir) + code[skip] = len(code) - skip + else: + # + # ... + # + emit(OPCODES[fix_direction[op]]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags, info, dir) + emit(OPCODES[fix_direction[REPEAT_OP_CODES[op]]]) + offset = len(code) - skip + code[skip] = offset + emit(offset) + elif op in SET_OP_SET: + # set + emit(OPCODES[fix_direction[op]]) + _compile_set(code, av) + elif op in SIMPLE_CATEGORY_OP_SET: + # + emit(OPCODES[fix_direction[op]]) + elif op == OP.SUBPATTERN: + if av[0]: + number_id, name_id = av[0] + info.group_count += 1 + number_start_mark, number_end_mark = (number_id * 2 - 2, + number_id * 2 - 1) + name_start_mark, name_end_mark = (name_id * 2 - 2, + name_id * 2 - 1) + if dir < 0: + number_start_mark, number_end_mark = (number_end_mark, + number_start_mark) + name_start_mark, name_end_mark = (name_end_mark, + name_start_mark) + # + emit(OPCODES[OP.MARK]) + emit(number_start_mark) + emit(name_start_mark) + _compile(code, av[1], flags, info, dir) + if av[0]: + # + emit(OPCODES[OP.MARK]) + emit(number_end_mark) + emit(name_end_mark) else: - emit(OPCODES[MIN_REPEAT_ONE]) - skip = _len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - emit(OPCODES[SUCCESS]) - code[skip] = _len(code) - skip - else: - emit(OPCODES[REPEAT]) - skip = _len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - code[skip] = _len(code) - skip - if op is MAX_REPEAT: - emit(OPCODES[MAX_UNTIL]) - else: - emit(OPCODES[MIN_UNTIL]) - elif op is SUBPATTERN: - if av[0]: - emit(OPCODES[MARK]) - emit((av[0]-1)*2) - # _compile_info(code, av[1], flags) - _compile(code, av[1], flags) - if av[0]: - emit(OPCODES[MARK]) - emit((av[0]-1)*2+1) - elif op in SUCCESS_CODES: + raise ValueError("unsupported operand type: %s" % op) + flush_literal() + +def emit_literal_string(code, literal_op, literal_string, fix_direction): + emit = code.append + if len(literal_string) > 1: + # a string + if literal_op == OP.LITERAL_IGNORE: + # length ... + emit(OPCODES[fix_direction[OP.LITERAL_STRING_IGNORE]]) + else: + # length ... + emit(OPCODES[fix_direction[OP.LITERAL_STRING]]) + emit(len(literal_string)) + code.extend(literal_string) + else: + # code + # a single character + emit(OPCODES[fix_direction[literal_op]]) + emit(literal_string[0]) + +def get_single_character(pattern): + if len(pattern) == 1 and pattern[0][0] in SINGLE_CHAR_OP_CODES: + return pattern + return None + +def _compile_set(code, charset): + emit = code.append + skip_set = len(code); emit(0) + for op, av in charset: + if op in CHARSET_OP_SET: + # skip charset emit(OPCODES[op]) - elif op in ASSERT_CODES: + skip = len(code); emit(0) + _compile_charset(code, av) + code[skip] = len(code) - skip + elif op in CATEGORY_OP_SET: + # category emit(OPCODES[op]) - skip = _len(code); emit(0) - if av[0] >= 0: - emit(0) # look ahead - else: - lo, hi = av[1].getwidth() - if lo != hi: - raise error, "look-behind requires fixed-width pattern" - emit(lo) # look behind - _compile(code, av[1], flags) - emit(OPCODES[SUCCESS]) - code[skip] = _len(code) - skip - elif op is CALL: + emit(av) + elif op == OP.LITERAL: + # code emit(OPCODES[op]) - skip = _len(code); emit(0) - _compile(code, av, flags) - emit(OPCODES[SUCCESS]) - code[skip] = _len(code) - skip - elif op is AT: + emit(av) + elif op == OP.RANGE: + # min max emit(OPCODES[op]) - if flags & SRE_FLAG_MULTILINE: - av = AT_MULTILINE.get(av, av) - if flags & SRE_FLAG_LOCALE: - av = AT_LOCALE.get(av, av) - elif flags & SRE_FLAG_UNICODE: - av = AT_UNICODE.get(av, av) - emit(ATCODES[av]) - elif op is BRANCH: + emit(av[0]) + emit(av[1]) + elif op in SIMPLE_CATEGORY_OP_SET: + # emit(OPCODES[op]) - tail = [] - tailappend = tail.append - for av in av[1]: - skip = _len(code); emit(0) - # _compile_info(code, av, flags) - _compile(code, av, flags) - emit(OPCODES[JUMP]) - tailappend(_len(code)); emit(0) - code[skip] = _len(code) - skip - emit(0) # end of branch - for tail in tail: - code[tail] = _len(code) - tail - elif op is CATEGORY: - emit(OPCODES[op]) - if flags & SRE_FLAG_LOCALE: - av = CH_LOCALE[av] - elif flags & SRE_FLAG_UNICODE: - av = CH_UNICODE[av] - emit(CHCODES[av]) - elif op is GROUPREF: - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - else: - emit(OPCODES[op]) - emit(av-1) - elif op is GROUPREF_EXISTS: - emit(OPCODES[op]) - emit(av[0]-1) - skipyes = _len(code); emit(0) - _compile(code, av[1], flags) - if av[2]: - emit(OPCODES[JUMP]) - skipno = _len(code); emit(0) - code[skipyes] = _len(code) - skipyes + 1 - _compile(code, av[2], flags) - code[skipno] = _len(code) - skipno - else: - code[skipyes] = _len(code) - skipyes + 1 else: - raise ValueError, ("unsupported operand type", op) - -def _compile_charset(charset, flags, code, fixup=None): - # compile charset subprogram - emit = code.append - if fixup is None: - fixup = _identityfunction - for op, av in _optimize_charset(charset, fixup): - emit(OPCODES[op]) - if op is NEGATE: - pass - elif op is LITERAL: - emit(fixup(av)) - elif op is RANGE: - emit(fixup(av[0])) - emit(fixup(av[1])) - elif op is CHARSET: - code.extend(av) - elif op is BIGCHARSET: - code.extend(av) - elif op is CATEGORY: - if flags & SRE_FLAG_LOCALE: - emit(CHCODES[CH_LOCALE[av]]) - elif flags & SRE_FLAG_UNICODE: - emit(CHCODES[CH_UNICODE[av]]) - else: - emit(CHCODES[av]) - else: - raise error, "internal: unsupported set operator" - emit(OPCODES[FAILURE]) - -def _optimize_charset(charset, fixup): - # internal: optimize character set - out = [] - outappend = out.append - charmap = [0]*256 - try: - for op, av in charset: - if op is NEGATE: - outappend((op, av)) - elif op is LITERAL: - charmap[fixup(av)] = 1 - elif op is RANGE: - for i in range(fixup(av[0]), fixup(av[1])+1): - charmap[i] = 1 - elif op is CATEGORY: - # XXX: could append to charmap tail - return charset # cannot compress - except IndexError: - # character set contains unicode characters - return _optimize_unicode(charset, fixup) - # compress character map - i = p = n = 0 - runs = [] - runsappend = runs.append - for c in charmap: - if c: - if n == 0: - p = i - n = n + 1 - elif n: - runsappend((p, n)) - n = 0 - i = i + 1 - if n: - runsappend((p, n)) - if len(runs) <= 2: - # use literal/range - for p, n in runs: - if n == 1: - outappend((LITERAL, p)) - else: - outappend((RANGE, (p, p+n-1))) - if len(out) < len(charset): - return out - else: - # use bitmap - data = _mk_bitmap(charmap) - outappend((CHARSET, data)) - return out - return charset - -def _mk_bitmap(bits): - data = [] - dataappend = data.append - if _sre.CODESIZE == 2: - start = (1, 0) - else: - start = (1L, 0L) - m, v = start - for c in bits: - if c: - v = v + m - m = m + m - if m > MAXCODE: - dataappend(v) - m, v = start - return data - -# To represent a big charset, first a bitmap of all characters in the + raise error("internal: unsupported set member: %s" % op) + code[skip_set] = len(code) - skip_set + +# The characters may be mapped to a bitmap. + +# To represent a charset, first a bitmap of all characters in the # set is constructed. Then, this bitmap is sliced into chunks of 256 # characters, duplicate chunks are eliminated, and each chunk is # given a number. In the compiled expression, the charset is -# represented by a 16-bit word sequence, consisting of one word for -# the number of different chunks, a sequence of 256 bytes (128 words) -# of chunk numbers indexed by their original chunk position, and a -# sequence of chunks (16 words each). +# represented by a codeword sequence, consisting of one codeword for +# the maximum character code, a sequence of chunk numbers +# (2 per codeword), and a sequence of chunks (8 codewords each). # Compression is normally good: in a typical charset, large ranges of # Unicode will be either completely excluded (e.g. if only cyrillic @@ -287,217 +298,148 @@ # subranges of Kanji match). These ranges will be represented by # chunks of all one-bits or all zero-bits. -# Matching can be also done efficiently: the more significant byte of +# Matching can be also done efficiently: the most significant bits of # the Unicode character is an index into the chunk number, and the -# less significant byte is a bit index in the chunk (just like the -# CHARSET matching). - -# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets -# of the basic multilingual plane; an efficient representation -# for all of UTF-16 has not yet been developed. This means, -# in particular, that negated charsets cannot be represented as -# bigcharsets. - -def _optimize_unicode(charset, fixup): - try: - import array - except ImportError: - return charset - charmap = [0]*65536 - negate = 0 - try: - for op, av in charset: - if op is NEGATE: - negate = 1 - elif op is LITERAL: - charmap[fixup(av)] = 1 - elif op is RANGE: - for i in xrange(fixup(av[0]), fixup(av[1])+1): - charmap[i] = 1 - elif op is CATEGORY: - # XXX: could expand category - return charset # cannot compress - except IndexError: - # non-BMP characters - return charset - if negate: - if sys.maxunicode != 65535: - # XXX: negation does not work with big charsets - return charset - for i in xrange(65536): - charmap[i] = not charmap[i] - comps = {} - mapping = [0]*256 - block = 0 - data = [] - for i in xrange(256): - chunk = tuple(charmap[i*256:(i+1)*256]) - new = comps.setdefault(chunk, block) - mapping[i] = new - if new == block: - block = block + 1 - data = data + _mk_bitmap(chunk) - header = [block] - if _sre.CODESIZE == 2: - code = 'H' +# least significant byte is a bit index into the chunk. + +# A charset is a 3-tuple, consisting of the maximum character code, +# a list of indexes and a list of 256-bit bitsets +def _compile_charset(code, charset): + # the maximum character code + code.append(charset[0]) + # pack the 16-bit indexes into 32-bit codewords + # (adding an extra index ensures that zip() doesn't drop + # the last one if there are an odd number of them) + for lo, hi in zip(charset[1][0 : : 2], charset[1][1 : : 2] + [0]): + code.append(lo | (hi << 16)) + # pack the 256-bit bitsets to 32-bit codewords + for chunk in charset[2]: + for i in range(256 // BITS_PER_CODE): + code.append(chunk & MAXCODE) + chunk >>= BITS_PER_CODE + +def _ones(n): + return (1 << n) - 1 + +def _optimize_set(set_op, set_members, flags): + # consolidate the ranges (the bounds are inclusive) + charset = set() + categories = [] + for o, a in set_members: + if o == OP.LITERAL: + charset.add(a) + elif o == OP.RANGE: + for c in xrange(a[0], a[1] + 1): + charset.add(c) + else: + categories.append((o, a)) + categories = sorted(set(categories)) + # convert charset to list of ranges + ranges = [] + start, end = None, None + for c in sorted(charset): + try: + if c == end + 1: + end = c + else: + ranges.append((start, end)) + start, end = c, c + except TypeError: + start, end = c, c + if start is not None: + ranges.append((start, end)) + # try to optimise the set + if len(ranges) <= 1: + # only a few ranges + for r in ranges: + if r[0] == r[1]: + # a range of 1 character! + categories.append((OP.LITERAL, r[0])) + else: + categories.append((OP.RANGE, r)) else: - code = 'I' - # Convert block indices to byte array of 256 bytes - mapping = array.array('b', mapping).tostring() - # Convert byte array to word array - mapping = array.array(code, mapping) - assert mapping.itemsize == _sre.CODESIZE - header = header + mapping.tolist() - data[0:0] = header - return [(BIGCHARSET, data)] - -def _simple(av): - # check if av is a "simple" operator - lo, hi = av[2].getwidth() - if lo == 0 and hi == MAXREPEAT: - raise error, "nothing to repeat" - return lo == hi == 1 and av[2][0][0] != SUBPATTERN - -def _compile_info(code, pattern, flags): - # internal: compile an info block. in the current version, - # this contains min/max pattern width, and an optional literal - # prefix or a character map - lo, hi = pattern.getwidth() - if lo == 0: - return # not worth it - # look for a literal prefix - prefix = [] - prefixappend = prefix.append - prefix_skip = 0 - charset = [] # not used - charsetappend = charset.append - if not (flags & SRE_FLAG_IGNORECASE): - # look for literal prefix - for op, av in pattern.data: - if op is LITERAL: - if len(prefix) == prefix_skip: - prefix_skip = prefix_skip + 1 - prefixappend(av) - elif op is SUBPATTERN and len(av[1]) == 1: - op, av = av[1][0] - if op is LITERAL: - prefixappend(av) - else: - break - else: - break - # if no prefix, look for charset prefix - if not prefix and pattern.data: - op, av = pattern.data[0] - if op is SUBPATTERN and av[1]: - op, av = av[1][0] - if op is LITERAL: - charsetappend((op, av)) - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) - else: - break - else: - charset = c - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) - else: - break - else: - charset = c - elif op is IN: - charset = av -## if prefix: -## print "*** PREFIX", prefix, prefix_skip -## if charset: -## print "*** CHARSET", charset - # add an info block - emit = code.append - emit(OPCODES[INFO]) - skip = len(code); emit(0) - # literal flag - mask = 0 - if prefix: - mask = SRE_INFO_PREFIX - if len(prefix) == prefix_skip == len(pattern.data): - mask = mask + SRE_INFO_LITERAL - elif charset: - mask = mask + SRE_INFO_CHARSET - emit(mask) - # pattern length - if lo < MAXCODE: - emit(lo) - else: - emit(MAXCODE) - prefix = prefix[:MAXCODE] - if hi < MAXCODE: - emit(hi) - else: - emit(0) - # add literal prefix - if prefix: - emit(len(prefix)) # length - emit(prefix_skip) # skip - code.extend(prefix) - # generate overlap table - table = [-1] + ([0]*len(prefix)) - for i in xrange(len(prefix)): - table[i+1] = table[i]+1 - while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: - table[i+1] = table[table[i+1]-1]+1 - code.extend(table[1:]) # don't store first entry - elif charset: - _compile_charset(charset, flags, code) - code[skip] = len(code) - skip - -try: - unicode -except NameError: - STRING_TYPES = (type(""),) -else: - STRING_TYPES = (type(""), type(unicode(""))) - -def isstring(obj): - for tp in STRING_TYPES: - if isinstance(obj, tp): - return 1 - return 0 + # many ranges, so use a charset instead + max_char = ranges[-1][1] + subset_list = [0] * (max_char // 256 + 1) + for lo, hi in ranges: + base = lo - lo % 256 + while lo <= hi: + subset_list[base // 256] |= (_ones(min(hi - base + 1, 256)) ^ + _ones(lo % 256)) + base += 256 + lo = base + # build the index and chunks, consolidating duplicate subsets/chunks + index_list, chunk_list = [], [] + for subset in subset_list: + try: + index_list.append(chunk_list.index(subset)) + except ValueError: + index_list.append(len(chunk_list)) + chunk_list.append(subset) + categories.append((OP.CHARSET, (max_char, index_list, chunk_list))) + if len(categories) == 1: + # only 1 test in the set, so don't use a set + cat = categories[0] + if set_op.startswith("NOT_"): + cat = not_op(cat) + if set_op.endswith("_IGNORE"): + cat = ignore_op(cat) + return cat + return set_op, categories + +def create_charset(iterable): + # (UNUSED) + # enumerate the characters and create the subsets + subset_list = [] + max_code = 0 + for ch in iterable: + ch = ord(ch) + max_code = max(max_code, ch) + hi, lo = divmod(ch, 256) + mask = 1 << lo + try: + subset_list[hi] |= mask + except IndexError: + subset_list.extend([0] * (hi - len(subset_list))) + subset_list.append(mask) + # optimise the subsets + index_list, chunk_list = [], [] + for subset in subset_list: + try: + index_list.append(chunk_list.index(subset)) + except ValueError: + index_list.append(len(chunk_list)) + chunk_list.append(subset) + return max_code, index_list, chunk_list def _code(p, flags): - flags = p.pattern.flags | flags code = [] # compile info block - _compile_info(code, p, flags) + #_compile_info(code, p, flags) # compile the pattern - _compile(code, p.data, flags) - - code.append(OPCODES[SUCCESS]) + class Record(object): + pass + info = Record() + info.group_count = 0 + if flags & SRE_FLAG_REVERSE: + dir = -1 + else: + dir = 1 + _compile(code, p.data, flags, info, dir) + code.append(OPCODES[OP.SUCCESS]) return code -def compile(p, flags=0): +def compile(p, flags=0, scanner=0): # internal: convert pattern list to internal format - if isstring(p): + if isinstance(p, basestring): + import sre_parse pattern = p - p = sre_parse.parse(p, flags) + p = sre_parse.parse(p, flags, scanner=scanner) else: pattern = None @@ -505,20 +447,12 @@ # print code - # XXX: get rid of this limitation! - if p.pattern.groups > 100: - raise AssertionError( - "sorry, but this version only supports 100 named groups" - ) - # map in either direction - groupindex = p.pattern.groupdict - indexgroup = [None] * p.pattern.groups - for k, i in groupindex.items(): - indexgroup[i] = k - - return _sre.compile( - pattern, flags | p.pattern.flags, code, - p.pattern.groups-1, - groupindex, indexgroup - ) + groupindex = p.pattern.named_groups + indexgroup = [None] * (max(groupindex.values() + [-1]) + 1) + + for name, index in groupindex.items(): + indexgroup[index] = name + + return _sre.compile(pattern, flags | p.pattern.flags, code, + p.pattern.groups, groupindex, indexgroup) === modified file Lib/sre_parse.py --- Lib/sre_parse.py 2008-10-14 22:37:18 +0000 +++ Lib/sre_parse.py 2009-03-05 03:00:47 +0000 @@ -15,75 +15,68 @@ import sys from sre_constants import * - -SPECIAL_CHARS = ".\\[{()*+?^$|" -REPEAT_CHARS = "*+?{" - -DIGITS = set("0123456789") - -OCTDIGITS = set("01234567") -HEXDIGITS = set("0123456789abcdefABCDEF") - -WHITESPACE = set(" \t\n\r\v\f") +import unicodedata + +SPECIAL_CHARS = set(".\\[{()*+?^$|") +REPEAT_CHARS = set("*+?{") +WHITESPACE_CHARS = set(" \t\n\r\v\f") ESCAPES = { - r"\a": (LITERAL, ord("\a")), - r"\b": (LITERAL, ord("\b")), - r"\f": (LITERAL, ord("\f")), - r"\n": (LITERAL, ord("\n")), - r"\r": (LITERAL, ord("\r")), - r"\t": (LITERAL, ord("\t")), - r"\v": (LITERAL, ord("\v")), - r"\\": (LITERAL, ord("\\")) + r"\a": (OP.LITERAL, ord("\a")), + r"\b": (OP.LITERAL, ord("\b")), + r"\f": (OP.LITERAL, ord("\f")), + r"\n": (OP.LITERAL, ord("\n")), + r"\r": (OP.LITERAL, ord("\r")), + r"\t": (OP.LITERAL, ord("\t")), + r"\v": (OP.LITERAL, ord("\v")), + r"\\": (OP.LITERAL, ord("\\")), } -CATEGORIES = { - r"\A": (AT, AT_BEGINNING_STRING), # start of string - r"\b": (AT, AT_BOUNDARY), - r"\B": (AT, AT_NON_BOUNDARY), - r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), - r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), - r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), - r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), - r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), - r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), - r"\Z": (AT, AT_END_STRING), # end of string +POSITIONS = { + r"\A": (OP.START_OF_STRING, None), + r"\b": (OP.BOUNDARY, None), + r"\B": (OP.NOT_BOUNDARY, None), + r"\G": (OP.START_OF_SEARCH, None), + r"\Z": (OP.END_OF_STRING, None), } +STD_CATEGORIES = { + r"\d": (OP.CATEGORY, CATEGORIES["digit"]), + r"\D": (OP.NOT_CATEGORY, CATEGORIES["digit"]), + r"\s": (OP.CATEGORY, CATEGORIES["space"]), + r"\S": (OP.NOT_CATEGORY, CATEGORIES["space"]), + r"\w": (OP.CATEGORY, CATEGORIES["word"]), + r"\W": (OP.NOT_CATEGORY, CATEGORIES["word"]), +} + FLAGS = { - # standard flags "i": SRE_FLAG_IGNORECASE, "L": SRE_FLAG_LOCALE, "m": SRE_FLAG_MULTILINE, + "r": SRE_FLAG_REVERSE, "s": SRE_FLAG_DOTALL, "x": SRE_FLAG_VERBOSE, - # extensions "t": SRE_FLAG_TEMPLATE, "u": SRE_FLAG_UNICODE, + "z": SRE_FLAG_ZEROWIDTH, } +SCOPED_FLAGS_MASK = (SRE_FLAG_IGNORECASE | SRE_FLAG_MULTILINE | + SRE_FLAG_DOTALL | SRE_FLAG_VERBOSE) + class Pattern: - # master pattern object. keeps track of global attributes + # master pattern object. keeps track of global attributes def __init__(self): self.flags = 0 - self.open = [] - self.groups = 1 - self.groupdict = {} - def opengroup(self, name=None): - gid = self.groups - self.groups = gid + 1 + self.groups = 0 + self.named_groups = {} + self.fix_list = [] + def new_group(self, name=None): + self.groups += 1 + group_number = self.groups if name is not None: - ogid = self.groupdict.get(name, None) - if ogid is not None: - raise error, ("redefinition of group name %s as group %d; " - "was group %d" % (repr(name), gid, ogid)) - self.groupdict[name] = gid - self.open.append(gid) - return gid - def closegroup(self, gid): - self.open.remove(gid) - def checkgroup(self, gid): - return gid < self.groups and gid not in self.open + self.named_groups.setdefault(name, len(self.named_groups)) + return group_number, name class SubPattern: # a subpattern, in intermediate form @@ -93,33 +86,37 @@ data = [] self.data = data self.width = None + self._inv_categories = dict((value, name) for name, value in + CATEGORIES.items()) def dump(self, level=0): - nl = 1 - seqtypes = type(()), type([]) + nl = True + seqtypes = tuple, list for op, av in self.data: - print level*" " + op,; nl = 0 - if op == "in": + print level * " " + op,; nl = False + if "SET" in op: # member sublanguage - print; nl = 1 + print; nl = True for op, a in av: - print (level+1)*" " + op, a - elif op == "branch": - print; nl = 1 + print (level + 1) * " " + op, a + elif op == OP.BRANCH: + print; nl = True i = 0 for a in av[1]: if i > 0: - print level*" " + "or" - a.dump(level+1); nl = 1 - i = i + 1 - elif type(av) in seqtypes: + print level * " " + "or" + a.dump(level + 1); nl = True + i += 1 + elif isinstance(av, seqtypes): for a in av: if isinstance(a, SubPattern): if not nl: print - a.dump(level+1); nl = 1 + a.dump(level + 1); nl = True else: - print a, ; nl = 0 - else: - print av, ; nl = 0 + print a, ; nl = False + elif "CATEGORY" in op: + print self._inv_categories[av], ; nl = False + else: + print av, ; nl = False if not nl: print def __repr__(self): return repr(self.data) @@ -137,363 +134,465 @@ self.data.insert(index, code) def append(self, code): self.data.append(code) - def getwidth(self): - # determine the width (min, max) for this subpattern - if self.width: - return self.width - lo = hi = 0L - UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) - REPEATCODES = (MIN_REPEAT, MAX_REPEAT) - for op, av in self.data: - if op is BRANCH: - i = sys.maxint - j = 0 - for av in av[1]: - l, h = av.getwidth() - i = min(i, l) - j = max(j, h) - lo = lo + i - hi = hi + j - elif op is CALL: - i, j = av.getwidth() - lo = lo + i - hi = hi + j - elif op is SUBPATTERN: - i, j = av[1].getwidth() - lo = lo + i - hi = hi + j - elif op in REPEATCODES: - i, j = av[2].getwidth() - lo = lo + long(i) * av[0] - hi = hi + long(j) * av[1] - elif op in UNITCODES: - lo = lo + 1 - hi = hi + 1 - elif op == SUCCESS: - break - self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) - return self.width + def extend(self, code): + self.data.extend(code) class Tokenizer: def __init__(self, string): self.string = string self.index = 0 - self.__next() - def __next(self): - if self.index >= len(self.string): + self._next() + def _next(self): + try: + char = self.string[self.index] + if char == "\\": + try: + char += self.string[self.index + 1] + except IndexError: + raise error("bad escape (end of line)") + self.index += len(char) + self.next = char + except IndexError: self.next = None - return - char = self.string[self.index] - if char[0] == "\\": - try: - c = self.string[self.index + 1] - except IndexError: - raise error, "bogus escape (end of line)" - char = char + c - self.index = self.index + len(char) - self.next = char - def match(self, char, skip=1): - if char == self.next: - if skip: - self.__next() - return 1 - return 0 + def match(self, char, skip=True): + if char != self.next: + return False + if skip: + self._next() + return True def get(self): this = self.next - self.__next() + self._next() return this def tell(self): return self.index, self.next def seek(self, index): self.index, self.next = index -def isident(char): - return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" - -def isdigit(char): - return "0" <= char <= "9" - -def isname(name): +def is_name(name): # check that group name is a valid string - if not isident(name[0]): - return False - for char in name[1:]: - if not isident(char) and not isdigit(char): - return False - return True - -def _class_escape(source, escape): + return (name[0] == "_" or name[0].isalpha()) and all(char == "_" or + char.isalnum() for char in name[1 : ]) + +# names can be delimited in a number of ways +NAME_DELIMITERS = {"<": ">", "{": "}"} + +def hex_escape(source, escape, max_digits): + # hexadecimal escape + digits = "" + while source.next in HEXDIGITS and len(digits) < max_digits: + digits += source.get() + if len(digits) != max_digits: + raise error("bad escape: %s" % (escape + digits)) + return int(digits, 16) + +def oct_escape(source, escape, digits): + # octal escape + while source.next in OCTDIGITS and len(digits) < 3: + digits += source.get() + try: + return int(digits, 8) & 0xFF + except ValueError: + raise error("bad escape: %s" % (escape + digits)) + +def parse_name(source, terminator, name_type, prefix): + name = "" + while True: + char = source.get() + if char is None: + raise error("unterminated %s name: %s" % (name_type, prefix)) + if char == terminator: + break + name += char + return name + +HEX_ESCAPE_LENGTH = {"x": 2, "u": 4, "U": 8} + +def class_escape(source, escape): # handle escape code inside character class - code = ESCAPES.get(escape) - if code: - return code - code = CATEGORIES.get(escape) + code = STD_CATEGORIES.get(escape) or ESCAPES.get(escape) if code: return code try: - c = escape[1:2] - if c == "x": - # hexadecimal escape (exactly two digits) - while source.next in HEXDIGITS and len(escape) < 4: - escape = escape + source.get() - escape = escape[2:] - if len(escape) != 2: - raise error, "bogus escape: %s" % repr("\\" + escape) - return LITERAL, int(escape, 16) & 0xff + c = escape[1 : 2] + if c in HEX_ESCAPE_LENGTH: + # hex escape + return OP.LITERAL, hex_escape(source, escape, HEX_ESCAPE_LENGTH[c]) elif c in OCTDIGITS: - # octal escape (up to three digits) - while source.next in OCTDIGITS and len(escape) < 4: - escape = escape + source.get() - escape = escape[1:] - return LITERAL, int(escape, 8) & 0xff + # octal escape + return OP.LITERAL, oct_escape(source, escape[ : 1], c) elif c in DIGITS: - raise error, "bogus escape: %s" % repr(escape) - if len(escape) == 2: - return LITERAL, ord(escape[1]) + raise error("bad escape: %s" % escape) + elif c == "N": + # named character + if source.next not in NAME_DELIMITERS: + raise error("missing character name: %s" % escape) + delimiter = source.get() + name = parse_name(source, NAME_DELIMITERS[delimiter], "character", + escape + delimiter) + try: + return OP.LITERAL, ord(unicodedata.lookup(name)) + except KeyError: + raise error("bad character name: %s" % name) + elif c == "p": + # character property + if source.next not in NAME_DELIMITERS: + raise error("missing property name: %s" % escape) + delimiter = source.get() + name = parse_name(source, NAME_DELIMITERS[delimiter], "property", + escape + delimiter) + try: + return OP.CATEGORY, CATEGORIES[name.lower()] + except KeyError: + raise error("bad property name: %s" % name) + else: + return OP.LITERAL, ord(c) except ValueError: pass - raise error, "bogus escape: %s" % repr(escape) - -def _escape(source, escape, state): + raise error("bad escape: %s" % escape) + +def posix_class(source): + if not source.match(":"): + return None + name = parse_name(source, ":", "class", ":") + try: + cat = CATEGORIES[name.lower()] + if cat < COMMON_CATEGORY_START: + raise error("bad class name: %s" % name) + except KeyError: + raise error("bad class name: %s" % name) + if not source.match(":") or not source.match("]"): + raise error("unterminated class name: %s" % name) + return OP.CATEGORY, cat + +# group references can be delimited in a number of ways +GROUP_DELIMITERS = {"<": ">", "{": "}", "'": "'", '"': '"'} + +# group references can be relative +GROUP_DIRECTION = {"+": 1, "-": -1} + +def escape(source, escape, state): # handle escape code in expression - code = CATEGORIES.get(escape) + # group references returned as list instead of tuple so that they can be + # fixed later + code = POSITIONS.get(escape) or STD_CATEGORIES.get(escape) or ESCAPES.get(escape) if code: return code - code = ESCAPES.get(escape) - if code: - return code + if state.flags & SRE_FLAG_IGNORECASE: + literal_op, groupref_op = OP.LITERAL_IGNORE, OP.GROUPREF_IGNORE + else: + literal_op, groupref_op = OP.LITERAL, OP.GROUPREF try: - c = escape[1:2] - if c == "x": - # hexadecimal escape - while source.next in HEXDIGITS and len(escape) < 4: - escape = escape + source.get() - if len(escape) != 4: - raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff + c = escape[1 : 2] + if c in HEX_ESCAPE_LENGTH: + # hex escape + return literal_op, hex_escape(source, escape, HEX_ESCAPE_LENGTH[c]) elif c == "0": # octal escape - while source.next in OCTDIGITS and len(escape) < 4: - escape = escape + source.get() - return LITERAL, int(escape[1:], 8) & 0xff + return literal_op, oct_escape(source, escape[ : 1], c) elif c in DIGITS: # octal escape *or* decimal group reference (sigh) if source.next in DIGITS: - escape = escape + source.get() - if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and - source.next in OCTDIGITS): + escape += source.get() + if set(escape[1 : ]) <= OCTDIGITS and source.next in OCTDIGITS: # got three octal digits; this is an octal escape - escape = escape + source.get() - return LITERAL, int(escape[1:], 8) & 0xff + escape += source.get() + return literal_op, int(escape[1 : ], 8) & 0xFF # not an octal escape, so this is a group reference - group = int(escape[1:]) - if group < state.groups: - if not state.checkgroup(group): - raise error, "cannot refer to open group" - return GROUPREF, group - raise ValueError - if len(escape) == 2: - return LITERAL, ord(escape[1]) + ref = [groupref_op, escape[1 : ]] + state.fix_list.append(ref) + return ref + elif c == "g": + # group reference + if source.next in GROUP_DELIMITERS: + # delimited group reference + delimiter = source.get() + name = parse_name(source, GROUP_DELIMITERS[delimiter], "group", + escape + delimiter) + if name[0] in GROUP_DIRECTION and name[1 : ].isdigit(): + # relative group reference, so convert to absolute + name = str(state.groups + GROUP_DIRECTION[name[0]] * + int(name[1 : ])) + if not name.isdigit() and not is_name(name): + raise error("bad group name: %s" % name) + # return the group reference + ref = [groupref_op, name] + state.fix_list.append(ref) + return ref + elif source.next in DIGITS: + # non-delimited group reference (single digit) + ref = [groupref_op, source.get()] + state.fix_list.append(ref) + return ref + else: + raise error("missing group name: %s" % escape) + elif c == "k": + # named group reference + if source.next in GROUP_DELIMITERS: + # delimited group reference + delimiter = source.get() + name = parse_name(source, GROUP_DELIMITERS[delimiter], "group", + escape + delimiter) + if not is_name(name): + raise error("bad group name: %s" % name) + ref = [groupref_op, name] + state.fix_list.append(ref) + return ref + else: + # non-delimited group reference; invalid for \k + raise error("missing group name: %s" % escape) + elif c == "N": + # named character + if source.next not in NAME_DELIMITERS: + raise error("missing character name: %s" % escape) + delimiter = source.get() + name = parse_name(source, NAME_DELIMITERS[delimiter], "character", + escape + delimiter) + try: + return literal_op, ord(unicodedata.lookup(name)) + except KeyError: + raise error("bad character name: %s" % name) + elif c in "pP": + # character property + if source.next not in NAME_DELIMITERS: + raise error("missing property name: %s" % escape) + delimiter = source.get() + name = parse_name(source, NAME_DELIMITERS[delimiter], "property", + escape + delimiter) + try: + op = OP.CATEGORY, CATEGORIES[name.lower()] + if c == "P": + op = not_op(op) + return op + except KeyError: + raise error("bad property name: %s" % name) + else: + return literal_op, ord(c) except ValueError: pass - raise error, "bogus escape: %s" % repr(escape) - -def _parse_sub(source, state, nested=1): + raise error("bad escape: %s" % escape) + +def _parse_sub(source, state, named_groups, reuse): # parse an alternation: a|b|c + # + # group names can be duplicated if they are mutually exclusive + # + # group numbers can be duplicated in mutually-exclusive branches if 'reuse' + # is True + initial_groups = state.groups + max_groups = state.groups + named_groups_out = named_groups.copy() items = [] - itemsappend = items.append - sourcematch = source.match - while 1: - itemsappend(_parse(source, state)) - if sourcematch("|"): - continue - if not nested: + while True: + # parse the branch + # returns the parsed items and the set of named groups + i, n = _parse(source, state, named_groups) + items.append(i) + named_groups_out |= n + max_groups = max(max_groups, state.groups) + if not source.match("|"): break - if not source.next or sourcematch(")", 0): - break - else: - raise error, "pattern not properly closed" + + # do we want to reuse group numbers, ie start all the branches at the + # same group number? + if reuse: + state.groups = initial_groups + + # the next group number should be higher than all previous ones + state.groups = max_groups if len(items) == 1: - return items[0] + return items[0], named_groups_out subpattern = SubPattern(state) - subpatternappend = subpattern.append - - # check if all items share a common prefix - while 1: - prefix = None - for item in items: - if not item: + + # check whether all branches share a common prefix + # (the prefix shouldn't contain a capture group) + index = 0 + try: + while all(items[0][index] == item[index] for item in items[1 : ]): + if is_capture(items[0][index]): break - if prefix is None: - prefix = item[0] - elif item[0] != prefix: - break - else: - # all subitems start with a common "prefix". - # move it out of the branch - for item in items: - del item[0] - subpatternappend(prefix) - continue # check next one - break - - # check if the branch can be replaced by a character set - for item in items: - if len(item) != 1 or item[0][0] != LITERAL: - break + index += 1 + except IndexError: + pass + + if index > 0: + subpattern.extend(items[0][ : index]) + items = [item[index : ] for item in items] + + # check whether the alternation can be replaced by a character set + if all(len(item) == 1 and item[0][0] == OP.LITERAL for item in items): + # we can store this as a set instead of a + # branch (the compiler may optimize this even more) + subpattern.append((OP.SET, [item[0] for item in items])) else: - # we can store this as a character set instead of a - # branch (the compiler may optimize this even more) - set = [] - setappend = set.append - for item in items: - setappend(item[0]) - subpatternappend((IN, set)) - return subpattern - - subpattern.append((BRANCH, (None, items))) - return subpattern - -def _parse_sub_cond(source, state, condgroup): - item_yes = _parse(source, state) + subpattern.append((OP.BRANCH, (None, items))) + + return subpattern, named_groups_out + +def is_capture(pattern): + if not pattern: + return False + o, a = pattern + if o in [OP.ASSERT, OP.ASSERT_NOT, OP.ATOMIC]: + return has_capture(a[1]) + elif o == OP.BRANCH: + return any(has_capture(i) for i in a[1]) + elif o == OP.GROUPREF_EXISTS: + return any(has_capture(i) for i in a[1 : 3]) + elif o in [OP.REPEAT_MAX, OP.REPEAT_MIN, OP.REPEAT_POSS]: + return has_capture(a[2]) + elif o in [OP.REPEAT_ONE_MAX, OP.REPEAT_ONE_MIN, OP.REPEAT_ONE_POSS]: + return is_capture(a[2]) + elif o == OP.SUBPATTERN: + return a[0] is not None or has_capture(a[1]) + else: + return False + +def has_capture(pattern): + if not pattern: + return False + return any(i for i in pattern) + +def _parse_sub_cond(source, state, named_groups, condgroup): + item_yes, n_yes = _parse(source, state, named_groups) if source.match("|"): - item_no = _parse(source, state) + item_no, n_no = _parse(source, state, named_groups) if source.match("|"): - raise error, "conditional backref with more than two branches" + raise error("conditional reference with more than two branches") else: - item_no = None - if source.next and not source.match(")", 0): - raise error, "pattern not properly closed" + item_no, n_no = None, named_groups + if source.next and not source.match(")", False): + raise error("pattern not properly closed") subpattern = SubPattern(state) - subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) - return subpattern - -_PATTERNENDERS = set("|)") -_ASSERTCHARS = set("=!<") -_LOOKBEHINDASSERTCHARS = set("=!") -_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) - -def _parse(source, state): + ref = (OP.GROUPREF_EXISTS, [condgroup, item_yes, item_no]) + state.fix_list.append(ref) + subpattern.append(ref) + return subpattern, n_yes | n_no + +PATTERN_ENDERS = set("|)") +ASSERT_CHARS = set("=!<") +LOOKBEHIND_ASSERT_CHARS = set("=!") +POSITION_CODES = set([OP.BOUNDARY, OP.END_OF_LINE, OP.END_OF_STRING, + OP.END_OF_STRING_LN, OP.NOT_BOUNDARY, OP.START_OF_LINE, OP.START_OF_SEARCH, + OP.START_OF_STRING]) +QUERY_GROUP = 0 +CAPTURE_GROUP = 1 +NONCAPTURE_GROUP = 2 +ATOMIC_GROUP = 3 + +def _parse(source, state, named_groups): # parse a simple pattern subpattern = SubPattern(state) - - # precompute constants into local variables - subpatternappend = subpattern.append - sourceget = source.get - sourcematch = source.match - _len = len - PATTERNENDERS = _PATTERNENDERS - ASSERTCHARS = _ASSERTCHARS - LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS - REPEATCODES = _REPEATCODES - - while 1: - - if source.next in PATTERNENDERS: + named_groups = named_groups.copy() + + while True: + if state.flags & SRE_FLAG_VERBOSE: + # skip whitespace and comments + while source.next in WHITESPACE_CHARS: + source.get() + if source.next == "#": + while source.next not in (None, "\n"): + source.get() + source.get() + continue + + if source.next in PATTERN_ENDERS: break # end of subpattern - this = sourceget() + + this = source.get() + if this is None: break # end of pattern - if state.flags & SRE_FLAG_VERBOSE: - # skip whitespace and comments - if this in WHITESPACE: - continue - if this == "#": - while 1: - this = sourceget() - if this in (None, "\n"): - break - continue - - if this and this[0] not in SPECIAL_CHARS: - subpatternappend((LITERAL, ord(this))) - + if this[0] not in SPECIAL_CHARS: + if state.flags & SRE_FLAG_IGNORECASE: + subpattern.append((OP.LITERAL_IGNORE, ord(this))) + else: + subpattern.append((OP.LITERAL, ord(this))) elif this == "[": # character set - set = [] - setappend = set.append -## if sourcematch(":"): -## pass # handle character classes - if sourcematch("^"): - setappend((NEGATE, None)) + char_set = [] + negate = source.match("^") # check remaining characters - start = set[:] - while 1: - this = sourceget() - if this == "]" and set != start: + while True: + this = source.get() + if not this: + raise error("unexpected end of pattern") + if this == "]" and char_set: + # terminating ] break - elif this and this[0] == "\\": - code1 = _class_escape(source, this) - elif this: - code1 = LITERAL, ord(this) - else: - raise error, "unexpected end of regular expression" - if sourcematch("-"): + elif this[0] == "\\": + code1 = class_escape(source, this) + elif this[0] == "[": + code1 = posix_class(source) + if not code1: + code1 = OP.LITERAL, ord(this) + else: + code1 = OP.LITERAL, ord(this) + if source.match("-"): # potential range - this = sourceget() + this = source.get() + if not this: + raise error("unexpected end of pattern") if this == "]": - if code1[0] is IN: - code1 = code1[1][0] - setappend(code1) - setappend((LITERAL, ord("-"))) + # at end of pattern, so literal char and "-" + char_set.append(code1) + char_set.append((OP.LITERAL, ord("-"))) break - elif this: - if this[0] == "\\": - code2 = _class_escape(source, this) - else: - code2 = LITERAL, ord(this) - if code1[0] != LITERAL or code2[0] != LITERAL: - raise error, "bad character range" - lo = code1[1] - hi = code2[1] - if hi < lo: - raise error, "bad character range" - setappend((RANGE, (lo, hi))) + if this[0] == "\\": + code2 = class_escape(source, this) + elif this[0] == "[": + code2 = posix_class(source) + if not code2: + code2 = OP.LITERAL, ord(this) else: - raise error, "unexpected end of regular expression" - else: - if code1[0] is IN: - code1 = code1[1][0] - setappend(code1) - - # XXX: should move set optimization to compiler! - if _len(set)==1 and set[0][0] is LITERAL: - subpatternappend(set[0]) # optimization - elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: - subpatternappend((NOT_LITERAL, set[1][1])) # optimization - else: - # XXX: should add charmap optimization here - subpatternappend((IN, set)) - - elif this and this[0] in REPEAT_CHARS: + code2 = OP.LITERAL, ord(this) + if code1[0] != OP.LITERAL or code2[0] != OP.LITERAL: + raise error("bad character range") + lo = code1[1] + hi = code2[1] + if hi < lo: + raise error("bad character range") + char_set.append((OP.RANGE, (lo, hi))) + else: + char_set.append(code1) + if negate: + if state.flags & SRE_FLAG_IGNORECASE: + subpattern.append((OP.NOT_SET_IGNORE, char_set)) + else: + subpattern.append((OP.NOT_SET, char_set)) + else: + if state.flags & SRE_FLAG_IGNORECASE: + subpattern.append((OP.SET_IGNORE, char_set)) + else: + subpattern.append((OP.SET, char_set)) + elif this[0] in REPEAT_CHARS: # repeat previous item if this == "?": min, max = 0, 1 elif this == "*": min, max = 0, MAXREPEAT - elif this == "+": min, max = 1, MAXREPEAT elif this == "{": if source.next == "}": - subpatternappend((LITERAL, ord(this))) + subpattern.append((OP.LITERAL, ord(this))) continue here = source.tell() min, max = 0, MAXREPEAT lo = hi = "" while source.next in DIGITS: - lo = lo + source.get() - if sourcematch(","): + lo += source.get() + if source.match(","): while source.next in DIGITS: - hi = hi + sourceget() + hi += source.get() else: hi = lo - if not sourcematch("}"): - subpatternappend((LITERAL, ord(this))) + if not source.match("}"): + subpattern.append((OP.LITERAL, ord(this))) source.seek(here) continue if lo: @@ -501,166 +600,245 @@ if hi: max = int(hi) if max < min: - raise error, "bad repeat interval" - else: - raise error, "not supported" + raise error("bad repeat interval") + else: + raise error("not supported") # figure out which item to repeat - if subpattern: - item = subpattern[-1:] - else: - item = None - if not item or (_len(item) == 1 and item[0][0] == AT): - raise error, "nothing to repeat" - if item[0][0] in REPEATCODES: - raise error, "multiple repeat" - if sourcematch("?"): - subpattern[-1] = (MIN_REPEAT, (min, max, item)) - else: - subpattern[-1] = (MAX_REPEAT, (min, max, item)) - + item = subpattern[-1 : ] + if not item or len(item) == 1 and item[0][0] in POSITION_CODES: + raise error("nothing to repeat") + if source.match("?"): + subpattern[-1] = (OP.REPEAT_MIN, (min, max, item)) + elif source.match("+"): + subpattern[-1] = (OP.REPEAT_POSS, (min, max, item)) + else: + subpattern[-1] = (OP.REPEAT_MAX, (min, max, item)) elif this == ".": - subpatternappend((ANY, None)) - + if state.flags & SRE_FLAG_DOTALL: + subpattern.append((OP.ANY_ALL, None)) + else: + subpattern.append((OP.ANY, None)) elif this == "(": - group = 1 + group = CAPTURE_GROUP name = None condgroup = None - if sourcematch("?"): - group = 0 + scoped_flags = None + reuse = False + if source.match("?"): + group = QUERY_GROUP # options - if sourcematch("P"): + if source.match("P"): # python extensions - if sourcematch("<"): - # named group: skip forward to end of name - name = "" - while 1: - char = sourceget() - if char is None: - raise error, "unterminated name" - if char == ">": - break - name = name + char - group = 1 - if not isname(name): - raise error, "bad character in group name" - elif sourcematch("="): - # named backreference - name = "" - while 1: - char = sourceget() - if char is None: - raise error, "unterminated name" - if char == ")": - break - name = name + char - if not isname(name): - raise error, "bad character in group name" - gid = state.groupdict.get(name) - if gid is None: - raise error, "unknown group name" - subpatternappend((GROUPREF, gid)) + if source.match("<"): + # named group + name = parse_name(source, ">", "group", "(?P<") + group = CAPTURE_GROUP + if not is_name(name): + raise error("bad group name: %s" % name) + if name in named_groups: + raise error("duplicate group name: %s" % name) + named_groups.add(name) + elif source.match("="): + # named group reference + # group reference stored as list instead of tuple so + # that it can be fixed later + name = parse_name(source, ")", "group", "(?P=") + if not is_name(name): + raise error("bad group name: %s" % name) + if state.flags & SRE_FLAG_IGNORECASE: + ref = [OP.GROUPREF_IGNORE, name] + else: + ref = [OP.GROUPREF, name] + state.fix_list.append(ref) + subpattern.append(ref) continue else: - char = sourceget() + char = source.get() if char is None: - raise error, "unexpected end of pattern" - raise error, "unknown specifier: ?P%s" % char - elif sourcematch(":"): - # non-capturing group - group = 2 - elif sourcematch("#"): + raise error("unexpected end of pattern") + raise error("unknown specifier: (?P%s" % char) + elif source.match("<"): + # named group or look-behind + if source.next in LOOKBEHIND_ASSERT_CHARS: + # lookbehind assertion + dir = -1 # lookbehind + char = source.get() + saved_flags = state.flags + p, named_groups = _parse_sub(source, state, named_groups, False) + state.flags = ((state.flags & ~SCOPED_FLAGS_MASK) | + (saved_flags & SCOPED_FLAGS_MASK)) + if not source.match(")"): + raise error("unbalanced parenthesis") + if char == "=": + subpattern.append((OP.ASSERT, (dir, p))) + else: + subpattern.append((OP.ASSERT_NOT, (dir, p))) + continue + # named group + name = parse_name(source, ">", "group", "(?<") + group = CAPTURE_GROUP + if not is_name(name): + raise error("bad group name: %s" % name) + if name in named_groups: + raise error("duplicate group name: %s" % name) + named_groups.add(name) + elif source.match(">"): + # atomic group + group = ATOMIC_GROUP + elif source.match("#"): # comment - while 1: - if source.next is None or source.next == ")": + while True: + if source.next in (None, ")"): break - sourceget() - if not sourcematch(")"): - raise error, "unbalanced parenthesis" + source.get() + if not source.match(")"): + raise error("unbalanced parenthesis") continue - elif source.next in ASSERTCHARS: + elif source.next in ASSERT_CHARS: # lookahead assertions - char = sourceget() + char = source.get() dir = 1 if char == "<": - if source.next not in LOOKBEHINDASSERTCHARS: - raise error, "syntax error" + if source.next not in LOOKBEHIND_ASSERT_CHARS: + raise error("syntax error: (?%s" % char) dir = -1 # lookbehind - char = sourceget() - p = _parse_sub(source, state) - if not sourcematch(")"): - raise error, "unbalanced parenthesis" + char = source.get() + saved_flags = state.flags + p, named_groups = _parse_sub(source, state, named_groups, False) + state.flags = ((state.flags & ~SCOPED_FLAGS_MASK) | + (saved_flags & SCOPED_FLAGS_MASK)) + if not source.match(")"): + raise error("unbalanced parenthesis") if char == "=": - subpatternappend((ASSERT, (dir, p))) + subpattern.append((OP.ASSERT, (dir, p))) else: - subpatternappend((ASSERT_NOT, (dir, p))) + subpattern.append((OP.ASSERT_NOT, (dir, p))) continue - elif sourcematch("("): + elif source.match("("): # conditional backreference group - condname = "" - while 1: - char = sourceget() - if char is None: - raise error, "unterminated name" - if char == ")": - break - condname = condname + char - group = 2 - if isname(condname): - condgroup = state.groupdict.get(condname) - if condgroup is None: - raise error, "unknown group name" + condgroup = parse_name(source, ")", "group", "(?(") + group = NONCAPTURE_GROUP + if not is_name(condgroup) and not condgroup.isdigit(): + raise error("bad group name: %s" % condgroup) + elif source.match("|"): + # reuse group numbers for multiple branches + group = NONCAPTURE_GROUP + reuse = True + else: + # probably non-capturing group or flags + # might be scoped (set at start of group and local to group) + scoped_flags = state.flags + seen_on, seen_off = False, False + while source.next in FLAGS: + scoped_flags |= FLAGS[source.get()] + seen_on = True + if source.match("-"): + while source.next in FLAGS: + if (FLAGS[source.next] & SCOPED_FLAGS_MASK) == 0: + raise error("bad pattern flag: %s" % + source.next) + scoped_flags &= ~FLAGS[source.get()] + seen_off = True + if not seen_off: + raise error("bad pattern flag") + # update just global flags + state.flags |= scoped_flags & ~SCOPED_FLAGS_MASK + if source.match(":"): + # non-capturing group with scoped flags + group = NONCAPTURE_GROUP + elif seen_on or seen_off: + # not start of group, just setting flags + state.flags = scoped_flags + scoped_flags = None else: - try: - condgroup = int(condname) - except ValueError: - raise error, "bad character in group name" - else: - # flags - if not source.next in FLAGS: - raise error, "unexpected end of pattern" - while source.next in FLAGS: - state.flags = state.flags | FLAGS[sourceget()] + raise error("unexpected end of pattern") if group: + atomic = group == ATOMIC_GROUP # parse group contents - if group == 2: + if group in [NONCAPTURE_GROUP, ATOMIC_GROUP]: # anonymous group group = None else: - group = state.opengroup(name) + group = state.new_group(name) + saved_flags = state.flags + if scoped_flags is not None: + state.flags = scoped_flags if condgroup: - p = _parse_sub_cond(source, state, condgroup) - else: - p = _parse_sub(source, state) - if not sourcematch(")"): - raise error, "unbalanced parenthesis" - if group is not None: - state.closegroup(group) - subpatternappend((SUBPATTERN, (group, p))) - else: - while 1: - char = sourceget() + p, named_groups = _parse_sub_cond(source, state, named_groups, condgroup) + else: + p, named_groups = _parse_sub(source, state, named_groups, reuse) + state.flags = ((state.flags & ~SCOPED_FLAGS_MASK) | + (saved_flags & SCOPED_FLAGS_MASK)) + if not source.match(")"): + raise error("unbalanced parenthesis") + if atomic: + subpattern.append((OP.ATOMIC, (group, p))) + else: + if group is None: + subpattern.append((OP.SUBPATTERN, (None, p))) + else: + # group reference stored as list instead of tuple so + # that it can be fixed later + ref = OP.SUBPATTERN, (list(group), p) + state.fix_list.append(ref) + subpattern.append(ref) + else: + while True: + char = source.get() if char is None: - raise error, "unexpected end of pattern" + raise error("unexpected end of pattern") if char == ")": break - raise error, "unknown extension" - + raise error("unknown extension") elif this == "^": - subpatternappend((AT, AT_BEGINNING)) - + if state.flags & SRE_FLAG_MULTILINE: + subpattern.append((OP.START_OF_LINE, None)) + else: + subpattern.append((OP.START_OF_STRING, None)) elif this == "$": - subpattern.append((AT, AT_END)) - - elif this and this[0] == "\\": - code = _escape(source, this, state) - subpatternappend(code) - + if state.flags & SRE_FLAG_MULTILINE: + subpattern.append((OP.END_OF_LINE, None)) + else: + subpattern.append((OP.END_OF_STRING_LN, None)) + elif this[0] == "\\": + code = escape(source, this, state) + subpattern.append(code) else: - raise error, "parser error" - - return subpattern - -def parse(str, flags=0, pattern=None): + raise error("parser error: %s" % this) + + return subpattern, named_groups + +def fix_ref(ref, index, state): + if ref[index].isdigit(): + ref[index] = int(ref[index]) + if not (1 <= ref[index] <= state.groups): + raise error("invalid group reference: %s" % ref[index]) + else: + try: + ref[index] = state.named_groups[ref[index]] + except KeyError: + raise error("invalid group reference: %s" % ref[index]) + +def fix_grouprefs(p, state): + for name, value in state.named_groups.items(): + state.named_groups[name] = state.groups + 1 + value + GROUPREF_SET = set([OP.GROUPREF, OP.GROUPREF_IGNORE]) + for ref in state.fix_list: + if ref[0] in GROUPREF_SET: + fix_ref(ref, 1, state) + elif ref[0] == OP.GROUPREF_EXISTS: + fix_ref(ref[1], 0, state) + elif ref[0] == OP.SUBPATTERN: + ref = ref[1][0] + if ref[1] is None: + ref[1] = ref[0] + else: + try: + ref[1] = state.named_groups[ref[1]] + except KeyError: + raise error("invalid group reference: %s" % ref[1]) + +def parse(str, flags=0, pattern=None, scanner=False): # parse 're' pattern into list of (opcode, argument) tuples source = Tokenizer(str) @@ -669,122 +847,202 @@ pattern = Pattern() pattern.flags = flags pattern.str = str - - p = _parse_sub(source, pattern, 0) - - tail = source.get() - if tail == ")": - raise error, "unbalanced parenthesis" - elif tail: - raise error, "bogus characters at end of regular expression" + pattern.group_count = 0 + + p, named_groups = _parse_sub(source, pattern, set(), False) + + if source.match(")"): + raise error("unbalanced parenthesis") + + if source.next is not None: + raise error("bad characters at end of pattern") + + fix_grouprefs(p, pattern) + + if scanner: + # check that the scanner pattern doesn't have any extra capture groups + # or group references + # (actually, plain capture groups are turned into non-capture groups) + _validate_scanner(p) if flags & SRE_FLAG_DEBUG: p.dump() - if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: - # the VERBOSE flag was switched on inside the pattern. to be - # on the safe side, we'll parse the whole thing again... - return parse(str, p.pattern.flags) - return p def parse_template(source, pattern): - # parse 're' replacement string into list of literals and - # group references + # parse 're' replacement string into list of literals and group references + sep = source[ : 0] + char_type = unichr if isinstance(sep, unicode) else chr s = Tokenizer(source) - sget = s.get - p = [] - a = p.append - def literal(literal, p=p, pappend=a): - if p and p[-1][0] is LITERAL: - p[-1] = LITERAL, p[-1][1] + literal - else: - pappend((LITERAL, literal)) - sep = source[:0] - if type(sep) is type(""): - makechar = chr - else: - makechar = unichr - while 1: - this = sget() + literals, groups = [], [] + current_literal = [] + def add_literal(char_code): + current_literal.append(char_type(char_code)) + def flush_literal(): + if current_literal: + literals.append(sep.join(current_literal)) + current_literal[:] = [] + def add_group(index): + flush_literal() + groups.append((index, len(literals))) + literals.append(None) + while True: + this = s.get() if this is None: break # end of replacement string - if this and this[0] == "\\": - # group - c = this[1:2] - if c == "g": - name = "" - if s.match("<"): - while 1: - char = sget() - if char is None: - raise error, "unterminated group name" - if char == ">": - break - name = name + char - if not name: - raise error, "bad group name" - try: + if this[0] == "\\": + c = this[1 : 2] + if c in HEX_ESCAPE_LENGTH: + # hex escape + add_literal(hex_escape(s, escape, HEX_ESCAPE_LENGTH[c])) + elif c == "0": + add_literal(oct_escape(s, this[0], this[1 : ])) + elif c in DIGITS: + if s.next in DIGITS: + this += s.get() + if set(this[1 : ]) <= OCTDIGITS and s.next in OCTDIGITS: + this += s.get() + add_literal(int(this[1 : ], 8) & 0xFF) + else: + index = int(this[1 : ]) + if index > pattern.groups: + raise error("invalid group reference: %s" % index) + add_group(index) + else: + index = int(this[1 : ]) + if index > pattern.groups: + raise error("invalid group reference: %s" % index) + add_group(index) + elif c == "g": + # group reference + if s.next in GROUP_DELIMITERS: + # delimited group reference + delimiter = s.get() + name = parse_name(s, GROUP_DELIMITERS[delimiter], "group", + this + delimiter) + elif s.next in DIGITS: + # non-delimited group reference (single digit) + name = s.get() + else: + raise error("missing group name: %s" + this) + if name.isdigit(): index = int(name) - if index < 0: - raise error, "negative group number" - except ValueError: - if not isname(name): - raise error, "bad character in group name" + if not (0 <= index <= pattern.groups): + raise error("invalid group reference: %s" % index) + elif is_name(name): try: index = pattern.groupindex[name] except KeyError: - raise IndexError, "unknown group name" - a((MARK, index)) - elif c == "0": - if s.next in OCTDIGITS: - this = this + sget() - if s.next in OCTDIGITS: - this = this + sget() - literal(makechar(int(this[1:], 8) & 0xff)) - elif c in DIGITS: - isoctal = False - if s.next in DIGITS: - this = this + sget() - if (c in OCTDIGITS and this[2] in OCTDIGITS and - s.next in OCTDIGITS): - this = this + sget() - isoctal = True - literal(makechar(int(this[1:], 8) & 0xff)) - if not isoctal: - a((MARK, int(this[1:]))) - else: + raise error("invalid group reference: %s" % name) + else: + raise error("bad group name: %s" % name) + add_group(index) + elif c == "k": + # named group reference + if s.next in GROUP_DELIMITERS: + # delimited group reference + delimiter = s.get() + name = parse_name(s, GROUP_DELIMITERS[delimiter], "group", + this + delimiter) + else: + # non-delimited group reference; invalid for \k + raise error("missing group name: %s" + this) + if is_name(name): + try: + index = pattern.groupindex[name] + except KeyError: + raise error("invalid group reference: %s" % name) + else: + raise error("bad group name: %s" % name) + add_group(index) + elif c == "N": + # named character + if not s.match("{"): + raise error("missing character name: %s" + this) + name = parse_name(s, "}", "character", this + "{") try: - this = makechar(ESCAPES[this][1]) + add_literal(ord(unicodedata.lookup(name))) except KeyError: - pass - literal(this) + raise error("bad character name: %s" % name) + else: + try: + add_literal(ESCAPES[this][1]) + except KeyError: + add_literal(ord(this[0])) + add_literal(ord(this[1])) else: - literal(this) - # convert template to groups and literals lists - i = 0 - groups = [] - groupsappend = groups.append - literals = [None] * len(p) - for c, s in p: - if c is MARK: - groupsappend((i, s)) - # literal[i] is already None - else: - literals[i] = s - i = i + 1 - return groups, literals - -def expand_template(template, match): - g = match.group - sep = match.string[:0] - groups, literals = template + add_literal(ord(this)) + flush_literal() + return literals, groups + +def expand_template(template, match, unmatched_as_empty=False): + g = match._internal_group + sep = match.string[ : 0] + literals, groups = template literals = literals[:] try: - for index, group in groups: - literals[index] = s = g(group) + for index, pos in groups: + s = g(index) if s is None: - raise error, "unmatched group" + if unmatched_as_empty: + s = sep + else: + raise error("unmatched group") + literals[pos] = s except IndexError: - raise error, "invalid group reference" + raise error("invalid group reference: %s" % a) return sep.join(literals) + +def _validate_scanner(pattern): + # checks that the scanner pattern doesn't have any extra capture groups + # or group references + # (actually, plain capture groups are turned into non-capture groups) + + # pattern must be an alternative + if len(pattern) != 1 or pattern[0][0] != OP.BRANCH: + raise error("invalid scanner pattern") + + # each alternative must be a capture group + items = pattern[0][1][1] + for index, item in enumerate(items, start=1): + if len(item) != 1 or item[0][0] != OP.SUBPATTERN: + error("invalid scanner pattern") + + # ensure that the capture groups are numbered consecutively + op, av = item[0] + marks, subitems = av + marks = [index, index] + av = marks, _validate_scanner_items(subitems) + item[0] = op, av + +def _validate_scanner_items(items): + # validates the scanner items + new_items = [] + for item in items: + op, av = item + if op in (OP.ASSERT, OP.ASSERT_NOT, OP.ATOMIC, OP.BRANCH): + av = av[0], _validate_scanner_items(av[1]) + item = op, av + elif op in (OP.GROUPREF, OP.GROUPREF_EXISTS, OP.GROUPREF_IGNORE): + # reject group references + error("capture group reference in scanner pattern") + elif op == OP.SUBPATTERN: + # turn plain capture groups onto non-capture groups and + # reject named capture groups + marks, subitems = av + if marks: + # it's a capture group + if marks[0] != marks[1]: + # it's a named capture group + error("capture group in scanner pattern") + marks = None + subitems = _validate_scanner_items(subitems) + if not marks and len(subitems) == 1: + # it's a non-capture group containing one item, so promote it + op, av = subitems[0] + else: + av = marks, subitems + item = op, av + new_items.append(item) + return new_items === modified file Lib/re.py --- Lib/re.py 2009-01-01 12:00:19 +0000 +++ Lib/re.py 2009-03-05 21:30:48 +0000 @@ -27,52 +27,86 @@ concatenate ordinary characters, so last matches the string 'last'. The special characters are: - "." Matches any character except a newline. - "^" Matches the start of the string. - "$" Matches the end of the string or just before the newline at - the end of the string. - "*" Matches 0 or more (greedy) repetitions of the preceding RE. - Greedy means that it will match as many repetitions as possible. - "+" Matches 1 or more (greedy) repetitions of the preceding RE. - "?" Matches 0 or 1 (greedy) of the preceding RE. - *?,+?,?? Non-greedy versions of the previous three special characters. - {m,n} Matches from m to n repetitions of the preceding RE. - {m,n}? Non-greedy version of the above. - "\\" Either escapes special characters or signals a special sequence. - [] Indicates a set of characters. - A "^" as the first character indicates a complementing set. - "|" A|B, creates an RE that will match either A or B. - (...) Matches the RE inside the parentheses. - The contents can be retrieved or matched later in the string. - (?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below). - (?:...) Non-grouping version of regular parentheses. - (?P...) The substring matched by the group is accessible by name. - (?P=name) Matches the text matched earlier by the group named name. - (?#...) A comment; ignored. - (?=...) Matches if ... matches next, but doesn't consume the string. - (?!...) Matches if ... doesn't match next. - (?<=...) Matches if preceded by ... (must be fixed length). - (?...) The substring matched by the group is accessible by + name. + (?...) The substring matched by the group is accessible by + name. + (?#...) A comment; ignored. + (?>...) Atomic group. Like (?:...) but won't retry the RE + within the parentheses. + (?=...) Matches if ... matches next, but doesn't consume + the string. + (?!...) Matches if ... doesn't match next. + (?<=...) Matches if preceded by .... + (? Matches the text matched by the group named name. + \g Matches the contents of the group of the same number. + \g<+number> Matches the contents of the group of the relative number. + \g<-number> Matches the contents of the group of the relative number. + \G Matches the empty string, but only at the place where the + previous match ended. + \k Matches the text matched earlier by the group named name. + \N{name} Matches named Unicode character. + \p{name} Matches any character having the named property. + \P{name} Matches any character not having the named property. + \s Matches any whitespace character; equivalent to + [ \t\n\r\f\v]. + \S Matches any non-whitespace character; equivalent to + [^ \t\n\r\f\v]. + \w Matches any alphanumeric character; equivalent to + [a-zA-Z0-9_]. With LOCALE, it will match the set + [0-9_] plus characters defined as letters for the current + locale. + \W Matches the complement of \w. + \Z Matches only at the end of the string. + \\ Matches a literal backslash. This module exports the following functions: match Match a regular expression pattern to the beginning of a string. @@ -84,18 +118,20 @@ finditer Return an iterator yielding a match object for each match. compile Compile a pattern into a RegexObject. purge Clear the regular expression cache. - escape Backslash all non-alphanumerics in a string. + escape Backslash all non-alphanumerics and underscores in a string. Some of the functions in this module takes flags as optional parameters: - I IGNORECASE Perform case-insensitive matching. - L LOCALE Make \w, \W, \b, \B, dependent on the current locale. - M MULTILINE "^" matches the beginning of lines (after a newline) - as well as the string. - "$" matches the end of lines (before a newline) as well - as the end of the string. - S DOTALL "." matches any character at all, including the newline. - X VERBOSE Ignore whitespace and comments for nicer looking RE's. - U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale. + I IGNORECASE Perform case-insensitive matching. + L LOCALE Make \w, \W, \b, \B, dependent on the current locale. + M MULTILINE "^" matches the beginning of lines (after a newline) as + well as the string. + "$" matches the end of lines (before a newline) as well + as the end of the string. + R REVERSE Search backwards, from the end to the start. + S DOTALL "." matches any character at all, including the newline. + X VERBOSE Ignore whitespace and comments for nicer looking RE's. + U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale. + Z ZEROWIDTH Permit splitting on zero-width separators. This module also defines an exception 'error'. @@ -109,24 +145,27 @@ __all__ = [ "match", "search", "sub", "subn", "split", "findall", "compile", "purge", "template", "escape", "I", "L", "M", "S", "X", "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", - "UNICODE", "error" ] - -__version__ = "2.2.1" + "UNICODE", "REVERSE", "error" ] + +__version__ = "2.2.2" # flags I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale +M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline +R = REVERSE = sre_compile.SRE_FLAG_REVERSE # search backwards +S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale -M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline -S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments - +Z = ZEROWIDTH = sre_compile.SRE_FLAG_ZEROWIDTH # permit splitting on zero-width + # separators. # sre extensions (experimental, don't rely on these) T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation # sre exception -error = sre_compile.error +class error(Exception): + pass # -------------------------------------------------------------------- # public interface @@ -141,16 +180,16 @@ a match object, or None if no match was found.""" return _compile(pattern, flags).search(string) -def sub(pattern, repl, string, count=0): +def sub(pattern, repl, string, count=0, flags=0): """Return the string obtained by replacing the leftmost non-overlapping occurrences of the pattern in string by the replacement repl. repl can be either a string or a callable; if a string, backslash escapes in it are processed. If it is a callable, it's passed the match object and must return a replacement string to be used.""" - return _compile(pattern, 0).sub(repl, string, count) - -def subn(pattern, repl, string, count=0): + return _compile(pattern, flags).sub(repl, string, count) + +def subn(pattern, repl, string, count=0, flags=0): """Return a 2-tuple containing (new_string, number). new_string is the string obtained by replacing the leftmost non-overlapping occurrences of the pattern in the source @@ -159,12 +198,12 @@ callable; if a string, backslash escapes in it are processed. If it is a callable, it's passed the match object and must return a replacement string to be used.""" - return _compile(pattern, 0).subn(repl, string, count) - -def split(pattern, string, maxsplit=0): + return _compile(pattern, flags).subn(repl, string, count) + +def split(pattern, string, maxsplit=0, flags=0): """Split the source string by the occurrences of the pattern, returning a list containing the resulting substrings.""" - return _compile(pattern, 0).split(string, maxsplit) + return _compile(pattern, flags).split(string, maxsplit) def findall(pattern, string, flags=0): """Return a list of all non-overlapping matches in the string. @@ -198,23 +237,18 @@ "Compile a template pattern, returning a pattern object" return _compile(pattern, flags|T) -_alphanum = {} -for c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890': - _alphanum[c] = 1 -del c +_nonescaped = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_') def escape(pattern): "Escape all non-alphanumeric characters in pattern." s = list(pattern) - alphanum = _alphanum - for i in range(len(pattern)): - c = pattern[i] - if c not in alphanum: + for i, c in enumerate(s): + if c not in _nonescaped: if c == "\000": s[i] = "\\000" else: s[i] = "\\" + c - return pattern[:0].join(s) + return pattern[ : 0].join(s) # -------------------------------------------------------------------- # internals @@ -224,7 +258,7 @@ _pattern_type = type(sre_compile.compile("", 0)) -_MAXCACHE = 100 +_MAXCACHE = 256 def _compile(*key): # internal: compile pattern @@ -237,12 +271,12 @@ if flags: raise ValueError('Cannot process flags argument with a compiled pattern') return pattern - if not sre_compile.isstring(pattern): - raise TypeError, "first argument must be string or compiled pattern" + if not isinstance(pattern, (str, unicode)): + raise TypeError("First argument must be string or compiled pattern") try: p = sre_compile.compile(pattern, flags) - except error, v: - raise error, v # invalid expression + except sre_compile.error, v: + raise error(v) # invalid expression if len(_cache) >= _MAXCACHE: _cache.clear() _cache[cachekey] = p @@ -256,8 +290,8 @@ repl, pattern = key try: p = sre_parse.parse_template(repl, pattern) - except error, v: - raise error, v # invalid expression + except sre_compile.error, v: + raise error(v) # invalid expression if len(_cache_repl) >= _MAXCACHE: _cache_repl.clear() _cache_repl[key] = p @@ -266,7 +300,7 @@ def _expand(pattern, match, template): # internal: match.expand implementation hook template = sre_parse.parse_template(template, pattern) - return sre_parse.expand_template(template, match) + return sre_parse.expand_template(template, match, True) def _subx(pattern, template): # internal: pattern.sub/subn implementation helper @@ -275,7 +309,7 @@ # literal replacement return template[1][0] def filter(match, template=template): - return sre_parse.expand_template(template, match) + return sre_parse.expand_template(template, match, True) return filter # register myself for pickling @@ -292,36 +326,48 @@ class Scanner: def __init__(self, lexicon, flags=0): - from sre_constants import BRANCH, SUBPATTERN self.lexicon = lexicon # combine phrases into a compound pattern - p = [] - s = sre_parse.Pattern() - s.flags = flags - for phrase, action in lexicon: - p.append(sre_parse.SubPattern(s, [ - (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), - ])) - s.groups = len(p)+1 - p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) - self.scanner = sre_compile.compile(p) + string_type = type(lexicon[0][0]) + sep, template = string_type("|"), string_type("(%s)") + regex = sep.join(template % phrase for phrase, action in lexicon) + # compile pattern, specifying that it's for a scanner, which is + # an alternation of capture groups with no other capture groups + self.scanner = sre_compile.compile(regex, flags, scanner=1) def scan(self, string): result = [] - append = result.append match = self.scanner.scanner(string).match - i = 0 - while 1: + pos = 0 + while True: m = match() if not m: break - j = m.end() - if i == j: + end_pos = m.end() + if pos == end_pos: break - action = self.lexicon[m.lastindex-1][1] + action = self.lexicon[m.lastindex - 1][1] if hasattr(action, '__call__'): self.match = m action = action(self, m.group()) if action is not None: - append(action) - i = j - return result, string[i:] + result.append(action) + pos = end_pos + return result, string[pos : ] + def scaniter(self, string): + match = self.scanner.scanner(string).match + pos = 0 + while True: + m = match() + if not m: + break + end_pos = m.end() + if pos == end_pos: + break + action = self.lexicon[m.lastindex - 1][1] + if hasattr(action, '__call__'): + self.match = m + action = action(self, m.group()) + if action is not None: + yield action + pos = end_pos + self.string = string[pos : ] === modified file Lib/test/re_tests.py --- Lib/test/re_tests.py 2003-04-20 07:35:44 +0000 +++ Lib/test/re_tests.py 2009-02-03 18:18:47 +0000 @@ -87,7 +87,7 @@ (r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'), # NOTE: not an error under PCRE/PRE: # (r'\u', '', SYNTAX_ERROR), # A Perl escape - (r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'), + (r'\c\e\h\i\j\m\q\y\z', 'cehijmqyz', SUCCEED, 'found', 'cehijmqyz'), (r'\xff', '\377', SUCCEED, 'found', chr(255)), # new \x semantics (r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)), @@ -106,8 +106,8 @@ ('a.*b', 'acc\nccb', FAIL), ('a.{4,5}b', 'acc\nccb', FAIL), ('a.b', 'a\rb', SUCCEED, 'found', 'a\rb'), - ('a.b(?s)', 'a\nb', SUCCEED, 'found', 'a\nb'), - ('a.*(?s)b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'), + ('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'), + ('(?s)a.*b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'), ('(?s)a.{4,5}b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'), ('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'), @@ -563,7 +563,7 @@ # Check odd placement of embedded pattern modifiers # not an error under PCRE/PRE: - ('w(?i)', 'W', SUCCEED, 'found', 'W'), + ('(?i)w', 'W', SUCCEED, 'found', 'W'), # ('w(?i)', 'W', SYNTAX_ERROR), # Comments using the x embedded pattern modifier @@ -607,8 +607,8 @@ # new \x semantics (r'\x00ff', '\377', FAIL), # (r'\x00ff', '\377', SUCCEED, 'found', chr(255)), - (r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'), - ('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'), + (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', '\t\n\v\r\f\a'), + ('\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', '\t\n\v\r\f\a'), (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)), (r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', SUCCEED, 'found', '\t\n\v\r\f\b'), @@ -627,7 +627,7 @@ # bug 114033: nothing to repeat (r'(x?)?', 'x', SUCCEED, 'found', 'x'), # bug 115040: rescan if flags are modified inside pattern - (r' (?x)foo ', 'foo', SUCCEED, 'found', 'foo'), + (r'(?x) foo ', 'foo', SUCCEED, 'found', 'foo'), # bug 115618: negative lookahead (r'(?x)', '\g', 'xx') self.assertRaises(re.error, re.sub, '(?Px)', '\g', 'xx') self.assertRaises(re.error, re.sub, '(?Px)', '\g<1a1>', 'xx') - self.assertRaises(IndexError, re.sub, '(?Px)', '\g', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)|(?Py)', '\g', 'xx') - self.assertRaises(re.error, re.sub, '(?Px)|(?Py)', '\\2', 'xx') + self.assertRaises(re.error, re.sub, '(?Px)', '\g', 'xx') + self.assertEqual(re.sub('(?Px)|(?Py)', '\g', 'xx'), '') + self.assertEqual(re.sub('(?Px)|(?Py)', '\\2', 'xx'), '') self.assertRaises(re.error, re.sub, '(?Px)', '\g<-1>', 'xx') def test_re_subn(self): @@ -208,6 +209,7 @@ None, '::', 'c']) self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '', '', 'c']) + self.assertEqual(re.split("(?z):*", ":a:b::c"), ['', 'a', 'b', 'c', '']) def test_qualified_re_split(self): self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) @@ -685,6 +687,102 @@ self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') self.assertEqual(pattern.sub('#', '\n'), '#\n#') + def test_atomic(self): + pattern = re.compile(r'a(?>bc|b)c') + self.assertEqual(pattern.match('abc'), None) + self.assertNotEqual(pattern.match('abcc'), None) + self.assertEqual(re.match(r'(?>.*).', 'abc'), None) + self.assertNotEqual(re.match(r'(?>x)++', 'xxx'), None) + self.assertNotEqual(re.match(r'(?>x++)', 'xxx'), None) + self.assertEqual(re.match(r'(?>x)++x', 'xxx'), None) + self.assertEqual(re.match(r'(?>x++)x', 'xxx'), None) + + def test_bug_2537(self): + "nested repeat" + self.assertEqual(re.sub('((x|y)*)*', '(\\1, \\2)', 'xyyzy', 1), '(, y)zy') + self.assertEqual(re.sub('((x|y+)*)*', '(\\1, \\2)', 'xyyzy', 1), '(, yy)zy') + + def test_named_groups(self): + self.assertEqual(re.match(r"(?Pa)|(?Pb)", "a").groups(), ('a', None)) + self.assertEqual(re.match(r"(?Pa)|(?Pb)", "a").groupdict(), {'a': 'a', 'b': None}) + self.assertEqual(re.match(r"(?Pa)|(?Pb)", "b").groups(), (None, 'b')) + self.assertEqual(re.match(r"(?Pa)|(?Pb)", "b").groupdict(), {'a': None, 'b': 'b'}) + self.assertEqual(re.match(r"(?Pa)|(?Pb)", "a").groups(), ('a', None)) + self.assertEqual(re.match(r"(?Pa)|(?Pb)", "a").groupdict(), {'a': 'a'}) + self.assertEqual(re.match(r"(?Pa)|(?Pb)", "b").groups(), (None, 'b')) + self.assertEqual(re.match(r"(?Pa)|(?Pb)", "b").groupdict(), {'a': 'b'}) + + def test_duplicate_groups(self): + self.assertEqual(re.match(r"(?:(a)|(b))", "a").groups(), ('a', None)) + self.assertEqual(re.match(r"(?|(a)|(b))", "a").groups(), ('a',)) + + def test_search_anchor(self): + self.assertEqual(re.findall(r"\w", "abc def"), ['a', 'b', 'c', 'd', 'e', 'f']) + self.assertEqual(re.findall(r"\G\w", "abc def"), ['a', 'b', 'c']) + self.assertEqual(re.findall(r"\G\w", " abc def"), []) + + def test_word_chars(self): + word_chars, all_chars = [], [] + accept_set = set(['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Nd', 'No', 'Mc', 'Me', 'Mn', 'Pc']) + for i in range(sys.maxunicode): + c = unichr(i) + if c == '_' or unicodedata.category(c) in accept_set: + word_chars.append(c) + all_chars.append(c) + word_chars = u''.join(word_chars) + found_chars = u''.join(re.findall(r'(?u)(\w)', u''.join(all_chars))) + self.assertEqual(found_chars, word_chars) + + def test_digit_chars(self): + digit_chars, all_chars = [], [] + accept_set = set(['Nd']) + for i in range(sys.maxunicode): + c = unichr(i) + if unicodedata.category(c) in accept_set: + digit_chars.append(c) + all_chars.append(c) + digit_chars = u''.join(digit_chars) + found_chars = u''.join(re.findall(r'(?u)(\d)', u''.join(all_chars))) + self.assertEqual(found_chars, digit_chars) + + def test_named_chars(self): + self.assertNotEqual(re.match(r"\N{LATIN CAPITAL LETTER A}", u"A"), None) + self.assertNotEqual(re.match(r"[\N{LATIN CAPITAL LETTER A}]", u"A"), None) + self.assertEqual(re.match(r"\N{LATIN CAPITAL LETTER A}", u"B"), None) + self.assertEqual(re.match(r"[\N{LATIN CAPITAL LETTER A}]", u"a"), None) + + def test_unicode_properties(self): + self.assertNotEqual(re.match(r"\p{Lu}", u"A"), None) + self.assertEqual(re.match(r"\p{Lu}", u"a"), None) + self.assertNotEqual(re.match(r"\p{L&}", u"A"), None) + + ascii_chars = "".join(chr(c) for c in range(0x0, 0x80)) + charsets = r""" +\p{Alnum} [\p{L&}\p{Nd}] [a-zA-Z0-9] +\p{Alpha} \p{L&} [a-zA-Z] +\p{ASCII} [\x00-\x7F] +\p{Blank} [\p{Zs}\t] [ \t] +\p{Cntrl} \p{Cc} [\x00-\x1F\x7F] +\p{Digit} \p{Nd} \d [0-9] +\p{Graph} [^\p{Z}\p{C}] [\x21-\x7E] +\p{Lower} \p{Ll} [a-z] +\p{Print} \P{C} [\x20-\x7E] +\p{Punct} [\p{P}\p{S}] [!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~] +\p{Space} [\p{Z}\t\r\n\v\f] \s [ \t\r\n\v\f] +\p{Upper} \p{Lu} [A-Z] + [\p{L}\p{N}\p{Pc}] \w [A-Za-z0-9_] +\p{XDigit} [A-Fa-f0-9] +""" + for line in charsets.splitlines(): + parts = [p.strip() for p in line.split(" ")] + parts = [p for p in parts if p] + if parts: + matched = [re.findall(p, ascii_chars, re.U) for p in parts] + self.assertEqual(self.all_same(matched), True) + + def all_same(self, items): + first = items[0] + return all(i == first for i in items[1 : ]) def run_re_tests(): from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR === modified file Modules/_sre.c --- Modules/_sre.c 2008-09-10 14:27:00 +0000 +++ Modules/_sre.c 2009-03-07 02:44:17 +0000 @@ -4,24 +4,25 @@ * regular expression matching engine * * partial history: - * 1999-10-24 fl created (based on existing template matcher code) - * 2000-03-06 fl first alpha, sort of - * 2000-08-01 fl fixes for 1.6b1 - * 2000-08-07 fl use PyOS_CheckStack() if available - * 2000-09-20 fl added expand method - * 2001-03-20 fl lots of fixes for 2.1b2 - * 2001-04-15 fl export copyright as Python attribute, not global - * 2001-04-28 fl added __copy__ methods (work in progress) - * 2001-05-14 fl fixes for 1.5.2 compatibility - * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis) - * 2001-10-18 fl fixed group reset issue (from Matthew Mueller) - * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1 - * 2001-10-21 fl added sub/subn primitive - * 2001-10-24 fl added finditer primitive (for 2.2 only) - * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) - * 2002-11-09 fl fixed empty sub/subn return type - * 2003-04-18 mvl fully support 4-byte codes - * 2003-10-17 gn implemented non recursive scheme + * 1999-10-24 fl created (based on existing template matcher code) + * 2000-03-06 fl first alpha, sort of + * 2000-08-01 fl fixes for 1.6b1 + * 2000-08-07 fl use PyOS_CheckStack() if available + * 2000-09-20 fl added expand method + * 2001-03-20 fl lots of fixes for 2.1b2 + * 2001-04-15 fl export copyright as Python attribute, not global + * 2001-04-28 fl added __copy__ methods (work in progress) + * 2001-05-14 fl fixes for 1.5.2 compatibility + * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis) + * 2001-10-18 fl fixed group reset issue (from Matthew Mueller) + * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1 + * 2001-10-21 fl added sub/subn primitive + * 2001-10-24 fl added finditer primitive (for 2.2 only) + * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) + * 2002-11-09 fl fixed empty sub/subn return type + * 2003-04-18 mvl fully support 4-byte codes + * 2003-10-17 gn implemented non recursive scheme + * 2008-09-21 mrab major reworking * * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * @@ -37,9 +38,7 @@ #ifndef SRE_RECURSIVE static char copyright[] = - " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB "; - -#define PY_SSIZE_T_CLEAN + " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB "; #include "Python.h" #include "structmember.h" /* offsetof */ @@ -55,11 +54,11 @@ #define SRE_PY_MODULE "re" -/* defining this one enables tracing */ -#undef VERBOSE +/* uncomment this define to enable tracing */ +/* #define VERBOSE_SRE_ENGINE */ #if PY_VERSION_HEX >= 0x01060000 -#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE) +#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE) /* defining this enables unicode support (default under 1.6a1 and later) */ #define HAVE_UNICODE #endif @@ -67,9 +66,6 @@ /* -------------------------------------------------------------------- */ /* optional features */ - -/* enables fast searching */ -#define USE_FAST_SEARCH /* enables aggressive inlining (always on for Visual C) */ #undef USE_INLINE @@ -95,13 +91,13 @@ #endif /* error codes */ -#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */ -#define SRE_ERROR_STATE -2 /* illegal state */ +#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */ +#define SRE_ERROR_STATE -2 /* illegal state */ #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */ -#define SRE_ERROR_MEMORY -9 /* out of memory */ -#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */ - -#if defined(VERBOSE) +#define SRE_ERROR_MEMORY -9 /* out of memory */ +#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */ + +#if defined(VERBOSE_SRE_ENGINE) #define TRACE(v) printf v #else #define TRACE(v) @@ -110,219 +106,586 @@ /* -------------------------------------------------------------------- */ /* search engine state */ -/* default character predicates (run sre_chars.py to regenerate tables) */ - -#define SRE_DIGIT_MASK 1 -#define SRE_SPACE_MASK 2 -#define SRE_LINEBREAK_MASK 4 -#define SRE_ALNUM_MASK 8 -#define SRE_WORD_MASK 16 - -/* FIXME: this assumes ASCII. create tables in init_sre() instead */ - -static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2, -2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, -25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, -24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, -0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, -24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 }; - -static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, -27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, -61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, -108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, -122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, -106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, -120, 121, 122, 123, 124, 125, 126, 127 }; - -#define SRE_IS_DIGIT(ch)\ - ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0) -#define SRE_IS_SPACE(ch)\ - ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0) -#define SRE_IS_LINEBREAK(ch)\ - ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0) -#define SRE_IS_ALNUM(ch)\ - ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0) -#define SRE_IS_WORD(ch)\ - ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0) - -static unsigned int sre_lower(unsigned int ch) -{ - return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch); -} - -/* locale-specific character predicates */ -/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids - * warnings when c's type supports only numbers < N+1 */ -#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0) -#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0) -#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n') -#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0) -#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_') - -static unsigned int sre_lower_locale(unsigned int ch) -{ - return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch); -} - -/* unicode-specific character predicates */ - -#if defined(HAVE_UNICODE) - -#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch)) -#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch)) -#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) -#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch)) -#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_') - -static unsigned int sre_lower_unicode(unsigned int ch) -{ - return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch)); -} - -#endif - -LOCAL(int) -sre_category(SRE_CODE category, unsigned int ch) -{ +/* + The following structs and function are copied from unicodedata.c. + + Ideally the functionality would be available directly from that 'module'. + */ + +typedef struct { + const unsigned char category; /* index into + PyUnicode_CategoryNames */ + const unsigned char combining; /* combining class value 0 - 255 */ + const unsigned char bidirectional; /* index into + PyUnicode_BidirectionalNames */ + const unsigned char mirrored; /* true if mirrored in bidir mode */ + const unsigned char east_asian_width; /* index into + PyUnicode_EastAsianWidth */ +} _PyUnicode_DatabaseRecord; + +typedef struct change_record { + const unsigned char bidir_changed; + const unsigned char category_changed; + const unsigned char decimal_changed; + const unsigned char mirrored_changed; + const int numeric_changed; +} change_record; + +#include "unicodedata_db.h" + +static const unsigned char get_unicode_category(Py_UCS4 code) { + int index; + if (code >= 0x110000) + index = 0; + else { + index = index1[(code >> SHIFT)]; + index = index2[(index << SHIFT) + (code & ((1 << SHIFT) - 1))]; + } + + return _PyUnicode_Database_Records[index].category; +} + +/* ASCII-specific */ + +/* The maximum ASCII character. */ +#define SRE_ASCII_MAX 0x7F + +/* Bit-masks for the character categories. */ +#define SRE_BLANK_MASK 0x001 +#define SRE_DIGIT_MASK 0x002 +#define SRE_GRAPH_MASK 0x004 +#define SRE_LOWER_MASK 0x008 +#define SRE_PRINT_MASK 0x010 +#define SRE_PUNCT_MASK 0x020 +#define SRE_UNDERSCORE_MASK 0x040 +#define SRE_UPPER_MASK 0x080 +#define SRE_XDIGIT_MASK 0x100 +#define SRE_WHITESPACE_MASK 0x200 + +#define SRE_ALPHA_MASK (SRE_LOWER_MASK | SRE_UPPER_MASK) + +/* The categories of the characters. */ +static short sre_ascii_info[SRE_ASCII_MAX + 1] = { + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, + 0x000, 0x201, 0x200, 0x200, 0x200, 0x200, 0x000, 0x000, + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, + 0x000, 0x000, 0x000, 0x000, 0x200, 0x200, 0x200, 0x200, + 0x211, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, + 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, + 0x116, 0x116, 0x116, 0x116, 0x116, 0x116, 0x116, 0x116, + 0x116, 0x116, 0x034, 0x034, 0x034, 0x034, 0x034, 0x034, + 0x034, 0x194, 0x194, 0x194, 0x194, 0x194, 0x194, 0x094, + 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, + 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, 0x094, + 0x094, 0x094, 0x094, 0x034, 0x034, 0x034, 0x034, 0x074, + 0x034, 0x11C, 0x11C, 0x11C, 0x11C, 0x11C, 0x11C, 0x01C, + 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, + 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, 0x01C, + 0x01C, 0x01C, 0x01C, 0x034, 0x034, 0x034, 0x034, 0x000, +}; + +/* Checks whether a ASCII character is in the given category. */ +static BOOL ascii_in_category(SRE_CODE category, Py_UCS4 ch) { + if (ch > SRE_ASCII_MAX) + /* Not ASCII. */ + return FALSE; + + switch(category) { + case SRE_CAT_Alnum: + return (sre_ascii_info[ch] & (SRE_DIGIT_MASK | SRE_ALPHA_MASK)) != 0; + case SRE_CAT_Alpha: + return (sre_ascii_info[ch] & SRE_ALPHA_MASK) != 0; + case SRE_CAT_ASCII: + return TRUE; + case SRE_CAT_Blank: + return (sre_ascii_info[ch] & SRE_BLANK_MASK) != 0; + case SRE_CAT_Cntrl: + return (sre_ascii_info[ch] & SRE_PRINT_MASK) == 0; + case SRE_CAT_Digit: + return (sre_ascii_info[ch] & SRE_DIGIT_MASK) != 0; + case SRE_CAT_Graph: + return (sre_ascii_info[ch] & SRE_GRAPH_MASK) != 0; + case SRE_CAT_LineBreak: + return ch == '\n'; + case SRE_CAT_Lower: + return (sre_ascii_info[ch] & SRE_LOWER_MASK) != 0; + case SRE_CAT_Print: + return (sre_ascii_info[ch] & SRE_PRINT_MASK) != 0; + case SRE_CAT_Punct: + return (sre_ascii_info[ch] & SRE_PUNCT_MASK) != 0; + case SRE_CAT_Space: + return (sre_ascii_info[ch] & SRE_WHITESPACE_MASK) != 0; + case SRE_CAT_Upper: + return (sre_ascii_info[ch] & SRE_UPPER_MASK) != 0; + case SRE_CAT_Word: + return (sre_ascii_info[ch] & + (SRE_DIGIT_MASK | SRE_ALPHA_MASK | SRE_UNDERSCORE_MASK)) != 0; + case SRE_CAT_XDigit: + return (sre_ascii_info[ch] & SRE_XDIGIT_MASK) != 0; + default: + /* Not a known category for ASCII. */ + return FALSE; + } +} + +/* Converts an ASCII character to lowercase. */ +static Py_UCS4 ascii_lower(Py_UCS4 ch) { + if (ch <= SRE_ASCII_MAX && (sre_ascii_info[ch] & SRE_UPPER_MASK) != 0) + /* The character is ASCII and uppercase. */ + return ch ^ 0x20; + + return ch; +} + +/* Converts an ASCII character to uppercase. */ +static Py_UCS4 ascii_upper(Py_UCS4 ch) { + if (ch <= SRE_ASCII_MAX && (sre_ascii_info[ch] & SRE_LOWER_MASK) != 0) + /* The character is ASCII and lowercase. */ + return ch ^ 0x20; + + return ch; +} + +/* The handlers for ASCII characters. */ +static SRE_ENCODING_TABLE ascii_encoding = { + ascii_in_category, + ascii_lower, + ascii_upper, + ascii_upper, /* Titlecase for ASCII is the same as uppercase. */ +}; + +/* Locale-specific */ + +/* The maximum locale character. */ +#define SRE_LOC_MAX 0xFF + +/* Checks whether a locale character is in the given category. */ +static BOOL loc_in_category(SRE_CODE category, Py_UCS4 ch) { + if (ch > SRE_LOC_MAX) + return FALSE; + switch (category) { - - case SRE_CATEGORY_DIGIT: - return SRE_IS_DIGIT(ch); - case SRE_CATEGORY_NOT_DIGIT: - return !SRE_IS_DIGIT(ch); - case SRE_CATEGORY_SPACE: - return SRE_IS_SPACE(ch); - case SRE_CATEGORY_NOT_SPACE: - return !SRE_IS_SPACE(ch); - case SRE_CATEGORY_WORD: - return SRE_IS_WORD(ch); - case SRE_CATEGORY_NOT_WORD: - return !SRE_IS_WORD(ch); - case SRE_CATEGORY_LINEBREAK: - return SRE_IS_LINEBREAK(ch); - case SRE_CATEGORY_NOT_LINEBREAK: - return !SRE_IS_LINEBREAK(ch); - - case SRE_CATEGORY_LOC_WORD: - return SRE_LOC_IS_WORD(ch); - case SRE_CATEGORY_LOC_NOT_WORD: - return !SRE_LOC_IS_WORD(ch); - -#if defined(HAVE_UNICODE) - case SRE_CATEGORY_UNI_DIGIT: - return SRE_UNI_IS_DIGIT(ch); - case SRE_CATEGORY_UNI_NOT_DIGIT: - return !SRE_UNI_IS_DIGIT(ch); - case SRE_CATEGORY_UNI_SPACE: - return SRE_UNI_IS_SPACE(ch); - case SRE_CATEGORY_UNI_NOT_SPACE: - return !SRE_UNI_IS_SPACE(ch); - case SRE_CATEGORY_UNI_WORD: - return SRE_UNI_IS_WORD(ch); - case SRE_CATEGORY_UNI_NOT_WORD: - return !SRE_UNI_IS_WORD(ch); - case SRE_CATEGORY_UNI_LINEBREAK: - return SRE_UNI_IS_LINEBREAK(ch); - case SRE_CATEGORY_UNI_NOT_LINEBREAK: - return !SRE_UNI_IS_LINEBREAK(ch); -#else - case SRE_CATEGORY_UNI_DIGIT: - return SRE_IS_DIGIT(ch); - case SRE_CATEGORY_UNI_NOT_DIGIT: - return !SRE_IS_DIGIT(ch); - case SRE_CATEGORY_UNI_SPACE: - return SRE_IS_SPACE(ch); - case SRE_CATEGORY_UNI_NOT_SPACE: - return !SRE_IS_SPACE(ch); - case SRE_CATEGORY_UNI_WORD: - return SRE_LOC_IS_WORD(ch); - case SRE_CATEGORY_UNI_NOT_WORD: - return !SRE_LOC_IS_WORD(ch); - case SRE_CATEGORY_UNI_LINEBREAK: - return SRE_IS_LINEBREAK(ch); - case SRE_CATEGORY_UNI_NOT_LINEBREAK: - return !SRE_IS_LINEBREAK(ch); -#endif - } - return 0; -} - -/* helpers */ - -static void -data_stack_dealloc(SRE_STATE* state) -{ - if (state->data_stack) { - PyMem_FREE(state->data_stack); - state->data_stack = NULL; - } - state->data_stack_size = state->data_stack_base = 0; -} - -static int -data_stack_grow(SRE_STATE* state, Py_ssize_t size) -{ - Py_ssize_t minsize, cursize; - minsize = state->data_stack_base+size; - cursize = state->data_stack_size; - if (cursize < minsize) { - void* stack; - cursize = minsize+minsize/4+1024; - TRACE(("allocate/grow stack %d\n", cursize)); - stack = PyMem_REALLOC(state->data_stack, cursize); - if (!stack) { - data_stack_dealloc(state); - return SRE_ERROR_MEMORY; - } - state->data_stack = (char *)stack; - state->data_stack_size = cursize; - } - return 0; -} - -/* generate 8-bit version */ + case SRE_CAT_Alnum: + return isalnum(ch); + case SRE_CAT_Alpha: + return isalpha(ch); + case SRE_CAT_ASCII: + return ch <= SRE_ASCII_MAX; + case SRE_CAT_Blank: + return ch == '\t' || ch == ' '; + case SRE_CAT_Cntrl: + return !isprint(ch); + case SRE_CAT_Digit: + return isdigit(ch); + case SRE_CAT_Graph: + return isgraph(ch); + case SRE_CAT_LineBreak: + return ch == '\n'; + case SRE_CAT_Lower: + return islower(ch); + case SRE_CAT_Print: + return isprint(ch); + case SRE_CAT_Punct: + return ispunct(ch); + case SRE_CAT_Space: + return isspace(ch); + case SRE_CAT_Upper: + return isupper(ch); + case SRE_CAT_Word: + return ch == '_' || isalnum(ch); + case SRE_CAT_XDigit: + return isxdigit(ch); + default: + /* Not a known category for locale. */ + return FALSE; + } +} + +/* Converts a locale character to lowercase. */ +static Py_UCS4 loc_lower(Py_UCS4 ch) { + if (ch <= SRE_LOC_MAX) + /* The character is locale. */ + return (Py_UCS4)tolower(ch); + + return ch; +} + +/* Converts a locale character to uppercase. */ +static Py_UCS4 loc_upper(Py_UCS4 ch) { + if (ch <= SRE_LOC_MAX) + /* The character is locale. */ + return (Py_UCS4)toupper(ch); + + return ch; +} + +/* The handlers for locale characters. */ +static SRE_ENCODING_TABLE locale_encoding = { + loc_in_category, + loc_lower, + loc_upper, + loc_upper, /* Titlecase for locale is the same as uppercase (probably!). */ +}; + +/* Unicode-specific */ + +/* Checks whether a Unicode character is in the given category. */ +static BOOL uni_in_category(SRE_CODE category, Py_UCS4 ch) { + unsigned char cat = get_unicode_category(ch); + + /* Are we checking for a Unicode category (eg "Lu")? */ + if (category < 0x20) + return cat == category; + + switch (category) { + case SRE_UNI_CAT_L: /* Category "L&" or "L". */ + return (SRE_CAT_MASK_L & (1 << cat)) != 0; + case SRE_UNI_CAT_M: /* Category "M&" or "M". */ + return (SRE_CAT_MASK_M & (1 << cat)) != 0; + case SRE_UNI_CAT_N: /* Category "N&" or "N". */ + return (SRE_CAT_MASK_N & (1 << cat)) != 0; + case SRE_UNI_CAT_Z: /* Category "Z&" or "Z". */ + return (SRE_CAT_MASK_Z & (1 << cat)) != 0; + case SRE_UNI_CAT_C: /* Category "C&" or "C". */ + return (SRE_CAT_MASK_C & (1 << cat)) != 0; + case SRE_UNI_CAT_P: /* Category "P&" or "P". */ + return (SRE_CAT_MASK_P & (1 << cat)) != 0; + case SRE_UNI_CAT_S: /* Category "S&" or "S". */ + return (SRE_CAT_MASK_S & (1 << cat)) != 0; + case SRE_CAT_Alnum: + return (SRE_CAT_MASK_Alnum & (1 << cat)) != 0; + case SRE_CAT_Alpha: + return (SRE_CAT_MASK_Alpha & (1 << cat)) != 0; + case SRE_CAT_ASCII: + return ch <= SRE_ASCII_MAX; + case SRE_CAT_Blank: + return ch == '\t' || cat == SRE_UNI_CAT_Zs; + case SRE_CAT_Cntrl: + return cat == SRE_UNI_CAT_Cc; + case SRE_CAT_Digit: + return cat == SRE_UNI_CAT_Nd; + case SRE_CAT_Graph: + return (SRE_CAT_MASK_Graph & (1 << cat)) != 0; + case SRE_CAT_LineBreak: + return ch == '\n'; + case SRE_CAT_Lower: + return cat == SRE_UNI_CAT_Ll; + case SRE_CAT_Print: + return (SRE_CAT_MASK_Print & (1 << cat)) != 0; + case SRE_CAT_Punct: + return (SRE_CAT_MASK_Punct & (1 << cat)) != 0; + case SRE_CAT_Space: + return ch == '\t' || ch == '\r' || ch == '\n' || ch == '\v' || + ch == '\f' || (SRE_CAT_MASK_Z & (1 << cat)) != 0; + case SRE_CAT_Upper: + return cat == SRE_UNI_CAT_Lu; + case SRE_CAT_Word: + return (SRE_CAT_MASK_Word & (1 << cat)) != 0; + case SRE_CAT_XDigit: + return ch >= '0' && ch <= '9' || ch >= 'A' && ch <= 'F' || + ch >= 'a' && ch <= 'f'; + default: + /* Not a known category for Unicode. */ + return FALSE; + } +} + +/* Converts a Unicode character to lowercase. */ +static Py_UCS4 uni_lower(Py_UCS4 ch) { + return (Py_UCS4)Py_UNICODE_TOLOWER((Py_UNICODE)ch); +} + +/* Converts a Unicode character to uppercase. */ +static Py_UCS4 uni_upper(Py_UCS4 ch) { + return (Py_UCS4)Py_UNICODE_TOUPPER((Py_UNICODE)ch); +} + +/* Converts a Unicode character to titlecase. */ +static Py_UCS4 uni_title(Py_UCS4 ch) { + return (Py_UCS4)Py_UNICODE_TOTITLE((Py_UNICODE)ch); +} + +/* The handlers for Unicode characters. */ +static SRE_ENCODING_TABLE unicode_encoding = { + uni_in_category, + uni_lower, + uni_upper, + uni_title, +}; + +/* Returns the minimum of 2 numbers. */ +LOCAL(unsigned int) unsigned_min(unsigned int x, unsigned int y) { + return x <= y ? x : y; +} + +/* Returns the maximum of 2 numbers. */ +LOCAL(unsigned int) unsigned_max(unsigned int x, unsigned int y) { + return x >= y ? x : y; +} + +/* Returns TRUE if the op is a repeat-one code. */ +LOCAL(BOOL) is_repeat_one(SRE_CODE op) { + switch(op) { + case SRE_OP_REPEAT_ONE_MAX: + case SRE_OP_REPEAT_ONE_MAX_REV: + case SRE_OP_REPEAT_ONE_MIN: + case SRE_OP_REPEAT_ONE_MIN_REV: + case SRE_OP_REPEAT_ONE_POSS: + case SRE_OP_REPEAT_ONE_POSS_REV: + return TRUE; + default: + return FALSE; + } +} + +/* Checks whether a character is in a charset. */ +LOCAL(BOOL) in_charset(SRE_CODE* charset, Py_UCS4 ch) { + /* + Charset format: max_char indexes... chunks... + + The charset format is based on that of BIGCHARSET written by + Martin von Loewis. + + The characters may be mapped to a bitmap. + + To represent a charset, first a bitmap of all characters in the set is + constructed. Then, this bitmap is sliced into chunks of 256 characters, + duplicate chunks are eliminated, and each chunk is given a number. In the + compiled expression, the charset is represented by a codeword sequence, + consisting of one codeword for the maximum character code, a sequence of + chunk numbers (2 per codeword), and a sequence of chunks (8 codewords + each). + + Compression is normally good: in a typical charset, large ranges of Unicode + will be either completely excluded (e.g. if only Cyrillic letters are to be + matched), or completely included (e.g. if large subranges of Kanji match). + These ranges will be represented by chunks of all one-bits or all + zero-bits. + + Matching can be also done efficiently: the most significant bits of the + Unicode character is an index into the chunk number, and the least + significant byte is a bit index into the chunk. + + This format is used even for 8-bit character sets. + + The entire charset is an array of SRE_CODE, so endianness isn't a problem. + */ + Py_ssize_t hi_bytes = ch / 256; /* Split the character code into the */ + Py_ssize_t lo_byte = ch % 256; /* upper and lower bits. */ + Py_ssize_t index; + SRE_CODE* chunk; + SRE_CODE bitmask; + + /* Check against the maximum character code in the charset. */ + if (ch > charset[0]) + /* Definitely not in the charset. */ + return FALSE; + + /* Get the chunk index (2 x 16-bit indexes in each 32-bit codeword). */ + index = (charset[1 + hi_bytes / 2] >> ((hi_bytes % 2) * 16)) & 0xFFFF; + + /* + Point to the chunk. The number of chunk indexes depends on the maximum + character code of the charset, so that needs to be taken into account. + */ + chunk = charset + 2 + charset[0] / 512 + index * (256 / SRE_BITS_PER_CODE); + + /* Check the bit in the chunk. */ + bitmask = 1 << (lo_byte % SRE_BITS_PER_CODE); + return (chunk[lo_byte / SRE_BITS_PER_CODE] & bitmask) != 0; +} + +/* Checks whether a character is in a charset, ignoring the case. */ +LOCAL(BOOL) in_charset_ignore(SRE_STATE* state, SRE_CODE* charset, Py_UCS4 ch) { + /* + Unfortunately we need to check for all 3 possible cases (lower, upper and + title). + + As a example of the problem, normally: + + 'I' <-> 'i' + + but in Turkish: + + uppercase dotless 'I' <-> lowercase dotless 'i' + uppercase dotted 'I' <-> lowercase dotted 'i' + + We therefore adopt the tactic of retaining the case of the character in + the charset and checking whether the character we wish to check is + equivalent to it. This does, however, mean that some characters might be + treated as equivalent when ideally they shouldn't be, eg uppercase + dotless 'I' <-> lowercase dotted 'i' in Turkish. + + I hope that in the future Unicode strings will gain some locale-specific + methods; the regex code can then be improved to become more + locale-friendly. + */ + SRE_ENCODING_TABLE* encoding = state->encoding; + + return in_charset(charset, encoding->lower(ch)) || + in_charset(charset, encoding->upper(ch)) || + in_charset(charset, encoding->title(ch)); +} + +/* Checks whether a character is in a range. */ +LOCAL(BOOL) in_range(SRE_CODE lower, SRE_CODE upper, Py_UCS4 ch) { + return lower <= ch && ch <= upper; +} + +/* Checks whether a character is in a range, ignoring the case. */ +LOCAL(BOOL) in_range_ignore(SRE_STATE* state, SRE_CODE lower, SRE_CODE upper, + Py_UCS4 ch) { + /* + Unfortunately we need to check for all 3 possible cases. + + (Look at in_charset_ignore() for an explanation.) + */ + SRE_ENCODING_TABLE* encoding = state->encoding; + + return in_range(lower, upper, encoding->lower(ch)) || + in_range(lower, upper, encoding->upper(ch)) || + in_range(lower, upper, encoding->title(ch)); +} + +/* Checks whether a character is in a set. */ +LOCAL(BOOL) in_set(SRE_STATE* state, SRE_CODE* charset, Py_UCS4 ch) { + SRE_CODE* charset_end = charset + charset[0]; + SRE_ENCODING_TABLE* encoding = state->encoding; + + charset++; + + do { + switch (charset[0]) { + case SRE_OP_CATEGORY: + /* Character in a certain category. */ + /* */ + if (encoding->in_category(charset[1], ch)) + return TRUE; + charset += 2; + break; + case SRE_OP_CHARSET: + /* Character in a charset. */ + /* */ + if (in_charset(charset + 2, ch)) + return TRUE; + charset += 1 + charset[1]; + break; + case SRE_OP_LITERAL: + /* Character is this literal. */ + /* */ + if (ch == charset[1]) + return TRUE; + charset += 2; + break; + case SRE_OP_RANGE: + /* Character in range. */ + /* */ + if (in_range(charset[1], charset[2], ch)) + return TRUE; + charset += 3; + break; + default: + /* internal error -- there's not much we can do about it + here, so let's just pretend it didn't match... */ + return FALSE; + } + } while (charset < charset_end); + + return FALSE; +} + +/* Checks whether a character is in a set, ignoring case. */ +LOCAL(BOOL) in_set_ignore(SRE_STATE* state, SRE_CODE* charset, Py_UCS4 ch) { + /* + Unfortunately we need to check for all 3 possible cases. + + (Look at in_charset_ignore() for an explanation.) + */ + SRE_ENCODING_TABLE* encoding = state->encoding; + + return in_set(state, charset, encoding->lower(ch)) || + in_set(state, charset, encoding->upper(ch)) || + in_set(state, charset, encoding->title(ch)); +} + +/* Checks whether 2 characters are equivalent, ignoring case. */ +LOCAL(BOOL) same_char_ignore(SRE_STATE* state, Py_UCS4 ch_1, Py_UCS4 ch_2) { + /* Unfortunately we need to check for all 3 possible cases. + + (Look at in_charset_ignore() for an explanation.) + */ + SRE_ENCODING_TABLE* encoding = state->encoding; + + return encoding->lower(ch_1) == encoding->lower(ch_2) || + encoding->upper(ch_1) == encoding->upper(ch_2) || + encoding->title(ch_1) == encoding->title(ch_2); +} + +/* generate bytestring version */ #define SRE_CHAR unsigned char -#define SRE_AT sre_at -#define SRE_COUNT sre_count -#define SRE_CHARSET sre_charset -#define SRE_INFO sre_info -#define SRE_MATCH sre_match -#define SRE_MATCH_CONTEXT sre_match_context -#define SRE_SEARCH sre_search -#define SRE_LITERAL_TEMPLATE sre_literal_template +#define SRE_MATCH sre_bmatch +#define SRE_SEARCH sre_bsearch +#define SRE_LITERAL_TEMPLATE sre_bliteral_template +#define SRE_AT_BOUNDARY sre_bat_boundary +#define SRE_CONTEXT sre_bcontext +#define SRE_SAVE_BACKTRACK sre_bsave_backtrack +#define SRE_DISCARD_BACKTRACK sre_bdiscard_backtrack +#define SRE_REFRESH_MARKS sre_brefresh_marks +#define SRE_DISCARD_UNTIL sre_bdiscard_until +#define SRE_CLEANUP sre_bcleanup +#define SRE_POSSIBLE_MATCH_AHEAD sre_bpossible_match_ahead +#define SRE_MATCH_MANY sre_bmatch_many +#define SRE_MATCH_UNTIL_TAIL sre_bmatch_until_tail +#define SRE_MATCH_MANY_UNTIL_TAIL sre_bmatch_many_until_tail +#define SRE_UNMATCH_UNTIL_TAIL sre_bunmatch_until_tail +#define SRE_UNMATCH_UNTIL_TAIL_REV sre_bunmatch_until_tail_rev +#define SRE_PRINT_TEXT sre_bprint_text #if defined(HAVE_UNICODE) #define SRE_RECURSIVE #include "_sre.c" +#undef SRE_PRINT_TEXT +#undef SRE_UNMATCH_UNTIL_TAIL_REV +#undef SRE_UNMATCH_UNTIL_TAIL +#undef SRE_MATCH_MANY_UNTIL_TAIL +#undef SRE_MATCH_UNTIL_TAIL +#undef SRE_MATCH_MANY +#undef SRE_POSSIBLE_MATCH_AHEAD +#undef SRE_CLEANUP +#undef SRE_DISCARD_UNTIL +#undef SRE_REFRESH_MARKS +#undef SRE_DISCARD_BACKTRACK +#undef SRE_SAVE_BACKTRACK +#undef SRE_CONTEXT #undef SRE_RECURSIVE - +#undef SRE_AT_BOUNDARY #undef SRE_LITERAL_TEMPLATE #undef SRE_SEARCH #undef SRE_MATCH -#undef SRE_MATCH_CONTEXT -#undef SRE_INFO -#undef SRE_CHARSET -#undef SRE_COUNT -#undef SRE_AT #undef SRE_CHAR -/* generate 16-bit unicode version */ +/* generate unicode version */ #define SRE_CHAR Py_UNICODE -#define SRE_AT sre_uat -#define SRE_COUNT sre_ucount -#define SRE_CHARSET sre_ucharset -#define SRE_INFO sre_uinfo #define SRE_MATCH sre_umatch -#define SRE_MATCH_CONTEXT sre_umatch_context #define SRE_SEARCH sre_usearch #define SRE_LITERAL_TEMPLATE sre_uliteral_template +#define SRE_AT_BOUNDARY sre_uat_boundary +#define SRE_CONTEXT sre_ucontext +#define SRE_SAVE_BACKTRACK sre_usave_backtrack +#define SRE_DISCARD_BACKTRACK sre_udiscard_backtrack +#define SRE_REFRESH_MARKS sre_urefresh_marks +#define SRE_DISCARD_UNTIL sre_udiscard_until +#define SRE_CLEANUP sre_ucleanup +#define SRE_POSSIBLE_MATCH_AHEAD sre_upossible_match_ahead +#define SRE_MATCH_MANY sre_umatch_many +#define SRE_MATCH_UNTIL_TAIL sre_umatch_until_tail +#define SRE_MATCH_MANY_UNTIL_TAIL sre_umatch_many_until_tail +#define SRE_UNMATCH_UNTIL_TAIL sre_uunmatch_until_tail +#define SRE_UNMATCH_UNTIL_TAIL_REV sre_uunmatch_until_tail_rev +#define SRE_PRINT_TEXT sre_uprint_text #endif #endif /* SRE_RECURSIVE */ @@ -333,1295 +696,4411 @@ /* the following section is compiled twice, with different character settings */ -LOCAL(int) -SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) -{ - /* check if pointer is at given position */ - - Py_ssize_t thisp, thatp; - - switch (at) { - - case SRE_AT_BEGINNING: - case SRE_AT_BEGINNING_STRING: - return ((void*) ptr == state->beginning); - - case SRE_AT_BEGINNING_LINE: - return ((void*) ptr == state->beginning || - SRE_IS_LINEBREAK((int) ptr[-1])); - - case SRE_AT_END: - return (((void*) (ptr+1) == state->end && - SRE_IS_LINEBREAK((int) ptr[0])) || - ((void*) ptr == state->end)); - - case SRE_AT_END_LINE: - return ((void*) ptr == state->end || - SRE_IS_LINEBREAK((int) ptr[0])); - - case SRE_AT_END_STRING: - return ((void*) ptr == state->end); - - case SRE_AT_BOUNDARY: - if (state->beginning == state->end) - return 0; - thatp = ((void*) ptr > state->beginning) ? - SRE_IS_WORD((int) ptr[-1]) : 0; - thisp = ((void*) ptr < state->end) ? - SRE_IS_WORD((int) ptr[0]) : 0; - return thisp != thatp; - - case SRE_AT_NON_BOUNDARY: - if (state->beginning == state->end) - return 0; - thatp = ((void*) ptr > state->beginning) ? - SRE_IS_WORD((int) ptr[-1]) : 0; - thisp = ((void*) ptr < state->end) ? - SRE_IS_WORD((int) ptr[0]) : 0; - return thisp == thatp; - - case SRE_AT_LOC_BOUNDARY: - if (state->beginning == state->end) - return 0; - thatp = ((void*) ptr > state->beginning) ? - SRE_LOC_IS_WORD((int) ptr[-1]) : 0; - thisp = ((void*) ptr < state->end) ? - SRE_LOC_IS_WORD((int) ptr[0]) : 0; - return thisp != thatp; - - case SRE_AT_LOC_NON_BOUNDARY: - if (state->beginning == state->end) - return 0; - thatp = ((void*) ptr > state->beginning) ? - SRE_LOC_IS_WORD((int) ptr[-1]) : 0; - thisp = ((void*) ptr < state->end) ? - SRE_LOC_IS_WORD((int) ptr[0]) : 0; - return thisp == thatp; - -#if defined(HAVE_UNICODE) - case SRE_AT_UNI_BOUNDARY: - if (state->beginning == state->end) - return 0; - thatp = ((void*) ptr > state->beginning) ? - SRE_UNI_IS_WORD((int) ptr[-1]) : 0; - thisp = ((void*) ptr < state->end) ? - SRE_UNI_IS_WORD((int) ptr[0]) : 0; - return thisp != thatp; - - case SRE_AT_UNI_NON_BOUNDARY: - if (state->beginning == state->end) - return 0; - thatp = ((void*) ptr > state->beginning) ? - SRE_UNI_IS_WORD((int) ptr[-1]) : 0; - thisp = ((void*) ptr < state->end) ? - SRE_UNI_IS_WORD((int) ptr[0]) : 0; - return thisp == thatp; -#endif - - } +/* + The 'context' is the current position within the text and pattern and any + associated info. + */ +typedef struct SRE_CONTEXT { + SRE_STATE* state; /* The state struct. */ + SRE_CHAR* text_ptr; /* Current position within the text. */ + SRE_CHAR* text_beginning; /* True start of the text. */ + SRE_CHAR* text_start; /* Start of the text to search/match. */ + SRE_CHAR* text_end; /* End of the text to search/match; treated as + the true end even if it isn't really + (inherited behaviour). */ + SRE_CHAR* search_ptr; /* Start of the search (used by \G). */ + SRE_CHAR* final_linebreak; /* Position of the final linebreak if it's + the last character, otherwise NULL. */ + SRE_CODE* pattern_ptr; /* Current position within the pattern. */ + SRE_CHAR** marks; /* All the numbered and named marks (start and + end of numbered and named groups). */ + Py_ssize_t marks_size; /* Total size of the numbered and named text + mark pointers. */ + SRE_BACKTRACK_CHUNK* backtrack_chunk; /* Most recent chunk of backtrack + items. */ + SRE_BACKTRACK_ITEM* backtrack_item; /* Current backtrack item. */ +} SRE_CONTEXT; + +/* + Cleans up the context when the match code wants to return its result. + + The result is passed in, the cleanup is done, and then the result is returned. + This makes it a bit tidier in the main match code. + */ +LOCAL(int) SRE_CLEANUP(SRE_CONTEXT* context, int result) { + SRE_BACKTRACK_CHUNK* current; + SRE_BACKTRACK_ITEM* max_item; + SRE_BACKTRACK_ITEM* item; + + /* + Discard all but the first backtrack chunks. + + The first chunk is reused to reduce the overhead. + */ + current = context->backtrack_chunk; + while (current->previous != NULL) { + SRE_BACKTRACK_CHUNK* previous = current->previous; + + /* Discard any stored marks in the chunk. */ + max_item = current->items + current->count; + for(item = current->items; item < max_item; item++) { + if (item->marks != NULL) + PyMem_FREE(item->marks); + } + + PyMem_FREE(current); + current = previous; + } + + /* Discard any stored marks in the first chunk. */ + max_item = current->items + current->count; + for(item = current->items; item < max_item; item++) { + if (item->marks != NULL) + PyMem_FREE(item->marks); + } + + /* Re-initialise the first chunk. */ + current->count = 0; + + context->backtrack_chunk = current; + context->state->backtrack_chunk = current; + + return result; +} + +/* + Saves a backtrack position. + + This saves just the opcode, and the marks if required. + */ +LOCAL(int) SRE_SAVE_BACKTRACK(SRE_CONTEXT* context, SRE_CODE op, + BOOL save_marks) { + SRE_BACKTRACK_CHUNK* chunk = context->backtrack_chunk; + SRE_BACKTRACK_ITEM* item; + + /* Is there an empty slot in the current chunk? */ + if (chunk->count >= SRE_BACKTRACK_CHUNK_SIZE) { + /* Create a new chunk. */ + SRE_BACKTRACK_CHUNK* new_chunk = + (SRE_BACKTRACK_CHUNK*)PyMem_MALLOC(sizeof(SRE_BACKTRACK_CHUNK)); + if (new_chunk == NULL) + return SRE_ERROR_MEMORY; + + /* Link the new chunk at the head of the list. */ + new_chunk->previous = chunk; + new_chunk->count = 0; + context->backtrack_chunk = new_chunk; + chunk = new_chunk; + } + + /* Store the opcode. */ + item = &chunk->items[chunk->count++]; + item->op = op; + + /* Save the marks? */ + if (save_marks && context->marks_size > 0) { + item->marks = PyMem_MALLOC(context->marks_size); + if (item->marks == NULL) + return SRE_ERROR_MEMORY; + + /* Save the marks. */ + memmove(item->marks, context->marks, context->marks_size); + } else + /* No marks. */ + item->marks = NULL; + + /* This is now the current backtrack item. */ + context->backtrack_item = item; return 0; } -LOCAL(int) -SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) -{ - /* check if character is a member of the given set */ - - int ok = 1; +/* Discards the last backtrack item. */ +LOCAL(void) SRE_DISCARD_BACKTRACK(SRE_CONTEXT* context) { + SRE_BACKTRACK_CHUNK* chunk = context->backtrack_chunk; + SRE_BACKTRACK_ITEM* item = &chunk->items[--chunk->count]; + + /* Discard the saved marks, if any. */ + if (item->marks != NULL) + PyMem_FREE(item->marks); + + /* + Are all the slots in the chunk now empty? + + Empty chunks are discarded, except for the first one, which is reused to + reduce the overhead. + */ + if (chunk->count == 0 && chunk->previous != NULL) { + SRE_BACKTRACK_CHUNK* previous = chunk->previous; + PyMem_FREE(chunk); + context->backtrack_chunk = previous; + } +} + +/* Discards all backtrack items until it finds one with a given opcode. */ +LOCAL(void) SRE_DISCARD_UNTIL(SRE_CONTEXT* context, SRE_CODE op) { + SRE_BACKTRACK_ITEM* item; for (;;) { - switch (*set++) { - - case SRE_OP_FAILURE: - return !ok; - + SRE_BACKTRACK_CHUNK* chunk = context->backtrack_chunk; + item = &chunk->items[chunk->count - 1]; + if (item->op == op) + /* Found it! */ + break; + SRE_DISCARD_BACKTRACK(context); + } + + /* Record as the current backtrack item. */ + context->backtrack_item = item; +} + +/* Returns whether the current text position is at a word boundary. */ +LOCAL(BOOL) SRE_AT_BOUNDARY(SRE_CONTEXT* context) { + SRE_ENCODING_TABLE* encoding = context->state->encoding; + + /* Is the previous character part of a word? */ + BOOL before = context->text_ptr > context->text_beginning && + encoding->in_category(SRE_CAT_Word, context->text_ptr[-1]); + + /* Is the current character part of a word? */ + BOOL after = context->text_ptr < context->text_end && + encoding->in_category(SRE_CAT_Word, context->text_ptr[0]); + + /* We're at a word boundary if they're different. */ + return before != after; +} + +/* The 'MARK' operator is 3 codewords long. */ +#define SRE_MARK_OP_SIZE 3 + +/* + Looks ahead to see whether the tail of the pattern _could_ match. + + This is used to avoid creating backtrack points unnecessarily. + + For forwards or backwards searching. + + Some of the code might actually look outside the text, but the worse that + could happen is that it could say that the tail _could_ match, which is the + default result. +*/ +LOCAL(BOOL) SRE_POSSIBLE_MATCH_AHEAD(SRE_CONTEXT* context, SRE_CODE* tail) { + SRE_STATE* state = context->state; + SRE_ENCODING_TABLE* encoding = context->state->encoding; + + /* Skip over any marks. */ + while (tail[0] == SRE_OP_MARK) + tail += SRE_MARK_OP_SIZE; + + switch (tail[0]) { + case SRE_OP_ANY: + /* Any character except a newline (forwards). */ + return !encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0]); + case SRE_OP_ANY_ALL: + case SRE_OP_ANY_ALL_REV: + /* Any character at all. */ + return TRUE; + case SRE_OP_ANY_REV: + /* Any character except a newline (backwards). */ + return !encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1]); + case SRE_OP_BOUNDARY: + /* Boundary between word and non-word. */ + return SRE_AT_BOUNDARY(context); + case SRE_OP_CATEGORY: + /* Character in a certain category (forwards). */ + return encoding->in_category(tail[1], context->text_ptr[0]); + case SRE_OP_CATEGORY_REV: + /* Character in a certain category (backwards). */ + return encoding->in_category(tail[1], context->text_ptr[-1]); + case SRE_OP_CHARSET: + /* Character in a charset (forwards). */ + return in_charset(tail + 2, context->text_ptr[0]); + case SRE_OP_CHARSET_IGNORE: + /* Character in a charset, ignoring case (forwards). */ + return in_charset_ignore(state, tail + 2, context->text_ptr[0]); + case SRE_OP_CHARSET_IGNORE_REV: + /* Character in a charset, ignoring case (backwards). */ + return in_charset_ignore(state, tail + 2, context->text_ptr[-1]); + case SRE_OP_CHARSET_REV: + /* Character in a charset (backwards). */ + return in_charset(tail + 2, context->text_ptr[-1]); + case SRE_OP_END_OF_LINE: + /* End of line. */ + return context->text_ptr >= context->text_end || + encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0]); + case SRE_OP_END_OF_STRING: + /* End of string. */ + return context->text_ptr >= context->text_end; + case SRE_OP_END_OF_STRING_LN: + /* End of string or final line. */ + return context->text_ptr >= context->text_end || + context->text_ptr == context->final_linebreak; + case SRE_OP_LITERAL: + /* Character is this literal (forwards). */ + return context->text_ptr[0] == (SRE_CHAR)tail[1]; + case SRE_OP_LITERAL_IGNORE: + /* Character is this literal, ignoring case (forwards). */ + return same_char_ignore(state, context->text_ptr[0], tail[1]); + case SRE_OP_LITERAL_IGNORE_REV: + /* Character is this literal, ignoring case (backwards). */ + return same_char_ignore(state, context->text_ptr[-1], tail[1]); + case SRE_OP_LITERAL_REV: + /* Character is this literal (backwards). */ + return context->text_ptr[-1] == (SRE_CHAR)tail[1]; + case SRE_OP_LITERAL_STRING: + /* Literal string (forwards). */ + return context->text_ptr[0] == (SRE_CHAR)tail[2]; + case SRE_OP_LITERAL_STRING_IGNORE: + /* Literal string, ignoring case (forwards). */ + return same_char_ignore(state, context->text_ptr[0], tail[2]); + case SRE_OP_LITERAL_STRING_IGNORE_REV: + /* + Literal string, ignoring case (backwards). + + It's a little bit harder to locate the first character of the literal + string when searching backwards. + */ + return same_char_ignore(state, + context->text_ptr[-(int)tail[1]], tail[2]); + case SRE_OP_LITERAL_STRING_REV: + /* + Literal string (backwards). + + It's a little bit harder to locate the first character of the + literal string when searching backwards. + */ + return context->text_ptr[-(int)tail[1]] == (SRE_CHAR)tail[2]; + case SRE_OP_NOT_BOUNDARY: + /* Not a boundary between word and non-word. */ + return !SRE_AT_BOUNDARY(context); + case SRE_OP_NOT_CATEGORY: + /* Character not in a certain category (forwards). */ + return !encoding->in_category(tail[1], context->text_ptr[0]); + case SRE_OP_NOT_CATEGORY_REV: + /* Character not in a certain category (backwards). */ + return !encoding->in_category(tail[1], context->text_ptr[-1]); + case SRE_OP_NOT_CHARSET: + /* Character not in a charset (forwards). */ + return !in_charset(tail + 2, context->text_ptr[0]); + case SRE_OP_NOT_CHARSET_IGNORE: + /* Character not in a charset,ignoring case (forwards). */ + return !in_charset_ignore(state, tail + 2, context->text_ptr[0]); + case SRE_OP_NOT_CHARSET_IGNORE_REV: + /* Character not in a charset,ignoring case (backwards). */ + return !in_charset_ignore(state, tail + 2, context->text_ptr[-1]); + case SRE_OP_NOT_CHARSET_REV: + /* Character not in a charset (backwards). */ + return !in_charset(tail + 2, context->text_ptr[-1]); + case SRE_OP_NOT_LITERAL: + /* Character is not this literal (forwards). */ + return context->text_ptr[0] != (SRE_CHAR)tail[1]; + case SRE_OP_NOT_LITERAL_IGNORE: + /* Character is not this literal, ignoring case (forwards). */ + return !same_char_ignore(state, context->text_ptr[0], tail[1]); + case SRE_OP_NOT_LITERAL_IGNORE_REV: + /* Character is not this literal, ignoring case (backwards). */ + return !same_char_ignore(state, context->text_ptr[-1], tail[1]); + case SRE_OP_NOT_LITERAL_REV: + /* Character is not this literal (backwards). */ + return context->text_ptr[-1] != (SRE_CHAR)tail[1]; + case SRE_OP_NOT_RANGE: + /* Character not in range (forwards). */ + return !in_range(tail[1], tail[2], context->text_ptr[0]); + case SRE_OP_NOT_RANGE_IGNORE: + /* Character not in range, ignoring case (forwards). */ + return !in_range_ignore(state, tail[1], tail[2], context->text_ptr[0]); + case SRE_OP_NOT_RANGE_IGNORE_REV: + /* Character not in range, ignoring case (backwards). */ + return !in_range_ignore(state, tail[1], tail[2], context->text_ptr[-1]); + case SRE_OP_NOT_RANGE_REV: + /* Character not in range (backwards). */ + return !in_range(tail[1], tail[2], context->text_ptr[-1]); + case SRE_OP_NOT_SET: + /* Character not in set (forwards). */ + return !in_set(state, tail + 1, context->text_ptr[0]); + case SRE_OP_NOT_SET_IGNORE: + /* Character not in set, ignoring case (forwards). */ + return !in_set_ignore(state, tail + 1, context->text_ptr[0]); + case SRE_OP_NOT_SET_IGNORE_REV: + /* Character not in set, ignoring case (backwards). */ + return !in_set_ignore(state, tail + 1, context->text_ptr[-1]); + case SRE_OP_NOT_SET_REV: + /* Character not in set (backwards). */ + return !in_set(state, tail + 1, context->text_ptr[-1]); + case SRE_OP_RANGE: + /* Character in range (forwards). */ + return in_range(tail[1], tail[2], context->text_ptr[0]); + case SRE_OP_RANGE_IGNORE: + /* Character in range, ignoring case (forwards). */ + return in_range_ignore(state, tail[1], tail[2], context->text_ptr[0]); + case SRE_OP_RANGE_IGNORE_REV: + /* Character in range, ignoring case (backwards). */ + return in_range_ignore(state, tail[1], tail[2], context->text_ptr[-1]); + case SRE_OP_RANGE_REV: + /* Character in range (backwards). */ + return in_range(tail[1], tail[2], context->text_ptr[-1]); + case SRE_OP_SET: + /* Character in set (forwards). */ + return in_set(state, tail + 1, context->text_ptr[0]); + case SRE_OP_SET_IGNORE: + /* Character in set, ignoring case (forwards). */ + return in_set_ignore(state, tail + 1, context->text_ptr[0]); + case SRE_OP_SET_IGNORE_REV: + /* Character in set, ignoring case (backwards). */ + return in_set_ignore(state, tail + 1, context->text_ptr[-1]); + case SRE_OP_SET_REV: + /* Character in set (backwards). */ + return in_set(state, tail + 1, context->text_ptr[-1]); + case SRE_OP_START_OF_LINE: + /* Start of line. */ + return context->text_ptr == context->text_beginning || + encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1]); + case SRE_OP_START_OF_SEARCH: + /* Start of search. */ + return context->text_ptr == context->search_ptr; + case SRE_OP_START_OF_STRING: + /* Start of string. */ + return context->text_ptr == context->text_beginning; + default: + /* Anything else we'll assume could match. */ + return TRUE; + } +} + +/* + Matches single characters up to a maximum. + + This is used for matching a repeated single-character pattern. It's more + efficient that the general multi-character repeat. + + For forwards or backwards searching. + */ +LOCAL(void) SRE_MATCH_MANY(SRE_CONTEXT* context, SRE_CHAR* max_ptr, + SRE_CODE* body) { + SRE_STATE* state = context->state; + SRE_ENCODING_TABLE* encoding = state->encoding; + + switch (body[0]) { + case SRE_OP_ANY: + /* Any character except a newline (forwards). */ + while (context->text_ptr < max_ptr && + !encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_ANY_ALL: + /* Any character at all (forwards). */ + if (context->text_ptr < max_ptr) + context->text_ptr = max_ptr; + break; + case SRE_OP_ANY_ALL_REV: + /* Any character at all (backwards). */ + if (context->text_ptr > max_ptr) + context->text_ptr = max_ptr; + break; + case SRE_OP_ANY_REV: + /* Any character except a newline (backwards). */ + while (context->text_ptr > max_ptr && + !encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_CATEGORY: + /* Character in a certain category (forwards). */ + while (context->text_ptr < max_ptr && + encoding->in_category(body[1], context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_CATEGORY_REV: + /* Character in a certain category (backwards). */ + while (context->text_ptr > max_ptr && + encoding->in_category(body[1], context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_CHARSET: + /* Character in a charset (forwards). */ + while (context->text_ptr < max_ptr && + in_charset(body + 2, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_CHARSET_IGNORE: + /* Character in a charset, ignoring case (forwards). */ + while (context->text_ptr < max_ptr && + in_charset_ignore(state, body + 2, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_CHARSET_IGNORE_REV: + /* Character in a charset, ignoring case (backwards). */ + while (context->text_ptr > max_ptr && + in_charset_ignore(state, body + 2, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_CHARSET_REV: + /* Character in a charset (backwards). */ + while (context->text_ptr > max_ptr && + in_charset(body + 2, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_LITERAL: + /* Character is this literal (forwards). */ + while (context->text_ptr < max_ptr && + context->text_ptr[0] == (SRE_CHAR)body[1]) + context->text_ptr++; + break; + case SRE_OP_LITERAL_IGNORE: + /* Character is this literal, ignoring case (forwards). */ + while (context->text_ptr < max_ptr && + same_char_ignore(state, context->text_ptr[0], body[1])) + context->text_ptr++; + break; + case SRE_OP_LITERAL_IGNORE_REV: + /* Character is this literal, ignoring case (backwards). */ + while (context->text_ptr > max_ptr && + !same_char_ignore(state, context->text_ptr[-1], body[1])) + context->text_ptr--; + break; + case SRE_OP_LITERAL_REV: + /* Character is this literal (backwards). */ + while (context->text_ptr > max_ptr && + context->text_ptr[-1] == (SRE_CHAR)body[1]) + context->text_ptr--; + break; + case SRE_OP_NOT_CATEGORY: + /* Character not in a certain category (forwards). */ + while (context->text_ptr < max_ptr && + !encoding->in_category(body[1], context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_NOT_CATEGORY_REV: + /* Character not in a certain category (backwards). */ + while (context->text_ptr > max_ptr && + !encoding->in_category(body[1], context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_NOT_CHARSET: + /* Character not in a charset (forwards). */ + while (context->text_ptr < max_ptr && + !in_charset(body + 2, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_NOT_CHARSET_IGNORE: + /* Character not in a charset,ignoring case (forwards). */ + while (context->text_ptr < max_ptr && + !in_charset_ignore(state, body + 2, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_NOT_CHARSET_IGNORE_REV: + /* Character not in a charset,ignoring case (backwards). */ + while (context->text_ptr > max_ptr && + !in_charset_ignore(state, body + 2, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_NOT_CHARSET_REV: + /* Character not in a charset (backwards). */ + while (context->text_ptr > max_ptr && + !in_charset(body + 2, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_NOT_LITERAL: + /* Character is not this literal (forwards). */ + while (context->text_ptr < max_ptr && + context->text_ptr[0] != (SRE_CHAR)body[1]) + context->text_ptr++; + break; + case SRE_OP_NOT_LITERAL_IGNORE: + /* Character is not this literal, ignoring case (forwards). */ + while (context->text_ptr < max_ptr && + !same_char_ignore(state, context->text_ptr[0], body[1])) + context->text_ptr++; + break; + case SRE_OP_NOT_LITERAL_IGNORE_REV: + /* Character is not this literal, ignoring case (backwards). */ + while (context->text_ptr > max_ptr && + !same_char_ignore(state, context->text_ptr[-1], body[1])) + context->text_ptr--; + break; + case SRE_OP_NOT_LITERAL_REV: + /* Character is not this literal (backwards). */ + while (context->text_ptr > max_ptr && + context->text_ptr[-1] != (SRE_CHAR)body[1]) + context->text_ptr--; + break; + case SRE_OP_NOT_RANGE: + /* Character not in range (forwards). */ + while (context->text_ptr < max_ptr && + !in_range(body[1], body[2], context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_NOT_RANGE_IGNORE: + /* Character not in range, ignoring case (forwards). */ + while (context->text_ptr < max_ptr && + !in_range_ignore(state, body[1], body[2], context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_NOT_RANGE_IGNORE_REV: + /* Character not in range, ignoring case (backwards). */ + while (context->text_ptr > max_ptr && + !in_range_ignore(state, body[1], body[2], context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_NOT_RANGE_REV: + /* Character not in range (backwards). */ + while (context->text_ptr > max_ptr && + !in_range(body[1], body[2], context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_NOT_SET: + /* Character not in set (forwards). */ + while (context->text_ptr < max_ptr && + !in_set(state, body + 1, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_NOT_SET_IGNORE: + /* Character not in set, ignoring case (forwards). */ + while (context->text_ptr < max_ptr && + !in_set_ignore(state, body + 1, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_NOT_SET_IGNORE_REV: + /* Character not in set, ignoring case (backwards). */ + while (context->text_ptr > max_ptr && + !in_set_ignore(state, body + 1, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_NOT_SET_REV: + /* Character not in set (backwards). */ + while (context->text_ptr > max_ptr && + !in_set(state, body + 1, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_RANGE: + /* Character in range (forwards). */ + while (context->text_ptr < max_ptr && + in_range(body[1], body[2], context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_RANGE_IGNORE: + /* Character in range, ignoring case (forwards). */ + while (context->text_ptr < max_ptr && + in_range_ignore(state, body[1], body[2], context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_RANGE_IGNORE_REV: + /* Character in range, ignoring case (backwards). */ + while (context->text_ptr > max_ptr && + in_range_ignore(state, body[1], body[2], context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_RANGE_REV: + /* Character in range (backwards). */ + while (context->text_ptr > max_ptr && + in_range(body[1], body[2], context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_SET: + /* Character in set (forwards). */ + while (context->text_ptr < max_ptr && + in_set(state, body + 1, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_SET_IGNORE: + /* Character in set, ignoring case (forwards). */ + while (context->text_ptr < max_ptr && + in_set_ignore(state, body + 1, context->text_ptr[0])) + context->text_ptr++; + break; + case SRE_OP_SET_IGNORE_REV: + /* Character in set, ignoring case (backwards). */ + while (context->text_ptr > max_ptr && + in_set_ignore(state, body + 1, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_SET_REV: + /* Character in set (backwards). */ + while (context->text_ptr > max_ptr && + in_set(state, body + 1, context->text_ptr[-1])) + context->text_ptr--; + break; + } +} + +/* + 'Unmatches' single characters down to a minimum or until the tail _could_ + match. + + Returns FALSE if the minimum is reached but the tail still couldn't match. + + This is used for 'unmatching' a repeated single-character pattern. It's more + efficient that the general multi-character repeat. + + For forwards searching only. + */ +LOCAL(BOOL) SRE_UNMATCH_UNTIL_TAIL(SRE_CONTEXT* context, SRE_CHAR* min_ptr, + SRE_CODE* tail) { + SRE_STATE* state = context->state; + SRE_ENCODING_TABLE* encoding = state->encoding; + + /* Skip over any marks. */ + while (tail[0] == SRE_OP_MARK) + tail += SRE_MARK_OP_SIZE; + + switch (tail[0]) { + case SRE_OP_ANY: + /* Any character except a newline. */ + while (context->text_ptr >= min_ptr && + encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_ANY_ALL: + /* Any character at all. */ + break; + case SRE_OP_BOUNDARY: + /* Boundary between word and non-word. */ + while (context->text_ptr >= min_ptr && + !SRE_AT_BOUNDARY(context)) + context->text_ptr--; + break; + case SRE_OP_CATEGORY: + /* Character in a certain category. */ + while (context->text_ptr >= min_ptr && + !encoding->in_category(tail[1], context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_CHARSET: + /* Character in a charset. */ + while (context->text_ptr >= min_ptr && + !in_charset(tail + 2, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_CHARSET_IGNORE: + /* Character in a charset, ignoring case. */ + while (context->text_ptr >= min_ptr && + !in_charset_ignore(state, tail + 2, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_END_OF_LINE: + /* End of line. */ + if (context->text_ptr < context->text_end) { + while (context->text_ptr >= min_ptr && + !encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0])) + context->text_ptr--; + } + break; + case SRE_OP_END_OF_STRING: + /* End of string. */ + if (context->text_ptr >= min_ptr && + context->text_ptr < context->text_end) + context->text_ptr = min_ptr - 1; + break; + case SRE_OP_END_OF_STRING_LN: + /* End of string or final line. */ + if (context->text_ptr < context->text_end && + context->text_ptr != context->final_linebreak && + context->text_ptr >= min_ptr) + context->text_ptr = min_ptr - 1; + break; + case SRE_OP_LITERAL: + /* Character is this literal. */ + while (context->text_ptr >= min_ptr && + context->text_ptr[0] != (SRE_CHAR)tail[1]) + context->text_ptr--; + break; + case SRE_OP_LITERAL_IGNORE: + /* Character is this literal, ignoring case. */ + while (context->text_ptr >= min_ptr && + !same_char_ignore(state, context->text_ptr[0], tail[1])) + context->text_ptr--; + break; + case SRE_OP_LITERAL_STRING: + /* Literal string. */ + while (context->text_ptr >= min_ptr && + context->text_ptr[0] != (SRE_CHAR)tail[2]) + context->text_ptr--; + break; + case SRE_OP_LITERAL_STRING_IGNORE: + /* Literal string, ignoring case. */ + while (context->text_ptr >= min_ptr && + !same_char_ignore(state, context->text_ptr[0], tail[2])) + context->text_ptr--; + break; + case SRE_OP_NOT_BOUNDARY: + /* Not a boundary between word and non-word. */ + while (context->text_ptr >= min_ptr && + SRE_AT_BOUNDARY(context)) + context->text_ptr--; + break; + case SRE_OP_NOT_CATEGORY: + /* Character not in a certain category. */ + while (context->text_ptr >= min_ptr && + encoding->in_category(tail[1], context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_NOT_CHARSET: + /* Character not in a charset. */ + while (context->text_ptr >= min_ptr && + in_charset(tail + 2, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_NOT_CHARSET_IGNORE: + /* Character not in a charset,ignoring case. */ + while (context->text_ptr >= min_ptr && + in_charset_ignore(state, tail + 2, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_NOT_LITERAL: + /* Character is not this literal. */ + while (context->text_ptr >= min_ptr && + context->text_ptr[0] == (SRE_CHAR)tail[1]) + context->text_ptr--; + break; + case SRE_OP_NOT_LITERAL_IGNORE: + /* Character is not this literal, ignoring case. */ + while (context->text_ptr >= min_ptr && + same_char_ignore(state, context->text_ptr[0], tail[1])) + context->text_ptr--; + break; + case SRE_OP_NOT_RANGE: + /* Character not in range. */ + while (context->text_ptr >= min_ptr && + in_range(tail[1], tail[2], context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_NOT_RANGE_IGNORE: + /* Character not in range, ignoring case. */ + while (context->text_ptr >= min_ptr && + in_range_ignore(state, tail[1], tail[2], context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_NOT_SET: + /* Character not in set. */ + while (context->text_ptr >= min_ptr && + in_set(state, tail + 1, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_NOT_SET_IGNORE: + /* Character not in set, ignoring case. */ + while (context->text_ptr >= min_ptr && + in_set_ignore(state, tail + 1, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_RANGE: + /* Character in range. */ + while (context->text_ptr >= min_ptr && + !in_range(tail[1], tail[2], context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_RANGE_IGNORE: + /* Character in range, ignoring case. */ + while (context->text_ptr >= min_ptr && + !in_range_ignore(state, tail[1], tail[2], context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_SET: + /* Character in set. */ + while (context->text_ptr >= min_ptr && + !in_set(state, tail + 1, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_SET_IGNORE: + /* Character in set, ignoring case. */ + while (context->text_ptr >= min_ptr && + !in_set_ignore(state, tail + 1, context->text_ptr[0])) + context->text_ptr--; + break; + case SRE_OP_START_OF_LINE: + /* Start of line. */ + while (context->text_ptr >= min_ptr && + context->text_ptr != context->text_beginning && + encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + context->text_ptr--; + break; + case SRE_OP_START_OF_SEARCH: + /* Start of search. */ + while (context->text_ptr >= min_ptr && + context->text_ptr != context->search_ptr) + context->text_ptr--; + break; + case SRE_OP_START_OF_STRING: + /* Start of string. */ + if (context->text_ptr >= min_ptr && + context->text_ptr > context->text_beginning) + context->text_ptr = min_ptr - 1; + break; + } + + return context->text_ptr >= min_ptr; +} + +/* + 'Unmatches' single characters down to a minimum or until the tail _could_ + match. + + Returns FALSE if the minimum is reached but the tail still couldn't match. + + This is used for 'unmatching' a repeated single-character pattern. It's more + efficient that the general multi-character repeat. + + For backwards searching only. + */ +LOCAL(BOOL) SRE_UNMATCH_UNTIL_TAIL_REV(SRE_CONTEXT* context, SRE_CHAR* min_ptr, + SRE_CODE* tail) { + SRE_STATE* state = context->state; + SRE_ENCODING_TABLE* encoding = state->encoding; + + /* Skip over any marks. */ + while (tail[0] == SRE_OP_MARK) + tail += SRE_MARK_OP_SIZE; + + switch (tail[0]) { + case SRE_OP_ANY_ALL_REV: + /* Any character at all. */ + break; + case SRE_OP_ANY_REV: + /* Any character except a newline. */ + while (context->text_ptr <= min_ptr && + encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_BOUNDARY: + /* Boundary between word and non-word. */ + while (context->text_ptr <= min_ptr && + !SRE_AT_BOUNDARY(context)) + context->text_ptr++; + break; + case SRE_OP_CATEGORY_REV: + /* Character in a certain category. */ + while (context->text_ptr <= min_ptr && + !encoding->in_category(tail[1], context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_CHARSET_IGNORE_REV: + /* Character in a charset, ignoring case. */ + while (context->text_ptr <= min_ptr && + !in_charset_ignore(state, tail + 2, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_CHARSET_REV: + /* Character in a charset. */ + while (context->text_ptr <= min_ptr && + !in_charset(tail + 2, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_END_OF_LINE: + /* End of line. */ + while (context->text_ptr <= min_ptr && + context->text_ptr < context->text_end && + !encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_END_OF_STRING: + /* End of string. */ + while (context->text_ptr <= min_ptr && + context->text_ptr < context->text_end) + context->text_ptr++; + break; + case SRE_OP_END_OF_STRING_LN: + /* End of string or final line. */ + while (context->text_ptr <= min_ptr && + context->text_ptr < context->text_end && + context->text_ptr != context->final_linebreak) + context->text_ptr++; + break; + case SRE_OP_LITERAL_IGNORE_REV: + /* Character is this literal, ignoring case. */ + while (context->text_ptr <= min_ptr && + !same_char_ignore(state, context->text_ptr[-1], tail[1])) + context->text_ptr++; + break; + case SRE_OP_LITERAL_REV: + /* Character is this literal. */ + while (context->text_ptr <= min_ptr && + context->text_ptr[-1] != (SRE_CHAR)tail[1]) + context->text_ptr++; + break; + case SRE_OP_LITERAL_STRING_IGNORE_REV: + /* Literal string, ignoring case. */ + while (context->text_ptr <= min_ptr && + !same_char_ignore(state, context->text_ptr[-(int)tail[1]], tail[2])) + context->text_ptr++; + break; + case SRE_OP_LITERAL_STRING_REV: + /* Literal string. */ + while (context->text_ptr <= min_ptr && + context->text_ptr[-(int)tail[1]] != (SRE_CHAR)tail[2]) + context->text_ptr++; + break; + case SRE_OP_NOT_BOUNDARY: + /* Not a boundary between word and non-word. */ + while (context->text_ptr <= min_ptr && + SRE_AT_BOUNDARY(context)) + context->text_ptr++; + break; + case SRE_OP_NOT_CATEGORY_REV: + /* Character not in a certain category. */ + while (context->text_ptr <= min_ptr && + encoding->in_category(tail[1], context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_NOT_CHARSET_IGNORE_REV: + /* Character not in a charset,ignoring case. */ + while (context->text_ptr <= min_ptr && + in_charset_ignore(state, tail + 2, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_NOT_CHARSET_REV: + /* Character not in a charset. */ + while (context->text_ptr <= min_ptr && + in_charset(tail + 2, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_NOT_LITERAL_IGNORE_REV: + /* Character is not this literal, ignoring case. */ + while (context->text_ptr <= min_ptr && + same_char_ignore(state, context->text_ptr[-1], tail[1])) + context->text_ptr++; + break; + case SRE_OP_NOT_LITERAL_REV: + /* Character is not this literal. */ + while (context->text_ptr <= min_ptr && + context->text_ptr[-1] == (SRE_CHAR)tail[1]) + context->text_ptr++; + break; + case SRE_OP_NOT_RANGE_IGNORE_REV: + /* Character not in range, ignoring case. */ + while (context->text_ptr <= min_ptr && + in_range_ignore(state, tail[1], tail[2], context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_NOT_RANGE_REV: + /* Character not in range. */ + while (context->text_ptr <= min_ptr && + in_range(tail[1], tail[2], context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_NOT_SET_IGNORE_REV: + /* Character not in set, ignoring case. */ + while (context->text_ptr <= min_ptr && + in_set_ignore(state, tail + 1, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_NOT_SET_REV: + /* Character not in set. */ + while (context->text_ptr <= min_ptr && + in_set(state, tail + 1, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_RANGE_IGNORE_REV: + /* Character in range, ignoring case. */ + while (context->text_ptr <= min_ptr && + !in_range_ignore(state, tail[1], tail[2], context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_RANGE_REV: + /* Character in range. */ + while (context->text_ptr <= min_ptr && + !in_range(tail[1], tail[2], context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_SET_IGNORE_REV: + /* Character in set, ignoring case. */ + while (context->text_ptr <= min_ptr && + !in_set_ignore(state, tail + 1, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_SET_REV: + /* Character in set. */ + while (context->text_ptr <= min_ptr && + !in_set(state, tail + 1, context->text_ptr[-1])) + context->text_ptr++; + break; + case SRE_OP_START_OF_LINE: + /* Start of line. */ + if (context->text_ptr > context->text_beginning) { + while (context->text_ptr <= min_ptr && + encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + context->text_ptr++; + } + break; + case SRE_OP_START_OF_SEARCH: + /* Start of search. */ + while (context->text_ptr <= min_ptr && + context->text_ptr != context->search_ptr) + context->text_ptr++; + break; + case SRE_OP_START_OF_STRING: + /* Start of string. */ + if (context->text_ptr > context->text_beginning) + context->text_ptr = min_ptr + 1; + break; + } + + return context->text_ptr <= min_ptr; +} + +/* + Matches single characters until the tail _could_ match, up to a maximum. + Returns FALSE if the maximum is reached but the tail still couldn't match. + + This is used for matching a repeated single-character pattern. It's more + efficient that the general multi-character repeat. + + For forwards or backwards searching. + */ +LOCAL(BOOL) SRE_MATCH_UNTIL_TAIL(SRE_CONTEXT* context, SRE_CHAR* max_ptr, + SRE_CODE* body, SRE_CODE* tail) { + SRE_STATE* state = context->state; + SRE_ENCODING_TABLE* encoding = state->encoding; + + /* Skip over any marks. */ + while (tail[0] == SRE_OP_MARK) + tail += SRE_MARK_OP_SIZE; + + switch (body[0]) { + case SRE_OP_ANY: + /* Any character except a newline (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_ANY_ALL: + /* Any character at all (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_ANY_ALL_REV: + /* Any character at all (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_ANY_REV: + /* Any character except a newline (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_CATEGORY: + /* Character in a certain category (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + !encoding->in_category(body[1], context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_CHARSET: + /* Character in a charset (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + !in_charset(body + 2, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_CHARSET_IGNORE: + /* Character in a charset, ignoring case (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + !in_charset_ignore(state, body + 2, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_CHARSET_IGNORE_REV: + /* Character in a charset, ignoring case (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + !in_charset_ignore(state, body + 2, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_CHARSET_REV: + /* Character in a charset (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + !in_charset(body + 2, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_LITERAL: + /* Character is this literal (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + context->text_ptr[0] != (SRE_CHAR)body[1]) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_LITERAL_IGNORE: + /* Character is this literal, ignoring case (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + !same_char_ignore(state, context->text_ptr[0], body[1])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_LITERAL_IGNORE_REV: + /* Character is this literal, ignoring case (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + !same_char_ignore(state, context->text_ptr[-1], body[1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_LITERAL_REV: + /* Character is this literal (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + context->text_ptr[-1] != (SRE_CHAR)body[1]) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_NOT_CATEGORY: + /* Character not in a certain category (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + encoding->in_category(body[1], context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_NOT_CHARSET: + /* Character not in a charset (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + in_charset(body + 2, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_NOT_CHARSET_IGNORE: + /* Character not in a charset,ignoring case (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + in_charset_ignore(state, body + 2, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_NOT_CHARSET_IGNORE_REV: + /* Character not in a charset,ignoring case (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + in_charset_ignore(state, body + 2, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_NOT_CHARSET_REV: + /* Character not in a charset (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + in_charset(body + 2, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_NOT_LITERAL: + /* Character is not this literal (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + context->text_ptr[0] == (SRE_CHAR)body[1]) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_NOT_LITERAL_IGNORE: + /* Character is not this literal, ignoring case (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + same_char_ignore(state, context->text_ptr[0], body[1])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_NOT_LITERAL_IGNORE_REV: + /* Character is not this literal, ignoring case (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + same_char_ignore(state, context->text_ptr[-1], body[1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_NOT_LITERAL_REV: + /* Character is not this literal (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + context->text_ptr[-1] == (SRE_CHAR)body[1]) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_NOT_RANGE: + /* Character not in range (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + in_range(body[1], body[2], context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_NOT_RANGE_IGNORE: + /* Character not in range, ignoring case (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + in_range_ignore(state, body[1], body[2], context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_NOT_RANGE_IGNORE_REV: + /* Character not in range, ignoring case (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + in_range_ignore(state, body[1], body[2], context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_NOT_RANGE_REV: + /* Character not in range (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + in_range(body[1], body[2], context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_NOT_SET: + /* Character not in set (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + in_set(state, body + 1, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_NOT_SET_IGNORE: + /* Character not in set, ignoring case (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + in_set_ignore(state, body + 1, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_NOT_SET_IGNORE_REV: + /* Character not in set, ignoring case (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + in_set_ignore(state, body + 1, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_NOT_SET_REV: + /* Character not in set (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + in_set(state, body + 1, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_RANGE: + /* Character in range (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + !in_range(body[1], body[2], context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_RANGE_IGNORE: + /* Character in range, ignoring case (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + !in_range_ignore(state, body[1], body[2], context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_RANGE_IGNORE_REV: + /* Character in range, ignoring case (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + !in_range_ignore(state, body[1], body[2], context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_RANGE_REV: + /* Character in range (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + !in_range(body[1], body[2], context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_SET: + /* Character in set (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + !in_set(state, body + 1, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_SET_IGNORE: + /* Character in set, ignoring case (forwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr >= max_ptr || + !in_set_ignore(state, body + 1, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } + break; + case SRE_OP_SET_IGNORE_REV: + /* Character in set, ignoring case (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + !in_set_ignore(state, body + 1, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + case SRE_OP_SET_REV: + /* Character in set (backwards). */ + while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)) { + if (context->text_ptr <= max_ptr || + !in_set(state, body + 1, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } + break; + } + + return TRUE; +} + +/* + Matches single characters until the tail _could_ match, up to a maximum. + Returns FALSE if the maximum is reached but the tail still couldn't match. + + This is used for matching a repeated single-character pattern. It's more + efficient that the general multi-character repeat. + + For forwards or backwards searching. + */ +LOCAL(BOOL) SRE_MATCH_MANY_UNTIL_TAIL(SRE_CONTEXT* context, SRE_CHAR* max_ptr, + SRE_CODE* body, SRE_CODE* tail) { + SRE_STATE* state = context->state; + SRE_ENCODING_TABLE* encoding = state->encoding; + + /* Skip over any marks. */ + while (tail[0] == SRE_OP_MARK) + tail += SRE_MARK_OP_SIZE; + + switch (body[0]) { + case SRE_OP_ANY: + /* Any character except a newline (forwards). */ + do { + if (context->text_ptr >= max_ptr || + encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_ANY_ALL: + /* Any character at all (forwards). */ + do { + if (context->text_ptr >= max_ptr) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_ANY_ALL_REV: + /* Any character at all (backwards). */ + do { + if (context->text_ptr <= max_ptr) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_ANY_REV: + /* Any character except a newline (backwards). */ + do { + if (context->text_ptr <= max_ptr || + encoding->in_category(SRE_CAT_LineBreak, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_CATEGORY: + /* Character in a certain category (forwards). */ + do { + if (context->text_ptr >= max_ptr || + !encoding->in_category(body[1], context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_CHARSET: + /* Character in a charset (forwards). */ + do { + if (context->text_ptr >= max_ptr || + !in_charset(body + 2, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_CHARSET_IGNORE: + /* Character in a charset, ignoring case (forwards). */ + do { + if (context->text_ptr >= max_ptr || + !in_charset_ignore(state, body + 2, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_CHARSET_IGNORE_REV: + /* Character in a charset, ignoring case (backwards). */ + do { + if (context->text_ptr <= max_ptr || + !in_charset_ignore(state, body + 2, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_CHARSET_REV: + /* Character in a charset (backwards). */ + do { + if (context->text_ptr <= max_ptr || + !in_charset(body + 2, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_LITERAL: + /* Character is this literal (forwards). */ + do { + if (context->text_ptr >= max_ptr || + context->text_ptr[0] != (SRE_CHAR)body[1]) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_LITERAL_IGNORE: + /* Character is this literal, ignoring case (forwards). */ + do { + if (context->text_ptr >= max_ptr || + !same_char_ignore(state, context->text_ptr[0], body[1])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_LITERAL_IGNORE_REV: + /* Character is this literal, ignoring case (backwards). */ + do { + if (context->text_ptr <= max_ptr || + !same_char_ignore(state, context->text_ptr[-1], body[1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_LITERAL_REV: + /* Character is this literal (backwards). */ + do { + if (context->text_ptr <= max_ptr || + context->text_ptr[-1] != (SRE_CHAR)body[1]) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_CATEGORY: + /* Character not in a certain category (forwards). */ + do { + if (context->text_ptr >= max_ptr || + encoding->in_category(body[1], context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_CHARSET: + /* Character not in a charset (forwards). */ + do { + if (context->text_ptr >= max_ptr || + in_charset(body + 2, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_CHARSET_IGNORE: + /* Character not in a charset,ignoring case (forwards). */ + do { + if (context->text_ptr >= max_ptr || + in_charset_ignore(state, body + 2, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_CHARSET_IGNORE_REV: + /* Character not in a charset,ignoring case (backwards). */ + do { + if (context->text_ptr <= max_ptr || + in_charset_ignore(state, body + 2, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_CHARSET_REV: + /* Character not in a charset (backwards). */ + do { + if (context->text_ptr <= max_ptr || + in_charset(body + 2, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_LITERAL: + /* Character is not this literal (forwards). */ + do { + if (context->text_ptr >= max_ptr || + context->text_ptr[0] == (SRE_CHAR)body[1]) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_LITERAL_IGNORE: + /* Character is not this literal, ignoring case (forwards). */ + do { + if (context->text_ptr >= max_ptr || + same_char_ignore(state, context->text_ptr[0], body[1])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_LITERAL_IGNORE_REV: + /* Character is not this literal, ignoring case (backwards). */ + do { + if (context->text_ptr <= max_ptr || + same_char_ignore(state, context->text_ptr[-1], body[1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_LITERAL_REV: + /* Character is not this literal (backwards). */ + do { + if (context->text_ptr <= max_ptr || + context->text_ptr[-1] == (SRE_CHAR)body[1]) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_RANGE: + /* Character not in range (forwards). */ + do { + if (context->text_ptr >= max_ptr || + in_range(body[1], body[2], context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_RANGE_IGNORE: + /* Character not in range, ignoring case (forwards). */ + do { + if (context->text_ptr >= max_ptr || + in_range_ignore(state, body[1], body[2], context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_RANGE_IGNORE_REV: + /* Character not in range, ignoring case (backwards). */ + do { + if (context->text_ptr <= max_ptr || + in_range_ignore(state, body[1], body[2], context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_RANGE_REV: + /* Character not in range (backwards). */ + do { + if (context->text_ptr <= max_ptr || + in_range(body[1], body[2], context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_SET: + /* Character not in set (forwards). */ + do { + if (context->text_ptr >= max_ptr || + in_set(state, body + 1, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_SET_IGNORE: + /* Character not in set, ignoring case (forwards). */ + do { + if (context->text_ptr >= max_ptr || + in_set_ignore(state, body + 1, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_SET_IGNORE_REV: + /* Character not in set, ignoring case (backwards). */ + do { + if (context->text_ptr <= max_ptr || + in_set_ignore(state, body + 1, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_NOT_SET_REV: + /* Character not in set (backwards). */ + do { + if (context->text_ptr <= max_ptr || + in_set(state, body + 1, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_RANGE: + /* Character in range (forwards). */ + do { + if (context->text_ptr >= max_ptr || + !in_range(body[1], body[2], context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_RANGE_IGNORE: + /* Character in range, ignoring case (forwards). */ + do { + if (context->text_ptr >= max_ptr || + !in_range_ignore(state, body[1], body[2], context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_RANGE_IGNORE_REV: + /* Character in range, ignoring case (backwards). */ + do { + if (context->text_ptr <= max_ptr || + !in_range_ignore(state, body[1], body[2], context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_RANGE_REV: + /* Character in range (backwards). */ + do { + if (context->text_ptr <= max_ptr || + !in_range(body[1], body[2], context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_SET: + /* Character in set (forwards). */ + do { + if (context->text_ptr >= max_ptr || + !in_set(state, body + 1, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_SET_IGNORE: + /* Character in set, ignoring case (forwards). */ + do { + if (context->text_ptr >= max_ptr || + !in_set_ignore(state, body + 1, context->text_ptr[0])) + return FALSE; + context->text_ptr++; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_SET_IGNORE_REV: + /* Character in set, ignoring case (backwards). */ + do { + if (context->text_ptr <= max_ptr || + !in_set_ignore(state, body + 1, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + case SRE_OP_SET_REV: + /* Character in set (backwards). */ + do { + if (context->text_ptr <= max_ptr || + !in_set(state, body + 1, context->text_ptr[-1])) + return FALSE; + context->text_ptr--; + } while (!SRE_POSSIBLE_MATCH_AHEAD(context, tail)); + break; + } + + return TRUE; +} + +/* + Checks if the string matches the given pattern. Returns <0 for + error, 0 for failure, and 1 for success. + */ +LOCAL(int) SRE_MATCH(SRE_STATE* state) { + SRE_CONTEXT context; /* 'context' is the info for the pattern matching. */ + SRE_BACKTRACK_ITEM* current_loop = NULL; /* Points to the backtrack item + for the current loop, if any. */ + unsigned int sigcount = 0; + int result; + SRE_ENCODING_TABLE* encoding = state->encoding; + SRE_CODE op; + + /* Initialise the context. */ + context.state = state; + context.text_beginning = (SRE_CHAR *)state->beginning; + context.text_ptr = state->ptr; + context.text_start = (SRE_CHAR *)state->start; + context.text_end = (SRE_CHAR *)state->end; + context.search_ptr = state->search_ptr; + context.pattern_ptr = state->pattern_code; + context.marks = (SRE_CHAR**)state->mark; + context.marks_size = (state->numbered_mark_count + + state->named_mark_count) * sizeof(context.marks[0]); + context.backtrack_chunk = state->backtrack_chunk; + + /* Point to the final newline if it's the final character. */ + if (context.text_beginning < context.text_end && + encoding->in_category(SRE_CAT_LineBreak, context.text_end[-1])) + context.final_linebreak = context.text_end - 1; + else + context.final_linebreak = NULL; + + TRACE(("|%p|%p|ENTER\n", context.pattern_ptr, context.text_ptr)); + + /* + Store a backtrack item for failure. This takes effect if the entire + pattern fails to match. + */ + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_FAILURE, 0); + if (result != 0) + return SRE_CLEANUP(&context, result); + + /* Clear the text marks. */ + memset(context.marks, 0, context.marks_size); + + /* The main matching loop. */ +advance: + for (;;) { + /* Cancel the matching? */ + ++sigcount; + if ((0 == (sigcount & 0xFFF)) && PyErr_CheckSignals()) + return SRE_CLEANUP(&context, SRE_ERROR_INTERRUPTED); + + /* Try to match the next operator against the text. */ + op = context.pattern_ptr[0]; + switch (op) { + case SRE_OP_ANY: + /* Any character except a newline (forwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr >= context.text_end || + encoding->in_category(SRE_CAT_LineBreak, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr++; + break; + case SRE_OP_ANY_ALL: + /* Any character at all (forwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr >= context.text_end) + goto backtrack; + context.text_ptr++; + context.pattern_ptr++; + break; + case SRE_OP_ANY_ALL_REV: + /* Any character at all (backwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr <= context.text_start) + goto backtrack; + context.text_ptr--; + context.pattern_ptr++; + break; + case SRE_OP_ANY_REV: + /* Any character except a newline (backwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr <= context.text_start || + encoding->in_category(SRE_CAT_LineBreak, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr++; + break; + case SRE_OP_ASSERT: + /* Assert subpattern (+ve look-ahead/look-behind). */ + /* ... */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + /* + If the subpattern succeeds then we'll discard the enclosed + backtrack info, including any marks, so we need to save the marks + here. + + If the subpattern fails then the marks will be restored + automatically. + */ + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_ASSERT, TRUE); + if (result != 0) + return SRE_CLEANUP(&context, result); + /* Save the context. */ + context.backtrack_item->assert.text_start = context.text_start; + context.backtrack_item->assert.text_ptr = context.text_ptr; + context.backtrack_item->assert.pattern_ptr = context.pattern_ptr; + /* The assert can look at the entire text. */ + context.text_start = state->beginning; + context.pattern_ptr += 2; + break; + case SRE_OP_ASSERT_NOT: + /* Assert not subpattern (-ve look-ahead/look-behind). */ + /* ... */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + /* + If the subpattern succeeds then we'll discard the enclosed + backtrack info, including any marks, so we need to save the marks + here. + + If the subpattern fails then the marks will be restored + automatically. + */ + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_ASSERT_NOT, TRUE); + if (result != 0) + return SRE_CLEANUP(&context, result); + /* Save the context. */ + context.backtrack_item->assert.text_start = context.text_start; + context.backtrack_item->assert.text_ptr = context.text_ptr; + context.backtrack_item->assert.pattern_ptr = context.pattern_ptr; + /* The assert can look at the entire text. */ + context.text_start = state->beginning; + context.pattern_ptr += 2; + break; + case SRE_OP_ATOMIC: + /* Atomic subpattern. */ + /* ... */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + /* + If the subpattern succeeds then we'll discard the enclosed + backtrack info, including any marks, so we need to save the marks + here. + */ + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_ATOMIC, TRUE); + if (result != 0) + return SRE_CLEANUP(&context, result); + context.pattern_ptr++; + break; + case SRE_OP_BOUNDARY: + /* Boundary between word and non-word. */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (!SRE_AT_BOUNDARY(&context)) + goto backtrack; + context.pattern_ptr++; + break; + case SRE_OP_BRANCH: + { + /* Alternation. */ + /* + + + ... + + + ... + + 0 + */ + SRE_CODE* skip_ptr = context.pattern_ptr + 1; + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + /* Look ahead in the branch to avoid unnecessary backtracking. */ + while (! SRE_POSSIBLE_MATCH_AHEAD(&context, skip_ptr + 1)) { + skip_ptr += skip_ptr[0]; + /* Is there another branch? */ + if (skip_ptr[0] == 0) + goto backtrack; + } + /* Try this branch. */ + context.pattern_ptr = skip_ptr + 1; + /* Save the next branch, if any. */ + skip_ptr += skip_ptr[0]; + if (skip_ptr[0] != 0) { + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_BRANCH, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, result); + /* Save the context for trying the next branch. */ + context.backtrack_item->branch.text_ptr = context.text_ptr; + context.backtrack_item->branch.pattern_ptr = skip_ptr; + } + break; + } + case SRE_OP_CATEGORY: + /* Character in a certain category (forwards). */ + /* */ + TRACE(("|%p|%p|%s 0x%X\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + if (context.text_ptr >= context.text_end || + !encoding->in_category(context.pattern_ptr[1], + context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 2; + break; + case SRE_OP_CATEGORY_REV: + /* Character in a certain category (backwards). */ + /* */ + TRACE(("|%p|%p|%s 0x%X\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + if (context.text_ptr <= context.text_start || + !encoding->in_category(context.pattern_ptr[1], + context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 2; + break; + case SRE_OP_CHARSET: + /* Character in a charset (forwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr >= context.text_end || + !in_charset(context.pattern_ptr + 2, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_CHARSET_IGNORE: + /* Character in a charset, ignoring case (forwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr >= context.text_end || + !in_charset_ignore(state, context.pattern_ptr + 2, + context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_CHARSET_IGNORE_REV: + /* Character in a charset, ignoring case (backwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr <= context.text_start || + !in_charset_ignore(state, context.pattern_ptr + 2, + context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_CHARSET_REV: + /* Character in a charset (backwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr <= context.text_start || + !in_charset(context.pattern_ptr + 2, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_END_ASSERT: + /* End of assert subpattern (+ve look-ahead/look-behind). */ + /* ... */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + /* + The subpattern has succeeded, so discard all backtrack info in the + assertion. + */ + SRE_DISCARD_UNTIL(&context, SRE_OP_ASSERT); + /* Restore the marks and context and continue matching. */ + memmove(context.marks, context.backtrack_item->marks, + context.marks_size); + context.text_start = context.backtrack_item->assert.text_start; + context.text_ptr = context.backtrack_item->assert.text_ptr; + SRE_DISCARD_BACKTRACK(&context); + context.pattern_ptr++; + break; + case SRE_OP_END_ASSERT_NOT: + /* End of assert not subpattern (-ve look-ahead/look-behind). */ + /* ... */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + /* + The subpattern has succeeded, so discard all backtrack info in the + assertion. + */ + SRE_DISCARD_UNTIL(&context, SRE_OP_ASSERT_NOT); + /* Restore the context and marks and backtrack. */ + memmove(context.marks, context.backtrack_item->marks, + context.marks_size); + context.text_start = context.backtrack_item->assert.text_start; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_END_ATOMIC: + /* Atomic subpattern. */ + /* ... */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + /* + The subpattern has succeeded, so discard all backtrack info in the + subpattern. + */ + SRE_DISCARD_UNTIL(&context, SRE_OP_ATOMIC); + /* + Modify the backtrack info so that the marks will be restored if the + tail fails. + */ + context.backtrack_item->op = SRE_OP_END_ATOMIC; + context.pattern_ptr++; + break; + case SRE_OP_END_OF_LINE: + /* End of line. */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr < context.text_end && + !encoding->in_category(SRE_CAT_LineBreak, context.text_ptr[0])) + goto backtrack; + context.pattern_ptr++; + break; + case SRE_OP_END_OF_STRING: + /* End of string. */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr < context.text_end) + goto backtrack; + context.pattern_ptr++; + break; + case SRE_OP_END_OF_STRING_LN: + /* End of string or final line. */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr < context.text_end && + context.text_ptr != context.final_linebreak) + goto backtrack; + context.pattern_ptr++; + break; + case SRE_OP_END_REPEAT_MAX: + case SRE_OP_END_REPEAT_MAX_REV: + { + /* End of greedy repeat. */ + /* + + ... + + */ + BOOL forward = op == SRE_OP_END_REPEAT_MAX; + SRE_CODE* repeat_ptr; + SRE_CODE* end_repeat_ptr; + SRE_CODE* body; + SRE_CODE* tail; + Py_ssize_t available; + Py_ssize_t max_rep; + BOOL consumed; + BOOL try_body; + BOOL try_tail; + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + /* Point to the repeat and end-repeat operators. */ + end_repeat_ptr = context.pattern_ptr; + repeat_ptr = end_repeat_ptr - end_repeat_ptr[1]; + + /* Point to the body of the repeat and the tail of the pattern. */ + body = repeat_ptr + 4; + tail = end_repeat_ptr + 2; + + /* We've just matched the body again. */ + ++current_loop->repeat.repeat_counter; + + /* How many characters are still available? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + + /* How many times can we repeat the body? */ + max_rep = unsigned_min(current_loop->repeat.repeat_max, + current_loop->repeat.repeat_counter + available); + + /* Has the body consumed any characters this time? */ + consumed = context.text_ptr != current_loop->repeat.repeat_start; + + /* + If the body hasn't consumed any characters then it could continue + to repeat up to the maximum and then the tail could be tried. + + If that's the case then we can just skip the pointless repeats and + go straight to the tail. + */ + /* Should the body be tried again? */ + try_body = consumed && current_loop->repeat.repeat_counter < + max_rep; + /* Should the tail be tried? */ + try_tail = (consumed || current_loop->repeat.repeat_counter >= + current_loop->repeat.repeat_min) && + SRE_POSSIBLE_MATCH_AHEAD(&context, tail); + if (try_body) { + if (try_tail) { + /* + Both the body and the tail should be tried. + + The body takes precedence, so create a backtrack point for + the tail. + */ + result = SRE_SAVE_BACKTRACK(&context, op, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, result); + /* Save the context for trying the tail. */ + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.pattern_ptr = tail; + context.backtrack_item->repeat.loop = current_loop; + } + /* Try the body. */ + current_loop->repeat.repeat_start = context.text_ptr; + context.pattern_ptr = body; + } else { + if (try_tail) { + /* + Only the tail should be tried, so do that. + + Now that we're about to try the tail we also need to make + the enclosing loop the 'current' one. + */ + current_loop = current_loop->repeat.loop; + context.pattern_ptr = tail; + } else + /* + Neither the body and the tail should be tried, so + backtrack. + */ + goto backtrack; + } + break; + } + case SRE_OP_END_REPEAT_MIN: + case SRE_OP_END_REPEAT_MIN_REV: + { + /* End of lazy repeat. */ + /* + + ... + + */ + BOOL forward = op == SRE_OP_END_REPEAT_MIN; + SRE_CODE* repeat_ptr; + SRE_CODE* end_repeat_ptr; + SRE_CODE* body; + SRE_CODE* tail; + Py_ssize_t available; + Py_ssize_t max_rep; + BOOL consumed; + BOOL try_body; + BOOL try_tail; + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + /* Point to the repeat and end-repeat operators. */ + end_repeat_ptr = context.pattern_ptr; + repeat_ptr = end_repeat_ptr - end_repeat_ptr[1]; + + /* Point to the body of the repeat and the tail of the pattern. */ + body = repeat_ptr + 4; + tail = end_repeat_ptr + 2; + + /* We've just matched the body again. */ + ++current_loop->repeat.repeat_counter; + + /* How many characters are still available? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + + /* How many times can we repeat the body? */ + max_rep = unsigned_min(current_loop->repeat.repeat_max, + current_loop->repeat.repeat_counter + available); + + /* Has the body consumed any characters this time? */ + consumed = context.text_ptr != current_loop->repeat.repeat_start; + + /* + If the body hasn't consumed any characters then it could continue + to repeat up to the minimum and then the tail could be tried. + + If that's the case then we can just skip the pointless repeats and + go straight to the tail. + */ + /* Should the body be tried again? */ + try_body = consumed && current_loop->repeat.repeat_counter < + max_rep; + /* Should the tail be tried? */ + try_tail = (consumed || current_loop->repeat.repeat_counter >= + current_loop->repeat.repeat_min) && + SRE_POSSIBLE_MATCH_AHEAD(&context, tail); + if (try_body) { + if (try_tail) { + /* + Both the body and the tail should be tried. + + The tail takes precedence, so create a backtrack point for + the body. + */ + result = SRE_SAVE_BACKTRACK(&context, op, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, result); + /* Save the context for trying the body. */ + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.pattern_ptr = body; + context.backtrack_item->repeat.loop = current_loop; + /* + Now that we're about to try the tail we also need to make + the enclosing loop the 'current' one. + */ + context.pattern_ptr = tail; + current_loop = current_loop->repeat.loop; + } else { + /* Only the body should be tried, so do that. */ + current_loop->repeat.repeat_start = context.text_ptr; + context.pattern_ptr = body; + } + } else { + if (try_tail) { + /* + Only the tail should be tried, so do that. + + Now that we're about to try the tail we also need to make + the enclosing loop the 'current' one. + */ + current_loop = current_loop->repeat.loop; + context.pattern_ptr = tail; + } else + /* + Neither the body and the tail should be tried, so + backtrack. + */ + goto backtrack; + } + break; + } + case SRE_OP_END_REPEAT_POSS: + case SRE_OP_END_REPEAT_POSS_REV: + { + /* End of greedy repeat. */ + /* + + ... + + */ + BOOL forward = op == SRE_OP_END_REPEAT_POSS; + SRE_CODE* repeat_ptr; + SRE_CODE* end_repeat_ptr; + SRE_CODE* body; + SRE_CODE* tail; + Py_ssize_t available; + Py_ssize_t max_rep; + BOOL consumed; + BOOL try_body; + BOOL try_tail; + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + /* Point to the repeat and end-repeat operators. */ + end_repeat_ptr = context.pattern_ptr; + repeat_ptr = end_repeat_ptr - end_repeat_ptr[1]; + + /* Point to the body of the repeat and the tail of the pattern. */ + body = repeat_ptr + 4; + tail = end_repeat_ptr + 2; + + /* We've just matched the body again. */ + ++current_loop->repeat.repeat_counter; + + /* Discard all the backtrack info in the body we've just matched. */ + SRE_DISCARD_UNTIL(&context, repeat_ptr[0]); + + /* How many characters are still available? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + + /* How many times can we repeat the body? */ + max_rep = unsigned_min(current_loop->repeat.repeat_max, + current_loop->repeat.repeat_counter + available); + + /* Has the body consumed any characters this time? */ + consumed = context.text_ptr != current_loop->repeat.repeat_start; + + /* + If the body hasn't consumed any characters then it could continue + to repeat up to the maximum and then the tail could be tried. + + If that's the case then we can just skip the pointless repeats and + go straight to the tail. + */ + /* Should the body be tried again? */ + try_body = consumed && current_loop->repeat.repeat_counter < + max_rep; + /* Should the tail be tried? */ + try_tail = (consumed || current_loop->repeat.repeat_counter >= + current_loop->repeat.repeat_min) && + SRE_POSSIBLE_MATCH_AHEAD(&context, tail); + if (try_body) { + if (try_tail) { + /* + Both the body and the tail should be tried. + + The body takes precedence, so create a backtrack point for + the tail. + */ + result = SRE_SAVE_BACKTRACK(&context, op, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, result); + /* Save the context for trying the tail. */ + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.pattern_ptr = tail; + context.backtrack_item->repeat.loop = current_loop; + } + /* Try the body. */ + current_loop->repeat.repeat_start = context.text_ptr; + context.pattern_ptr = body; + } else { + if (try_tail) { + /* + Only the tail should be tried, so do that. + + Now that we're about to try the tail we also need to make + the enclosing loop the 'current' one. + */ + current_loop = current_loop->repeat.loop; + context.pattern_ptr = tail; + } else + /* + Neither the body and the tail should be tried, so + backtrack. + */ + goto backtrack; + } + break; + } + case SRE_OP_GROUPREF: + case SRE_OP_GROUPREF_REV: + { + /* Match against capture group. */ + /* */ + BOOL forward = op == SRE_OP_GROUPREF; + unsigned int group; + SRE_CHAR* group_start; + SRE_CHAR* group_end; + Py_ssize_t length; + Py_ssize_t available; + Py_ssize_t i; + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + /* + Get the captured group. + + Here we're using a zero-based index. Note that _externally_ + group 0 is the entire matched string. + */ + group = context.pattern_ptr[1]; + group_start = context.marks[group * 2]; + group_end = context.marks[group * 2 + 1]; + /* Is this group valid? */ + if (group_start == NULL || group_start > group_end) + goto backtrack; + + /* How long is the captured group? */ + length = group_end - group_start; + /* Are there enough characters available for a match? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + if (length > available) + goto backtrack; + + if (!forward) + /* Skip to the start of the text we're about to match. */ + context.text_ptr -= length; + + /* Check whether the text here matches the group. */ + i = 0; + while (i < length) { + if (context.text_ptr[i] != group_start[i]) + goto backtrack; + i++; + } + + if (forward) + /* Skip over the text we've just matched. */ + context.text_ptr += length; + + context.pattern_ptr += 2; + break; + } + case SRE_OP_GROUPREF_IGNORE: + case SRE_OP_GROUPREF_IGNORE_REV: + { + /* Match against capture group, ignoring case. */ + /* */ + BOOL forward = op == SRE_OP_GROUPREF_IGNORE; + unsigned int group; + SRE_CHAR* group_start; + SRE_CHAR* group_end; + Py_ssize_t length; + Py_ssize_t available; + Py_ssize_t i; + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + /* + Get the captured group. + + Here we're using a zero-based index. Note that _externally_ + group 0 is the entire matched string. + */ + group = context.pattern_ptr[1]; + group_start = context.marks[group * 2]; + group_end = context.marks[group * 2 + 1]; + /* Is this group valid? */ + if (group_start == NULL || group_start > group_end) + goto backtrack; + + /* How long is the captured group? */ + length = group_end - group_start; + /* Are there enough characters available for a match? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + if (length > available) + goto backtrack; + + if (!forward) + /* Skip to the start of the text we're about to match. */ + context.text_ptr -= length; + + /* Check whether the text here matches the group. */ + i = 0; + while (i < length) { + if (!same_char_ignore(state, context.text_ptr[i], + group_start[i])) + goto backtrack; + i++; + } + + if (forward) + /* Skip over the text we've just matched. */ + context.text_ptr += length; + + context.pattern_ptr += 2; + break; + } + case SRE_OP_GROUPREF_EXISTS: + { + /* Whether a capture group exists. */ + /* + + code_yes + + code_no + */ + unsigned int group; + SRE_CHAR* group_start; + SRE_CHAR* group_end; + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + /* + Get the captured group. + + Here we're using a zero-based index. Note that _externally_ + group 0 is the entire matched string. + */ + group = context.pattern_ptr[1]; + group_start = context.marks[group * 2]; + group_end = context.marks[group * 2 + 1]; + /* Is this group valid? */ + if (group_start == NULL || group_start > group_end) + /* + Skip to code_no, which might actually be the tail of the + pattern. + */ + context.pattern_ptr += 1 + context.pattern_ptr[2]; + else + /* Skip to code_yes. */ + context.pattern_ptr += 3; + break; + } + case SRE_OP_JUMP: + /* Jump forward in the pattern. */ + /* */ + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; case SRE_OP_LITERAL: - /* */ - if (ch == set[0]) - return ok; - set++; - break; - - case SRE_OP_CATEGORY: - /* */ - if (sre_category(set[0], (int) ch)) - return ok; - set += 1; - break; - - case SRE_OP_CHARSET: - if (sizeof(SRE_CODE) == 2) { - /* (16 bits per code word) */ - if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15)))) - return ok; - set += 16; + /* Character is not this literal (forwards). */ + /* */ + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + if (context.text_ptr >= context.text_end || + context.text_ptr[0] != (SRE_CHAR)context.pattern_ptr[1]) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 2; + break; + case SRE_OP_LITERAL_IGNORE: + /* Character is this literal, ignoring case (forwards). */ + /* */ + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + if (context.text_ptr >= context.text_end || + !same_char_ignore(state, context.text_ptr[0], + context.pattern_ptr[1])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 2; + break; + case SRE_OP_LITERAL_IGNORE_REV: + /* Character is this literal, ignoring case (backwards). */ + /* */ + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + if (context.text_ptr <= context.text_start || + !same_char_ignore(state, context.text_ptr[-1], + context.pattern_ptr[1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 2; + break; + case SRE_OP_LITERAL_REV: + /* Character is this literal (backwards). */ + /* */ + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + if (context.text_ptr <= context.text_start || + context.text_ptr[-1] != (SRE_CHAR)context.pattern_ptr[1]) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 2; + break; + case SRE_OP_LITERAL_STRING: + case SRE_OP_LITERAL_STRING_REV: + { + /* Literal string. */ + /* ... */ + BOOL forward = op == SRE_OP_LITERAL_STRING; + Py_ssize_t available; + Py_ssize_t length = context.pattern_ptr[1]; + SRE_CODE* literal = context.pattern_ptr + 2; + Py_ssize_t i; + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + /* Are there enough characters available? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + if (length > available) + goto backtrack; + + if (!forward) + /* Skip to the start of the text we're about to match. */ + context.text_ptr -= length; + + /* Check whether the text here matches the literal. */ + i = 0; + do { + if (context.text_ptr[i] != (SRE_CHAR)literal[i]) + goto backtrack; + i++; } - else { - /* (32 bits per code word) */ - if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31)))) - return ok; - set += 8; + while (i < length); + + if (forward) + /* Skip over the text we've just matched. */ + context.text_ptr += length; + + context.pattern_ptr = literal + length; + break; + } + case SRE_OP_LITERAL_STRING_IGNORE: + case SRE_OP_LITERAL_STRING_IGNORE_REV: + { + /* Literal string, ignoring case. */ + /* ... */ + BOOL forward = op == SRE_OP_LITERAL_STRING_IGNORE; + Py_ssize_t available; + Py_ssize_t length = context.pattern_ptr[1]; + SRE_CODE* literal = context.pattern_ptr + 2; + Py_ssize_t i; + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + /* Are there enough characters available? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + if (length > available) + goto backtrack; + + if (!forward) + /* Skip to the start of the text we're about to match. */ + context.text_ptr -= length; + + /* Check whether the text here matches the literal. */ + i = 0; + do { + if (!same_char_ignore(state, context.text_ptr[i], literal[i])) + goto backtrack; + i++; } - break; - + while (i < length); + + if (forward) + /* Skip over the text we've just matched. */ + context.text_ptr += length; + + context.pattern_ptr = literal + length; + break; + } + case SRE_OP_MARK: + { + /* Text mark. */ + /* */ + unsigned int numbered_index = context.pattern_ptr[1]; + unsigned int named_index = context.pattern_ptr[2]; + SRE_BACKTRACK_ITEM* item; + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, numbered_index, named_index)); + + /* + Save the current position of the mark and set the mark to the + current text position. + + Every capture group has a number; some also have names, which + might occur more than once in a regex. Therefore the marks have 2 + ids, one for the number and the other for the name. If the mark is + for an unnamed group then the name id the same as the number id. + */ + result = SRE_SAVE_BACKTRACK(&context, SRE_OP_MARK, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, result); + item = context.backtrack_item; + /* The number id. */ + item->mark.numbered_index = numbered_index; + item->mark.numbered_mark_ptr = context.marks[numbered_index]; + context.marks[numbered_index] = context.text_ptr; + /* The name id. */ + item->mark.named_index = named_index; + item->mark.named_mark_ptr = context.marks[named_index]; + context.marks[named_index] = context.text_ptr; + + context.pattern_ptr += 3; + break; + } + case SRE_OP_NOT_BOUNDARY: + /* Not a boundary between word and non-word. */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (SRE_AT_BOUNDARY(&context)) + goto backtrack; + context.pattern_ptr++; + break; + case SRE_OP_NOT_CATEGORY: + /* Character not in a certain category (forwards). */ + /* */ + TRACE(("|%p|%p|%s 0x%X\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + if (context.text_ptr >= context.text_end || + state->encoding->in_category(context.pattern_ptr[1], + context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 2; + break; + case SRE_OP_NOT_CATEGORY_REV: + /* Character not in a certain category (backwards). */ + /* */ + TRACE(("|%p|%p|%s 0x%X\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + if (context.text_ptr >= context.text_start || + state->encoding->in_category(context.pattern_ptr[1], + context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 2; + break; + case SRE_OP_NOT_CHARSET: + /* Character not in a charset (forwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr >= context.text_end || + in_charset(context.pattern_ptr + 2, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_CHARSET_IGNORE: + /* Character not in a charset,ignoring case (forwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr >= context.text_end || + in_charset_ignore(state, context.pattern_ptr + 2, + context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_CHARSET_IGNORE_REV: + /* Character not in a charset,ignoring case (backwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr <= context.text_start || + in_charset_ignore(state, context.pattern_ptr + 2, + context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_CHARSET_REV: + /* Character not in a charset (backwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr <= context.text_start || + in_charset(context.pattern_ptr + 2, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 1 + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_LITERAL: + /* Character is not this literal (forwards). */ + /* */ + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + if (context.text_ptr >= context.text_end || + context.text_ptr[0] == (SRE_CHAR)context.pattern_ptr[1]) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 2; + break; + case SRE_OP_NOT_LITERAL_IGNORE: + /* Character is not this literal, ignoring case (forwards). */ + /* */ + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + if (context.text_ptr >= context.text_end || + same_char_ignore(state, context.text_ptr[0], + context.pattern_ptr[1])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 2; + break; + case SRE_OP_NOT_LITERAL_IGNORE_REV: + /* Character is not this literal, ignoring case (backwards). */ + /* */ + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + if (context.text_ptr <= context.text_start || + same_char_ignore(state, context.text_ptr[-1], + context.pattern_ptr[1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 2; + break; + case SRE_OP_NOT_LITERAL_REV: + /* Character is not this literal (backwards). */ + /* */ + TRACE(("|%p|%p|%s %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1])); + + if (context.text_ptr <= context.text_start || + context.text_ptr[-1] == (SRE_CHAR)context.pattern_ptr[1]) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 2; + break; + case SRE_OP_NOT_RANGE: + /* Character not in range (forwards). */ + /* */ + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1], + context.pattern_ptr[2])); + + if (context.text_ptr >= context.text_end || + in_range(context.pattern_ptr[1], context.pattern_ptr[2], + context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 3; + break; + case SRE_OP_NOT_RANGE_IGNORE: + /* Character not in range, ignoring case (forwards). */ + /* */ + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1], + context.pattern_ptr[2])); + + if (context.text_ptr >= context.text_end || + in_range_ignore(state, context.pattern_ptr[1], + context.pattern_ptr[2], context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 3; + break; + case SRE_OP_NOT_RANGE_IGNORE_REV: + /* Character not in range, ignoring case (backwards). */ + /* */ + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1], + context.pattern_ptr[2])); + + if (context.text_ptr <= context.text_start || + in_range_ignore(state, context.pattern_ptr[1], + context.pattern_ptr[2], context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 3; + break; + case SRE_OP_NOT_RANGE_REV: + /* Character not in range (backwards). */ + /* */ + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1], + context.pattern_ptr[2])); + + if (context.text_ptr <= context.text_start || + in_range(context.pattern_ptr[1], context.pattern_ptr[2], + context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 3; + break; + case SRE_OP_NOT_SET: + /* Character not in set (forwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr >= context.text_end || + in_set(state, context.pattern_ptr + 1, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr = context.pattern_ptr + 1 + + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_SET_IGNORE: + /* Character not in set, ignoring case (forwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr >= context.text_end || + in_set_ignore(state, context.pattern_ptr + 1, + context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr = context.pattern_ptr + 1 + + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_SET_IGNORE_REV: + /* Character not in set, ignoring case (backwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr <= context.text_start || + in_set_ignore(state, context.pattern_ptr + 1, + context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr = context.pattern_ptr + 1 + + context.pattern_ptr[1]; + break; + case SRE_OP_NOT_SET_REV: + /* Character not in set (backwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr <= context.text_start || + in_set(state, context.pattern_ptr + 1, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr = context.pattern_ptr + 1 + + context.pattern_ptr[1]; + break; case SRE_OP_RANGE: + /* Character in range (forwards). */ /* */ - if (set[0] <= ch && ch <= set[1]) - return ok; - set += 2; - break; - - case SRE_OP_NEGATE: - ok = !ok; - break; - - case SRE_OP_BIGCHARSET: - /* <256 blockindices> */ + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1], + context.pattern_ptr[2])); + + if (context.text_ptr >= context.text_end || + !in_range(context.pattern_ptr[1], context.pattern_ptr[2], + context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 3; + break; + case SRE_OP_RANGE_IGNORE: + /* Character in range, ignoring case (forwards). */ + /* */ + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1], + context.pattern_ptr[2])); + + if (context.text_ptr >= context.text_end || + !in_range_ignore(state, context.pattern_ptr[1], + context.pattern_ptr[2], context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr += 3; + break; + case SRE_OP_RANGE_IGNORE_REV: + /* Character in range, ignoring case (backwards). */ + /* */ + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1], + context.pattern_ptr[2])); + + if (context.text_ptr <= context.text_start || + !in_range_ignore(state, context.pattern_ptr[1], + context.pattern_ptr[2], context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 3; + break; + case SRE_OP_RANGE_REV: + /* Character in range (backwards). */ + /* */ + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, context.pattern_ptr[1], + context.pattern_ptr[2])); + + if (context.text_ptr <= context.text_start || + !in_range(context.pattern_ptr[1], context.pattern_ptr[2], + context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr += 3; + break; + case SRE_OP_REPEAT_MAX: + case SRE_OP_REPEAT_MAX_REV: { - Py_ssize_t count, block; - count = *(set++); - - if (sizeof(SRE_CODE) == 2) { - block = ((unsigned char*)set)[ch >> 8]; - set += 128; - if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15))) - return ok; - set += count*16; + /* Greedy repeat. */ + /* + + ... + + */ + BOOL forward = op == SRE_OP_REPEAT_MAX; + SRE_CODE* repeat_ptr; + SRE_CODE* end_repeat_ptr; + SRE_CODE* body; + SRE_CODE* tail; + Py_ssize_t available; + Py_ssize_t max_rep; + SRE_BACKTRACK_ITEM* new_loop; + BOOL try_body; + BOOL try_tail; + + /* Point to the repeat and end-repeat operators. */ + repeat_ptr = context.pattern_ptr; + end_repeat_ptr = repeat_ptr + repeat_ptr[1]; + + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, repeat_ptr[2], repeat_ptr[3])); + + /* Point to the body of the repeat and the tail of the pattern. */ + body = repeat_ptr + 4; + tail = end_repeat_ptr + 2; + + /* How many characters are still available? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + + /* + Are there enough characters available for the repeat? + + The repeat should consume at least one character per iteration and + must iterate a minimum number of times. + */ + if ((Py_ssize_t)repeat_ptr[2] > available) + goto backtrack; + + /* How many times can we repeat the body? */ + if (repeat_ptr[3] == SRE_UNLIMITED_REPEATS) + max_rep = available; + else + max_rep = unsigned_min(repeat_ptr[3], available); + + /* + Save the context and initialise the repeat info for the new repeat. + */ + result = SRE_SAVE_BACKTRACK(&context, op, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, result); + new_loop = context.backtrack_item; + new_loop->repeat.repeat_min = repeat_ptr[2]; + new_loop->repeat.repeat_max = max_rep; + new_loop->repeat.repeat_counter = 0; + new_loop->repeat.loop = current_loop; + new_loop->repeat.repeat_start = context.text_ptr; + + /* Should the body be tried? */ + try_body = available > 0; + /* Should the tail be tried? */ + try_tail = new_loop->repeat.repeat_min == 0 && + SRE_POSSIBLE_MATCH_AHEAD(&context, tail); + if (try_body) { + if (try_tail) { + /* + Both the body and the tail should be tried. + + The body takes precedence, so create a backtrack point for + the tail. + */ + result = SRE_SAVE_BACKTRACK(&context, end_repeat_ptr[0], + FALSE); + if (result != 0) + return SRE_CLEANUP(&context, result); + /* Save the context for trying the tail. */ + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.pattern_ptr = tail; + context.backtrack_item->repeat.loop = new_loop; + } + /* Try the body. */ + current_loop = new_loop; + context.pattern_ptr = body; + } else { + if (try_tail) + /* Only the tail should be tried, so do that. */ + context.pattern_ptr = tail; + else + /* + Neither the body and the tail should be tried, so + backtrack. + */ + goto backtrack; } - else { - /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids - * warnings when c's type supports only numbers < N+1 */ - if (!(ch & ~65535)) - block = ((unsigned char*)set)[ch >> 8]; + break; + } + case SRE_OP_REPEAT_MIN: + case SRE_OP_REPEAT_MIN_REV: + { + /* Lazy repeat. */ + /* + + ... + + */ + BOOL forward = op == SRE_OP_REPEAT_MIN; + SRE_CODE* repeat_ptr; + SRE_CODE* end_repeat_ptr; + SRE_CODE* body; + SRE_CODE* tail; + Py_ssize_t available; + Py_ssize_t max_rep; + SRE_BACKTRACK_ITEM* new_loop; + BOOL try_body; + BOOL try_tail; + + /* Point to the repeat and end-repeat operators. */ + repeat_ptr = context.pattern_ptr; + end_repeat_ptr = repeat_ptr + repeat_ptr[1]; + + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, repeat_ptr[2], repeat_ptr[3])); + + /* Point to the body of the repeat and the tail of the pattern. */ + body = repeat_ptr + 4; + tail = end_repeat_ptr + 2; + + /* How many characters are still available? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + + /* + Are there enough characters available for the repeat? + + The repeat should consume at least one character per iteration and + must iterate a minimum number of times. + */ + if ((Py_ssize_t)repeat_ptr[2] > available) + goto backtrack; + + /* How many times can we repeat the body? */ + if (repeat_ptr[3] == SRE_UNLIMITED_REPEATS) + max_rep = available; + else + max_rep = unsigned_min(repeat_ptr[3], available); + + /* + Save the context and initialise the repeat info for the new repeat. + */ + result = SRE_SAVE_BACKTRACK(&context, op, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, result); + new_loop = context.backtrack_item; + new_loop->repeat.repeat_min = repeat_ptr[2]; + new_loop->repeat.repeat_max = max_rep; + new_loop->repeat.repeat_counter = 0; + new_loop->repeat.loop = current_loop; + new_loop->repeat.repeat_start = context.text_ptr; + + /* Should the body be tried? */ + try_body = available > 0; + /* Should the tail be tried? */ + try_tail = new_loop->repeat.repeat_min == 0 && + SRE_POSSIBLE_MATCH_AHEAD(&context, tail); + if (try_body) { + if (try_tail) { + /* + Both the body and the tail should be tried. + + The tail takes precedence, so create a backtrack point for + the body. + */ + result = SRE_SAVE_BACKTRACK(&context, end_repeat_ptr[0], + FALSE); + if (result != 0) + return SRE_CLEANUP(&context, result); + /* Save the context for trying the body. */ + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.pattern_ptr = body; + context.backtrack_item->repeat.loop = new_loop; + + /* Try the tail. */ + context.pattern_ptr = tail; + } else { + /* Try the body. */ + current_loop = new_loop; + context.pattern_ptr = body; + } + } else { + if (try_tail) + /* Only the tail should be tried, so do that. */ + context.pattern_ptr = tail; else - block = -1; - set += 64; - if (block >=0 && - (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31)))) - return ok; - set += count*8; + /* + Neither the body and the tail should be tried, so + backtrack. + */ + goto backtrack; } break; } - - default: - /* internal error -- there's not much we can do about it - here, so let's just pretend it didn't match... */ - return 0; - } - } -} - -LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern); - -LOCAL(Py_ssize_t) -SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) -{ - SRE_CODE chr; - SRE_CHAR* ptr = (SRE_CHAR *)state->ptr; - SRE_CHAR* end = (SRE_CHAR *)state->end; - Py_ssize_t i; - - /* adjust end */ - if (maxcount < end - ptr && maxcount != 65535) - end = ptr + maxcount; - - switch (pattern[0]) { - - case SRE_OP_IN: - /* repeated set */ - TRACE(("|%p|%p|COUNT IN\n", pattern, ptr)); - while (ptr < end && SRE_CHARSET(pattern + 2, *ptr)) - ptr++; - break; - - case SRE_OP_ANY: - /* repeated dot wildcard. */ - TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr)); - while (ptr < end && !SRE_IS_LINEBREAK(*ptr)) - ptr++; - break; - - case SRE_OP_ANY_ALL: - /* repeated dot wildcard. skip to the end of the target - string, and backtrack from there */ - TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr)); - ptr = end; - break; - - case SRE_OP_LITERAL: - /* repeated literal */ - chr = pattern[1]; - TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) *ptr == chr) - ptr++; - break; - - case SRE_OP_LITERAL_IGNORE: - /* repeated literal */ - chr = pattern[1]; - TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr) - ptr++; - break; - - case SRE_OP_NOT_LITERAL: - /* repeated non-literal */ - chr = pattern[1]; - TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) *ptr != chr) - ptr++; - break; - - case SRE_OP_NOT_LITERAL_IGNORE: - /* repeated non-literal */ - chr = pattern[1]; - TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr) - ptr++; - break; - - default: - /* repeated single character pattern */ - TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr)); - while ((SRE_CHAR*) state->ptr < end) { - i = SRE_MATCH(state, pattern); - if (i < 0) - return i; - if (!i) - break; - } - TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, - (SRE_CHAR*) state->ptr - ptr)); - return (SRE_CHAR*) state->ptr - ptr; - } - - TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr)); - return ptr - (SRE_CHAR*) state->ptr; -} - -#if 0 /* not used in this release */ -LOCAL(int) -SRE_INFO(SRE_STATE* state, SRE_CODE* pattern) -{ - /* check if an SRE_OP_INFO block matches at the current position. - returns the number of SRE_CODE objects to skip if successful, 0 - if no match */ - - SRE_CHAR* end = state->end; - SRE_CHAR* ptr = state->ptr; - Py_ssize_t i; - - /* check minimal length */ - if (pattern[3] && (end - ptr) < pattern[3]) - return 0; - - /* check known prefix */ - if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) { - /* */ - for (i = 0; i < pattern[5]; i++) - if ((SRE_CODE) ptr[i] != pattern[7 + i]) - return 0; - return pattern[0] + 2 * pattern[6]; - } - return pattern[0]; -} -#endif - -/* The macros below should be used to protect recursive SRE_MATCH() - * calls that *failed* and do *not* return immediately (IOW, those - * that will backtrack). Explaining: - * - * - Recursive SRE_MATCH() returned true: that's usually a success - * (besides atypical cases like ASSERT_NOT), therefore there's no - * reason to restore lastmark; - * - * - Recursive SRE_MATCH() returned false but the current SRE_MATCH() - * is returning to the caller: If the current SRE_MATCH() is the - * top function of the recursion, returning false will be a matching - * failure, and it doesn't matter where lastmark is pointing to. - * If it's *not* the top function, it will be a recursive SRE_MATCH() - * failure by itself, and the calling SRE_MATCH() will have to deal - * with the failure by the same rules explained here (it will restore - * lastmark by itself if necessary); - * - * - Recursive SRE_MATCH() returned false, and will continue the - * outside 'for' loop: must be protected when breaking, since the next - * OP could potentially depend on lastmark; - * - * - Recursive SRE_MATCH() returned false, and will be called again - * inside a local for/while loop: must be protected between each - * loop iteration, since the recursive SRE_MATCH() could do anything, - * and could potentially depend on lastmark. - * - * For more information, check the discussion at SF patch #712900. - */ -#define LASTMARK_SAVE() \ - do { \ - ctx->lastmark = state->lastmark; \ - ctx->lastindex = state->lastindex; \ - } while (0) -#define LASTMARK_RESTORE() \ - do { \ - state->lastmark = ctx->lastmark; \ - state->lastindex = ctx->lastindex; \ - } while (0) - -#define RETURN_ERROR(i) do { return i; } while(0) -#define RETURN_FAILURE do { ret = 0; goto exit; } while(0) -#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0) - -#define RETURN_ON_ERROR(i) \ - do { if (i < 0) RETURN_ERROR(i); } while (0) -#define RETURN_ON_SUCCESS(i) \ - do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0) -#define RETURN_ON_FAILURE(i) \ - do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0) - -#define SFY(x) #x - -#define DATA_STACK_ALLOC(state, type, ptr) \ -do { \ - alloc_pos = state->data_stack_base; \ - TRACE(("allocating %s in %d (%d)\n", \ - SFY(type), alloc_pos, sizeof(type))); \ - if (state->data_stack_size < alloc_pos+sizeof(type)) { \ - int j = data_stack_grow(state, sizeof(type)); \ - if (j < 0) return j; \ - if (ctx_pos != -1) \ - DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \ - } \ - ptr = (type*)(state->data_stack+alloc_pos); \ - state->data_stack_base += sizeof(type); \ -} while (0) - -#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \ -do { \ - TRACE(("looking up %s at %d\n", SFY(type), pos)); \ - ptr = (type*)(state->data_stack+pos); \ -} while (0) - -#define DATA_STACK_PUSH(state, data, size) \ -do { \ - TRACE(("copy data in %p to %d (%d)\n", \ - data, state->data_stack_base, size)); \ - if (state->data_stack_size < state->data_stack_base+size) { \ - int j = data_stack_grow(state, size); \ - if (j < 0) return j; \ - if (ctx_pos != -1) \ - DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \ - } \ - memcpy(state->data_stack+state->data_stack_base, data, size); \ - state->data_stack_base += size; \ -} while (0) - -#define DATA_STACK_POP(state, data, size, discard) \ -do { \ - TRACE(("copy data to %p from %d (%d)\n", \ - data, state->data_stack_base-size, size)); \ - memcpy(data, state->data_stack+state->data_stack_base-size, size); \ - if (discard) \ - state->data_stack_base -= size; \ -} while (0) - -#define DATA_STACK_POP_DISCARD(state, size) \ -do { \ - TRACE(("discard data from %d (%d)\n", \ - state->data_stack_base-size, size)); \ - state->data_stack_base -= size; \ -} while(0) - -#define DATA_PUSH(x) \ - DATA_STACK_PUSH(state, (x), sizeof(*(x))) -#define DATA_POP(x) \ - DATA_STACK_POP(state, (x), sizeof(*(x)), 1) -#define DATA_POP_DISCARD(x) \ - DATA_STACK_POP_DISCARD(state, sizeof(*(x))) -#define DATA_ALLOC(t,p) \ - DATA_STACK_ALLOC(state, t, p) -#define DATA_LOOKUP_AT(t,p,pos) \ - DATA_STACK_LOOKUP_AT(state,t,p,pos) - -#define MARK_PUSH(lastmark) \ - do if (lastmark > 0) { \ - i = lastmark; /* ctx->lastmark may change if reallocated */ \ - DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ - } while (0) -#define MARK_POP(lastmark) \ - do if (lastmark > 0) { \ - DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ - } while (0) -#define MARK_POP_KEEP(lastmark) \ - do if (lastmark > 0) { \ - DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ - } while (0) -#define MARK_POP_DISCARD(lastmark) \ - do if (lastmark > 0) { \ - DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ - } while (0) - -#define JUMP_NONE 0 -#define JUMP_MAX_UNTIL_1 1 -#define JUMP_MAX_UNTIL_2 2 -#define JUMP_MAX_UNTIL_3 3 -#define JUMP_MIN_UNTIL_1 4 -#define JUMP_MIN_UNTIL_2 5 -#define JUMP_MIN_UNTIL_3 6 -#define JUMP_REPEAT 7 -#define JUMP_REPEAT_ONE_1 8 -#define JUMP_REPEAT_ONE_2 9 -#define JUMP_MIN_REPEAT_ONE 10 -#define JUMP_BRANCH 11 -#define JUMP_ASSERT 12 -#define JUMP_ASSERT_NOT 13 - -#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \ - DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \ - nextctx->last_ctx_pos = ctx_pos; \ - nextctx->jump = jumpvalue; \ - nextctx->pattern = nextpattern; \ - ctx_pos = alloc_pos; \ - ctx = nextctx; \ - goto entrance; \ - jumplabel: \ - while (0) /* gcc doesn't like labels at end of scopes */ \ - -typedef struct { - Py_ssize_t last_ctx_pos; - Py_ssize_t jump; - SRE_CHAR* ptr; - SRE_CODE* pattern; - Py_ssize_t count; - Py_ssize_t lastmark; - Py_ssize_t lastindex; - union { - SRE_CODE chr; - SRE_REPEAT* rep; - } u; -} SRE_MATCH_CONTEXT; - -/* check if string matches the given pattern. returns <0 for - error, 0 for failure, and 1 for success */ -LOCAL(Py_ssize_t) -SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) -{ - SRE_CHAR* end = (SRE_CHAR *)state->end; - Py_ssize_t alloc_pos, ctx_pos = -1; - Py_ssize_t i, ret = 0; - Py_ssize_t jump; - unsigned int sigcount=0; - - SRE_MATCH_CONTEXT* ctx; - SRE_MATCH_CONTEXT* nextctx; - - TRACE(("|%p|%p|ENTER\n", pattern, state->ptr)); - - DATA_ALLOC(SRE_MATCH_CONTEXT, ctx); - ctx->last_ctx_pos = -1; - ctx->jump = JUMP_NONE; - ctx->pattern = pattern; - ctx_pos = alloc_pos; - -entrance: - - ctx->ptr = (SRE_CHAR *)state->ptr; - - if (ctx->pattern[0] == SRE_OP_INFO) { - /* optimization info block */ - /* <1=skip> <2=flags> <3=min> ... */ - if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) { - TRACE(("reject (got %d chars, need %d)\n", - (end - ctx->ptr), ctx->pattern[3])); - RETURN_FAILURE; - } - ctx->pattern += ctx->pattern[1] + 1; - } - - for (;;) { - ++sigcount; - if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals()) - RETURN_ERROR(SRE_ERROR_INTERRUPTED); - - switch (*ctx->pattern++) { - - case SRE_OP_MARK: - /* set mark */ - /* */ - TRACE(("|%p|%p|MARK %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[0])); - i = ctx->pattern[0]; - if (i & 1) - state->lastindex = i/2 + 1; - if (i > state->lastmark) { - /* state->lastmark is the highest valid index in the - state->mark array. If it is increased by more than 1, - the intervening marks must be set to NULL to signal - that these marks have not been encountered. */ - Py_ssize_t j = state->lastmark + 1; - while (j < i) - state->mark[j++] = NULL; - state->lastmark = i; + case SRE_OP_REPEAT_ONE_MAX: + case SRE_OP_REPEAT_ONE_MAX_REV: + { + /* Greedy repeat. */ + /* + + ... + */ + BOOL forward = op == SRE_OP_REPEAT_ONE_MAX; + SRE_CODE* repeat_ptr; + SRE_CODE* body; + SRE_CODE* tail; + Py_ssize_t available; + Py_ssize_t max_rep; + SRE_CHAR* start_ptr; + SRE_CHAR* min_ptr; + SRE_CHAR* max_ptr; + BOOL ok; + + /* Point to the repeat operator. */ + repeat_ptr = context.pattern_ptr; + + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, repeat_ptr[2], repeat_ptr[3])); + + /* Point to the body of the repeat and the tail of the pattern. */ + body = repeat_ptr + 4; + tail = repeat_ptr + 1 + repeat_ptr[1]; + + /* How many characters are still available? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + + /* + Are there enough characters available for the repeat? + + The repeat should consume one character per iteration and must + iterate a minimum number of times. + */ + if ((Py_ssize_t)repeat_ptr[2] > available) + goto backtrack; + + /* How many times can we repeat the body? */ + if (repeat_ptr[3] == SRE_UNLIMITED_REPEATS) + max_rep = available; + else + max_rep = unsigned_min(repeat_ptr[3], available); + + start_ptr = context.text_ptr; + if (forward) { + min_ptr = start_ptr + repeat_ptr[2]; + max_ptr = start_ptr + max_rep; + } else { + min_ptr = start_ptr - repeat_ptr[2]; + max_ptr = start_ptr - max_rep; } - state->mark[i] = ctx->ptr; - ctx->pattern++; - break; - - case SRE_OP_LITERAL: - /* match literal string */ - /* */ - TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern, - ctx->ptr, *ctx->pattern)); - if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0]) - RETURN_FAILURE; - ctx->pattern++; - ctx->ptr++; - break; - - case SRE_OP_NOT_LITERAL: - /* match anything that is not literal character */ - /* */ - TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern, - ctx->ptr, *ctx->pattern)); - if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0]) - RETURN_FAILURE; - ctx->pattern++; - ctx->ptr++; - break; - + + /* Match up to the maximum. */ + SRE_MATCH_MANY(&context, max_ptr, body); + + /* Unmatch down to the minimum until the tail could match. */ + if (forward) + ok = SRE_UNMATCH_UNTIL_TAIL(&context, min_ptr, tail); + else + ok = SRE_UNMATCH_UNTIL_TAIL_REV(&context, min_ptr, tail); + if (!ok) + /* Reached the minimum and the tail still couldn't match. */ + goto backtrack; + + /* + Save the context and initialise the repeat info for the new repeat + unless we're already at the minimum. + */ + if (context.text_ptr != min_ptr) { + SRE_BACKTRACK_ITEM* new_loop; + + result = SRE_SAVE_BACKTRACK(&context, op, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, result); + new_loop = context.backtrack_item; + new_loop->repeat.repeat_min = repeat_ptr[2]; + new_loop->repeat.repeat_max = max_rep; + if (forward) + new_loop->repeat.repeat_counter = context.text_ptr - + start_ptr; + else + new_loop->repeat.repeat_counter = start_ptr - + context.text_ptr; + new_loop->repeat.pattern_ptr = repeat_ptr; + new_loop->repeat.loop = current_loop; + new_loop->repeat.repeat_start = start_ptr; + } + + /* Now match the tail. */ + context.pattern_ptr = tail; + break; + } + case SRE_OP_REPEAT_ONE_MIN: + case SRE_OP_REPEAT_ONE_MIN_REV: + { + /* Lazy repeat. */ + /* + + ... + */ + BOOL forward = op == SRE_OP_REPEAT_ONE_MIN; + SRE_CODE* repeat_ptr; + SRE_CODE* body; + SRE_CODE* tail; + Py_ssize_t available; + Py_ssize_t max_rep; + SRE_CHAR* start_ptr; + SRE_CHAR* min_ptr; + SRE_CHAR* max_ptr; + BOOL ok; + + /* Point to the repeat operator. */ + repeat_ptr = context.pattern_ptr; + + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, repeat_ptr[2], repeat_ptr[3])); + + /* Point to the body of the repeat and the tail of the pattern. */ + body = repeat_ptr + 4; + tail = repeat_ptr + 1 + repeat_ptr[1]; + + /* How many characters are still available? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + + /* + Are there enough characters available for the repeat? + + The repeat should consume one character per iteration and must + iterate a minimum number of times. + */ + if ((Py_ssize_t)repeat_ptr[2] > available) + goto backtrack; + + /* How many times can we repeat the body? */ + if (repeat_ptr[3] == SRE_UNLIMITED_REPEATS) + max_rep = available; + else + max_rep = unsigned_min(repeat_ptr[3], available); + + start_ptr = context.text_ptr; + if (forward) + min_ptr = start_ptr + repeat_ptr[2]; + else + min_ptr = start_ptr - repeat_ptr[2]; + + /* Match up to the minimum. */ + SRE_MATCH_MANY(&context, min_ptr, body); + + /* Matched at least the minimum? */ + if (forward) + ok = context.text_ptr >= min_ptr; + else + ok = context.text_ptr <= min_ptr; + if (!ok) + goto backtrack; + + /* Match until the tail could match, up to the maximum. */ + if (forward) + max_ptr = start_ptr + max_rep; + else + max_ptr = start_ptr - max_rep; + if(!SRE_MATCH_UNTIL_TAIL(&context, max_ptr, body, tail)) + /* Reached the maximum and the tail still couldn't match. */ + goto backtrack; + + /* + Save the context and initialise the repeat info for the new repeat + unless we're already at the maximum. + */ + if (context.text_ptr != max_ptr) + { + SRE_BACKTRACK_ITEM* new_loop; + + result = SRE_SAVE_BACKTRACK(&context, op, FALSE); + if (result != 0) + return SRE_CLEANUP(&context, result); + new_loop = context.backtrack_item; + new_loop->repeat.repeat_min = repeat_ptr[2]; + new_loop->repeat.repeat_max = max_rep; + if (forward) + new_loop->repeat.repeat_counter = context.text_ptr - + start_ptr; + else + new_loop->repeat.repeat_counter = start_ptr - + context.text_ptr; + new_loop->repeat.pattern_ptr = repeat_ptr; + new_loop->repeat.loop = current_loop; + new_loop->repeat.repeat_start = start_ptr; + } + + /* Now match the tail. */ + context.pattern_ptr = tail; + break; + } + case SRE_OP_REPEAT_ONE_POSS: + case SRE_OP_REPEAT_ONE_POSS_REV: + { + /* Possessive repeat. */ + /* + + ... + */ + BOOL forward = op == SRE_OP_REPEAT_ONE_POSS; + SRE_CODE* repeat_ptr; + SRE_CODE* body; + SRE_CODE* tail; + Py_ssize_t available; + Py_ssize_t max_rep; + SRE_CHAR* start_ptr; + SRE_CHAR* min_ptr; + SRE_CHAR* max_ptr; + BOOL ok; + + /* Point to the repeat operator. */ + repeat_ptr = context.pattern_ptr; + + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, repeat_ptr[2], repeat_ptr[3])); + + /* Point to the body of the repeat and the tail of the pattern. */ + body = repeat_ptr + 4; + tail = repeat_ptr + 1 + repeat_ptr[1]; + + /* How many characters are still available? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + + /* + Are there enough characters available for the repeat? + + The repeat should consume one character per iteration and must + iterate a minimum number of times. + */ + if ((Py_ssize_t)repeat_ptr[2] > available) + goto backtrack; + + /* How many times can we repeat the body? */ + if (repeat_ptr[3] == SRE_UNLIMITED_REPEATS) + max_rep = available; + else + max_rep = unsigned_min(repeat_ptr[3], available); + + start_ptr = context.text_ptr; + if (forward) { + min_ptr = start_ptr + repeat_ptr[2]; + max_ptr = start_ptr + max_rep; + } else { + min_ptr = start_ptr - repeat_ptr[2]; + max_ptr = start_ptr - max_rep; + } + + /* Match up to the maximum. */ + SRE_MATCH_MANY(&context, max_ptr, body); + + /* Matched at least the minimum? */ + if (forward) + ok = context.text_ptr >= min_ptr; + else + ok = context.text_ptr <= min_ptr; + if (!ok) + goto backtrack; + + /* Now match the tail. */ + context.pattern_ptr = tail; + break; + } + case SRE_OP_REPEAT_POSS: + case SRE_OP_REPEAT_POSS_REV: + { + /* Possessive repeat. */ + /* + + ... + + */ + BOOL forward = op == SRE_OP_REPEAT_POSS; + SRE_CODE* repeat_ptr; + SRE_CODE* end_repeat_ptr; + SRE_CODE* body; + SRE_CODE* tail; + Py_ssize_t available; + Py_ssize_t max_rep; + SRE_BACKTRACK_ITEM* new_loop; + BOOL try_body; + BOOL try_tail; + + /* Point to the repeat and end-repeat operators. */ + repeat_ptr = context.pattern_ptr; + end_repeat_ptr = repeat_ptr + repeat_ptr[1]; + + TRACE(("|%p|%p|%s %u %u\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name, repeat_ptr[2], repeat_ptr[3])); + + /* Point to the body of the repeat and the tail of the pattern. */ + body = repeat_ptr + 4; + tail = end_repeat_ptr + 2; + + /* How many characters are still available? */ + if (forward) + available = context.text_end - context.text_ptr; + else + available = context.text_ptr - context.text_start; + + /* + Are there enough characters available for the repeat? + + The repeat should consume at least one character per iteration and + must iterate a minimum number of times. + */ + if ((Py_ssize_t)repeat_ptr[2] > available) + goto backtrack; + + /* How many times can we repeat the body? */ + if (repeat_ptr[3] == SRE_UNLIMITED_REPEATS) + max_rep = available; + else + max_rep = unsigned_min(repeat_ptr[3], available); + + /* + Save the context and initialise the repeat info for the new repeat. + + If the body succeeds then we'll discard its backtrack info, + including any marks, so we need to save the marks here in case the + tail fails. + */ + result = SRE_SAVE_BACKTRACK(&context, op, TRUE); + if (result != 0) + return SRE_CLEANUP(&context, result); + new_loop = context.backtrack_item; + new_loop->repeat.repeat_min = repeat_ptr[2]; + new_loop->repeat.repeat_max = max_rep; + new_loop->repeat.repeat_counter = 0; + new_loop->repeat.loop = current_loop; + new_loop->repeat.repeat_start = context.text_ptr; + + /* Should the body be tried? */ + try_body = available > 0; + /* Should the tail be tried? */ + try_tail = new_loop->repeat.repeat_min == 0 && + SRE_POSSIBLE_MATCH_AHEAD(&context, tail); + if (try_body) { + if (try_tail) { + /* + Both the body and the tail should be tried. + + The body takes precedence, so create a backtrack point for + the tail. + */ + result = SRE_SAVE_BACKTRACK(&context, end_repeat_ptr[0], + FALSE); + if (result != 0) + return SRE_CLEANUP(&context, result); + /* Save the context for trying the tail. */ + context.backtrack_item->repeat.text_ptr = context.text_ptr; + context.backtrack_item->repeat.pattern_ptr = tail; + context.backtrack_item->repeat.loop = new_loop; + } + /* Try the body. */ + current_loop = new_loop; + context.pattern_ptr = body; + } else { + if (try_tail) + /* Only the tail should be tried, so do that. */ + context.pattern_ptr = tail; + else + /* + Neither the body and the tail should be tried, so + backtrack. + */ + goto backtrack; + } + break; + } + case SRE_OP_SET: + /* Character in set (forwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr >= context.text_end || + !in_set(state, context.pattern_ptr + 1, context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr = context.pattern_ptr + 1 + + context.pattern_ptr[1]; + break; + case SRE_OP_SET_IGNORE: + /* Character in set, ignoring case (forwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr >= context.text_end || + !in_set_ignore(state, context.pattern_ptr + 1, + context.text_ptr[0])) + goto backtrack; + context.text_ptr++; + context.pattern_ptr = context.pattern_ptr + 1 + + context.pattern_ptr[1]; + break; + case SRE_OP_SET_IGNORE_REV: + /* Character in set, ignoring case (backwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr <= context.text_start || + !in_set_ignore(state, context.pattern_ptr + 1, + context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr = context.pattern_ptr + 1 + + context.pattern_ptr[1]; + break; + case SRE_OP_SET_REV: + /* Character in set (backwards). */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr <= context.text_start || + !in_set(state, context.pattern_ptr + 1, context.text_ptr[-1])) + goto backtrack; + context.text_ptr--; + context.pattern_ptr = context.pattern_ptr + 1 + + context.pattern_ptr[1]; + break; + case SRE_OP_START_OF_LINE: + /* Start of line. */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr > context.text_beginning && + !state->encoding->in_category(SRE_CAT_LineBreak, + context.text_ptr[-1])) + goto backtrack; + context.pattern_ptr++; + break; + case SRE_OP_START_OF_SEARCH: + /* Start of search. */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr != context.search_ptr) + goto backtrack; + context.pattern_ptr++; + break; + case SRE_OP_START_OF_STRING: + /* Start of string. */ + /* */ + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + if (context.text_ptr > context.text_beginning) + goto backtrack; + context.pattern_ptr++; + break; case SRE_OP_SUCCESS: - /* end of pattern */ - TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr)); - state->ptr = ctx->ptr; - RETURN_SUCCESS; - - case SRE_OP_AT: - /* match at given position */ - /* */ - TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); - if (!SRE_AT(state, ctx->ptr, *ctx->pattern)) - RETURN_FAILURE; - ctx->pattern++; - break; - - case SRE_OP_CATEGORY: - /* match at given category */ - /* */ - TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern, - ctx->ptr, *ctx->pattern)); - if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0])) - RETURN_FAILURE; - ctx->pattern++; - ctx->ptr++; - break; - - case SRE_OP_ANY: - /* match anything (except a newline) */ - /* */ - TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr)); - if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0])) - RETURN_FAILURE; - ctx->ptr++; - break; - - case SRE_OP_ANY_ALL: - /* match anything */ - /* */ - TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr)); - if (ctx->ptr >= end) - RETURN_FAILURE; - ctx->ptr++; - break; - - case SRE_OP_IN: - /* match set member (or non_member) */ - /* */ - TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr)); - if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr)) - RETURN_FAILURE; - ctx->pattern += ctx->pattern[0]; - ctx->ptr++; - break; - - case SRE_OP_LITERAL_IGNORE: - TRACE(("|%p|%p|LITERAL_IGNORE %d\n", - ctx->pattern, ctx->ptr, ctx->pattern[0])); - if (ctx->ptr >= end || - state->lower(*ctx->ptr) != state->lower(*ctx->pattern)) - RETURN_FAILURE; - ctx->pattern++; - ctx->ptr++; - break; - - case SRE_OP_NOT_LITERAL_IGNORE: - TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", - ctx->pattern, ctx->ptr, *ctx->pattern)); - if (ctx->ptr >= end || - state->lower(*ctx->ptr) == state->lower(*ctx->pattern)) - RETURN_FAILURE; - ctx->pattern++; - ctx->ptr++; - break; - - case SRE_OP_IN_IGNORE: - TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); - if (ctx->ptr >= end - || !SRE_CHARSET(ctx->pattern+1, - (SRE_CODE)state->lower(*ctx->ptr))) - RETURN_FAILURE; - ctx->pattern += ctx->pattern[0]; - ctx->ptr++; - break; - - case SRE_OP_JUMP: - case SRE_OP_INFO: - /* jump forward */ - /* */ - TRACE(("|%p|%p|JUMP %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[0])); - ctx->pattern += ctx->pattern[0]; - break; - - case SRE_OP_BRANCH: - /* alternation */ - /* <0=skip> code ... */ - TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr)); - LASTMARK_SAVE(); - ctx->u.rep = state->repeat; - if (ctx->u.rep) - MARK_PUSH(ctx->lastmark); - for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) { - if (ctx->pattern[1] == SRE_OP_LITERAL && - (ctx->ptr >= end || - (SRE_CODE) *ctx->ptr != ctx->pattern[2])) - continue; - if (ctx->pattern[1] == SRE_OP_IN && - (ctx->ptr >= end || - !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr))) - continue; - state->ptr = ctx->ptr; - DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); - if (ret) { - if (ctx->u.rep) - MARK_POP_DISCARD(ctx->lastmark); - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - if (ctx->u.rep) - MARK_POP_KEEP(ctx->lastmark); - LASTMARK_RESTORE(); - } - if (ctx->u.rep) - MARK_POP_DISCARD(ctx->lastmark); - RETURN_FAILURE; - - case SRE_OP_REPEAT_ONE: - /* match repeated sequence (maximizing regexp) */ - - /* this operator only works if the repeated item is - exactly one character wide, and we're not already - collecting backtracking points. for other cases, - use the MAX_REPEAT operator */ - - /* <1=min> <2=max> item tail */ - - TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, - ctx->pattern[1], ctx->pattern[2])); - - if (ctx->ptr + ctx->pattern[1] > end) - RETURN_FAILURE; /* cannot match */ - - state->ptr = ctx->ptr; - - ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]); - RETURN_ON_ERROR(ret); - DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); - ctx->count = ret; - ctx->ptr += ctx->count; - - /* when we arrive here, count contains the number of - matches, and ctx->ptr points to the tail of the target - string. check if the rest of the pattern matches, - and backtrack if not. */ - - if (ctx->count < (Py_ssize_t) ctx->pattern[1]) - RETURN_FAILURE; - - if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { - /* tail is empty. we're finished */ - state->ptr = ctx->ptr; - RETURN_SUCCESS; - } - - LASTMARK_SAVE(); - - if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) { - /* tail starts with a literal. skip positions where - the rest of the pattern cannot possibly match */ - ctx->u.chr = ctx->pattern[ctx->pattern[0]+1]; - for (;;) { - while (ctx->count >= (Py_ssize_t) ctx->pattern[1] && - (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) { - ctx->ptr--; - ctx->count--; - } - if (ctx->count < (Py_ssize_t) ctx->pattern[1]) - break; - state->ptr = ctx->ptr; - DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1, - ctx->pattern+ctx->pattern[0]); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - - LASTMARK_RESTORE(); - - ctx->ptr--; - ctx->count--; - } - - } else { - /* general case */ - while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) { - state->ptr = ctx->ptr; - DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2, - ctx->pattern+ctx->pattern[0]); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - ctx->ptr--; - ctx->count--; - LASTMARK_RESTORE(); - } - } - RETURN_FAILURE; - - case SRE_OP_MIN_REPEAT_ONE: - /* match repeated sequence (minimizing regexp) */ - - /* this operator only works if the repeated item is - exactly one character wide, and we're not already - collecting backtracking points. for other cases, - use the MIN_REPEAT operator */ - - /* <1=min> <2=max> item tail */ - - TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, - ctx->pattern[1], ctx->pattern[2])); - - if (ctx->ptr + ctx->pattern[1] > end) - RETURN_FAILURE; /* cannot match */ - - state->ptr = ctx->ptr; - - if (ctx->pattern[1] == 0) - ctx->count = 0; - else { - /* count using pattern min as the maximum */ - ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]); - RETURN_ON_ERROR(ret); - DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); - if (ret < (Py_ssize_t) ctx->pattern[1]) - /* didn't match minimum number of times */ - RETURN_FAILURE; - /* advance past minimum matches of repeat */ - ctx->count = ret; - ctx->ptr += ctx->count; - } - - if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { - /* tail is empty. we're finished */ - state->ptr = ctx->ptr; - RETURN_SUCCESS; - - } else { - /* general case */ - LASTMARK_SAVE(); - while ((Py_ssize_t)ctx->pattern[2] == 65535 - || ctx->count <= (Py_ssize_t)ctx->pattern[2]) { - state->ptr = ctx->ptr; - DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one, - ctx->pattern+ctx->pattern[0]); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - state->ptr = ctx->ptr; - ret = SRE_COUNT(state, ctx->pattern+3, 1); - RETURN_ON_ERROR(ret); - DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); - if (ret == 0) - break; - assert(ret == 1); - ctx->ptr++; - ctx->count++; - LASTMARK_RESTORE(); - } - } - RETURN_FAILURE; - - case SRE_OP_REPEAT: - /* create repeat context. all the hard work is done - by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ - /* <1=min> <2=max> item tail */ - TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr, - ctx->pattern[1], ctx->pattern[2])); - - /* install new repeat context */ - ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep)); - if (!ctx->u.rep) { - PyErr_NoMemory(); - RETURN_FAILURE; - } - ctx->u.rep->count = -1; - ctx->u.rep->pattern = ctx->pattern; - ctx->u.rep->prev = state->repeat; - ctx->u.rep->last_ptr = NULL; - state->repeat = ctx->u.rep; - - state->ptr = ctx->ptr; - DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]); - state->repeat = ctx->u.rep->prev; - PyObject_FREE(ctx->u.rep); - - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - RETURN_FAILURE; - - case SRE_OP_MAX_UNTIL: - /* maximizing repeat */ - /* <1=min> <2=max> item tail */ - - /* FIXME: we probably need to deal with zero-width - matches in here... */ - - ctx->u.rep = state->repeat; - if (!ctx->u.rep) - RETURN_ERROR(SRE_ERROR_STATE); - - state->ptr = ctx->ptr; - - ctx->count = ctx->u.rep->count+1; - - TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern, - ctx->ptr, ctx->count)); - - if (ctx->count < ctx->u.rep->pattern[1]) { - /* not enough matches */ - ctx->u.rep->count = ctx->count; - DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1, - ctx->u.rep->pattern+3); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - ctx->u.rep->count = ctx->count-1; - state->ptr = ctx->ptr; - RETURN_FAILURE; - } - - if ((ctx->count < ctx->u.rep->pattern[2] || - ctx->u.rep->pattern[2] == 65535) && - state->ptr != ctx->u.rep->last_ptr) { - /* we may have enough matches, but if we can - match another item, do so */ - ctx->u.rep->count = ctx->count; - LASTMARK_SAVE(); - MARK_PUSH(ctx->lastmark); - /* zero-width match protection */ - DATA_PUSH(&ctx->u.rep->last_ptr); - ctx->u.rep->last_ptr = state->ptr; - DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2, - ctx->u.rep->pattern+3); - DATA_POP(&ctx->u.rep->last_ptr); - if (ret) { - MARK_POP_DISCARD(ctx->lastmark); - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - MARK_POP(ctx->lastmark); - LASTMARK_RESTORE(); - ctx->u.rep->count = ctx->count-1; - state->ptr = ctx->ptr; - } - - /* cannot match more repeated items here. make sure the - tail matches */ - state->repeat = ctx->u.rep->prev; - DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); - RETURN_ON_SUCCESS(ret); - state->repeat = ctx->u.rep; - state->ptr = ctx->ptr; - RETURN_FAILURE; - - case SRE_OP_MIN_UNTIL: - /* minimizing repeat */ - /* <1=min> <2=max> item tail */ - - ctx->u.rep = state->repeat; - if (!ctx->u.rep) - RETURN_ERROR(SRE_ERROR_STATE); - - state->ptr = ctx->ptr; - - ctx->count = ctx->u.rep->count+1; - - TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern, - ctx->ptr, ctx->count, ctx->u.rep->pattern)); - - if (ctx->count < ctx->u.rep->pattern[1]) { - /* not enough matches */ - ctx->u.rep->count = ctx->count; - DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1, - ctx->u.rep->pattern+3); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - ctx->u.rep->count = ctx->count-1; - state->ptr = ctx->ptr; - RETURN_FAILURE; - } - - LASTMARK_SAVE(); - - /* see if the tail matches */ - state->repeat = ctx->u.rep->prev; - DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - - state->repeat = ctx->u.rep; - state->ptr = ctx->ptr; - - LASTMARK_RESTORE(); - - if (ctx->count >= ctx->u.rep->pattern[2] - && ctx->u.rep->pattern[2] != 65535) - RETURN_FAILURE; - - ctx->u.rep->count = ctx->count; - DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3, - ctx->u.rep->pattern+3); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_SUCCESS; - } - ctx->u.rep->count = ctx->count-1; - state->ptr = ctx->ptr; - RETURN_FAILURE; - - case SRE_OP_GROUPREF: - /* match backreference */ - TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[0])); - i = ctx->pattern[0]; - { - Py_ssize_t groupref = i+i; - if (groupref >= state->lastmark) { - RETURN_FAILURE; - } else { - SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; - SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; - if (!p || !e || e < p) - RETURN_FAILURE; - while (p < e) { - if (ctx->ptr >= end || *ctx->ptr != *p) - RETURN_FAILURE; - p++; ctx->ptr++; + { + /* Success at the end of the pattern. */ + /* */ + BOOL zero_width; + unsigned int m; + SRE_CHAR* end_ptr; + unsigned int max_mark; + TRACE(("|%p|%p|%s\n", context.pattern_ptr, context.text_ptr, + sre_op_info[op].name)); + + /* Is the entire matched portion zero-width? */ + zero_width = context.text_ptr == context.text_start; + + /* + Reject the match if it's zero-width and we aren't allowed to + return zero-width matches. + */ + if (zero_width && state->reject_zero_width) + goto backtrack; + + /* + Find the numbered mark which matched the furthest to the right. + */ + end_ptr = NULL; + for (m = 1; m < state->numbered_mark_count; m += 2) { + if (context.marks[m - 1] != NULL && + context.marks[m] >= context.marks[m - 1]) { + state->lastmark = m; + if (end_ptr < context.marks[m]) { + state->lastindex = 1 + m / 2; + end_ptr = context.marks[m]; } } } - ctx->pattern++; - break; - - case SRE_OP_GROUPREF_IGNORE: - /* match backreference */ - TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[0])); - i = ctx->pattern[0]; - { - Py_ssize_t groupref = i+i; - if (groupref >= state->lastmark) { - RETURN_FAILURE; - } else { - SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; - SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; - if (!p || !e || e < p) - RETURN_FAILURE; - while (p < e) { - if (ctx->ptr >= end || - state->lower(*ctx->ptr) != state->lower(*p)) - RETURN_FAILURE; - p++; ctx->ptr++; + + /* Find the named mark which matched the furthest to the right. */ + end_ptr = NULL; + max_mark = state->numbered_mark_count + state->named_mark_count; + for (m = state->numbered_mark_count + 1; m < max_mark; m += 2) { + if (context.marks[m - 1] != NULL && + context.marks[m] >= context.marks[m - 1]) { + if (end_ptr < context.marks[m]) { + state->last_named_index = 1 + m / 2; + end_ptr = context.marks[m]; } } } - ctx->pattern++; - break; - - case SRE_OP_GROUPREF_EXISTS: - TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[0])); - /* codeyes codeno ... */ - i = ctx->pattern[0]; - { - Py_ssize_t groupref = i+i; - if (groupref >= state->lastmark) { - ctx->pattern += ctx->pattern[1]; + + /* Record where the match finished. */ + state->ptr = context.text_ptr; + return SRE_CLEANUP(&context, 1); + } + default: + /* Unknown opcode. */ + TRACE(("|%p|%p|UNKNOWN %u\n", context.pattern_ptr, context.text_ptr, + context.pattern_ptr[0])); + return SRE_CLEANUP(&context, SRE_ERROR_ILLEGAL); + } + } + +backtrack: + /* Handle the backtracking. */ + TRACE(("|%p|%p|BACKTRACK ", context.pattern_ptr, context.text_ptr)); + + /* Fetch the backtracking info. */ + context.backtrack_item = context.backtrack_chunk->items + + (context.backtrack_chunk->count - 1); + + op = context.backtrack_item->op; + switch (op) { + case SRE_OP_ASSERT: + /* Assert subpattern (+ve look-ahead/look-behind). */ + /* ... */ + TRACE(("%s\n", sre_op_info[op].name)); + + /* + The subpattern has failed, so the marks have already been restored. + + Restore the context and continue backtracking. + */ + context.text_start = context.backtrack_item->assert.text_start; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_ASSERT_NOT: + /* Assert not subpattern (-ve look-ahead/look-behind). */ + /* ... */ + TRACE(("%s\n", sre_op_info[op].name)); + + /* + The subpattern has failed, so the marks have already been restored. + + Restore the context and continue matching. + */ + context.text_start = context.backtrack_item->assert.text_start; + context.text_ptr = context.backtrack_item->assert.text_ptr; + context.pattern_ptr = context.backtrack_item->assert.pattern_ptr; + SRE_DISCARD_BACKTRACK(&context); + + context.pattern_ptr += 1 + context.pattern_ptr[1]; + goto advance; + case SRE_OP_ATOMIC: + /* Atomic subpattern. */ + /* ... */ + TRACE(("%s\n", sre_op_info[op].name)); + + /* + The subpattern has failed, so the marks have already been restored. + + Continue backtracking. + */ + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_BRANCH: + { + /* Alternation. */ + /* + + + ... + + + ... + + 0 + */ + SRE_CODE* skip_ptr; + TRACE(("%s\n", sre_op_info[op].name)); + + /* Fetch the next branch. */ + skip_ptr = context.backtrack_item->branch.pattern_ptr; + + /* Restore the context for the next branch. */ + context.text_ptr = context.backtrack_item->branch.text_ptr; + + /* Look ahead in the branch to avoid unnecessary backtracking. */ + while (! SRE_POSSIBLE_MATCH_AHEAD(&context, skip_ptr + 1)) { + /* This branch can't match, so advance to the next one. */ + skip_ptr += skip_ptr[0]; + + /* Is there another branch? */ + if (skip_ptr[0] == 0) { + /* No more branches, so backtrack. */ + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + } + } + + /* Try this branch. */ + context.pattern_ptr = skip_ptr + 1; + + /* + Is there another branch? + + There's no need to save the context if this is the last branch. + */ + skip_ptr += skip_ptr[0]; + if (skip_ptr[0] == 0) + /* No more branches after this one. */ + SRE_DISCARD_BACKTRACK(&context); + else + /* Save the next branch for backtracking. */ + context.backtrack_item->branch.pattern_ptr = skip_ptr; + goto advance; + } + case SRE_OP_END_ATOMIC: + /* Atomic subpattern. */ + /* ... */ + TRACE(("%s\n", sre_op_info[op].name)); + + /* + The tail has failed, so restore the marks and continue backtracking. + */ + memmove(context.marks, context.backtrack_item->marks, + context.marks_size); + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_END_REPEAT_MAX: + case SRE_OP_END_REPEAT_MAX_REV: + /* End of greedy repeat. */ + /* + + ... + + */ + TRACE(("%s\n", sre_op_info[op].name)); + + /* Restore the context. */ + context.text_ptr = context.backtrack_item->repeat.text_ptr; + context.pattern_ptr = context.backtrack_item->repeat.pattern_ptr; + + /* + REPEAT_MAX prefers trying the body to trying the tail. + + We've tried the body, so now we need to try the tail. + + The tail expects the current loop to be this one's enclosing (outer) + loop. + */ + current_loop = context.backtrack_item->repeat.loop->repeat.loop; + SRE_DISCARD_BACKTRACK(&context); + goto advance; + case SRE_OP_END_REPEAT_MIN: + case SRE_OP_END_REPEAT_MIN_REV: + /* Lazy repeat. */ + /* + + ... + + */ + TRACE(("%s\n", sre_op_info[op].name)); + + /* Restore the context. */ + context.text_ptr = context.backtrack_item->repeat.text_ptr; + context.pattern_ptr = context.backtrack_item->repeat.pattern_ptr; + + /* + REPEAT_MIN prefers to trying the tail to trying the body. + + We've tried the tail, so now we need to try the body. + + The body expects the current loop to be this one's. + */ + current_loop = context.backtrack_item->repeat.loop; + SRE_DISCARD_BACKTRACK(&context); + goto advance; + case SRE_OP_END_REPEAT_POSS: + case SRE_OP_END_REPEAT_POSS_REV: + /* End of greedy repeat. */ + /* + + ... + + */ + TRACE(("%s\n", sre_op_info[op].name)); + + /* Restore the context. */ + context.text_ptr = context.backtrack_item->repeat.text_ptr; + context.pattern_ptr = context.backtrack_item->repeat.pattern_ptr; + + /* + REPEAT_POSS prefers trying the body to trying the tail. + + We've tried the body, so now we need to try the tail. + + The tail expects the current loop to be this one's enclosing (outer) + loop. + */ + current_loop = context.backtrack_item->repeat.loop->repeat.loop; + SRE_DISCARD_BACKTRACK(&context); + goto advance; + case SRE_OP_FAILURE: + /* Failed to match. */ + TRACE(("%s\n", sre_op_info[op].name)); + + /* Permit a zero-width match next time. */ + state->reject_zero_width = FALSE; + return SRE_CLEANUP(&context, 0); + case SRE_OP_MARK: + { + /* Text mark. */ + /* */ + SRE_BACKTRACK_ITEM* item; + TRACE(("%s\n", sre_op_info[op].name)); + + /* + The number and name ids need to be restored in the opposite order to + which they were saved. This is because the name id might be the same + as the number id. + */ + item = context.backtrack_item; + /* The name id. */ + context.marks[item->mark.named_index] = item->mark.named_mark_ptr; + /* The name id. */ + context.marks[item->mark.numbered_index] = item->mark.numbered_mark_ptr; + + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + } + case SRE_OP_REPEAT_MAX: + case SRE_OP_REPEAT_MAX_REV: + /* Greedy repeat. */ + /* + + ... + + */ + TRACE(("%s\n", sre_op_info[op].name)); + + /* + REPEAT_MAX failed. + + Restore 'current' loop to the enclosing loop and backtrack. + */ + current_loop = context.backtrack_item->repeat.loop; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_REPEAT_MIN: + case SRE_OP_REPEAT_MIN_REV: + /* Lazy repeat. */ + /* + + ... + + */ + TRACE(("%s\n", sre_op_info[op].name)); + + /* + REPEAT_MIN failed. + + Restore 'current' loop to the enclosing loop and backtrack. + */ + current_loop = context.backtrack_item->repeat.loop; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + case SRE_OP_REPEAT_ONE_MAX: + case SRE_OP_REPEAT_ONE_MAX_REV: + { + /* Greedy repeat. */ + /* + + ... + */ + BOOL forward = op == SRE_OP_REPEAT_ONE_MAX; + SRE_CODE* repeat_ptr; + SRE_CODE* tail; + SRE_BACKTRACK_ITEM* loop; + SRE_CHAR* start_ptr; + BOOL ok; + TRACE(("%s\n", sre_op_info[op].name)); + + /* Point to the repeat operator. */ + repeat_ptr = context.backtrack_item->repeat.pattern_ptr; + + /* Point to the tail of the pattern. */ + tail = repeat_ptr + 1 + repeat_ptr[1]; + + /* The loop info is stored in the backtrack info. */ + loop = context.backtrack_item; + + /* Restore the context. */ + start_ptr = loop->repeat.repeat_start; + if (forward) + context.text_ptr = start_ptr + loop->repeat.repeat_counter; + else + context.text_ptr = start_ptr - loop->repeat.repeat_counter; + + /* + Release the last character we matched in the body and then unmatch down + to the minimum, until the tail could match. + */ + if (forward) { + context.text_ptr--; + ok = SRE_UNMATCH_UNTIL_TAIL(&context, start_ptr + + loop->repeat.repeat_min, tail); + } else { + context.text_ptr++; + ok = SRE_UNMATCH_UNTIL_TAIL_REV(&context, start_ptr - + loop->repeat.repeat_min, tail); + } + if(!ok) { + /* Reached the minimum and the tail still couldn't match. */ + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + } + + /* How many times has the body matched? */ + if (forward) + loop->repeat.repeat_counter = context.text_ptr - start_ptr; + else + loop->repeat.repeat_counter = start_ptr - context.text_ptr; + + /* + Now match the tail. + + The tail expects the 'current' loop to be the enclosing one. + */ + current_loop = loop->repeat.loop; + context.pattern_ptr = tail; + goto advance; + } + case SRE_OP_REPEAT_ONE_MIN: + case SRE_OP_REPEAT_ONE_MIN_REV: + { + /* Lazy repeat. */ + /* + + ... + */ + BOOL forward = op == SRE_OP_REPEAT_ONE_MIN; + SRE_CODE* repeat_ptr; + SRE_CODE* body; + SRE_CODE* tail; + SRE_BACKTRACK_ITEM* loop; + SRE_CHAR* start_ptr; + SRE_CHAR* max_ptr; + TRACE(("%s\n", sre_op_info[op].name)); + + /* Point to the repeat operator. */ + repeat_ptr = context.backtrack_item->repeat.pattern_ptr; + + /* Point to the body of the repeat and the tail of the pattern. */ + body = repeat_ptr + 4; + tail = repeat_ptr + 1 + repeat_ptr[1]; + + /* The loop info is stored in the backtrack info. */ + loop = context.backtrack_item; + + /* Restore the context. */ + start_ptr = loop->repeat.repeat_start; + if (forward) + context.text_ptr = start_ptr + loop->repeat.repeat_counter; + else + context.text_ptr = start_ptr - loop->repeat.repeat_counter; + + /* Match up to the maximum, until the tail could match. */ + if (forward) + max_ptr = start_ptr + context.backtrack_item->repeat.repeat_max; + else + max_ptr = start_ptr - context.backtrack_item->repeat.repeat_max; + if (!SRE_MATCH_MANY_UNTIL_TAIL(&context, max_ptr, body, tail)) { + /* Reached the maximum and the tail still couldn't match. */ + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + } + + /* How many times has the body matched? */ + if (forward) + loop->repeat.repeat_counter = context.text_ptr - start_ptr; + else + loop->repeat.repeat_counter = start_ptr - context.text_ptr; + + /* + Now match the tail. + + The tail expects the 'current' loop to be the enclosing one. + */ + current_loop = loop->repeat.loop; + context.pattern_ptr = tail; + goto advance; + } + case SRE_OP_REPEAT_POSS: + case SRE_OP_REPEAT_POSS_REV: + /* Possessive repeat. */ + /* + + ... + + */ + TRACE(("%s\n", sre_op_info[op].name)); + + /* + REPEAT_POSS failed. + + Restore the marks, restore 'current' loop to the enclosing loop and + backtrack. + */ + memmove(context.marks, context.backtrack_item->marks, + context.marks_size); + current_loop = context.backtrack_item->repeat.loop; + SRE_DISCARD_BACKTRACK(&context); + goto backtrack; + default: + /* Unknown opcode. */ + TRACE(("UNKNOWN %u\n", context.backtrack_item->op)); + return SRE_CLEANUP(&context, SRE_ERROR_ILLEGAL); + } + + return 0; +} + +LOCAL(int) SRE_SEARCH(SRE_STATE* state) { + SRE_CODE* repeat_ptr; + SRE_CODE* tail; + SRE_CONTEXT context; + int status = 0; + + /* + If a pattern starts with "c{m,n}" where "c" matches a single character and + the pattern fails, then advancing by only one character before retrying + could be inefficient (if m < n and it failed to match m...n times when it + would certainly fail to match m...n-1 times!). + */ + repeat_ptr = state->pattern_code; + if (is_repeat_one(repeat_ptr[0]) && repeat_ptr[3] == SRE_UNLIMITED_REPEATS) + repeat_ptr += 4; + else + repeat_ptr = NULL; + + /* Skip over any marks. */ + tail = state->pattern_code; + while (tail[0] == SRE_OP_MARK) + tail += SRE_MARK_OP_SIZE; + + /* + If the pattern is anchored at the start of the string or the start of the + search then try a match instead of searching. + */ + if (tail[0] == SRE_OP_START_OF_STRING || + tail[0] == SRE_OP_START_OF_SEARCH) { + /* Where should we start the match? */ + state->ptr = state->reverse ? (SRE_CHAR *)state->end : + (SRE_CHAR *)state->start; + state->search_ptr = state->ptr; + return SRE_MATCH(state); + } + + /* + Initialise the context. + + It's more efficient to do it here instead of each time we try a match. + */ + context.state = state; + context.text_beginning = (SRE_CHAR *)state->beginning; + context.text_start = (SRE_CHAR *)state->start; + context.text_end = (SRE_CHAR *)state->end; + + /* Point to the final newline if it's the final character. */ + context.final_linebreak = context.text_beginning < context.text_end && + state->encoding->in_category(SRE_CAT_LineBreak, context.text_end[-1]) ? + context.text_end - 1 : NULL; + + /* + state->reject_zero_width might initially be set to reject an initial zero- + width match. + + If there's no match initially then state->reject_zero_width will be + cleared to allow a zero-wodth match subsequently. + */ + if (state->reverse) { + /* + We want to search backwards. + + Where should we start the match? + */ + context.text_ptr = (SRE_CHAR *)state->end; + + /* Try a match at each position until we're successful. */ + while (context.text_ptr >= context.text_start) { + TRACE(("|%p|%p|SEARCH\n", state->pattern_code, context.text_ptr)); + + /* Could the pattern match here? */ + if (SRE_POSSIBLE_MATCH_AHEAD(&context, tail)) { + /* Try a match. */ + state->end = state->ptr = context.text_ptr; + status = SRE_MATCH(state); + if (status != 0) break; - } else { - SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; - SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; - if (!p || !e || e < p) { - ctx->pattern += ctx->pattern[1]; - break; - } - } - } - ctx->pattern += 2; - break; - - case SRE_OP_ASSERT: - /* assert subpattern */ - /* */ - TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[1])); - state->ptr = ctx->ptr - ctx->pattern[1]; - if (state->ptr < state->beginning) - RETURN_FAILURE; - DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2); - RETURN_ON_FAILURE(ret); - ctx->pattern += ctx->pattern[0]; - break; - - case SRE_OP_ASSERT_NOT: - /* assert not subpattern */ - /* */ - TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[1])); - state->ptr = ctx->ptr - ctx->pattern[1]; - if (state->ptr >= state->beginning) { - DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); - if (ret) { - RETURN_ON_ERROR(ret); - RETURN_FAILURE; - } - } - ctx->pattern += ctx->pattern[0]; - break; - - case SRE_OP_FAILURE: - /* immediate failure */ - TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr)); - RETURN_FAILURE; - - default: - TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr, - ctx->pattern[-1])); - RETURN_ERROR(SRE_ERROR_ILLEGAL); - } - } - -exit: - ctx_pos = ctx->last_ctx_pos; - jump = ctx->jump; - DATA_POP_DISCARD(ctx); - if (ctx_pos == -1) - return ret; - DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); - - switch (jump) { - case JUMP_MAX_UNTIL_2: - TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr)); - goto jump_max_until_2; - case JUMP_MAX_UNTIL_3: - TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr)); - goto jump_max_until_3; - case JUMP_MIN_UNTIL_2: - TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr)); - goto jump_min_until_2; - case JUMP_MIN_UNTIL_3: - TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr)); - goto jump_min_until_3; - case JUMP_BRANCH: - TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr)); - goto jump_branch; - case JUMP_MAX_UNTIL_1: - TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr)); - goto jump_max_until_1; - case JUMP_MIN_UNTIL_1: - TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr)); - goto jump_min_until_1; - case JUMP_REPEAT: - TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr)); - goto jump_repeat; - case JUMP_REPEAT_ONE_1: - TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr)); - goto jump_repeat_one_1; - case JUMP_REPEAT_ONE_2: - TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr)); - goto jump_repeat_one_2; - case JUMP_MIN_REPEAT_ONE: - TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr)); - goto jump_min_repeat_one; - case JUMP_ASSERT: - TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr)); - goto jump_assert; - case JUMP_ASSERT_NOT: - TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr)); - goto jump_assert_not; - case JUMP_NONE: - TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret)); - break; - } - - return ret; /* should never get here */ -} - -LOCAL(Py_ssize_t) -SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) -{ - SRE_CHAR* ptr = (SRE_CHAR *)state->start; - SRE_CHAR* end = (SRE_CHAR *)state->end; - Py_ssize_t status = 0; - Py_ssize_t prefix_len = 0; - Py_ssize_t prefix_skip = 0; - SRE_CODE* prefix = NULL; - SRE_CODE* charset = NULL; - SRE_CODE* overlap = NULL; - int flags = 0; - - if (pattern[0] == SRE_OP_INFO) { - /* optimization info block */ - /* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ - - flags = pattern[2]; - - if (pattern[3] > 1) { - /* adjust end point (but make sure we leave at least one - character in there, so literal search will work) */ - end -= pattern[3]-1; - if (end <= ptr) - end = ptr+1; - } - - if (flags & SRE_INFO_PREFIX) { - /* pattern starts with a known prefix */ - /* */ - prefix_len = pattern[5]; - prefix_skip = pattern[6]; - prefix = pattern + 7; - overlap = prefix + prefix_len - 1; - } else if (flags & SRE_INFO_CHARSET) - /* pattern starts with a character from a known set */ - /* */ - charset = pattern + 5; - - pattern += 1 + pattern[1]; - } - - TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip)); - TRACE(("charset = %p\n", charset)); - -#if defined(USE_FAST_SEARCH) - if (prefix_len > 1) { - /* pattern starts with a known prefix. use the overlap - table to skip forward as fast as we possibly can */ - Py_ssize_t i = 0; - end = (SRE_CHAR *)state->end; - while (ptr < end) { - for (;;) { - if ((SRE_CODE) ptr[0] != prefix[i]) { - if (!i) - break; + + /* Is there an initial repeat? */ + if (repeat_ptr != NULL) { + /* + How many characters could the initial repeat match if + unlimited? + */ + SRE_CHAR * max_ptr = context.text_ptr - repeat_ptr[3]; + SRE_MATCH_MANY(&context, context.text_start, + repeat_ptr); + + if (context.text_ptr >= max_ptr) + /* + The initial repeat could have consumed all those + available, but it still failed to match, so discard + all of those, advance by one, and try again. + */ + context.text_ptr--; else - i = overlap[i]; - } else { - if (++i == prefix_len) { - /* found a potential match */ - TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr)); - state->start = ptr + 1 - prefix_len; - state->ptr = ptr + 1 - prefix_len + prefix_skip; - if (flags & SRE_INFO_LITERAL) - return 1; /* we got all of it */ - status = SRE_MATCH(state, pattern + 2*prefix_skip); - if (status != 0) - return status; - /* close but no cigar -- try again */ - i = overlap[i]; - } + /* + The initial repeat couldn't have consumed all those + available, so discard until it /could/ consume all + those available and let try again. + */ + context.text_ptr += repeat_ptr[3]; + } else + /* Advance and try again. */ + context.text_ptr--; + } else + /* Advance and try again. */ + context.text_ptr--; + + state->reject_zero_width = FALSE; + } + } else { + /* + We want to search forwards. + + Where should we start the match? + */ + context.text_ptr = (SRE_CHAR *)state->start; + + /* Try a match at each position until we're successful. */ + while (context.text_ptr <= context.text_end) { + TRACE(("|%p|%p|SEARCH\n", state->pattern_code, context.text_ptr)); + + /* Could the pattern match here? */ + if (SRE_POSSIBLE_MATCH_AHEAD(&context, tail)) { + /* Try a match. */ + state->start = state->ptr = context.text_ptr; + status = SRE_MATCH(state); + if (status != 0) break; - } - } - ptr++; - } - return 0; - } -#endif - - if (pattern[0] == SRE_OP_LITERAL) { - /* pattern starts with a literal character. this is used - for short prefixes, and if fast search is disabled */ - SRE_CODE chr = pattern[1]; - end = (SRE_CHAR *)state->end; - for (;;) { - while (ptr < end && (SRE_CODE) ptr[0] != chr) - ptr++; - if (ptr >= end) - return 0; - TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr)); - state->start = ptr; - state->ptr = ++ptr; - if (flags & SRE_INFO_LITERAL) - return 1; /* we got all of it */ - status = SRE_MATCH(state, pattern + 2); - if (status != 0) - break; - } - } else if (charset) { - /* pattern starts with a character from a known set */ - end = (SRE_CHAR *)state->end; - for (;;) { - while (ptr < end && !SRE_CHARSET(charset, ptr[0])) - ptr++; - if (ptr >= end) - return 0; - TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr)); - state->start = ptr; - state->ptr = ptr; - status = SRE_MATCH(state, pattern); - if (status != 0) - break; - ptr++; - } - } else - /* general case */ - while (ptr <= end) { - TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); - state->start = state->ptr = ptr++; - status = SRE_MATCH(state, pattern); - if (status != 0) - break; - } + + /* Is there an initial repeat? */ + if (repeat_ptr != NULL) { + /* + How many characters could the initial repeat match if + unlimited? + */ + SRE_CHAR * max_ptr = context.text_ptr + repeat_ptr[3]; + SRE_MATCH_MANY(&context, context.text_end, + repeat_ptr); + + if (context.text_ptr <= max_ptr) + /* + The initial repeat could have consumed all those + available, but it still failed to match, so discard + all of those, advance by one, and try again. + */ + context.text_ptr++; + else + /* + The initial repeat couldn't have consumed all those + available, so discard until it /could/ consume all + those available and let try again. + */ + context.text_ptr -= repeat_ptr[3]; + } else + /* Advance and try again. */ + context.text_ptr++; + } else + /* Advance and try again. */ + context.text_ptr++; + + state->reject_zero_width = FALSE; + } + } return status; } -LOCAL(int) -SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len) -{ +LOCAL(BOOL) SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len) { /* check if given string is a literal template (i.e. no escapes) */ while (len-- > 0) if (*ptr++ == '\\') return 0; - return 1; + return TRUE; } #if !defined(SRE_RECURSIVE) @@ -1630,49 +5109,70 @@ /* factories and destructors */ /* see sre.h for object declarations */ -static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int); -static PyObject*pattern_scanner(PatternObject*, PyObject*); - -static PyObject * -sre_codesize(PyObject* self, PyObject *unused) -{ +static PyObject* pattern_new_match(PatternObject*, SRE_STATE*, int); +static PyObject* pattern_scanner(PatternObject*, PyObject*); + +static PyObject* sre_codesize(PyObject* self, PyObject *unused) { return Py_BuildValue("l", sizeof(SRE_CODE)); } -static PyObject * -sre_getlower(PyObject* self, PyObject* args) -{ +/* Exported function to convert a character to lowercase. */ +static PyObject* sre_getlower(PyObject* self, PyObject* args) { int character, flags; if (!PyArg_ParseTuple(args, "ii", &character, &flags)) return NULL; if (flags & SRE_FLAG_LOCALE) - return Py_BuildValue("i", sre_lower_locale(character)); + return Py_BuildValue("i", loc_lower(character)); if (flags & SRE_FLAG_UNICODE) #if defined(HAVE_UNICODE) - return Py_BuildValue("i", sre_lower_unicode(character)); + return Py_BuildValue("i", uni_lower(character)); #else - return Py_BuildValue("i", sre_lower_locale(character)); + return Py_BuildValue("i", loc_lower(character)); #endif - return Py_BuildValue("i", sre_lower(character)); -} - -LOCAL(void) -state_reset(SRE_STATE* state) -{ - /* FIXME: dynamic! */ - /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/ - + return Py_BuildValue("i", ascii_lower(character)); +} + +/* Exported function to convert a character to uppercase. */ +static PyObject* sre_getupper(PyObject* self, PyObject* args) { + int character, flags; + if (!PyArg_ParseTuple(args, "ii", &character, &flags)) + return NULL; + if (flags & SRE_FLAG_LOCALE) + return Py_BuildValue("i", loc_upper(character)); + if (flags & SRE_FLAG_UNICODE) +#if defined(HAVE_UNICODE) + return Py_BuildValue("i", uni_upper(character)); +#else + return Py_BuildValue("i", loc_upper(character)); +#endif + return Py_BuildValue("i", ascii_upper(character)); +} + +/* Exported function to convert a character to titlecase. */ +static PyObject* sre_gettitle(PyObject* self, PyObject* args) { + int character, flags; + if (!PyArg_ParseTuple(args, "ii", &character, &flags)) + return NULL; + if (flags & SRE_FLAG_LOCALE) + return Py_BuildValue("i", loc_upper(character)); + if (flags & SRE_FLAG_UNICODE) +#if defined(HAVE_UNICODE) + return Py_BuildValue("i", uni_title(character)); +#else + return Py_BuildValue("i", loc_upper(character)); +#endif + return Py_BuildValue("i", ascii_upper(character)); +} + +/* Resets the state. */ +LOCAL(void) state_reset(SRE_STATE* state) { state->lastmark = -1; state->lastindex = -1; - - state->repeat = NULL; - - data_stack_dealloc(state); -} - -static void* -getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize) -{ + state->last_named_index = -1; +} + +static void* getstring(PyObject* string, Py_ssize_t* p_length, + int* p_charsize) { /* given a python object, return a data pointer, a length (in characters), and a character size. return NULL if the object is not a string (or not compatible) */ @@ -1694,7 +5194,7 @@ #endif /* get pointer to string buffer */ - buffer = Py_TYPE(string)->tp_as_buffer; + buffer = string->ob_type->tp_as_buffer; if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount || buffer->bf_getsegcount(string, NULL) != 1) { PyErr_SetString(PyExc_TypeError, "expected string or buffer"); @@ -1736,10 +5236,9 @@ return ptr; } -LOCAL(PyObject*) -state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, - Py_ssize_t start, Py_ssize_t end) -{ +/* Initialises the state. */ +LOCAL(PyObject*) state_init(SRE_STATE* state, PatternObject* pattern, + PyObject* string, Py_ssize_t start, Py_ssize_t end, SRE_CODE* pattern_code) { /* prepare state object */ Py_ssize_t length; @@ -1748,12 +5247,34 @@ memset(state, 0, sizeof(SRE_STATE)); + /* Store the pattern. */ + state->pattern_code = pattern_code; + + /* Create the first chunk of backtracking items. */ + state->backtrack_chunk = + (SRE_BACKTRACK_CHUNK*)PyMem_MALLOC(sizeof(SRE_BACKTRACK_CHUNK)); + if (state->backtrack_chunk == NULL) + goto error; + + state->backtrack_chunk->previous = NULL; + state->backtrack_chunk->count = 0; + + /* + Calculate how many numbered and named marks there are. + + All capture groups are numbered. Some also have a name, but there can be + multiple groups with the same name, so there are name ids too. + */ + state->numbered_mark_count = 2 * pattern->groups; + state->named_mark_count = 2 * (pattern->internal_groups - pattern->groups); + state->lastmark = -1; state->lastindex = -1; + state->last_named_index = -1; ptr = getstring(string, &length, &charsize); if (!ptr) - return NULL; + goto error; /* adjust boundaries */ if (start < 0) @@ -1769,48 +5290,67 @@ state->charsize = charsize; state->beginning = ptr; - state->start = (void*) ((char*) ptr + start * state->charsize); state->end = (void*) ((char*) ptr + end * state->charsize); + + /* Whether to reject zero-width matches. */ + state->reject_zero_width = FALSE; Py_INCREF(string); state->string = string; state->pos = start; state->endpos = end; - if (pattern->flags & SRE_FLAG_LOCALE) - state->lower = sre_lower_locale; - else if (pattern->flags & SRE_FLAG_UNICODE) -#if defined(HAVE_UNICODE) - state->lower = sre_lower_unicode; -#else - state->lower = sre_lower_locale; -#endif + /* + What is the encoding of the text? + + The term "encoding" might not be correct: here it means whether the text + is ASCII, locale-specific 8-bit, or Unicode. + */ + if ((pattern->flags & SRE_FLAG_UNICODE) || state->charsize > 1) + /* We'll assume that non-8-bit text is Unicode. */ + state->encoding = &unicode_encoding; + else if (pattern->flags & SRE_FLAG_LOCALE) + /* Locale-specific 8-bit. */ + state->encoding = &locale_encoding; else - state->lower = sre_lower; + /* ASCII. */ + state->encoding = &ascii_encoding; + + /* Whether to search backwards. */ + state->reverse = pattern->flags & SRE_FLAG_REVERSE; return string; -} - -LOCAL(void) -state_fini(SRE_STATE* state) -{ + +error: + PyMem_FREE(state->backtrack_chunk); + return NULL; +} + +LOCAL(void) state_fini(SRE_STATE* state) { + /* + There are actually 2 versions of backtrack_chunk, 8-bit and Unicode. This + shouldn't be a problem because they have the same format and contain + pointers and an int, which are always the same size. + */ + PyMem_FREE(state->backtrack_chunk); + state->backtrack_chunk = NULL; + Py_XDECREF(state->string); - data_stack_dealloc(state); } /* calculate offset from start of string */ -#define STATE_OFFSET(state, member)\ +#define STATE_OFFSET(state, member) \ (((char*)(member) - (char*)(state)->beginning) / (state)->charsize) -LOCAL(PyObject*) -state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty) -{ +LOCAL(PyObject*) state_getslice(SRE_STATE* state, Py_ssize_t index, + PyObject* string, int empty) { Py_ssize_t i, j; index = (index - 1) * 2; - if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) { + if (string == Py_None || index >= state->lastmark || !state->mark[index] || + !state->mark[index + 1]) { if (empty) /* want empty string */ i = j = 0; @@ -1820,15 +5360,13 @@ } } else { i = STATE_OFFSET(state, state->mark[index]); - j = STATE_OFFSET(state, state->mark[index+1]); + j = STATE_OFFSET(state, state->mark[index + 1]); } return PySequence_GetSlice(string, i, j); } -static void -pattern_error(int status) -{ +static void pattern_error(int status) { switch (status) { case SRE_ERROR_RECURSION_LIMIT: PyErr_SetString( @@ -1851,23 +5389,20 @@ } } -static void -pattern_dealloc(PatternObject* self) -{ +static void pattern_dealloc(PatternObject* self) { if (self->weakreflist != NULL) - PyObject_ClearWeakRefs((PyObject *) self); + PyObject_ClearWeakRefs((PyObject*)self); Py_XDECREF(self->pattern); Py_XDECREF(self->groupindex); Py_XDECREF(self->indexgroup); PyObject_DEL(self); } -static PyObject* -pattern_match(PatternObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* pattern_match(PatternObject* self, PyObject* args, + PyObject* kw) { SRE_STATE state; int status; - + SRE_CODE* pattern_code; PyObject* string; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; @@ -1876,23 +5411,26 @@ &string, &start, &end)) return NULL; - string = state_init(&state, self, string, start, end); + pattern_code = PatternObject_GetCode(self); + + string = state_init(&state, self, string, start, end, pattern_code); if (!string) return NULL; - state.ptr = state.start; - - TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr)); + state.ptr = state.reverse ? state.end : state.start; + state.search_ptr = state.ptr; + + TRACE(("|%p|%p|MATCH\n", pattern_code, state.ptr)); if (state.charsize == 1) { - status = sre_match(&state, PatternObject_GetCode(self)); + status = sre_bmatch(&state); } else { #if defined(HAVE_UNICODE) - status = sre_umatch(&state, PatternObject_GetCode(self)); + status = sre_umatch(&state); #endif } - TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); + TRACE(("|%p|%p|END\n", pattern_code, state.ptr)); if (PyErr_Occurred()) return NULL; @@ -1901,12 +5439,11 @@ return pattern_new_match(self, &state, status); } -static PyObject* -pattern_search(PatternObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* pattern_search(PatternObject* self, PyObject* args, + PyObject* kw) { SRE_STATE state; int status; - + SRE_CODE* pattern_code; PyObject* string; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; @@ -1915,21 +5452,23 @@ &string, &start, &end)) return NULL; - string = state_init(&state, self, string, start, end); + pattern_code = PatternObject_GetCode(self); + + string = state_init(&state, self, string, start, end, pattern_code); if (!string) return NULL; - TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr)); + TRACE(("|%p|%p|SEARCH\n", pattern_code, state.ptr)); if (state.charsize == 1) { - status = sre_search(&state, PatternObject_GetCode(self)); + status = sre_bsearch(&state); } else { #if defined(HAVE_UNICODE) - status = sre_usearch(&state, PatternObject_GetCode(self)); + status = sre_usearch(&state); #endif } - TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); + TRACE(("|%p|%p|END\n", pattern_code, state.ptr)); state_fini(&state); @@ -1939,9 +5478,7 @@ return pattern_new_match(self, &state, status); } -static PyObject* -call(char* module, char* function, PyObject* args) -{ +static PyObject* call(char* module, char* function, PyObject* args) { PyObject* name; PyObject* mod; PyObject* func; @@ -1967,9 +5504,7 @@ } #ifdef USE_BUILTIN_COPY -static int -deepcopy(PyObject** object, PyObject* memo) -{ +static int deepcopy(PyObject** object, PyObject* memo) { PyObject* copy; copy = call( @@ -1986,9 +5521,7 @@ } #endif -static PyObject* -join_list(PyObject* list, PyObject* string) -{ +static PyObject* join_list(PyObject* list, PyObject* string) { /* join list elements */ PyObject* joiner; @@ -2034,14 +5567,13 @@ return result; } -static PyObject* -pattern_findall(PatternObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* pattern_findall(PatternObject* self, PyObject* args, + PyObject* kw) { SRE_STATE state; PyObject* list; int status; Py_ssize_t i, b, e; - + SRE_CODE* pattern_code; PyObject* string; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; @@ -2050,7 +5582,9 @@ &string, &start, &end)) return NULL; - string = state_init(&state, self, string, start, end); + pattern_code = PatternObject_GetCode(self); + + string = state_init(&state, self, string, start, end, pattern_code); if (!string) return NULL; @@ -2061,23 +5595,22 @@ } while (state.start <= state.end) { - PyObject* item; state_reset(&state); - state.ptr = state.start; + state.ptr = state.reverse ? state.end : state.start; if (state.charsize == 1) { - status = sre_search(&state, PatternObject_GetCode(self)); + status = sre_bsearch(&state); } else { #if defined(HAVE_UNICODE) - status = sre_usearch(&state, PatternObject_GetCode(self)); + status = sre_usearch(&state); #endif } - if (PyErr_Occurred()) - goto error; + if (PyErr_Occurred()) + goto error; if (status <= 0) { if (status == 0) @@ -2089,8 +5622,13 @@ /* don't bother to build a match object */ switch (self->groups) { case 0: - b = STATE_OFFSET(&state, state.start); - e = STATE_OFFSET(&state, state.ptr); + if (state.reverse) { + b = STATE_OFFSET(&state, state.ptr); + e = STATE_OFFSET(&state, state.end); + } else { + b = STATE_OFFSET(&state, state.start); + e = STATE_OFFSET(&state, state.ptr); + } item = PySequence_GetSlice(string, b, e); if (!item) goto error; @@ -2105,7 +5643,7 @@ if (!item) goto error; for (i = 0; i < self->groups; i++) { - PyObject* o = state_getslice(&state, i+1, string, 1); + PyObject* o = state_getslice(&state, i + 1, string, 1); if (!o) { Py_DECREF(item); goto error; @@ -2120,11 +5658,15 @@ if (status < 0) goto error; - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); + /* + Continue the search from where we left off. Forbid another zero-width + match at the same start position. + */ + if (state.reverse) + state.end = state.ptr; else state.start = state.ptr; - + state.reject_zero_width = TRUE; } state_fini(&state); @@ -2138,9 +5680,7 @@ } #if PY_VERSION_HEX >= 0x02020000 -static PyObject* -pattern_finditer(PatternObject* pattern, PyObject* args) -{ +static PyObject* pattern_finditer(PatternObject* pattern, PyObject* args) { PyObject* scanner; PyObject* search; PyObject* iterator; @@ -2161,16 +5701,17 @@ } #endif -static PyObject* -pattern_split(PatternObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* pattern_split(PatternObject* self, PyObject* args, + PyObject* kw) { SRE_STATE state; PyObject* list; PyObject* item; int status; + SRE_CODE* pattern_code; Py_ssize_t n; Py_ssize_t i; void* last; + BOOL zero_width; PyObject* string; Py_ssize_t maxsplit = 0; @@ -2179,7 +5720,9 @@ &string, &maxsplit)) return NULL; - string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX); + pattern_code = PatternObject_GetCode(self); + + string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX, pattern_code); if (!string) return NULL; @@ -2189,46 +5732,69 @@ return NULL; } + zero_width = (self->flags & SRE_FLAG_ZEROWIDTH) != 0; + n = 0; - last = state.start; + + /* Where did the last match end? */ + last = state.reverse ? state.end : state.start; while (!maxsplit || n < maxsplit) { - state_reset(&state); - state.ptr = state.start; + /* Where should the search start? */ + state.ptr = state.reverse ? state.end : state.start; if (state.charsize == 1) { - status = sre_search(&state, PatternObject_GetCode(self)); + status = sre_bsearch(&state); } else { #if defined(HAVE_UNICODE) - status = sre_usearch(&state, PatternObject_GetCode(self)); + status = sre_usearch(&state); #endif } - if (PyErr_Occurred()) - goto error; + if (PyErr_Occurred()) + goto error; if (status <= 0) { + /* The search failed. */ if (status == 0) break; pattern_error(status); goto error; } - if (state.start == state.ptr) { - if (last == state.end) - break; - /* skip one character */ - state.start = (void*) ((char*) state.ptr + state.charsize); - continue; - } - - /* get segment before this match */ - item = PySequence_GetSlice( - string, STATE_OFFSET(&state, last), - STATE_OFFSET(&state, state.start) - ); + if (state.reverse) { + /* Zero-width match? */ + if (state.ptr == state.end) { + /* Are we permitted to split on zero-width? */ + if (!zero_width) { + state.end = (void*) ((char*) state.ptr - state.charsize); + continue; + } + } + + /* get segment before this match */ + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, state.end), + STATE_OFFSET(&state, last) + ); + } else { + /* Zero-width match? */ + if (state.ptr == state.start) { + /* Are we permitted to split on zero-width? */ + if (!zero_width) { + state.start = (void*) ((char*) state.ptr + state.charsize); + continue; + } + } + + /* get segment before this match */ + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, last), + STATE_OFFSET(&state, state.start) + ); + } if (!item) goto error; status = PyList_Append(list, item); @@ -2238,7 +5804,7 @@ /* add groups (if any) */ for (i = 0; i < self->groups; i++) { - item = state_getslice(&state, i+1, string, 0); + item = state_getslice(&state, i + 1, string, 0); if (!item) goto error; status = PyList_Append(list, item); @@ -2249,14 +5815,52 @@ n = n + 1; - last = state.start = state.ptr; - + /* Remember where the search finished. */ + last = state.ptr; + + /* + Continue the search from where we left off. + + Legacy code won't split on a zero-width match; it'll simply ignore the + match, advance, and try again. + + Newer code with the ZEROWIDTH flag set can split on a zero-width match; + when it tries the next match it'll forbid another zero-width match at + the same start position. + */ + if (state.reverse) { + if (zero_width) { + state.end = state.ptr; + state.reject_zero_width = TRUE; + } else { + if (state.ptr == state.end) + state.end = (void*) ((char*) state.ptr - state.charsize); + else + state.end = state.ptr; + } + } else { + if (zero_width) { + state.start = state.ptr; + state.reject_zero_width = TRUE; + } else { + if(state.ptr == state.start) + state.start = (void*) ((char*) state.ptr + state.charsize); + else + state.start = state.ptr; + } + } } /* get segment following last match (even if empty) */ - item = PySequence_GetSlice( - string, STATE_OFFSET(&state, last), state.endpos - ); + if (state.reverse) + item = PySequence_GetSlice( + string, state.pos, STATE_OFFSET(&state, last) + ); + else + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, last), state.endpos + ); + if (!item) goto error; status = PyList_Append(list, item); @@ -2276,8 +5880,7 @@ static PyObject* pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, - Py_ssize_t count, Py_ssize_t subn) -{ + Py_ssize_t count, Py_ssize_t subn) { SRE_STATE state; PyObject* list; PyObject* item; @@ -2287,9 +5890,11 @@ void* ptr; int status; Py_ssize_t n; - Py_ssize_t i, b, e; + Py_ssize_t b; int bint; int filter_is_callable; + SRE_CODE* pattern_code; + void* last; if (PyCallable_Check(ptemplate)) { /* sub/subn takes either a function or a template */ @@ -2303,10 +5908,10 @@ b = bint; if (ptr) { if (b == 1) { - literal = sre_literal_template((unsigned char *)ptr, n); + literal = sre_bliteral_template((unsigned char*)ptr, n); } else { #if defined(HAVE_UNICODE) - literal = sre_uliteral_template((Py_UNICODE *)ptr, n); + literal = sre_uliteral_template((Py_UNICODE*)ptr, n); #endif } } else { @@ -2329,7 +5934,9 @@ } } - string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX); + pattern_code = PatternObject_GetCode(self); + + string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX, pattern_code); if (!string) { Py_DECREF(filter); return NULL; @@ -2342,48 +5949,54 @@ return NULL; } - n = i = 0; + n = 0; + + /* Where did the last match end? */ + last = state.reverse ? state.end : state.start; while (!count || n < count) { - state_reset(&state); - state.ptr = state.start; + /* Where should the search start? */ + state.ptr = state.reverse ? state.end : state.start; if (state.charsize == 1) { - status = sre_search(&state, PatternObject_GetCode(self)); + status = sre_bsearch(&state); } else { #if defined(HAVE_UNICODE) - status = sre_usearch(&state, PatternObject_GetCode(self)); + status = sre_usearch(&state); #endif } - if (PyErr_Occurred()) - goto error; + if (PyErr_Occurred()) + goto error; if (status <= 0) { + /* The search failed. */ if (status == 0) break; pattern_error(status); goto error; } - b = STATE_OFFSET(&state, state.start); - e = STATE_OFFSET(&state, state.ptr); - - if (i < b) { - /* get segment before this match */ - item = PySequence_GetSlice(string, i, b); - if (!item) - goto error; - status = PyList_Append(list, item); - Py_DECREF(item); - if (status < 0) - goto error; - - } else if (i == b && i == e && n > 0) - /* ignore empty match on latest position */ - goto next; + /* get segment before this match */ + if (state.reverse) { + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, state.end), + STATE_OFFSET(&state, last) + ); + } else { + item = PySequence_GetSlice( + string, STATE_OFFSET(&state, last), + STATE_OFFSET(&state, state.start) + ); + } + if (!item) + goto error; + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + goto error; if (filter_is_callable) { /* pass match object through filter */ @@ -2414,28 +6027,35 @@ goto error; } - i = e; n = n + 1; -next: - /* move on */ - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); + /* Remember where the search finished. */ + last = state.ptr; + + /* + Continue the search from where we left off. Forbid another zero-width + match at the same start position. + */ + if (state.reverse) + state.end = state.ptr; else state.start = state.ptr; - + state.reject_zero_width = TRUE; } /* get segment following last match */ - if (i < state.endpos) { - item = PySequence_GetSlice(string, i, state.endpos); - if (!item) - goto error; - status = PyList_Append(list, item); - Py_DECREF(item); - if (status < 0) - goto error; - } + if (state.reverse) + item = PySequence_GetSlice(string, state.pos, STATE_OFFSET(&state, + last)); + else + item = PySequence_GetSlice(string, STATE_OFFSET(&state, last), + state.endpos); + if (!item) + goto error; + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + goto error; state_fini(&state); @@ -2460,9 +6080,8 @@ } -static PyObject* -pattern_sub(PatternObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* pattern_sub(PatternObject* self, PyObject* args, + PyObject* kw) { PyObject* ptemplate; PyObject* string; Py_ssize_t count = 0; @@ -2474,9 +6093,8 @@ return pattern_subx(self, ptemplate, string, count, 0); } -static PyObject* -pattern_subn(PatternObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* pattern_subn(PatternObject* self, PyObject* args, + PyObject* kw) { PyObject* ptemplate; PyObject* string; Py_ssize_t count = 0; @@ -2488,9 +6106,7 @@ return pattern_subx(self, ptemplate, string, count, 1); } -static PyObject* -pattern_copy(PatternObject* self, PyObject *unused) -{ +static PyObject* pattern_copy(PatternObject* self, PyObject *unused) { #ifdef USE_BUILTIN_COPY PatternObject* copy; int offset; @@ -2516,9 +6132,7 @@ #endif } -static PyObject* -pattern_deepcopy(PatternObject* self, PyObject* memo) -{ +static PyObject* pattern_deepcopy(PatternObject* self, PyObject* memo) { #ifdef USE_BUILTIN_COPY PatternObject* copy; @@ -2577,20 +6191,20 @@ static PyMethodDef pattern_methods[] = { {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS, - pattern_match_doc}, + pattern_match_doc}, {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS, - pattern_search_doc}, + pattern_search_doc}, {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS, - pattern_sub_doc}, + pattern_sub_doc}, {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS, - pattern_subn_doc}, + pattern_subn_doc}, {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS, - pattern_split_doc}, + pattern_split_doc}, {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS, - pattern_findall_doc}, + pattern_findall_doc}, #if PY_VERSION_HEX >= 0x02020000 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS, - pattern_finditer_doc}, + pattern_finditer_doc}, #endif {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS}, {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS}, @@ -2598,9 +6212,7 @@ {NULL, NULL} }; -static PyObject* -pattern_getattr(PatternObject* self, char* name) -{ +static PyObject* pattern_getattr(PatternObject* self, char* name) { PyObject* res; res = Py_FindMethod(pattern_methods, (PyObject*) self, name); @@ -2631,38 +6243,49 @@ return NULL; } -statichere PyTypeObject Pattern_Type = { +static int _validate(PatternObject *self); /* Forward reference. */ + +static Py_ssize_t match_length(MatchObject* self) +{ + return self->groups; +} + +static PyObject* match_subscript(MatchObject* self, PyObject* group); + +static PyMappingMethods match_as_mapping = { + (lenfunc)match_length, /*mp_length*/ + (binaryfunc)match_subscript, /*mp_subscript*/ + 0, /*mp_ass_subscript*/ +}; + +static PyTypeObject Pattern_Type = { PyObject_HEAD_INIT(NULL) 0, "_" SRE_MODULE ".SRE_Pattern", sizeof(PatternObject), sizeof(SRE_CODE), (destructor)pattern_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ + 0, /*tp_print*/ (getattrfunc)pattern_getattr, /*tp_getattr*/ - 0, /* tp_setattr */ - 0, /* tp_compare */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */ - pattern_doc, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */ + pattern_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */ }; -static int _validate(PatternObject *self); /* Forward */ - -static PyObject * -_compile(PyObject* self_, PyObject* args) -{ +static PyObject* _compile(PyObject* self_, PyObject* args) { /* "compile" pattern descriptor to pattern object */ PatternObject* self; @@ -2763,454 +6386,652 @@ #define VTRACE(v) #endif -/* Report failure */ -#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0) - -/* Extract opcode, argument, or skip count from code array */ -#define GET_OP \ - do { \ - VTRACE(("%p: ", code)); \ - if (code >= end) FAIL; \ - op = *code++; \ - VTRACE(("%lu (op)\n", (unsigned long)op)); \ - } while (0) -#define GET_ARG \ - do { \ - VTRACE(("%p= ", code)); \ - if (code >= end) FAIL; \ - arg = *code++; \ - VTRACE(("%lu (arg)\n", (unsigned long)arg)); \ - } while (0) -#define GET_SKIP_ADJ(adj) \ - do { \ - VTRACE(("%p= ", code)); \ - if (code >= end) FAIL; \ - skip = *code; \ - VTRACE(("%lu (skip to %p)\n", \ - (unsigned long)skip, code+skip)); \ - if (code+skip-adj < code || code+skip-adj > end)\ - FAIL; \ - code++; \ - } while (0) -#define GET_SKIP GET_SKIP_ADJ(0) - -static int -_validate_charset(SRE_CODE *code, SRE_CODE *end) -{ - /* Some variables are manipulated by the macros above */ - SRE_CODE op; - SRE_CODE arg; - SRE_CODE offset; - int i; - - while (code < end) { - GET_OP; - switch (op) { - - case SRE_OP_NEGATE: - break; - - case SRE_OP_LITERAL: - GET_ARG; - break; - - case SRE_OP_RANGE: - GET_ARG; - GET_ARG; - break; - - case SRE_OP_CHARSET: - offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */ - if (code+offset < code || code+offset > end) - FAIL; - code += offset; - break; - - case SRE_OP_BIGCHARSET: - GET_ARG; /* Number of blocks */ - offset = 256/sizeof(SRE_CODE); /* 256-byte table */ - if (code+offset < code || code+offset > end) - FAIL; - /* Make sure that each byte points to a valid block */ - for (i = 0; i < 256; i++) { - if (((unsigned char *)code)[i] >= arg) - FAIL; +/* The info for validating a pattern. */ +typedef struct SRE_Validation { + unsigned int numbered_mark_count; + unsigned int named_mark_count; + unsigned int min_numbered_mark; + unsigned int max_numbered_mark; + unsigned int min_named_mark; + unsigned int max_named_mark; + unsigned int group_ref_count; + unsigned int max_group_ref; +} SRE_Validation; + +/* + Validates a charset. Returns a pointer to the following op if valid or NULL if + invalid. + + The charset might look valid yet extend off the end of the pattern; the caller + will check whether that's the case. +*/ +static SRE_CODE* validate_charset(SRE_CODE* charset) { + /* + The format of a charset is explained in in_charset(). + + charset[0] contains the maximum character code in the charset. + */ + Py_ssize_t hi_bytes = charset[0] / 256; + Py_ssize_t max_index = 0; + Py_ssize_t index; + + /* Check each of the chunk indexes. */ + for (index = 0; index <= hi_bytes; index ++) { + /* Get the chunk index (2 x 16-bit indexes in each 32-bit codeword). */ + Py_ssize_t i = (charset[1 + index / 2] >> ((index % 2) * 16)) & 0xFFFF; + + /* + If a chunk is identical to a previous one then its index is the same as + that one's. + + If a chunk is different from any previous ones then its index is 1+ the + previous maximum index. + + This is guaranteed. + */ + if (i > max_index + 1) + /* Definitely invalid. */ + return NULL; + if (i > max_index) + /* It's different from any previous ones. */ + max_index = i; + } + + /* + Return a pointer to the end of the charset. The number of chunk indexes + depends on the maximum character code of the charset. + */ + return charset + 2 + hi_bytes / 2 + (max_index + 1) * (256 / + SRE_BITS_PER_CODE); +} + +/* + Validates a set. Returns a pointer to the following op if valid or NULL if + invalid. +*/ +static SRE_CODE* validate_set(SRE_CODE* pattern, SRE_CODE* end_ptr) { + /* Check that the set doesn't extend off the end of the pattern. */ + SRE_CODE* set_end = pattern + pattern[0]; + if (pattern[0] < 1 || set_end > end_ptr) + return NULL; + + pattern++; + + do { + SRE_OpInfo* info_ptr; + + if (pattern[0] > SRE_MAX_OP) + /* Invalid opcode. */ + return NULL; + + /* Get the info about the opcode. */ + info_ptr = &sre_op_info[pattern[0]]; + + switch (info_ptr->type) { + case SRE_TYPE_CATEGORY: + /* category */ + VTRACE(("%s\n", info_ptr->name)); + pattern += 2; + break; + case SRE_TYPE_CHARSET: + { + /* skip charset */ + /* + Check that the charset doesn't extend off the end of the pattern. + */ + SRE_CODE* end_charset = pattern + 1 + pattern[1]; + VTRACE(("%s\n", info_ptr->name)); + if (end_charset > end_ptr) + return NULL; + + pattern = validate_charset(pattern + 2); + if (pattern != end_charset) + return NULL; + break; + } + case SRE_TYPE_LITERAL: + /* code */ + VTRACE(("%s\n", info_ptr->name)); + pattern += 2; + break; + case SRE_TYPE_RANGE: + /* min max */ + /* The minimum shouldn't be greater than the maximum. */ + VTRACE(("%s\n", info_ptr->name)); + if (pattern[1] > pattern[2]) + return NULL; + + pattern += 3; + break; + default: + /* Unknown opcode type. */ + VTRACE(("UNKNOWN\n")); + return NULL; + } + } while (pattern < set_end); + + return pattern > set_end ? NULL : pattern; +} + +/* + Validates a single-character op. Returns a pointer to the following op if valid + or NULL if invalid. +*/ +static SRE_CODE* validate_one_pattern(SRE_CODE* pattern, SRE_CODE* end_ptr, + int* direction) { + SRE_OpInfo* info_ptr; + + if (pattern[0] > SRE_MAX_OP) + /* Invalid opcode. */ + return NULL; + + /* Get the info about the opcode. */ + info_ptr = &sre_op_info[pattern[0]]; + + /* + Is the direction correct? We'll reject a forwards opcode when the current + direction is backwards, and vice versa. + */ + if (*direction != 0 && *direction != info_ptr->direction) + return NULL; + + switch (info_ptr->type) { + case SRE_TYPE_CATEGORY: + /* category */ + VTRACE(("%s\n", info_ptr->name)); + pattern += 2; + break; + case SRE_TYPE_CHARSET: + { + /* skip charset */ + /* + Check that the charset doesn't extend off the end of the pattern. + */ + SRE_CODE* end_charset = pattern + 1 + pattern[1]; + VTRACE(("%s\n", info_ptr->name)); + if (end_charset > end_ptr) + return NULL; + + pattern = validate_charset(pattern + 2); + if (pattern != end_charset) + return NULL; + break; + } + case SRE_TYPE_LITERAL: + /* code */ + VTRACE(("%s\n", info_ptr->name)); + pattern += 2; + break; + case SRE_TYPE_RANGE: + /* min max */ + /* The minimum shouldn't be greater than the maximum. */ + VTRACE(("%s\n", info_ptr->name)); + if (pattern[1] > pattern[2]) + return NULL; + + pattern += 3; + break; + case SRE_TYPE_SET: + /* set */ + VTRACE(("%s\n", info_ptr->name)); + pattern = validate_set(pattern + 1, end_ptr); + if (pattern == NULL) + return NULL; + break; + case SRE_TYPE_SIMPLE_CATEGORY: + /* */ + VTRACE(("%s\n", info_ptr->name)); + pattern++; + break; + default: + /* Unknown opcode type. */ + return NULL; + } + + if (pattern > end_ptr) + return NULL; + + /* Set the current direction. */ + *direction = info_ptr->direction; + + return pattern; +} + +/* + Validates a subpattern. Returns a pointer to the following op if valid or NULL + if invalid. +*/ +static SRE_CODE* validate_subpattern(SRE_CODE* pattern, SRE_CODE* end_ptr, + int* direction, SRE_Validation* validation) { + /* The current direction (forwards/backwards). */ + int dir = *direction; + + while (pattern < end_ptr) { + SRE_OpInfo* info_ptr; + + VTRACE(("op %d\n", pattern[0])); + if (pattern[0] > SRE_MAX_OP) + /* Invalid opcode. */ + return NULL; + + /* Get the info about the opcode. */ + info_ptr = &sre_op_info[pattern[0]]; + VTRACE(("type %d\n", info_ptr->type)); + + /* + Is the direction correct? We'll reject a forwards opcode when the + current direction is backwards, and vice versa. + */ + if (dir != 0 && info_ptr->direction != 0 && dir != info_ptr->direction) { + VTRACE(("wrong direction\n")); + return NULL; + } + + switch (info_ptr->type) { + case SRE_TYPE_ASSERT: + { + /* ... */ + SRE_CODE* tail_ptr = pattern + 1 + pattern[1]; + int subdir = 0; + VTRACE(("%s\n", info_ptr->name)); + /* + Validate the parameters. + + We also check that the 'skip' points to the assert's end marker. + */ + if (pattern[1] < 2 || tail_ptr > end_ptr || tail_ptr[-1] != + info_ptr->end_marker) + return NULL; + + /* + Validate the subpattern within the assert and check that it ends + in the right place. + */ + if (validate_subpattern(pattern + 2, tail_ptr - 1, &subdir, + validation) != tail_ptr - 1) + return NULL; + + pattern = tail_ptr; + break; + } + case SRE_TYPE_ATOMIC: + { + /* ... */ + /* + Validate the subpattern within the atomic group. + + The call should return a pointer to the END_ATOMIC, which it + doesn't understand. + */ + SRE_CODE* ptr; + VTRACE(("%s\n", info_ptr->name)); + ptr = validate_subpattern(pattern + 1, end_ptr, &dir, validation); + if (ptr == NULL || ptr >= end_ptr || ptr[0] != info_ptr->end_marker) + return NULL; + + pattern = ptr + 1; + break; + } + case SRE_TYPE_BRANCH: + { + /* + + + ... + + + ... + + 0 + */ + /* All the jumps should end in the same place. */ + SRE_CODE* skip_end_ptr = NULL; + VTRACE(("%s\n", info_ptr->name)); + + pattern++; + + do { + SRE_CODE* next_ptr = pattern + pattern[0]; + SRE_CODE* ptr; + /* The offset to the next alternative's offset. */ + if (pattern[0] < 3 || next_ptr >= end_ptr) + return NULL; + + /* Validate this alternative, which stops at the jump. */ + ptr = validate_subpattern(pattern + 1, next_ptr - 2, &dir, + validation); + if (ptr != next_ptr - 2 || ptr[0] != SRE_OP_JUMP || ptr[1] < 1) + return NULL; + + /* The jump to the end. */ + ptr += 1 + ptr[1]; + if (skip_end_ptr == NULL) + skip_end_ptr = ptr; + else if (ptr != skip_end_ptr) + return NULL; + + pattern = next_ptr; + } while (pattern[0] != 0); + pattern++; + break; + } + case SRE_TYPE_CATEGORY: + /* category */ + VTRACE(("%s\n", info_ptr->name)); + pattern += 2; + break; + case SRE_TYPE_CHARSET: + { + /* skip charset */ + /* Point to the end of the charset. */ + SRE_CODE* end_charset = pattern + 1 + pattern[1]; + VTRACE(("%s\n", info_ptr->name)); + if (end_charset > end_ptr) + return NULL; + + /* Validate the charset. */ + pattern = validate_charset(pattern + 2); + if (pattern != end_charset) + return NULL; + break; + } + case SRE_TYPE_GROUPREF: + /* group_id */ + VTRACE(("%s\n", info_ptr->name)); + validation->group_ref_count++; + validation->max_group_ref = unsigned_max(validation->max_group_ref, + pattern[1]); + pattern += 2; + break; + case SRE_TYPE_GROUPREF_EXISTS: + { + /* + group_id + code_yes + + code_no + */ + SRE_CODE* skip_ptr = pattern + 1 + pattern[2]; + SRE_CODE* ptr; + VTRACE(("%s\n", info_ptr->name)); + /* Locate code_no. */ + if (pattern[2] < 2 || skip_ptr > end_ptr) + return NULL; + + /* code_yes lies between the 'skip' and code_no. */ + ptr = validate_subpattern(pattern + 3, skip_ptr, &dir, validation); + + /* + 'ptr' will point after code_yes and at the jump, if present. + + (The jump will have been rejected by the call.) + */ + /* Validate code_no, if present. */ + if (ptr == skip_ptr - 2) { + if (ptr[0] != SRE_OP_JUMP || ptr[1] < 1) + return NULL; + + skip_ptr = ptr + 1 + ptr[1]; + if (skip_ptr > end_ptr) + return NULL; + + /* + code_yes lies between the 'skip' and the end of the subpattern. + */ + ptr = validate_subpattern(ptr + 2, skip_ptr, &dir, validation); + if (ptr < skip_ptr) + return NULL; + } else if (ptr != skip_ptr) + return NULL; + + validation->group_ref_count++; + validation->max_group_ref = unsigned_max(validation->max_group_ref, + pattern[1]); + pattern = skip_ptr; + break; + } + case SRE_TYPE_LITERAL: + /* code */ + VTRACE(("%s\n", info_ptr->name)); + pattern += 2; + break; + case SRE_TYPE_LITERAL_STRING: + /* length ... */ + VTRACE(("%s\n", info_ptr->name)); + if (pattern[1] == 0) + return NULL; + pattern += 2 + pattern[1]; + break; + case SRE_TYPE_MARK: + /* */ + /* + The the capture groups are numbered. Some also have names. + + The name ids are all higher than the number ids. + */ + VTRACE(("%s\n", info_ptr->name)); + if (pattern[1] > pattern[2]) + /* Number id not higher than name id. */ + return NULL; + + /* Found another mark. */ + validation->numbered_mark_count++; + + /* The highest number id. */ + validation->min_numbered_mark = + unsigned_min(validation->min_numbered_mark, pattern[1]); + validation->max_numbered_mark = + unsigned_max(validation->max_numbered_mark, pattern[1]); + + if (pattern[2] > pattern[1]) { + /* The mark has a name id (it's higher then the number id). */ + validation->named_mark_count++; + + /* The highest name id. */ + validation->min_named_mark = + unsigned_min(validation->min_named_mark, pattern[2]); + validation->max_named_mark = + unsigned_max(validation->max_named_mark, pattern[2]); } - code += offset; - offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */ - if (code+offset < code || code+offset > end) - FAIL; - code += offset; - break; - - case SRE_OP_CATEGORY: - GET_ARG; - switch (arg) { - case SRE_CATEGORY_DIGIT: - case SRE_CATEGORY_NOT_DIGIT: - case SRE_CATEGORY_SPACE: - case SRE_CATEGORY_NOT_SPACE: - case SRE_CATEGORY_WORD: - case SRE_CATEGORY_NOT_WORD: - case SRE_CATEGORY_LINEBREAK: - case SRE_CATEGORY_NOT_LINEBREAK: - case SRE_CATEGORY_LOC_WORD: - case SRE_CATEGORY_LOC_NOT_WORD: - case SRE_CATEGORY_UNI_DIGIT: - case SRE_CATEGORY_UNI_NOT_DIGIT: - case SRE_CATEGORY_UNI_SPACE: - case SRE_CATEGORY_UNI_NOT_SPACE: - case SRE_CATEGORY_UNI_WORD: - case SRE_CATEGORY_UNI_NOT_WORD: - case SRE_CATEGORY_UNI_LINEBREAK: - case SRE_CATEGORY_UNI_NOT_LINEBREAK: - break; - default: - FAIL; - } - break; - + pattern += 3; + break; + case SRE_TYPE_POSITION: + /* */ + VTRACE(("%s\n", info_ptr->name)); + pattern++; + break; + case SRE_TYPE_RANGE: + /* min max */ + /* The minimum shouldn't be greater than the maximum. */ + VTRACE(("%s\n", info_ptr->name)); + if (pattern[1] > pattern[2]) + return NULL; + + pattern += 3; + break; + case SRE_TYPE_REPEAT: + { + /* + + ... + + */ + SRE_CODE* skip_end_ptr; + VTRACE(("%s\n", info_ptr->name)); + /* Validate the parameters. */ + if (pattern[1] < 4 || pattern[2] > pattern[3]) + return NULL; + + /* Check that the 'skip' points to the repeat's end marker. */ + skip_end_ptr = pattern + pattern[1]; + if (skip_end_ptr + 2 > end_ptr || skip_end_ptr[0] != + info_ptr->end_marker || skip_end_ptr[1] != pattern[1]) + return NULL; + + /* Validate the subpattern within the repeat. */ + if (validate_subpattern(pattern + 4, skip_end_ptr, &dir, + validation) != skip_end_ptr) + return NULL; + + pattern = skip_end_ptr + 2; + break; + } + case SRE_TYPE_REPEAT_ONE: + { + /* ... */ + SRE_CODE* tail_ptr; + VTRACE(("%s\n", info_ptr->name)); + /* Validate the parameters. */ + if (pattern[1] < 4 || pattern[2] > pattern[3]) + return NULL; + + /* + Check that the repeat doesn't extend off the end of the + pattern. + */ + tail_ptr = pattern + 1 + pattern[1]; + if (tail_ptr > end_ptr) + return NULL; + + /* Validate the opcode within the repeat. */ + if (validate_one_pattern(pattern + 4, tail_ptr, &dir) != tail_ptr) + return NULL; + pattern = tail_ptr; + break; + } + case SRE_TYPE_SET: + /* set */ + /* Validate the set. */ + VTRACE(("%s\n", info_ptr->name)); + pattern = validate_set(pattern + 1, end_ptr); + if (pattern == NULL) + return NULL; + break; + case SRE_TYPE_SIMPLE_CATEGORY: + /* */ + VTRACE(("%s\n", info_ptr->name)); + pattern++; + break; default: - FAIL; - - } - } - + /* Anything else might be meaningful to the caller. */ + *direction = dir; + return pattern; + } + + /* Record the direction. */ + if (info_ptr->direction != 0) + dir = info_ptr->direction; + } + + *direction = dir; + + return pattern > end_ptr ? NULL : pattern; +} + +/* Validates the pattern. */ +static int _validate(PatternObject* self) { + SRE_Validation validation; + int direction = 0; + SRE_CODE* end_ptr = self->code + self->codesize; + + /* Initialise the valdiation info. */ + validation.numbered_mark_count = 0; + validation.min_numbered_mark = ~(unsigned int)0; + validation.max_numbered_mark = 0; + validation.named_mark_count = 0; + validation.min_named_mark = ~(unsigned int)0; + validation.max_named_mark = 0; + validation.group_ref_count = 0; + validation.max_group_ref = 0; + + /* + _validate_subpattern will return a pointer to the first op it doesn't + understand or NULL if the pattern is invalid. + + It doesn't understand SRE_OP_SUCCESS (which occurs only at the end of the + pattern), so the result should be a pointer to that. + */ + if (self->codesize < 1 || end_ptr[-1] != SRE_OP_SUCCESS || + validate_subpattern(self->code, end_ptr, &direction, &validation) != + end_ptr - 1) + goto error; + + /* There should be an even number of marks (start and end of a group). */ + if (validation.numbered_mark_count % 2 != 0 || + validation.named_mark_count % 2 != 0) + goto error; + + /* + The numbered marks should be in the range 0 .. numbered_mark_count - 1. + + Note that it's possible for several marks to have the same number, + so we might need to correct numbered_mark_count. + */ + if (validation.numbered_mark_count > 0) { + if (validation.min_numbered_mark > 0 || + validation.max_numbered_mark >= validation.numbered_mark_count) + goto error; + + validation.numbered_mark_count = validation.max_numbered_mark + 1; + } + + /* + All the named marks should be in the range numbered_mark_count .. + numbered_mark_count + named_mark_count - 1. + + Note that it's possible for several marks to have the same number. + */ + if (validation.named_mark_count > 0) { + if (validation.min_named_mark != validation.numbered_mark_count || + validation.max_named_mark >= validation.min_named_mark + + validation.named_mark_count) + goto error; + + validation.named_mark_count = validation.max_named_mark - + validation.max_numbered_mark; + } + + /* + All the group refs should be in the range 0 .. numbered_mark_count + + named_mark_count - 1. + */ + if (validation.group_ref_count > 0 && validation.max_group_ref * 2 >= + validation.numbered_mark_count + validation.named_mark_count) + goto error; + + /* Calculate the number of capture groups. */ + self->groups = validation.numbered_mark_count / 2; + + /* Calculate the number of capture groups + named capture groups. */ + self->internal_groups = (validation.numbered_mark_count + + validation.named_mark_count) / 2; + + VTRACE(("Success!\n")); return 1; -} - -static int -_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) -{ - /* Some variables are manipulated by the macros above */ - SRE_CODE op; - SRE_CODE arg; - SRE_CODE skip; - - VTRACE(("code=%p, end=%p\n", code, end)); - - if (code > end) - FAIL; - - while (code < end) { - GET_OP; - switch (op) { - - case SRE_OP_MARK: - /* We don't check whether marks are properly nested; the - sre_match() code is robust even if they don't, and the worst - you can get is nonsensical match results. */ - GET_ARG; - if (arg > 2*groups+1) { - VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups)); - FAIL; - } - break; - - case SRE_OP_LITERAL: - case SRE_OP_NOT_LITERAL: - case SRE_OP_LITERAL_IGNORE: - case SRE_OP_NOT_LITERAL_IGNORE: - GET_ARG; - /* The arg is just a character, nothing to check */ - break; - - case SRE_OP_SUCCESS: - case SRE_OP_FAILURE: - /* Nothing to check; these normally end the matching process */ - break; - - case SRE_OP_AT: - GET_ARG; - switch (arg) { - case SRE_AT_BEGINNING: - case SRE_AT_BEGINNING_STRING: - case SRE_AT_BEGINNING_LINE: - case SRE_AT_END: - case SRE_AT_END_LINE: - case SRE_AT_END_STRING: - case SRE_AT_BOUNDARY: - case SRE_AT_NON_BOUNDARY: - case SRE_AT_LOC_BOUNDARY: - case SRE_AT_LOC_NON_BOUNDARY: - case SRE_AT_UNI_BOUNDARY: - case SRE_AT_UNI_NON_BOUNDARY: - break; - default: - FAIL; - } - break; - - case SRE_OP_ANY: - case SRE_OP_ANY_ALL: - /* These have no operands */ - break; - - case SRE_OP_IN: - case SRE_OP_IN_IGNORE: - GET_SKIP; - /* Stop 1 before the end; we check the FAILURE below */ - if (!_validate_charset(code, code+skip-2)) - FAIL; - if (code[skip-2] != SRE_OP_FAILURE) - FAIL; - code += skip-1; - break; - - case SRE_OP_INFO: - { - /* A minimal info field is - <1=skip> <2=flags> <3=min> <4=max>; - If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags, - more follows. */ - SRE_CODE flags, min, max, i; - SRE_CODE *newcode; - GET_SKIP; - newcode = code+skip-1; - GET_ARG; flags = arg; - GET_ARG; min = arg; - GET_ARG; max = arg; - /* Check that only valid flags are present */ - if ((flags & ~(SRE_INFO_PREFIX | - SRE_INFO_LITERAL | - SRE_INFO_CHARSET)) != 0) - FAIL; - /* PREFIX and CHARSET are mutually exclusive */ - if ((flags & SRE_INFO_PREFIX) && - (flags & SRE_INFO_CHARSET)) - FAIL; - /* LITERAL implies PREFIX */ - if ((flags & SRE_INFO_LITERAL) && - !(flags & SRE_INFO_PREFIX)) - FAIL; - /* Validate the prefix */ - if (flags & SRE_INFO_PREFIX) { - SRE_CODE prefix_len, prefix_skip; - GET_ARG; prefix_len = arg; - GET_ARG; prefix_skip = arg; - /* Here comes the prefix string */ - if (code+prefix_len < code || code+prefix_len > newcode) - FAIL; - code += prefix_len; - /* And here comes the overlap table */ - if (code+prefix_len < code || code+prefix_len > newcode) - FAIL; - /* Each overlap value should be < prefix_len */ - for (i = 0; i < prefix_len; i++) { - if (code[i] >= prefix_len) - FAIL; - } - code += prefix_len; - } - /* Validate the charset */ - if (flags & SRE_INFO_CHARSET) { - if (!_validate_charset(code, newcode-1)) - FAIL; - if (newcode[-1] != SRE_OP_FAILURE) - FAIL; - code = newcode; - } - else if (code != newcode) { - VTRACE(("code=%p, newcode=%p\n", code, newcode)); - FAIL; - } - } - break; - - case SRE_OP_BRANCH: - { - SRE_CODE *target = NULL; - for (;;) { - GET_SKIP; - if (skip == 0) - break; - /* Stop 2 before the end; we check the JUMP below */ - if (!_validate_inner(code, code+skip-3, groups)) - FAIL; - code += skip-3; - /* Check that it ends with a JUMP, and that each JUMP - has the same target */ - GET_OP; - if (op != SRE_OP_JUMP) - FAIL; - GET_SKIP; - if (target == NULL) - target = code+skip-1; - else if (code+skip-1 != target) - FAIL; - } - } - break; - - case SRE_OP_REPEAT_ONE: - case SRE_OP_MIN_REPEAT_ONE: - { - SRE_CODE min, max; - GET_SKIP; - GET_ARG; min = arg; - GET_ARG; max = arg; - if (min > max) - FAIL; -#ifdef Py_UNICODE_WIDE - if (max > 65535) - FAIL; -#endif - if (!_validate_inner(code, code+skip-4, groups)) - FAIL; - code += skip-4; - GET_OP; - if (op != SRE_OP_SUCCESS) - FAIL; - } - break; - - case SRE_OP_REPEAT: - { - SRE_CODE min, max; - GET_SKIP; - GET_ARG; min = arg; - GET_ARG; max = arg; - if (min > max) - FAIL; -#ifdef Py_UNICODE_WIDE - if (max > 65535) - FAIL; -#endif - if (!_validate_inner(code, code+skip-3, groups)) - FAIL; - code += skip-3; - GET_OP; - if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL) - FAIL; - } - break; - - case SRE_OP_GROUPREF: - case SRE_OP_GROUPREF_IGNORE: - GET_ARG; - if (arg >= groups) - FAIL; - break; - - case SRE_OP_GROUPREF_EXISTS: - /* The regex syntax for this is: '(?(group)then|else)', where - 'group' is either an integer group number or a group name, - 'then' and 'else' are sub-regexes, and 'else' is optional. */ - GET_ARG; - if (arg >= groups) - FAIL; - GET_SKIP_ADJ(1); - code--; /* The skip is relative to the first arg! */ - /* There are two possibilities here: if there is both a 'then' - part and an 'else' part, the generated code looks like: - - GROUPREF_EXISTS - - - ...then part... - JUMP - - ( jumps here) - ...else part... - ( jumps here) - - If there is only a 'then' part, it looks like: - - GROUPREF_EXISTS - - - ...then part... - ( jumps here) - - There is no direct way to decide which it is, and we don't want - to allow arbitrary jumps anywhere in the code; so we just look - for a JUMP opcode preceding our skip target. - */ - if (skip >= 3 && code+skip-3 >= code && - code[skip-3] == SRE_OP_JUMP) - { - VTRACE(("both then and else parts present\n")); - if (!_validate_inner(code+1, code+skip-3, groups)) - FAIL; - code += skip-2; /* Position after JUMP, at */ - GET_SKIP; - if (!_validate_inner(code, code+skip-1, groups)) - FAIL; - code += skip-1; - } - else { - VTRACE(("only a then part present\n")); - if (!_validate_inner(code+1, code+skip-1, groups)) - FAIL; - code += skip-1; - } - break; - - case SRE_OP_ASSERT: - case SRE_OP_ASSERT_NOT: - GET_SKIP; - GET_ARG; /* 0 for lookahead, width for lookbehind */ - code--; /* Back up over arg to simplify math below */ - if (arg & 0x80000000) - FAIL; /* Width too large */ - /* Stop 1 before the end; we check the SUCCESS below */ - if (!_validate_inner(code+1, code+skip-2, groups)) - FAIL; - code += skip-2; - GET_OP; - if (op != SRE_OP_SUCCESS) - FAIL; - break; - - default: - FAIL; - - } - } - - VTRACE(("okay\n")); - return 1; -} - -static int -_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) -{ - if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS) - FAIL; - if (groups == 0) /* fix for simplejson */ - groups = 100; /* 100 groups should always be safe */ - return _validate_inner(code, end-1, groups); -} - -static int -_validate(PatternObject *self) -{ - if (!_validate_outer(self->code, self->code+self->codesize, self->groups)) - { - PyErr_SetString(PyExc_RuntimeError, "invalid SRE code"); - return 0; - } - else - VTRACE(("Success!\n")); - return 1; + +error: + PyErr_SetString(PyExc_RuntimeError, "invalid SRE code"); + return 0; } /* -------------------------------------------------------------------- */ /* match methods */ -static void -match_dealloc(MatchObject* self) -{ +static void match_dealloc(MatchObject* self) { Py_XDECREF(self->regs); Py_XDECREF(self->string); Py_DECREF(self->pattern); PyObject_DEL(self); } -static PyObject* -match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def) -{ - if (index < 0 || index >= self->groups) { +static PyObject* match_getslice_by_index(MatchObject* self, Py_ssize_t index, + PyObject* def, BOOL include_internal) { + /* + Internally we can access the named groups by their name id; externally we + can't. + */ + Py_ssize_t groups = include_internal ? self->internal_groups : self->groups; + if (index < 0 || index >= groups) { /* raise IndexError if we were given a bad group number */ PyErr_SetString( PyExc_IndexError, @@ -3228,17 +7049,25 @@ } return PySequence_GetSlice( - self->string, self->mark[index], self->mark[index+1] + self->string, self->mark[index], self->mark[index + 1] ); } -static Py_ssize_t -match_getindex(MatchObject* self, PyObject* index) -{ +static Py_ssize_t match_getindex(MatchObject* self, PyObject* index, + BOOL include_internal) { Py_ssize_t i; if (PyInt_Check(index)) - return PyInt_AsSsize_t(index); + { + /* + Internally we can access the named groups by their name id; externally + we can't. + */ + Py_ssize_t groups = include_internal ? self->internal_groups : + self->groups; + i = PyInt_AsSsize_t(index); + return i >= groups ? -1 : i; + } i = -1; @@ -3255,15 +7084,13 @@ return i; } -static PyObject* -match_getslice(MatchObject* self, PyObject* index, PyObject* def) -{ - return match_getslice_by_index(self, match_getindex(self, index), def); -} - -static PyObject* -match_expand(MatchObject* self, PyObject* ptemplate) -{ +static PyObject* match_getslice(MatchObject* self, PyObject* index, + PyObject* def, BOOL include_internal) { + return match_getslice_by_index(self, match_getindex(self, index, + include_internal), def, TRUE); +} + +static PyObject* match_expand(MatchObject* self, PyObject* ptemplate) { /* delegate to Python code */ return call( SRE_PY_MODULE, "_expand", @@ -3271,9 +7098,8 @@ ); } -static PyObject* -match_group(MatchObject* self, PyObject* args) -{ +static PyObject* sre_get_match_group(MatchObject* self, PyObject* args, + BOOL include_internal) { PyObject* result; Py_ssize_t i, size; @@ -3281,10 +7107,11 @@ switch (size) { case 0: - result = match_getslice(self, Py_False, Py_None); + result = match_getslice(self, Py_False, Py_None, include_internal); break; case 1: - result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None); + result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None, + include_internal); break; default: /* fetch multiple items */ @@ -3293,7 +7120,7 @@ return NULL; for (i = 0; i < size; i++) { PyObject* item = match_getslice( - self, PyTuple_GET_ITEM(args, i), Py_None + self, PyTuple_GET_ITEM(args, i), Py_None, include_internal ); if (!item) { Py_DECREF(result); @@ -3306,9 +7133,15 @@ return result; } -static PyObject* -match_groups(MatchObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* match_group(MatchObject* self, PyObject* args) { + return sre_get_match_group(self, args, FALSE); +} + +static PyObject* match_internal_group(MatchObject* self, PyObject* args) { + return sre_get_match_group(self, args, TRUE); +} + +static PyObject* match_groups(MatchObject* self, PyObject* args, PyObject* kw) { PyObject* result; Py_ssize_t index; @@ -3317,26 +7150,25 @@ if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def)) return NULL; - result = PyTuple_New(self->groups-1); + result = PyTuple_New(self->groups - 1); if (!result) return NULL; for (index = 1; index < self->groups; index++) { PyObject* item; - item = match_getslice_by_index(self, index, def); + item = match_getslice_by_index(self, index, def, FALSE); if (!item) { Py_DECREF(result); return NULL; } - PyTuple_SET_ITEM(result, index-1, item); + PyTuple_SET_ITEM(result, index - 1, item); } return result; } -static PyObject* -match_groupdict(MatchObject* self, PyObject* args, PyObject* kw) -{ +static PyObject* match_groupdict(MatchObject* self, PyObject* args, + PyObject* kw) { PyObject* result; PyObject* keys; Py_ssize_t index; @@ -3361,7 +7193,7 @@ key = PyList_GET_ITEM(keys, index); if (!key) goto failed; - value = match_getslice(self, key, def); + value = match_getslice(self, key, def, FALSE); if (!value) { Py_DECREF(key); goto failed; @@ -3382,18 +7214,16 @@ return NULL; } -static PyObject* -match_start(MatchObject* self, PyObject* args) -{ +static PyObject* match_start(MatchObject* self, PyObject* args) { Py_ssize_t index; PyObject* index_ = Py_False; /* zero */ if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_)) return NULL; - index = match_getindex(self, index_); - - if (index < 0 || index >= self->groups) { + index = match_getindex(self, index_, FALSE); + + if (index < 0 || index >= self->internal_groups) { PyErr_SetString( PyExc_IndexError, "no such group" @@ -3402,21 +7232,19 @@ } /* mark is -1 if group is undefined */ - return Py_BuildValue("i", self->mark[index*2]); -} - -static PyObject* -match_end(MatchObject* self, PyObject* args) -{ + return Py_BuildValue("i", self->mark[index * 2]); +} + +static PyObject* match_end(MatchObject* self, PyObject* args) { Py_ssize_t index; PyObject* index_ = Py_False; /* zero */ if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_)) return NULL; - index = match_getindex(self, index_); - - if (index < 0 || index >= self->groups) { + index = match_getindex(self, index_, FALSE); + + if (index < 0 || index >= self->internal_groups) { PyErr_SetString( PyExc_IndexError, "no such group" @@ -3425,12 +7253,10 @@ } /* mark is -1 if group is undefined */ - return Py_BuildValue("i", self->mark[index*2+1]); -} - -LOCAL(PyObject*) -_pair(Py_ssize_t i1, Py_ssize_t i2) -{ + return Py_BuildValue("i", self->mark[index * 2 + 1]); +} + +LOCAL(PyObject*) _pair(Py_ssize_t i1, Py_ssize_t i2) { PyObject* pair; PyObject* item; @@ -3450,23 +7276,21 @@ return pair; - error: +error: Py_DECREF(pair); return NULL; } -static PyObject* -match_span(MatchObject* self, PyObject* args) -{ +static PyObject* match_span(MatchObject* self, PyObject* args) { Py_ssize_t index; PyObject* index_ = Py_False; /* zero */ if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_)) return NULL; - index = match_getindex(self, index_); - - if (index < 0 || index >= self->groups) { + index = match_getindex(self, index_, FALSE); + + if (index < 0 || index >= self->internal_groups) { PyErr_SetString( PyExc_IndexError, "no such group" @@ -3475,12 +7299,10 @@ } /* marks are -1 if group is undefined */ - return _pair(self->mark[index*2], self->mark[index*2+1]); -} - -static PyObject* -match_regs(MatchObject* self) -{ + return _pair(self->mark[index * 2], self->mark[index * 2 + 1]); +} + +static PyObject* match_regs(MatchObject* self) { PyObject* regs; PyObject* item; Py_ssize_t index; @@ -3490,7 +7312,7 @@ return NULL; for (index = 0; index < self->groups; index++) { - item = _pair(self->mark[index*2], self->mark[index*2+1]); + item = _pair(self->mark[index * 2], self->mark[index * 2 + 1]); if (!item) { Py_DECREF(regs); return NULL; @@ -3504,14 +7326,12 @@ return regs; } -static PyObject* -match_copy(MatchObject* self, PyObject *unused) -{ +static PyObject* match_copy(MatchObject* self, PyObject* unused) { #ifdef USE_BUILTIN_COPY MatchObject* copy; Py_ssize_t slots, offset; - slots = 2 * (self->pattern->groups+1); + slots = 2 * (self->pattern->groups + 1); copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots); if (!copy) @@ -3525,23 +7345,21 @@ Py_XINCREF(self->string); Py_XINCREF(self->regs); - memcpy((char*) copy + offset, (char*) self + offset, + memcpy((char*)copy + offset, (char*)self + offset, sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset); - return (PyObject*) copy; + return (PyObject*)copy; #else PyErr_SetString(PyExc_TypeError, "cannot copy this match object"); return NULL; #endif } -static PyObject* -match_deepcopy(MatchObject* self, PyObject* memo) -{ +static PyObject* match_deepcopy(MatchObject* self, PyObject* memo) { #ifdef USE_BUILTIN_COPY MatchObject* copy; - copy = (MatchObject*) match_copy(self); + copy = (MatchObject*)match_copy(self); if (!copy) return NULL; @@ -3558,7 +7376,53 @@ #endif } +static PyObject* match_subscript(MatchObject* self, PyObject* item) { + if (PyIndex_Check(item) || PyString_Check(item) || PyUnicode_Check(item)) + /* integer or string subscript */ + return match_getslice(self, item, Py_None, FALSE); + else if (PySlice_Check(item)) { + /* slice subscript */ + Py_ssize_t start, stop, step, slicelength; + PyTupleObject *result; + Py_ssize_t from, to; + + /* get the slice info */ + if (PySlice_GetIndicesEx((PySliceObject*)item, self->groups, + &start, &stop, &step, &slicelength) < 0) { + return NULL; + } + + /* empty slice? */ + if (slicelength <= 0) + return PyTuple_New(0); + + /* create the result tuple */ + result = (PyTupleObject *)PyTuple_New(slicelength); + if (result== NULL) + return NULL; + + /* get the captures */ + for (from = start, to = 0; from >= 0 && from < self->groups; from += step, to++) { + PyObject* item = match_getslice_by_index(self, from, Py_None, TRUE); + if (item == NULL) { + Py_DECREF(result); + return NULL; + } + PyTuple_SET_ITEM(result, to, item); + } + + return (PyObject*)result; + } else { + /* invalid subscript type */ + PyErr_Format(PyExc_TypeError, + "match indices must be integers or strings, not %.200s", + item->ob_type->tp_name); + return NULL; + } +} + static PyMethodDef match_methods[] = { + {"__getitem__", (PyCFunction)match_subscript, METH_O|METH_COEXIST}, {"group", (PyCFunction) match_group, METH_VARARGS}, {"start", (PyCFunction) match_start, METH_VARARGS}, {"end", (PyCFunction) match_end, METH_VARARGS}, @@ -3568,15 +7432,14 @@ {"expand", (PyCFunction) match_expand, METH_O}, {"__copy__", (PyCFunction) match_copy, METH_NOARGS}, {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O}, + {"_internal_group", (PyCFunction) match_internal_group, METH_VARARGS}, {NULL, NULL} }; -static PyObject* -match_getattr(MatchObject* self, char* name) -{ +static PyObject* match_getattr(MatchObject* self, char* name) { PyObject* res; - res = Py_FindMethod(match_methods, (PyObject*) self, name); + res = Py_FindMethod(match_methods, (PyObject*)self, name); if (res) return res; @@ -3590,9 +7453,9 @@ } if (!strcmp(name, "lastgroup")) { - if (self->pattern->indexgroup && self->lastindex >= 0) { + if (self->pattern->indexgroup && self->last_named_index >= 0) { PyObject* result = PySequence_GetItem( - self->pattern->indexgroup, self->lastindex + self->pattern->indexgroup, self->last_named_index ); if (result) return result; @@ -3638,31 +7501,49 @@ /* FIXME: implement setattr("string", None) as a special case (to detach the associated string, if any */ -statichere PyTypeObject Match_Type = { +static PyTypeObject Match_Type = { PyObject_HEAD_INIT(NULL) 0, "_" SRE_MODULE ".SRE_Match", sizeof(MatchObject), sizeof(Py_ssize_t), - (destructor)match_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - (getattrfunc)match_getattr /*tp_getattr*/ + (destructor)match_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + (getattrfunc)match_getattr, /*tp_getattr*/ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + &match_as_mapping, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_HAVE_INDEX, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + match_methods, /* tp_methods */ }; -static PyObject* -pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status) -{ +static PyObject* pattern_new_match(PatternObject* pattern, SRE_STATE* state, + int status) { /* create match object (from state object) */ - - MatchObject* match; - Py_ssize_t i, j; - char* base; - int n; - if (status > 0) { + MatchObject* match; + char* base = (char*) state->beginning; + Py_ssize_t mark_index; + int charsize = state->charsize; /* create match object (with room for extra group marks) */ /* coverity[ampersand_in_size] */ - match = PyObject_NEW_VAR(MatchObject, &Match_Type, - 2*(pattern->groups+1)); + match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2 * + (pattern->internal_groups + 1)); if (!match) return NULL; @@ -3673,36 +7554,42 @@ match->string = state->string; match->regs = NULL; - match->groups = pattern->groups+1; + match->groups = pattern->groups + 1; + match->internal_groups = pattern->internal_groups + 1; /* fill in group slices */ - - base = (char*) state->beginning; - n = state->charsize; - - match->mark[0] = ((char*) state->start - base) / n; - match->mark[1] = ((char*) state->ptr - base) / n; - - for (i = j = 0; i < pattern->groups; i++, j+=2) - if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) { - match->mark[j+2] = ((char*) state->mark[j] - base) / n; - match->mark[j+3] = ((char*) state->mark[j+1] - base) / n; + if (state->reverse) { + match->mark[0] = ((char*) state->ptr - base) / charsize; + match->mark[1] = ((char*) state->end - base) / charsize; + } else { + match->mark[0] = ((char*) state->start - base) / charsize; + match->mark[1] = ((char*) state->ptr - base) / charsize; + } + + for (mark_index = 0; mark_index < pattern->internal_groups * 2; + mark_index += 2) { + if (state->mark[mark_index] != NULL && state->mark[mark_index] <= + state->mark[mark_index + 1]) { + match->mark[mark_index + 2] = + ((char*) state->mark[mark_index] - base) / charsize; + match->mark[mark_index + 3] = + ((char*) state->mark[mark_index + 1] - base) / charsize; } else - match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ + match->mark[mark_index + 2] = + match->mark[mark_index + 3] = -1; /* unmatched */ + } match->pos = state->pos; match->endpos = state->endpos; match->lastindex = state->lastindex; - - return (PyObject*) match; - + match->last_named_index = state->last_named_index; + + return (PyObject*)match; } else if (status == 0) { - /* no match */ Py_INCREF(Py_None); return Py_None; - } /* internal error */ @@ -3714,30 +7601,31 @@ /* -------------------------------------------------------------------- */ /* scanner methods (experimental) */ -static void -scanner_dealloc(ScannerObject* self) -{ +static void scanner_dealloc(ScannerObject* self) { state_fini(&self->state); Py_DECREF(self->pattern); PyObject_DEL(self); } -static PyObject* -scanner_match(ScannerObject* self, PyObject *unused) -{ +static PyObject* scanner_match(ScannerObject* self, PyObject* unused) { SRE_STATE* state = &self->state; PyObject* match; int status; state_reset(state); - state->ptr = state->start; + /* Where should we start the match? */ + state->ptr = state->reverse ? state->end : state->start; + state->search_ptr = state->ptr; + + /* Clear the marks. */ + memset(state->mark, 0, state->pattern_code[0] * sizeof(SRE_CHAR*)); if (state->charsize == 1) { - status = sre_match(state, PatternObject_GetCode(self->pattern)); + status = sre_bmatch(state); } else { #if defined(HAVE_UNICODE) - status = sre_umatch(state, PatternObject_GetCode(self->pattern)); + status = sre_umatch(state); #endif } if (PyErr_Occurred()) @@ -3746,43 +7634,56 @@ match = pattern_new_match((PatternObject*) self->pattern, state, status); - if (status == 0 || state->ptr == state->start) - state->start = (void*) ((char*) state->ptr + state->charsize); + if (state->reverse) { + if (status == 0 || state->ptr == state->end) + state->end = (void*) ((char*) state->ptr - state->charsize); + else + state->end = state->ptr; + } else { + if (status == 0 || state->ptr == state->start) + state->start = (void*) ((char*) state->ptr + state->charsize); + else + state->start = state->ptr; + } + + return match; +} + + +static PyObject* scanner_search(ScannerObject* self, PyObject* unused) { + SRE_STATE* state = &self->state; + void * start_ptr; + PyObject* match; + int status; + + state_reset(state); + + /* Where should we start the match? */ + state->ptr = state->reverse ? state->end : state->start; + start_ptr = state->ptr; + + if (state->charsize == 1) { + status = sre_bsearch(state); + } else { +#if defined(HAVE_UNICODE) + status = sre_usearch(state); +#endif + } + if (PyErr_Occurred()) + return NULL; + + match = pattern_new_match((PatternObject*) self->pattern, + state, status); + + /* + Continue the search from where we left off. Forbid another zero-width + match at the same start position. + */ + if (state->reverse) + state->end = state->ptr; else state->start = state->ptr; - - return match; -} - - -static PyObject* -scanner_search(ScannerObject* self, PyObject *unused) -{ - SRE_STATE* state = &self->state; - PyObject* match; - int status; - - state_reset(state); - - state->ptr = state->start; - - if (state->charsize == 1) { - status = sre_search(state, PatternObject_GetCode(self->pattern)); - } else { -#if defined(HAVE_UNICODE) - status = sre_usearch(state, PatternObject_GetCode(self->pattern)); -#endif - } - if (PyErr_Occurred()) - return NULL; - - match = pattern_new_match((PatternObject*) self->pattern, - state, status); - - if (status == 0 || state->ptr == state->start) - state->start = (void*) ((char*) state->ptr + state->charsize); - else - state->start = state->ptr; + state->reject_zero_width = state->ptr == start_ptr; return match; } @@ -3793,12 +7694,10 @@ {NULL, NULL} }; -static PyObject* -scanner_getattr(ScannerObject* self, char* name) -{ +static PyObject* scanner_getattr(ScannerObject* self, char* name) { PyObject* res; - res = Py_FindMethod(scanner_methods, (PyObject*) self, name); + res = Py_FindMethod(scanner_methods, (PyObject*)self, name); if (res) return res; @@ -3814,7 +7713,7 @@ return NULL; } -statichere PyTypeObject Scanner_Type = { +static PyTypeObject Scanner_Type = { PyObject_HEAD_INIT(NULL) 0, "_" SRE_MODULE ".SRE_Scanner", sizeof(ScannerObject), 0, @@ -3823,9 +7722,7 @@ (getattrfunc)scanner_getattr, /*tp_getattr*/ }; -static PyObject* -pattern_scanner(PatternObject* pattern, PyObject* args) -{ +static PyObject* pattern_scanner(PatternObject* pattern, PyObject* args) { /* create search state object */ ScannerObject* self; @@ -3833,6 +7730,8 @@ PyObject* string; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; + SRE_CODE* pattern_code; + if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end)) return NULL; @@ -3841,22 +7740,27 @@ if (!self) return NULL; - string = state_init(&self->state, pattern, string, start, end); + pattern_code = PatternObject_GetCode(pattern); + + string = state_init(&self->state, pattern, string, start, end, + pattern_code); if (!string) { PyObject_DEL(self); return NULL; } Py_INCREF(pattern); - self->pattern = (PyObject*) pattern; - - return (PyObject*) self; + self->pattern = (PyObject*)pattern; + + return (PyObject*)self; } static PyMethodDef _functions[] = { {"compile", _compile, METH_VARARGS}, {"getcodesize", sre_codesize, METH_NOARGS}, {"getlower", sre_getlower, METH_VARARGS}, + {"getupper", sre_getupper, METH_VARARGS}, + {"gettitle", sre_gettitle, METH_VARARGS}, {NULL, NULL} }; @@ -3876,7 +7780,7 @@ m = Py_InitModule("_" SRE_MODULE, _functions); if (m == NULL) - return; + return; d = PyModule_GetDict(m); x = PyInt_FromLong(SRE_MAGIC); === modified file Modules/sre.h --- Modules/sre.h 2006-06-12 03:05:40 +0000 +++ Modules/sre.h 2009-03-05 19:14:43 +0000 @@ -11,19 +11,16 @@ #ifndef SRE_INCLUDED #define SRE_INCLUDED +typedef int BOOL; +enum BOOL {FALSE, TRUE}; + #include "sre_constants.h" - -/* size of a code word (must be unsigned short or larger, and - large enough to hold a Py_UNICODE character) */ -#ifdef Py_UNICODE_WIDE -#define SRE_CODE Py_UCS4 -#else -#define SRE_CODE unsigned short -#endif typedef struct { PyObject_VAR_HEAD Py_ssize_t groups; /* must be first! */ + Py_ssize_t internal_groups; /* both numbered and named (all named are + numbered) */ PyObject* groupindex; PyObject* indexgroup; /* compatibility */ @@ -36,6 +33,7 @@ } PatternObject; #define PatternObject_GetCode(o) (((PatternObject*)(o))->code) +#define PatternObject_GetCodeSize(o) (((PatternObject*)(o))->codesize) typedef struct { PyObject_VAR_HEAD @@ -43,8 +41,13 @@ PyObject* regs; /* cached list of matching spans */ PatternObject* pattern; /* link to the regex (pattern) object */ Py_ssize_t pos, endpos; /* current target slice */ - Py_ssize_t lastindex; /* last index marker seen by the engine (-1 if none) */ + Py_ssize_t lastindex; /* last index marker seen by the engine + (-1 if none) */ + Py_ssize_t last_named_index; /* last named index marker seen by the engine + (-1 if none) */ Py_ssize_t groups; /* number of groups (start/end marks) */ + Py_ssize_t internal_groups; /* number of groups, both numbered and named + (all named are also numbered) */ Py_ssize_t mark[1]; } MatchObject; @@ -53,12 +56,53 @@ /* FIXME: shouldn't be a constant, really... */ #define SRE_MARK_SIZE 200 -typedef struct SRE_REPEAT_T { +#define SRE_BACKTRACK_CHUNK_SIZE 1024 + +typedef struct SRE_BACKTRACK_ITEM { + SRE_CODE op; + union + { + struct { + void* text_start; + void* text_ptr; + SRE_CODE* pattern_ptr; + } assert; + struct { + void* text_ptr; + SRE_CODE* pattern_ptr; + } branch; + struct { + Py_ssize_t numbered_index; + void* numbered_mark_ptr; + Py_ssize_t named_index; + void* named_mark_ptr; + } mark; + struct { + void* text_ptr; + Py_ssize_t repeat_min; + Py_ssize_t repeat_max; + Py_ssize_t repeat_counter; + void* repeat_start; + struct SRE_BACKTRACK_ITEM* loop; /* Outer loop for REPEAT, parent + loop for END_REPEAT. */ + SRE_CODE* pattern_ptr; + } repeat; + }; + void* marks; // Numbered and named marks. +} SRE_BACKTRACK_ITEM; + +typedef struct SRE_BACKTRACK_CHUNK { + struct SRE_BACKTRACK_CHUNK* previous; + SRE_BACKTRACK_ITEM items[SRE_BACKTRACK_CHUNK_SIZE]; Py_ssize_t count; - SRE_CODE* pattern; /* points to REPEAT operator arguments */ - void* last_ptr; /* helper to check for infinite loops */ - struct SRE_REPEAT_T *prev; /* points to previous repeat context */ -} SRE_REPEAT; +} SRE_BACKTRACK_CHUNK; + +typedef struct SRE_ENCODING_TABLE { + BOOL (*in_category)(SRE_CODE category, Py_UCS4 ch); + Py_UCS4 (*lower)(Py_UCS4 ch); + Py_UCS4 (*upper)(Py_UCS4 ch); + Py_UCS4 (*title)(Py_UCS4 ch); +} SRE_ENCODING_TABLE; typedef struct { /* string pointers */ @@ -66,23 +110,26 @@ void* beginning; /* start of original string */ void* start; /* start of current slice */ void* end; /* end of original string */ + void* search_ptr; /* start of search (used by \G) */ /* attributes for the match object */ PyObject* string; Py_ssize_t pos, endpos; /* character size */ int charsize; + BOOL reverse; + BOOL reject_zero_width; /* registers */ Py_ssize_t lastindex; Py_ssize_t lastmark; + Py_ssize_t last_named_index; void* mark[SRE_MARK_SIZE]; /* dynamically allocated stuff */ - char* data_stack; - size_t data_stack_size; - size_t data_stack_base; - /* current repeat context */ - SRE_REPEAT *repeat; + SRE_BACKTRACK_CHUNK* backtrack_chunk; + unsigned int numbered_mark_count; + unsigned int named_mark_count; + SRE_CODE* pattern_code; /* hooks */ - SRE_TOLOWER_HOOK lower; + SRE_ENCODING_TABLE* encoding; } SRE_STATE; typedef struct { === modified file Modules/sre_constants.h --- Modules/sre_constants.h 2003-10-17 22:13:16 +0000 +++ Modules/sre_constants.h 2009-03-04 15:26:22 +0000 @@ -11,76 +11,299 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20031017 +#define SRE_MAGIC 20081218 + +/* size of a code word (must be unsigned short or larger, and + large enough to hold a Py_UNICODE character) */ +typedef unsigned int SRE_CODE; + +#define SRE_BYTES_PER_CODE 4 +#define SRE_BITS_PER_CODE 32 +#define SRE_UNLIMITED_REPEATS 0xFFFFFFFF + #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 #define SRE_OP_ANY_ALL 3 -#define SRE_OP_ASSERT 4 -#define SRE_OP_ASSERT_NOT 5 -#define SRE_OP_AT 6 -#define SRE_OP_BRANCH 7 -#define SRE_OP_CALL 8 -#define SRE_OP_CATEGORY 9 -#define SRE_OP_CHARSET 10 -#define SRE_OP_BIGCHARSET 11 -#define SRE_OP_GROUPREF 12 -#define SRE_OP_GROUPREF_EXISTS 13 -#define SRE_OP_GROUPREF_IGNORE 14 -#define SRE_OP_IN 15 -#define SRE_OP_IN_IGNORE 16 -#define SRE_OP_INFO 17 -#define SRE_OP_JUMP 18 -#define SRE_OP_LITERAL 19 -#define SRE_OP_LITERAL_IGNORE 20 -#define SRE_OP_MARK 21 -#define SRE_OP_MAX_UNTIL 22 -#define SRE_OP_MIN_UNTIL 23 -#define SRE_OP_NOT_LITERAL 24 -#define SRE_OP_NOT_LITERAL_IGNORE 25 -#define SRE_OP_NEGATE 26 -#define SRE_OP_RANGE 27 -#define SRE_OP_REPEAT 28 -#define SRE_OP_REPEAT_ONE 29 -#define SRE_OP_SUBPATTERN 30 -#define SRE_OP_MIN_REPEAT_ONE 31 -#define SRE_AT_BEGINNING 0 -#define SRE_AT_BEGINNING_LINE 1 -#define SRE_AT_BEGINNING_STRING 2 -#define SRE_AT_BOUNDARY 3 -#define SRE_AT_NON_BOUNDARY 4 -#define SRE_AT_END 5 -#define SRE_AT_END_LINE 6 -#define SRE_AT_END_STRING 7 -#define SRE_AT_LOC_BOUNDARY 8 -#define SRE_AT_LOC_NON_BOUNDARY 9 -#define SRE_AT_UNI_BOUNDARY 10 -#define SRE_AT_UNI_NON_BOUNDARY 11 -#define SRE_CATEGORY_DIGIT 0 -#define SRE_CATEGORY_NOT_DIGIT 1 -#define SRE_CATEGORY_SPACE 2 -#define SRE_CATEGORY_NOT_SPACE 3 -#define SRE_CATEGORY_WORD 4 -#define SRE_CATEGORY_NOT_WORD 5 -#define SRE_CATEGORY_LINEBREAK 6 -#define SRE_CATEGORY_NOT_LINEBREAK 7 -#define SRE_CATEGORY_LOC_WORD 8 -#define SRE_CATEGORY_LOC_NOT_WORD 9 -#define SRE_CATEGORY_UNI_DIGIT 10 -#define SRE_CATEGORY_UNI_NOT_DIGIT 11 -#define SRE_CATEGORY_UNI_SPACE 12 -#define SRE_CATEGORY_UNI_NOT_SPACE 13 -#define SRE_CATEGORY_UNI_WORD 14 -#define SRE_CATEGORY_UNI_NOT_WORD 15 -#define SRE_CATEGORY_UNI_LINEBREAK 16 -#define SRE_CATEGORY_UNI_NOT_LINEBREAK 17 -#define SRE_FLAG_TEMPLATE 1 -#define SRE_FLAG_IGNORECASE 2 -#define SRE_FLAG_LOCALE 4 -#define SRE_FLAG_MULTILINE 8 -#define SRE_FLAG_DOTALL 16 -#define SRE_FLAG_UNICODE 32 -#define SRE_FLAG_VERBOSE 64 -#define SRE_INFO_PREFIX 1 -#define SRE_INFO_LITERAL 2 -#define SRE_INFO_CHARSET 4 +#define SRE_OP_ANY_ALL_REV 4 +#define SRE_OP_ANY_REV 5 +#define SRE_OP_ASSERT 6 +#define SRE_OP_ASSERT_NOT 7 +#define SRE_OP_ATOMIC 8 +#define SRE_OP_BOUNDARY 9 +#define SRE_OP_BRANCH 10 +#define SRE_OP_CATEGORY 11 +#define SRE_OP_CATEGORY_REV 12 +#define SRE_OP_CHARSET 13 +#define SRE_OP_CHARSET_IGNORE 14 +#define SRE_OP_CHARSET_IGNORE_REV 15 +#define SRE_OP_CHARSET_REV 16 +#define SRE_OP_END_ASSERT 17 +#define SRE_OP_END_ASSERT_NOT 18 +#define SRE_OP_END_ATOMIC 19 +#define SRE_OP_END_OF_LINE 20 +#define SRE_OP_END_OF_STRING 21 +#define SRE_OP_END_OF_STRING_LN 22 +#define SRE_OP_END_REPEAT_MAX 23 +#define SRE_OP_END_REPEAT_MAX_REV 24 +#define SRE_OP_END_REPEAT_MIN 25 +#define SRE_OP_END_REPEAT_MIN_REV 26 +#define SRE_OP_END_REPEAT_POSS 27 +#define SRE_OP_END_REPEAT_POSS_REV 28 +#define SRE_OP_GROUPREF 29 +#define SRE_OP_GROUPREF_EXISTS 30 +#define SRE_OP_GROUPREF_IGNORE 31 +#define SRE_OP_GROUPREF_IGNORE_REV 32 +#define SRE_OP_GROUPREF_REV 33 +#define SRE_OP_JUMP 34 +#define SRE_OP_LITERAL 35 +#define SRE_OP_LITERAL_IGNORE 36 +#define SRE_OP_LITERAL_IGNORE_REV 37 +#define SRE_OP_LITERAL_REV 38 +#define SRE_OP_LITERAL_STRING 39 +#define SRE_OP_LITERAL_STRING_IGNORE 40 +#define SRE_OP_LITERAL_STRING_IGNORE_REV 41 +#define SRE_OP_LITERAL_STRING_REV 42 +#define SRE_OP_MARK 43 +#define SRE_OP_NOT_BOUNDARY 44 +#define SRE_OP_NOT_CATEGORY 45 +#define SRE_OP_NOT_CATEGORY_REV 46 +#define SRE_OP_NOT_CHARSET 47 +#define SRE_OP_NOT_CHARSET_IGNORE 48 +#define SRE_OP_NOT_CHARSET_IGNORE_REV 49 +#define SRE_OP_NOT_CHARSET_REV 50 +#define SRE_OP_NOT_LITERAL 51 +#define SRE_OP_NOT_LITERAL_IGNORE 52 +#define SRE_OP_NOT_LITERAL_IGNORE_REV 53 +#define SRE_OP_NOT_LITERAL_REV 54 +#define SRE_OP_NOT_RANGE 55 +#define SRE_OP_NOT_RANGE_IGNORE 56 +#define SRE_OP_NOT_RANGE_IGNORE_REV 57 +#define SRE_OP_NOT_RANGE_REV 58 +#define SRE_OP_NOT_SET 59 +#define SRE_OP_NOT_SET_IGNORE 60 +#define SRE_OP_NOT_SET_IGNORE_REV 61 +#define SRE_OP_NOT_SET_REV 62 +#define SRE_OP_RANGE 63 +#define SRE_OP_RANGE_IGNORE 64 +#define SRE_OP_RANGE_IGNORE_REV 65 +#define SRE_OP_RANGE_REV 66 +#define SRE_OP_REPEAT_MAX 67 +#define SRE_OP_REPEAT_MAX_REV 68 +#define SRE_OP_REPEAT_MIN 69 +#define SRE_OP_REPEAT_MIN_REV 70 +#define SRE_OP_REPEAT_ONE_MAX 71 +#define SRE_OP_REPEAT_ONE_MAX_REV 72 +#define SRE_OP_REPEAT_ONE_MIN 73 +#define SRE_OP_REPEAT_ONE_MIN_REV 74 +#define SRE_OP_REPEAT_ONE_POSS 75 +#define SRE_OP_REPEAT_ONE_POSS_REV 76 +#define SRE_OP_REPEAT_POSS 77 +#define SRE_OP_REPEAT_POSS_REV 78 +#define SRE_OP_SET 79 +#define SRE_OP_SET_IGNORE 80 +#define SRE_OP_SET_IGNORE_REV 81 +#define SRE_OP_SET_REV 82 +#define SRE_OP_START_OF_LINE 83 +#define SRE_OP_START_OF_SEARCH 84 +#define SRE_OP_START_OF_STRING 85 +#define SRE_OP_SUBPATTERN 86 +#define SRE_MAX_OP 86 + +#define SRE_FLAG_TEMPLATE 0x1 +#define SRE_FLAG_IGNORECASE 0x2 +#define SRE_FLAG_LOCALE 0x4 +#define SRE_FLAG_MULTILINE 0x8 +#define SRE_FLAG_DOTALL 0x10 +#define SRE_FLAG_UNICODE 0x20 +#define SRE_FLAG_VERBOSE 0x40 +#define SRE_FLAG_REVERSE 0x100 +#define SRE_FLAG_ZEROWIDTH 0x200 + +#define SRE_INFO_PREFIX 0x1 +#define SRE_INFO_LITERAL 0x2 +#define SRE_INFO_CHARSET 0x4 + +#define SRE_UNI_CAT_Lu 0x1 +#define SRE_UNI_CAT_Ll 0x2 +#define SRE_UNI_CAT_Lt 0x3 +#define SRE_UNI_CAT_Mn 0x4 +#define SRE_UNI_CAT_Mc 0x5 +#define SRE_UNI_CAT_Me 0x6 +#define SRE_UNI_CAT_Nd 0x7 +#define SRE_UNI_CAT_Nl 0x8 +#define SRE_UNI_CAT_No 0x9 +#define SRE_UNI_CAT_Zs 0xA +#define SRE_UNI_CAT_Zl 0xB +#define SRE_UNI_CAT_Zp 0xC +#define SRE_UNI_CAT_Cc 0xD +#define SRE_UNI_CAT_Cf 0xE +#define SRE_UNI_CAT_Cs 0xF +#define SRE_UNI_CAT_Co 0x10 +#define SRE_UNI_CAT_Lm 0x12 +#define SRE_UNI_CAT_Lo 0x13 +#define SRE_UNI_CAT_Pc 0x14 +#define SRE_UNI_CAT_Pd 0x15 +#define SRE_UNI_CAT_Ps 0x16 +#define SRE_UNI_CAT_Pe 0x17 +#define SRE_UNI_CAT_Pi 0x18 +#define SRE_UNI_CAT_Pf 0x19 +#define SRE_UNI_CAT_Po 0x1A +#define SRE_UNI_CAT_Sm 0x1B +#define SRE_UNI_CAT_Sc 0x1C +#define SRE_UNI_CAT_Sk 0x1D +#define SRE_UNI_CAT_So 0x1E +#define SRE_UNI_CAT_L 0x20 +#define SRE_UNI_CAT_M 0x21 +#define SRE_UNI_CAT_N 0x22 +#define SRE_UNI_CAT_Z 0x23 +#define SRE_UNI_CAT_C 0x24 +#define SRE_UNI_CAT_P 0x25 +#define SRE_UNI_CAT_S 0x26 + +#define SRE_CAT_Alpha 0x27 +#define SRE_CAT_Alnum 0x28 +#define SRE_CAT_ASCII 0x29 +#define SRE_CAT_Blank 0x2A +#define SRE_CAT_Cntrl 0x2B +#define SRE_CAT_Digit 0x2C +#define SRE_CAT_Graph 0x2D +#define SRE_CAT_LineBreak 0x2E +#define SRE_CAT_Lower 0x2F +#define SRE_CAT_Print 0x30 +#define SRE_CAT_Punct 0x31 +#define SRE_CAT_Space 0x32 +#define SRE_CAT_Upper 0x33 +#define SRE_CAT_Word 0x34 +#define SRE_CAT_XDigit 0x35 + +#define SRE_CAT_MASK_C 0x0001E000 +#define SRE_CAT_MASK_L 0x000C000E +#define SRE_CAT_MASK_M 0x00000070 +#define SRE_CAT_MASK_N 0x00000380 +#define SRE_CAT_MASK_P 0x07F00000 +#define SRE_CAT_MASK_S 0x78000000 +#define SRE_CAT_MASK_Z 0x00001C00 + +#define SRE_CAT_MASK_Alnum 0x000C008E +#define SRE_CAT_MASK_Alpha 0x000C000E +#define SRE_CAT_MASK_Graph 0x7FFC03FE +#define SRE_CAT_MASK_Print 0x7FFC1FFE +#define SRE_CAT_MASK_Punct 0x7FF00000 +#define SRE_CAT_MASK_Word 0x001C03FE + +// info for operator validation +typedef struct SRE_OpInfo { + char* name; + int type; + int direction; + int end_marker; +} SRE_OpInfo; + +#define SRE_TYPE_INVALID 0 +#define SRE_TYPE_ASSERT 1 +#define SRE_TYPE_ATOMIC 2 +#define SRE_TYPE_BRANCH 3 +#define SRE_TYPE_CATEGORY 4 +#define SRE_TYPE_CHARSET 5 +#define SRE_TYPE_GROUPREF 6 +#define SRE_TYPE_GROUPREF_EXISTS 7 +#define SRE_TYPE_LITERAL 8 +#define SRE_TYPE_LITERAL_STRING 9 +#define SRE_TYPE_MARK 10 +#define SRE_TYPE_POSITION 11 +#define SRE_TYPE_RANGE 12 +#define SRE_TYPE_REPEAT 13 +#define SRE_TYPE_REPEAT_ONE 14 +#define SRE_TYPE_SET 15 +#define SRE_TYPE_SIMPLE_CATEGORY 16 + +static SRE_OpInfo sre_op_info[] = { + {"FAILURE", 0, 0, 0}, + {"SUCCESS", 0, 0, 0}, + {"ANY", 16, 1, 0}, + {"ANY_ALL", 16, 1, 0}, + {"ANY_ALL_REV", 16, -1, 0}, + {"ANY_REV", 16, -1, 0}, + {"ASSERT", 1, 0, SRE_OP_END_ASSERT}, + {"ASSERT_NOT", 1, 0, SRE_OP_END_ASSERT_NOT}, + {"ATOMIC", 2, 0, SRE_OP_END_ATOMIC}, + {"BOUNDARY", 11, 0, 0}, + {"BRANCH", 3, 0, 0}, + {"CATEGORY", 4, 1, 0}, + {"CATEGORY_REV", 4, -1, 0}, + {"CHARSET", 5, 1, 0}, + {"CHARSET_IGNORE", 5, 1, 0}, + {"CHARSET_IGNORE_REV", 5, -1, 0}, + {"CHARSET_REV", 5, -1, 0}, + {"END_ASSERT", 0, 0, 0}, + {"END_ASSERT_NOT", 0, 0, 0}, + {"END_ATOMIC", 0, 0, 0}, + {"END_OF_LINE", 11, 0, 0}, + {"END_OF_STRING", 11, 0, 0}, + {"END_OF_STRING_LN", 11, 0, 0}, + {"END_REPEAT_MAX", 0, 1, 0}, + {"END_REPEAT_MAX_REV", 0, -1, 0}, + {"END_REPEAT_MIN", 0, 1, 0}, + {"END_REPEAT_MIN_REV", 0, -1, 0}, + {"END_REPEAT_POSS", 0, 1, 0}, + {"END_REPEAT_POSS_REV", 0, -1, 0}, + {"GROUPREF", 6, 1, 0}, + {"GROUPREF_EXISTS", 7, 0, 0}, + {"GROUPREF_IGNORE", 6, 1, 0}, + {"GROUPREF_IGNORE_REV", 6, -1, 0}, + {"GROUPREF_REV", 6, -1, 0}, + {"JUMP", 0, 0, 0}, + {"LITERAL", 8, 1, 0}, + {"LITERAL_IGNORE", 8, 1, 0}, + {"LITERAL_IGNORE_REV", 8, -1, 0}, + {"LITERAL_REV", 8, -1, 0}, + {"LITERAL_STRING", 9, 1, 0}, + {"LITERAL_STRING_IGNORE", 9, 1, 0}, + {"LITERAL_STRING_IGNORE_REV", 9, -1, 0}, + {"LITERAL_STRING_REV", 9, -1, 0}, + {"MARK", 10, 0, 0}, + {"NOT_BOUNDARY", 11, 0, 0}, + {"NOT_CATEGORY", 4, 1, 0}, + {"NOT_CATEGORY_REV", 4, -1, 0}, + {"NOT_CHARSET", 5, 1, 0}, + {"NOT_CHARSET_IGNORE", 5, 1, 0}, + {"NOT_CHARSET_IGNORE_REV", 5, -1, 0}, + {"NOT_CHARSET_REV", 5, -1, 0}, + {"NOT_LITERAL", 8, 1, 0}, + {"NOT_LITERAL_IGNORE", 8, 1, 0}, + {"NOT_LITERAL_IGNORE_REV", 8, -1, 0}, + {"NOT_LITERAL_REV", 8, -1, 0}, + {"NOT_RANGE", 12, 1, 0}, + {"NOT_RANGE_IGNORE", 12, 1, 0}, + {"NOT_RANGE_IGNORE_REV", 12, -1, 0}, + {"NOT_RANGE_REV", 12, -1, 0}, + {"NOT_SET", 15, 1, 0}, + {"NOT_SET_IGNORE", 15, 1, 0}, + {"NOT_SET_IGNORE_REV", 15, -1, 0}, + {"NOT_SET_REV", 15, -1, 0}, + {"RANGE", 12, 1, 0}, + {"RANGE_IGNORE", 12, 1, 0}, + {"RANGE_IGNORE_REV", 12, -1, 0}, + {"RANGE_REV", 12, -1, 0}, + {"REPEAT_MAX", 13, 1, SRE_OP_END_REPEAT_MAX}, + {"REPEAT_MAX_REV", 13, -1, SRE_OP_END_REPEAT_MAX}, + {"REPEAT_MIN", 13, 1, SRE_OP_END_REPEAT_MIN}, + {"REPEAT_MIN_REV", 13, -1, SRE_OP_END_REPEAT_MIN}, + {"REPEAT_ONE_MAX", 14, 1, 0}, + {"REPEAT_ONE_MAX_REV", 14, -1, 0}, + {"REPEAT_ONE_MIN", 14, 1, 0}, + {"REPEAT_ONE_MIN_REV", 14, -1, 0}, + {"REPEAT_ONE_POSS", 14, 1, 0}, + {"REPEAT_ONE_POSS_REV", 14, -1, 0}, + {"REPEAT_POSS", 13, 1, SRE_OP_END_REPEAT_POSS}, + {"REPEAT_POSS_REV", 13, -1, SRE_OP_END_REPEAT_POSS}, + {"SET", 15, 1, 0}, + {"SET_IGNORE", 15, 1, 0}, + {"SET_IGNORE_REV", 15, -1, 0}, + {"SET_REV", 15, -1, 0}, + {"START_OF_LINE", 11, 0, 0}, + {"START_OF_SEARCH", 11, 0, 0}, + {"START_OF_STRING", 11, 0, 0}, + {"SUBPATTERN", 0, 0, 0}, +};