Index: PythonSourceLibs/sre_constants.py =================================================================== --- PythonSourceLibs/sre_constants.py (revision 62045) +++ PythonSourceLibs/sre_constants.py (working copy) @@ -54,6 +54,7 @@ MAX_UNTIL = "max_until" MIN_REPEAT = "min_repeat" MIN_UNTIL = "min_until" +POSSESSIVE_REPEAT = "possessive_repeat" NEGATE = "negate" NOT_LITERAL = "not_literal" NOT_LITERAL_IGNORE = "not_literal_ignore" @@ -61,6 +62,7 @@ REPEAT = "repeat" REPEAT_ONE = "repeat_one" SUBPATTERN = "subpattern" +ATOMIC_GROUP = "atomic_group" MIN_REPEAT_ONE = "min_repeat_one" # positions @@ -97,6 +99,10 @@ CATEGORY_UNI_LINEBREAK = "category_uni_linebreak" CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak" +SRE_GROUP_IGNORE = "ignore_this_group" +SRE_GROUP_CAPTURE = "capture_this_group" +SRE_GROUP_NON_CAPTURE = "non_capturing_group" + OPCODES = [ # failure=0 success=1 (just because it looks better that way :-) Index: PythonSourceLibs/sre_parse.py =================================================================== --- PythonSourceLibs/sre_parse.py (revision 62045) +++ PythonSourceLibs/sre_parse.py (working copy) @@ -424,8 +424,6 @@ # character set set = [] setappend = set.append -## if sourcematch(":"): -## pass # handle character classes if sourcematch("^"): setappend((NEGATE, None)) # check remaining characters @@ -522,19 +520,25 @@ if item[0][0] in REPEATCODES: raise error, "multiple repeat" if sourcematch("?"): + # Non-Greedy Match subpattern[-1] = (MIN_REPEAT, (min, max, item)) + elif sourcematch("+"): + # Possessive Match (Always Greedy) + subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item)) else: + # Greedy Match subpattern[-1] = (MAX_REPEAT, (min, max, item)) elif this == ".": subpatternappend((ANY, None)) elif this == "(": - group = 1 + grouptype = SRE_GROUP_CAPTURE name = None condgroup = None + atomic = False if sourcematch("?"): - group = 0 + grouptype = SRE_GROUP_IGNORE # options if sourcematch("P"): # python extensions @@ -548,7 +552,7 @@ if char == ">": break name = name + char - group = 1 + grouptype = SRE_GROUP_CAPTURE if not isname(name): raise error, "bad character in group name" elif sourcematch("="): @@ -568,6 +572,27 @@ raise error, "unknown group name" subpatternappend((GROUPREF, gid)) continue + elif sourcematch("#"): + # Python-Specific Comment -- allows for nested + # paren + depth = 1 + while 1: + if sourcematch("\\"): + # Ignore escaped characters + if not source.next: + break + elif source.next == "(": + depth += 1 + elif source.next == ")": + depth -= 1 + if not depth: + break + if source.next is None: + break + sourceget() + if not sourcematch(")"): + raise error, "unbalanced parenthesis" + continue else: char = sourceget() if char is None: @@ -575,7 +600,7 @@ raise error, "unknown specifier: ?P%s" % char elif sourcematch(":"): # non-capturing group - group = 2 + grouptype = SRE_GROUP_NON_CAPTURE elif sourcematch("#"): # comment while 1: @@ -612,7 +637,7 @@ if char == ")": break condname = condname + char - group = 2 + grouptype = SRE_GROUP_NON_CAPTURE if isname(condname): condgroup = state.groupdict.get(condname) if condgroup is None: @@ -622,15 +647,19 @@ condgroup = int(condname) except ValueError: raise error, "bad character in group name" + elif sourcematch(">"): + # non-capturing, atomic group + grouptype = SRE_GROUP_NON_CAPTURE + atomic = True else: # flags if not source.next in FLAGS: raise error, "unexpected end of pattern" while source.next in FLAGS: state.flags = state.flags | FLAGS[sourceget()] - if group: + if grouptype != SRE_GROUP_IGNORE: # parse group contents - if group == 2: + if grouptype == SRE_GROUP_NON_CAPTURE: # anonymous group group = None else: @@ -643,7 +672,10 @@ raise error, "unbalanced parenthesis" if group is not None: state.closegroup(group) - subpatternappend((SUBPATTERN, (group, p))) + if atomic: + subpatternappend((ATOMIC_GROUP, (group, p))) + else: + subpatternappend((SUBPATTERN, (group, p))) else: while 1: char = sourceget()