diff -r 068365acbe73 Lib/re.py --- a/Lib/re.py Wed Mar 25 21:03:47 2015 +0200 +++ b/Lib/re.py Wed Mar 25 22:53:11 2015 +0200 @@ -353,7 +353,7 @@ class Scanner: for phrase, action in lexicon: gid = s.opengroup() p.append(sre_parse.SubPattern(s, [ - (SUBPATTERN, (gid, sre_parse.parse(phrase, flags))), + (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))), ])) s.closegroup(gid, p[-1]) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) diff -r 068365acbe73 Lib/sre_compile.py --- a/Lib/sre_compile.py Wed Mar 25 21:03:47 2015 +0200 +++ b/Lib/sre_compile.py Wed Mar 25 22:53:11 2015 +0200 @@ -71,7 +71,8 @@ def _compile(code, pattern, flags): ASSERT_CODES = _ASSERT_CODES if (flags & SRE_FLAG_IGNORECASE and not (flags & SRE_FLAG_LOCALE) and - flags & SRE_FLAG_UNICODE): + flags & SRE_FLAG_UNICODE and + not (flags & SRE_FLAG_ASCII)): fixes = _ignorecase_fixes else: fixes = None @@ -137,14 +138,15 @@ def _compile(code, pattern, flags): else: emit(MIN_UNTIL) elif op is SUBPATTERN: - if av[0]: + group, add_flags, del_flags, p = av + if group: emit(MARK) - emit((av[0]-1)*2) - # _compile_info(code, av[1], flags) - _compile(code, av[1], flags) - if av[0]: + emit((group-1)*2) + # _compile_info(code, p, (flags | add_flags) & ~del_flags) + _compile(code, p, (flags | add_flags) & ~del_flags) + if group: emit(MARK) - emit((av[0]-1)*2+1) + emit((group-1)*2+1) elif op in SUCCESS_CODES: emit(op) elif op in ASSERT_CODES: @@ -172,7 +174,7 @@ def _compile(code, pattern, flags): av = AT_MULTILINE.get(av, av) if flags & SRE_FLAG_LOCALE: av = AT_LOCALE.get(av, av) - elif flags & SRE_FLAG_UNICODE: + elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): av = AT_UNICODE.get(av, av) emit(av) elif op is BRANCH: @@ -193,7 +195,7 @@ def _compile(code, pattern, flags): emit(op) if flags & SRE_FLAG_LOCALE: av = CH_LOCALE[av] - elif flags & SRE_FLAG_UNICODE: + elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): av = CH_UNICODE[av] emit(av) elif op is GROUPREF: @@ -237,7 +239,7 @@ def _compile_charset(charset, flags, cod elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: emit(CH_LOCALE[av]) - elif flags & SRE_FLAG_UNICODE: + elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): emit(CH_UNICODE[av]) else: emit(av) @@ -432,8 +434,8 @@ def _compile_info(code, pattern, flags): if len(prefix) == prefix_skip: prefix_skip = prefix_skip + 1 prefixappend(av) - elif op is SUBPATTERN and len(av[1]) == 1: - op, av = av[1][0] + elif op is SUBPATTERN and len(av[3]) == 1: + op, av = av[3][0] if op is LITERAL: prefixappend(av) else: @@ -443,8 +445,8 @@ def _compile_info(code, pattern, flags): # if no prefix, look for charset prefix if not prefix and pattern.data: op, av = pattern.data[0] - if op is SUBPATTERN and av[1]: - op, av = av[1][0] + if op is SUBPATTERN and av[3]: + op, av = av[3][0] if op is LITERAL: charsetappend((op, av)) elif op is BRANCH: diff -r 068365acbe73 Lib/sre_parse.py --- a/Lib/sre_parse.py Wed Mar 25 21:03:47 2015 +0200 +++ b/Lib/sre_parse.py Wed Mar 25 22:53:11 2015 +0200 @@ -65,6 +65,12 @@ FLAGS = { "u": SRE_FLAG_UNICODE, } +GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE | + SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE) + +class Verbose(Exception): + pass + class Pattern: # master pattern object. keeps track of global attributes def __init__(self): @@ -184,7 +190,7 @@ class SubPattern: lo = lo + i hi = hi + j elif op is SUBPATTERN: - i, j = av[1].getwidth() + i, j = av[3].getwidth() lo = lo + i hi = hi + j elif op in _REPEATCODES: @@ -426,7 +432,7 @@ def _escape(source, escape, state): pass raise source.error("bad escape %s" % escape, len(escape)) -def _parse_sub(source, state, nested=True): +def _parse_sub(source, state, verbose, nested=True): # parse an alternation: a|b|c items = [] @@ -434,7 +440,7 @@ def _parse_sub(source, state, nested=Tru sourcematch = source.match start = source.tell() while True: - itemsappend(_parse(source, state)) + itemsappend(_parse(source, state, verbose)) if not sourcematch("|"): break @@ -476,10 +482,10 @@ def _parse_sub(source, state, nested=Tru subpattern.append((BRANCH, (None, items))) return subpattern -def _parse_sub_cond(source, state, condgroup): - item_yes = _parse(source, state) +def _parse_sub_cond(source, state, condgroup, verbose): + item_yes = _parse(source, state, verbose) if source.match("|"): - item_no = _parse(source, state) + item_no = _parse(source, state, verbose) if source.next == "|": raise source.error("conditional backref with more than two branches") else: @@ -488,7 +494,7 @@ def _parse_sub_cond(source, state, condg subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) return subpattern -def _parse(source, state): +def _parse(source, state, verbose): # parse a simple pattern subpattern = SubPattern(state) @@ -498,7 +504,6 @@ def _parse(source, state): sourcematch = source.match _len = len _ord = ord - verbose = state.flags & SRE_FLAG_VERBOSE while True: @@ -652,6 +657,8 @@ def _parse(source, state): group = True name = None condgroup = None + add_flags = 0 + del_flags = 0 if sourcematch("?"): # options char = sourceget() @@ -710,7 +717,7 @@ def _parse(source, state): lookbehindgroups = state.lookbehindgroups if lookbehindgroups is None: state.lookbehindgroups = state.groups - p = _parse_sub(source, state) + p = _parse_sub(source, state, verbose) if dir < 0: if lookbehindgroups is None: state.lookbehindgroups = None @@ -746,19 +753,13 @@ def _parse(source, state): raise source.error("invalid group reference", len(condname) + 1) state.checklookbehindgroup(condgroup, source) - elif char in FLAGS: + elif char in FLAGS or char == "-": # flags - while True: - state.flags |= FLAGS[char] - char = sourceget() - if char is None: - raise source.error("missing )") - if char == ")": - break - if char not in FLAGS: - raise source.error("unknown flag", len(char)) - verbose = state.flags & SRE_FLAG_VERBOSE - continue + flags = _parse_flags(source, state, char) + if flags is None: # global flags + continue + add_flags, del_flags = flags + group = None else: raise source.error("unknown extension ?" + char, len(char) + 1) @@ -770,15 +771,17 @@ def _parse(source, state): except error as err: raise source.error(err.msg, len(name) + 1) from None if condgroup: - p = _parse_sub_cond(source, state, condgroup) + p = _parse_sub_cond(source, state, condgroup, verbose) else: - p = _parse_sub(source, state) + sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and + not (del_flags & SRE_FLAG_VERBOSE)) + p = _parse_sub(source, state, sub_verbose) if not source.match(")"): raise source.error("missing ), unterminated subpattern", source.tell() - start) if group is not None: state.closegroup(group, p) - subpatternappend((SUBPATTERN, (group, p))) + subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) elif this == "^": subpatternappend((AT, AT_BEGINNING)) @@ -791,6 +794,53 @@ def _parse(source, state): return subpattern +def _parse_flags(source, state, char): + sourceget = source.get + add_flags = 0 + del_flags = 0 + if char != "-": + while True: + add_flags |= FLAGS[char] + char = sourceget() + if char is None: + raise source.error("missing -, : or )") + if char in ")-:": + break + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing -, : or )" + raise source.error(msg, len(char)) + if char == ")": + if ((add_flags & SRE_FLAG_VERBOSE) and + not (state.flags & SRE_FLAG_VERBOSE)): + raise Verbose + state.flags |= add_flags + return None + if add_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn on global flag", 1) + if char == "-": + char = sourceget() + if char is None: + raise source.error("missing flag") + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing flag" + raise source.error(msg, len(char)) + while True: + del_flags |= FLAGS[char] + char = sourceget() + if char is None: + raise source.error("missing :") + if char == ":": + break + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing :" + raise source.error(msg, len(char)) + assert char == ":" + if del_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn off global flag", 1) + if add_flags & del_flags: + raise source.error("bad inline flags: flag turned on and off", 1) + return add_flags, del_flags + def fix_flags(src, flags): # Check and fix flags according to the type of pattern (str or bytes) if isinstance(src, str): @@ -823,7 +873,16 @@ def parse(str, flags=0, pattern=None): pattern.flags = flags pattern.str = str - p = _parse_sub(source, pattern, 0) + try: + p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False) + except Verbose: + ## the VERBOSE flag was switched on inside the pattern. to be + ## on the safe side, we'll parse the whole thing again... + pattern = Pattern() + pattern.flags = flags | SRE_FLAG_VERBOSE + pattern.str = str + p = _parse_sub(source, pattern, True, False) + p.pattern.flags = fix_flags(str, p.pattern.flags) if source.next is not None: @@ -833,11 +892,6 @@ def parse(str, flags=0, pattern=None): if flags & SRE_FLAG_DEBUG: p.dump() - if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: - # the VERBOSE flag was switched on inside the pattern. to be - # on the safe side, we'll parse the whole thing again... - return parse(str, p.pattern.flags) - return p def parse_template(source, pattern): diff -r 068365acbe73 Lib/test/test_re.py --- a/Lib/test/test_re.py Wed Mar 25 21:03:47 2015 +0200 +++ b/Lib/test/test_re.py Wed Mar 25 22:53:11 2015 +0200 @@ -1360,6 +1360,38 @@ class ReTests(unittest.TestCase): self.assertWarns(DeprecationWarning, re.compile, b'(?a)', re.LOCALE) self.assertWarns(DeprecationWarning, re.compile, b'(?aL)') + def test_scoped_flags(self): + self.assertTrue(re.match(r'(?i:a)b', 'Ab')) + self.assertIsNone(re.match(r'(?i:a)b', 'aB')) + self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE)) + self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE)) + self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab')) + self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB')) + + self.assertTrue(re.match(r'(?x: a) b', 'a b')) + self.assertIsNone(re.match(r'(?x: a) b', ' a b')) + self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE)) + self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE)) + + self.checkPatternError(r'(?a:\w)', + 'bad inline flags: cannot turn on global flag', 3) + self.checkPatternError(r'(?a)(?-a:\w)', + 'bad inline flags: cannot turn off global flag', 8) + self.checkPatternError(r'(?i-i:a)', + 'bad inline flags: flag turned on and off', 5) + + self.checkPatternError(r'(?-', 'missing flag', 3) + self.checkPatternError(r'(?-+', 'missing flag', 3) + self.checkPatternError(r'(?-z', 'unknown flag', 3) + self.checkPatternError(r'(?-i', 'missing :', 4) + self.checkPatternError(r'(?-i)', 'missing :', 4) + self.checkPatternError(r'(?-i+', 'missing :', 4) + self.checkPatternError(r'(?-iz', 'unknown flag', 4) + self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0) + self.checkPatternError(r'(?i', 'missing -, : or )', 3) + self.checkPatternError(r'(?i+', 'missing -, : or )', 3) + self.checkPatternError(r'(?iz', 'unknown flag', 3) + def test_bug_6509(self): # Replacement strings of both types must parse properly. # all strings @@ -1529,9 +1561,9 @@ class ReTests(unittest.TestCase): with captured_stdout() as out: re.compile(pat, re.DEBUG) dump = '''\ -SUBPATTERN 1 +SUBPATTERN 1 0 0 LITERAL 46 -SUBPATTERN None +SUBPATTERN None 0 0 BRANCH IN LITERAL 99 @@ -1539,7 +1571,7 @@ SUBPATTERN None OR LITERAL 112 LITERAL 121 -SUBPATTERN None +SUBPATTERN None 0 0 GROUPREF_EXISTS 1 AT AT_END ELSE @@ -1655,7 +1687,7 @@ SUBPATTERN None self.checkPatternError(r'(?P', 'unexpected end of pattern', 3) self.checkPatternError(r'(?z)', 'unknown extension ?z', 1) self.checkPatternError(r'(?iz)', 'unknown flag', 3) - self.checkPatternError(r'(?i', 'missing )', 3) + self.checkPatternError(r'(?i', 'missing -, : or )', 3) self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0) self.checkPatternError(r'(?<', 'unexpected end of pattern', 3) self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)