diff -r 4f7c5349e801 Lib/re.py --- a/Lib/re.py Wed Feb 18 08:05:05 2015 +0200 +++ b/Lib/re.py Thu Feb 19 21:48:42 2015 +0200 @@ -352,7 +352,7 @@ class Scanner: s.flags = flags for phrase, action in lexicon: p.append(sre_parse.SubPattern(s, [ - (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), + (SUBPATTERN, (len(p)+1, 0, 0, sre_parse.parse(phrase, flags))), ])) s.groups = len(p)+1 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) diff -r 4f7c5349e801 Lib/sre_compile.py --- a/Lib/sre_compile.py Wed Feb 18 08:05:05 2015 +0200 +++ b/Lib/sre_compile.py Thu Feb 19 21:48:42 2015 +0200 @@ -71,7 +71,8 @@ def _compile(code, pattern, flags): ASSERT_CODES = _ASSERT_CODES if (flags & SRE_FLAG_IGNORECASE and not (flags & SRE_FLAG_LOCALE) and - flags & SRE_FLAG_UNICODE): + flags & SRE_FLAG_UNICODE and + not (flags & SRE_FLAG_ASCII)): fixes = _ignorecase_fixes else: fixes = None @@ -137,14 +138,15 @@ def _compile(code, pattern, flags): else: emit(MIN_UNTIL) elif op is SUBPATTERN: - if av[0]: + group, add_flags, del_flags, p = av + if group: emit(MARK) - emit((av[0]-1)*2) - # _compile_info(code, av[1], flags) - _compile(code, av[1], flags) - if av[0]: + emit((group-1)*2) + # _compile_info(code, p, (flags | add_flags) & ~del_flags) + _compile(code, p, (flags | add_flags) & ~del_flags) + if group: emit(MARK) - emit((av[0]-1)*2+1) + emit((group-1)*2+1) elif op in SUCCESS_CODES: emit(op) elif op in ASSERT_CODES: @@ -172,7 +174,7 @@ def _compile(code, pattern, flags): av = AT_MULTILINE.get(av, av) if flags & SRE_FLAG_LOCALE: av = AT_LOCALE.get(av, av) - elif flags & SRE_FLAG_UNICODE: + elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): av = AT_UNICODE.get(av, av) emit(av) elif op is BRANCH: @@ -193,7 +195,7 @@ def _compile(code, pattern, flags): emit(op) if flags & SRE_FLAG_LOCALE: av = CH_LOCALE[av] - elif flags & SRE_FLAG_UNICODE: + elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): av = CH_UNICODE[av] emit(av) elif op is GROUPREF: @@ -237,7 +239,7 @@ def _compile_charset(charset, flags, cod elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: emit(CH_LOCALE[av]) - elif flags & SRE_FLAG_UNICODE: + elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): emit(CH_UNICODE[av]) else: emit(av) @@ -432,8 +434,8 @@ def _compile_info(code, pattern, flags): if len(prefix) == prefix_skip: prefix_skip = prefix_skip + 1 prefixappend(av) - elif op is SUBPATTERN and len(av[1]) == 1: - op, av = av[1][0] + elif op is SUBPATTERN and len(av[3]) == 1: + op, av = av[3][0] if op is LITERAL: prefixappend(av) else: @@ -443,8 +445,8 @@ def _compile_info(code, pattern, flags): # if no prefix, look for charset prefix if not prefix and pattern.data: op, av = pattern.data[0] - if op is SUBPATTERN and av[1]: - op, av = av[1][0] + if op is SUBPATTERN and av[3]: + op, av = av[3][0] if op is LITERAL: charsetappend((op, av)) elif op is BRANCH: @@ -539,7 +541,8 @@ def compile(p, flags=0): code = _code(p, flags) - # print(code) + if p.pattern.flags & SRE_FLAG_DEBUG: + print(code) # map in either direction groupindex = p.pattern.groupdict diff -r 4f7c5349e801 Lib/sre_parse.py --- a/Lib/sre_parse.py Wed Feb 18 08:05:05 2015 +0200 +++ b/Lib/sre_parse.py Thu Feb 19 21:48:42 2015 +0200 @@ -64,6 +64,12 @@ FLAGS = { "u": SRE_FLAG_UNICODE, } +GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE | + SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE) + +class Verbose(Exception): + pass + class Pattern: # master pattern object. keeps track of global attributes def __init__(self): @@ -173,7 +179,7 @@ class SubPattern: lo = lo + i hi = hi + j elif op is SUBPATTERN: - i, j = av[1].getwidth() + i, j = av[3].getwidth() lo = lo + i hi = hi + j elif op in _REPEATCODES: @@ -387,14 +393,14 @@ def _escape(source, escape, state): pass raise source.error("bogus escape: %r" % escape, len(escape)) -def _parse_sub(source, state, nested=True): +def _parse_sub(source, state, verbose, nested=True): # parse an alternation: a|b|c items = [] itemsappend = items.append sourcematch = source.match while True: - itemsappend(_parse(source, state)) + itemsappend(_parse(source, state, verbose)) if not sourcematch("|"): break if nested and source.next is not None and source.next != ")": @@ -438,10 +444,10 @@ def _parse_sub(source, state, nested=Tru subpattern.append((BRANCH, (None, items))) return subpattern -def _parse_sub_cond(source, state, condgroup): - item_yes = _parse(source, state) +def _parse_sub_cond(source, state, condgroup, verbose): + item_yes = _parse(source, state, verbose) if source.match("|"): - item_no = _parse(source, state) + item_no = _parse(source, state, verbose) if source.next == "|": raise source.error("conditional backref with more than two branches") else: @@ -452,7 +458,7 @@ def _parse_sub_cond(source, state, condg subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) return subpattern -def _parse(source, state): +def _parse(source, state, verbose): # parse a simple pattern subpattern = SubPattern(state) @@ -462,7 +468,7 @@ def _parse(source, state): sourcematch = source.match _len = len _ord = ord - verbose = state.flags & SRE_FLAG_VERBOSE + #verbose = state.flags & SRE_FLAG_VERBOSE while True: @@ -610,6 +616,8 @@ def _parse(source, state): group = 1 name = None condgroup = None + add_flags = 0 + del_flags = 0 if sourcematch("?"): group = 0 # options @@ -668,7 +676,7 @@ def _parse(source, state): if char is None or char not in "=!": raise source.error("syntax error") dir = -1 # lookbehind - p = _parse_sub(source, state) + p = _parse_sub(source, state, verbose) if not sourcematch(")"): raise source.error("unbalanced parenthesis") if char == "=": @@ -701,12 +709,13 @@ def _parse(source, state): if condgroup >= MAXGROUPS: raise source.error("the group number is too large", len(condname) + 1) - elif char in FLAGS: + elif char in FLAGS or char == "-": # flags - state.flags |= FLAGS[char] - while source.next in FLAGS: - state.flags |= FLAGS[sourceget()] - verbose = state.flags & SRE_FLAG_VERBOSE + flags = _parse_flags(source, state, char) + if flags is None: + continue + add_flags, del_flags = flags + group = 2 else: raise source.error("unexpected end of pattern") if group: @@ -720,14 +729,16 @@ def _parse(source, state): except error as err: raise source.error(err.msg, len(name) + 1) if condgroup: - p = _parse_sub_cond(source, state, condgroup) + p = _parse_sub_cond(source, state, condgroup, verbose) else: - p = _parse_sub(source, state) + sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) + and not (del_flags & SRE_FLAG_VERBOSE)) + p = _parse_sub(source, state, sub_verbose) if not sourcematch(")"): raise source.error("unbalanced parenthesis") if group is not None: state.closegroup(group) - subpatternappend((SUBPATTERN, (group, p))) + subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) else: while True: char = sourceget() @@ -748,6 +759,60 @@ def _parse(source, state): return subpattern +def _parse_flags(source, state, char): + sourceget = source.get + add_flags = 0 + del_flags = 0 + if char != "-": + while True: + add_flags |= FLAGS[char] + char = sourceget() + if char is None: + raise source.error("missing -, : or )") + if char in ")-:": + break + if char not in FLAGS: + if char.isalpha(): + raise source.error("unknown flag", len(char)) + else: + raise source.error("missing -, : or )", len(char)) + if char == ")": + if ((add_flags & SRE_FLAG_VERBOSE) and + not (state.flags & SRE_FLAG_VERBOSE)): + raise Verbose + state.flags |= add_flags + return None + if add_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn on global flag", 1) + if char == "-": + char = sourceget() + if char is None: + raise source.error("missing flag") + if char not in FLAGS: + if char.isalpha(): + raise source.error("unknown flag", len(char)) + else: + raise source.error("missing flag", len(char)) + while True: + del_flags |= FLAGS[char] + char = sourceget() + if char is None: + raise source.error("missing :") + if char == ":": + break + if char not in FLAGS: + if char.isalpha(): + raise source.error("unknown flag", len(char)) + else: + raise source.error("missing :", len(char)) + assert char == ":" + if del_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn off global flag", 1) + if add_flags & del_flags: + raise error("bad inline flags: flag turned on and off", source.string, + source.pos) + return add_flags, del_flags + def fix_flags(src, flags): # Check and fix flags according to the type of pattern (str or bytes) if isinstance(src, str): @@ -780,7 +845,16 @@ def parse(str, flags=0, pattern=None): pattern.flags = flags pattern.str = str - p = _parse_sub(source, pattern, 0) + try: + p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False) + except Verbose: + ## the VERBOSE flag was switched on inside the pattern. to be + ## on the safe side, we'll parse the whole thing again... + pattern = Pattern() + pattern.flags = flags | SRE_FLAG_VERBOSE + pattern.str = str + p = _parse_sub(source, pattern, True, False) + p.pattern.flags = fix_flags(str, p.pattern.flags) if source.next is not None: @@ -793,11 +867,6 @@ def parse(str, flags=0, pattern=None): if flags & SRE_FLAG_DEBUG: p.dump() - if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: - # the VERBOSE flag was switched on inside the pattern. to be - # on the safe side, we'll parse the whole thing again... - return parse(str, p.pattern.flags) - return p def parse_template(source, pattern): diff -r 4f7c5349e801 Lib/test/test_re.py --- a/Lib/test/test_re.py Wed Feb 18 08:05:05 2015 +0200 +++ b/Lib/test/test_re.py Thu Feb 19 21:48:42 2015 +0200 @@ -28,6 +28,15 @@ class B(bytes): class ReTests(unittest.TestCase): + def checkPatternError(self, pattern, errmsg, pos=None): + with self.assertRaises(re.error) as cm: + re.compile(pattern) + with self.subTest(pattern=pattern): + err = cm.exception + self.assertEqual(err.msg, errmsg) + if pos is not None: + self.assertEqual(err.pos, pos) + def assertTypedEqual(self, actual, expect, msg=None): self.assertEqual(actual, expect, msg) def recurse(actual, expect): @@ -1212,6 +1221,30 @@ class ReTests(unittest.TestCase): self.assertWarns(DeprecationWarning, re.compile, b'(?a)', re.LOCALE) self.assertWarns(DeprecationWarning, re.compile, b'(?aL)') + def test_scoped_flags(self): + # Issue #433028 + self.assertTrue(re.match(r'(?i:a)b', 'Ab')) + self.assertIsNone(re.match(r'(?i:a)b', 'aB')) + self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE)) + self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE)) + self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab')) + self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB')) + + self.assertTrue(re.match(r'(?x: a) b', 'a b')) + self.assertIsNone(re.match(r'(?x: a) b', ' a b')) + self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE)) + self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE)) + + self.checkPatternError(r'(?-', 'missing flag', 3) + self.checkPatternError(r'(?-+', 'missing flag', 3) + self.checkPatternError(r'(?-z', 'unknown flag', 3) + self.checkPatternError(r'(?-i', 'missing :', 4) + self.checkPatternError(r'(?-i)', 'missing :', 4) + self.checkPatternError(r'(?-i+', 'missing :', 4) + self.checkPatternError(r'(?i:', 'unbalanced parenthesis', 4) + self.checkPatternError(r'(?i+', 'missing -, : or )', 3) + self.checkPatternError(r'(?iz', 'unknown flag', 3) + def test_bug_6509(self): # Replacement strings of both types must parse properly. # all strings