diff -r 8b1ac1a3d007 Doc/library/re.rst --- a/Doc/library/re.rst Wed Oct 08 22:32:50 2014 +0300 +++ b/Doc/library/re.rst Thu Oct 09 18:08:01 2014 +0300 @@ -520,7 +520,10 @@ form. current locale. The use of this flag is discouraged as the locale mechanism is very unreliable, and it only handles one "culture" at a time anyway; you should use Unicode matching instead, which is the default in Python 3 - for Unicode (str) patterns. + for Unicode (str) patterns. This flag makes sense only with bytes patterns. + + .. deprecated-removed:: 3.5 3.6 + Deprecated the use of re.LOCALE with re.ASCII or str patterns. .. data:: M diff -r 8b1ac1a3d007 Lib/sre_parse.py --- a/Lib/sre_parse.py Wed Oct 08 22:32:50 2014 +0300 +++ b/Lib/sre_parse.py Thu Oct 09 18:08:01 2014 +0300 @@ -754,6 +754,11 @@ def _parse(source, state): def fix_flags(src, flags): # Check and fix flags according to the type of pattern (str or bytes) if isinstance(src, str): + if flags & SRE_FLAG_LOCALE: + import warnings + warnings.warn("LOCALE flag with a str pattern is deprecated. " + "Will be an error in 3.6", + DeprecationWarning, stacklevel=6) if not flags & SRE_FLAG_ASCII: flags |= SRE_FLAG_UNICODE elif flags & SRE_FLAG_UNICODE: @@ -761,6 +766,11 @@ def fix_flags(src, flags): else: if flags & SRE_FLAG_UNICODE: raise ValueError("can't use UNICODE flag with a bytes pattern") + if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: + import warnings + warnings.warn("ASCII and LOCALE flags are incompatible. " + "Will be an error in 3.6", + DeprecationWarning, stacklevel=6) return flags def parse(str, flags=0, pattern=None): diff -r 8b1ac1a3d007 Lib/test/test_re.py --- a/Lib/test/test_re.py Wed Oct 08 22:32:50 2014 +0300 +++ b/Lib/test/test_re.py Thu Oct 09 18:08:01 2014 +0300 @@ -495,10 +495,6 @@ class ReTests(unittest.TestCase): "abcd abc bcd bx", re.ASCII).group(1), "bx") self.assertEqual(re.search(r"\B(b.)\B", "abc bcd bc abxd", re.ASCII).group(1), "bx") - self.assertEqual(re.search(r"\b(b.)\b", - "abcd abc bcd bx", re.LOCALE).group(1), "bx") - self.assertEqual(re.search(r"\B(b.)\B", - "abc bcd bc abxd", re.LOCALE).group(1), "bx") self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M)) @@ -519,8 +515,6 @@ class ReTests(unittest.TestCase): b"1aa! a").group(0), b"1aa! a") self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a", re.ASCII).group(0), "1aa! a") - self.assertEqual(re.search(r"\d\D\w\W\s\S", - "1aa! a", re.LOCALE).group(0), "1aa! a") self.assertEqual(re.search(br"\d\D\w\W\s\S", b"1aa! a", re.LOCALE).group(0), b"1aa! a") @@ -602,9 +596,12 @@ class ReTests(unittest.TestCase): self.assertEqual(_sre.getlower(ord('A'), 0), ord('a')) self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a')) self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a')) + self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a')) self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") + self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC") + self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC") def test_not_literal(self): self.assertEqual(re.search("\s([^a])", " b").group(1), "b") @@ -689,8 +686,10 @@ class ReTests(unittest.TestCase): self.assertEqual(re.X, re.VERBOSE) def test_flags(self): - for flag in [re.I, re.M, re.X, re.S, re.L]: + for flag in [re.I, re.M, re.X, re.S, re.A, re.U]: self.assertTrue(re.compile('^pattern$', flag)) + for flag in [re.I, re.M, re.X, re.S, re.A, re.L]: + self.assertTrue(re.compile(b'^pattern$', flag)) def test_sre_character_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: @@ -1055,6 +1054,52 @@ class ReTests(unittest.TestCase): self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE) self.assertRaises(ValueError, re.compile, '(?au)\w') + def test_locale_flag(self): + import locale + _, enc = locale.getlocale(locale.LC_CTYPE) + # Search non-ASCII letter + for i in range(128, 256): + try: + c = bytes([i]).decode(enc) + sletter = c.lower() + if sletter == c: continue + bletter = sletter.encode(enc) + if len(bletter) != 1: continue + if bletter.decode(enc) != sletter: continue + bpat = re.escape(bytes([i])) + break + except (UnicodeError, TypeError): + pass + else: + bletter = None + bpat = b'A' + # Bytes patterns + pat = re.compile(bpat, re.LOCALE | re.IGNORECASE) + if bletter: + self.assertTrue(pat.match(bletter)) + pat = re.compile(b'(?L)' + bpat, re.IGNORECASE) + if bletter: + self.assertTrue(pat.match(bletter)) + pat = re.compile(bpat, re.IGNORECASE) + if bletter: + self.assertIsNone(pat.match(bletter)) + pat = re.compile(b'\w', re.LOCALE) + if bletter: + self.assertTrue(pat.match(bletter)) + pat = re.compile(b'(?L)\w') + if bletter: + self.assertTrue(pat.match(bletter)) + pat = re.compile(b'\w') + if bletter: + self.assertIsNone(pat.match(bletter)) + # Incompatibilities + self.assertWarns(DeprecationWarning, re.compile, '', re.LOCALE) + self.assertWarns(DeprecationWarning, re.compile, '(?L)') + self.assertWarns(DeprecationWarning, re.compile, b'', re.LOCALE | re.ASCII) + self.assertWarns(DeprecationWarning, re.compile, b'(?L)', re.ASCII) + self.assertWarns(DeprecationWarning, re.compile, b'(?a)', re.LOCALE) + self.assertWarns(DeprecationWarning, re.compile, b'(?aL)') + def test_bug_6509(self): # Replacement strings of both types must parse properly. # all strings @@ -1314,6 +1359,10 @@ class PatternReprTests(unittest.TestCase self.check_flags(b'bytes pattern', re.A, "re.compile(b'bytes pattern', re.ASCII)") + def test_locale(self): + self.check_flags(b'bytes pattern', re.L, + "re.compile(b'bytes pattern', re.LOCALE)") + def test_quotes(self): self.check('random "double quoted" pattern', '''re.compile('random "double quoted" pattern')''') @@ -1431,16 +1480,23 @@ def run_re_tests(): pass else: try: - bpat = re.compile(bpat) + obj = re.compile(bpat) except Exception: print('=== Fails on bytes pattern compile', t) if verbose: traceback.print_exc(file=sys.stdout) else: - bytes_result = bpat.search(bs) + bytes_result = obj.search(bs) if bytes_result is None: print('=== Fails on bytes pattern match', t) + # Try the match with LOCALE enabled, and check that it + # still succeeds. + obj = re.compile(bpat, re.LOCALE) + result = obj.search(bs) + if result is None: + print('=== Fails on locale-sensitive match', t) + # Try the match with the search area limited to the extent # of the match and see if it still succeeds. \B will # break (because it won't match at the end or start of a @@ -1460,14 +1516,6 @@ def run_re_tests(): if result is None: print('=== Fails on case-insensitive match', t) - # Try the match with LOCALE enabled, and check that it - # still succeeds. - if '(?u)' not in pattern: - obj = re.compile(pattern, re.LOCALE) - result = obj.search(s) - if result is None: - print('=== Fails on locale-sensitive match', t) - # Try the match with UNICODE locale enabled, and check # that it still succeeds. obj = re.compile(pattern, re.UNICODE)