=== modified file 'Doc/library/re.rst' --- Doc/library/re.rst 2008-05-09 06:36:07 +0000 +++ Doc/library/re.rst 2008-05-29 18:49:52 +0000 @@ -229,6 +229,14 @@ *cannot* be retrieved after performing a match or referenced later in the pattern. +``(?P...)`` + Regular expressions of this form indicate a Python-Specific + extension to the general Regular Expression syntax. The ``(?P...)`` + form is reserved for the Python programming language by agreement + between Larry Wall and Guido van Rossum that Perl shall never + implement any new extension to the Regular Expression syntax that is + of the form ``(?P...)``. + ``(?P...)`` Similar to regular parentheses, but the substring matched by the group is accessible via the symbolic group name *name*. Group names must be valid Python @@ -248,6 +256,19 @@ ``(?#...)`` A comment; the contents of the parentheses are simply ignored. + .. note: + + The first closing parenthesis encountered in the commented + expression will be interpreted as the comment's closing + parenthesis. For example, not only would + ``He(?# 2 (TWO) ls)llo`` **not** match the string expression + ``Hello``, the expression itself is not even a valid regular + expression as the comment would consist of the characters + `` 2 (TWO`` and then terminate with the first closing + parenthesis, and then, after the `` ls`` the compiler would + encounter another closing parenthesis which would not be balanced + and cause the regular expression engine to generate an error. + ``(?=...)`` Matches if ``...`` matches next, but doesn't consume any of the string. This is called a lookahead assertion. For example, ``Isaac (?=Asimov)`` will match === modified file 'Lib/re.py' --- Lib/re.py 2008-05-20 07:49:57 +0000 +++ Lib/re.py 2008-05-29 18:49:52 +0000 @@ -222,34 +222,54 @@ _pattern_type = type(sre_compile.compile("", 0)) -_MAXCACHE = 100 +_MAXCACHE = 256 +_CACHE_CLEAR = _MAXCACHE >> 1 + +_CACHE_IDX = -1 +_CACHE_REPL_IDX = -1 def _compile(*key): # internal: compile pattern + global _CACHE_IDX + _CACHE_IDX += 1 cachekey = (type(key[0]),) + key - p = _cache.get(cachekey) - if p is not None: + entry = _cache.get(cachekey) + if entry is not None: + p = entry[1] + _cache[cachekey] = (_CACHE_IDX, p) return p pattern, flags = key if isinstance(pattern, _pattern_type): if flags: raise ValueError('Cannot process flags argument with a compiled pattern') return pattern - if not sre_compile.isstring(pattern): + if not isinstance(pattern, basestring): raise TypeError, "first argument must be string or compiled pattern" try: p = sre_compile.compile(pattern, flags) except error, v: raise error, v # invalid expression if len(_cache) >= _MAXCACHE: + cache_items = _cache.items() _cache.clear() - _cache[cachekey] = p + + # Sort by index + cache_items.sort(key = lambda x: x[1][0]) + + # Reinsert the last + _cache.update(cache_items[-_CACHE_CLEAR:]) + + _cache[cachekey] = (_CACHE_IDX, p) return p def _compile_repl(*key): # internal: compile replacement pattern - p = _cache_repl.get(key) - if p is not None: + global _CACHE_REPL_IDX + _CACHE_REPL_IDX += 1 + entry = _cache_repl.get(key) + if entry is not None: + p = entry[1] + _cache_repl[key] = (_CACHE_REPL_IDX, p) return p repl, pattern = key try: @@ -257,8 +277,16 @@ except error, v: raise error, v # invalid expression if len(_cache_repl) >= _MAXCACHE: + cache_items = _cache_repl.items() _cache_repl.clear() - _cache_repl[key] = p + + # Sort by index + cache_items.sort(key = lambda x: x[1][0]) + + # Reinsert the last + _cache_repl.update(cache_items[-_CACHE_CLEAR:]) + + _cache_repl[key] = (_CACHE_REPL_IDX, p) return p def _expand(pattern, match, template): === modified file 'Lib/sre_compile.py' --- Lib/sre_compile.py 2008-04-08 21:27:42 +0000 +++ Lib/sre_compile.py 2008-05-24 21:31:18 +0000 @@ -11,7 +11,7 @@ """Internal support module for sre""" import _sre, sys -import sre_parse + from sre_constants import * assert _sre.MAGIC == MAGIC, "SRE module mismatch" @@ -149,7 +149,7 @@ emit(OPCODES[JUMP]) tailappend(_len(code)); emit(0) code[skip] = _len(code) - skip - emit(0) # end of branch + emit(OPCODES[FAILURE]) # end of branch for tail in tail: code[tail] = _len(code) - tail elif op is CATEGORY: @@ -470,19 +470,6 @@ _compile_charset(charset, flags, code) code[skip] = len(code) - skip -try: - unicode -except NameError: - STRING_TYPES = (type(""),) -else: - STRING_TYPES = (type(""), type(unicode(""))) - -def isstring(obj): - for tp in STRING_TYPES: - if isinstance(obj, tp): - return 1 - return 0 - def _code(p, flags): flags = p.pattern.flags | flags @@ -501,7 +488,8 @@ def compile(p, flags=0): # internal: convert pattern list to internal format - if isstring(p): + if isinstance(p, basestring): + import sre_parse pattern = p p = sre_parse.parse(p, flags) else: === modified file 'Lib/sre_constants.py' --- Lib/sre_constants.py 2004-08-25 02:22:30 +0000 +++ Lib/sre_constants.py 2008-05-24 21:31:18 +0000 @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20031017 +MAGIC = 20080329 # max code word in this release === modified file 'Lib/sre_parse.py' --- Lib/sre_parse.py 2006-12-19 08:17:50 +0000 +++ Lib/sre_parse.py 2008-05-29 18:49:52 +0000 @@ -424,8 +424,6 @@ # character set set = [] setappend = set.append -## if sourcematch(":"): -## pass # handle character classes if sourcematch("^"): setappend((NEGATE, None)) # check remaining characters === modified file 'Lib/test/test_re.py' --- Lib/test/test_re.py 2008-01-10 21:59:42 +0000 +++ Lib/test/test_re.py 2008-05-29 18:50:07 +0000 @@ -644,8 +644,8 @@ def test_inline_flags(self): # Bug #1700 - upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow - lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow + upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Below + lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Below p = re.compile(upper_char, re.I | re.U) q = p.match(lower_char) @@ -672,7 +672,8 @@ self.assertNotEqual(q, None) def test_dollar_matches_twice(self): - "$ matches the end of string, and just before the terminating \n" + """Test that $ does not include \\n + $ matches the end of string, and just before the terminating \n""" pattern = re.compile('$') self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') @@ -683,6 +684,111 @@ self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') self.assertEqual(pattern.sub('#', '\n'), '#\n#') + def test_caching_mechinism(self): + """Testing Caches + Verifies the following aspects of the Regular Expression Cache: + 1) The Cache invalidates half its entries when full + 2) The most recent half of the cache entries are kept on invalidation + 3) Verify that initially the cache entry does not exist + 4) Verify that after execution, a cache entry now exists + 5) Perform test on both compile and sub templates + 6) Test purging of the caches + """ + def as_expr(n): + return '(Regexp%03d)' % n + def as_strn(n): + return 'Regexp%03d' % n + def as_tmpl(n): + return r'\1-%03d-\1' % n + def as_resp(n): + return as_strn(n) + ('-%03d-' % n) + as_strn(n) + + # Test purging + # Force at least 1 entry in each cache + self.assertEqual(re.sub('(\w)', r'\1\1', 'a'), 'aa') + + # Verify there is at least one entry in each cache + self.assertNotEqual(len(re._cache), 0) + self.assertNotEqual(len(re._cache_repl), 0) + + # Purge and verify + re.purge() + self.assertEqual(len(re._cache), 0) + self.assertEqual(len(re._cache_repl), 0) + + # Used for testing when the cache overflows + expressions = [ ] + templates = [ ] + compiled_expr = [ ] + compiled_tmpl = [ ] + + # Check that items are cached + for x in range(re._MAXCACHE): + # Get the Corresponding Regular Expression + expr = as_expr(x) + expressions.append((str, expr, 0)) + + # Verify the cache is empty + self.assertFalse(expressions[-1] in re._cache) + + # Create a new expression + pattern = re.compile(expr) + + # Verify the expression is now in the cache + self.assertTrue(expressions[-1] in re._cache) + compiled_expr.append(re._cache.get(expressions[-1])) + + # Get the template expression, tmpl + tmpl = as_tmpl(x) + templates.append((tmpl, pattern)) + + # Verify the replacement cache is empty + self.assertFalse(templates[-1] in re._cache_repl) + + # Perform a new substitution, wherein the template will be + # compiled + pattern.sub(tmpl, "") + + # Verify the expression is now in the cache + self.assertTrue(templates[-1] in re._cache_repl) + compiled_tmpl.append(re._cache_repl.get(templates[-1])) + + # Get the reference source Strings for testing + strn = as_strn(x) + resp = as_resp(x) + + # Verify the expressions were generated correctly + self.assertEqual(pattern.sub(tmpl, strn), resp) + + # Force one more element to be added, clearing half the cache + self.assertEqual(re.sub('(\w)', r'\1\1', 'a'), 'aa') + + # Verify the oldest re._MAXCACHE - re._CACHE_CLEAR items have + # been removed + for idx in range(re._MAXCACHE - re._CACHE_CLEAR): + # Verify + self.assertEqual(re._cache.get(expressions[idx]), None) + self.assertEqual(re._cache_repl.get(templates[idx]), None) + + # Verify the newest re._CACHE_CLEAR items still exist + for idx in range(re._MAXCACHE - re._CACHE_CLEAR, re._MAXCACHE): + # Get the expected value for the Compiled Expression + if compiled_expr[idx] == None: + self.fail() + e = None + else: + e = compiled_expr[idx][1] + + # Get the expected value for the Compiled Template + if compiled_tmpl[idx] == None: + self.fail() + t = None + else: + t = compiled_tmpl[idx][1] + + # Verify + self.assertEqual(re._cache.get(expressions[idx])[1], e) + self.assertEqual(re._cache_repl.get(templates[idx])[1], t) def run_re_tests(): from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR === modified file 'Modules/sre_constants.h' --- Modules/sre_constants.h 2003-10-17 22:13:16 +0000 +++ Modules/sre_constants.h 2008-05-24 21:31:18 +0000 @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20031017 +#define SRE_MAGIC 20080329 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2