=== modified file 'Doc/library/re.rst' --- Doc/library/re.rst 2008-05-09 06:36:07 +0000 +++ Doc/library/re.rst 2008-05-24 21:32:48 +0000 @@ -229,6 +229,14 @@ *cannot* be retrieved after performing a match or referenced later in the pattern. +``(?P...)`` + Regular expressions of this form indicate a Python-Specific + extension to the general Regular Expression syntax. The ``(?P...)`` + form is reserved for the Python programming language by agreement + between Larry Wall and Guido van Rossum that Perl shall never + implement any new extension to the Regular Expression syntax that is + of the form ``(?P...)``. + ``(?P...)`` Similar to regular parentheses, but the substring matched by the group is accessible via the symbolic group name *name*. Group names must be valid Python @@ -248,6 +256,29 @@ ``(?#...)`` A comment; the contents of the parentheses are simply ignored. + .. note: + + The first closing parenthesis encountered in the commented + expression will be interpreted as the comment's closing + parenthesis. For example, not only would + ``He(?# 2 (TWO) ls)llo`` **not** match the string expression + ``Hello``, the expression itself is not even a valid regular + expression as the comment would consist of the characters + `` 2 (TWO`` and then terminate with the first closing + parenthesis, and then, after the `` ls`` the compiler would + encounter another closing parenthesis which would not be balanced + and cause the regular expression engine to generate an error. + +``(?P#...)`` + A Parentheses-balanced comment. Like the standard comment, text + between the parentheses is ignored, but in addition to this, if + there are balanced parenthesis within the commented expression, + these too will be ignored until the balancing closing parenthesis + is encountered. Also, an escaped closing parentesis is ignored + as part of the sequence of balancing parentheses. For example, + ``(?P# 6\) There is no rule SIX (6))`` would be a well-formed + regular expression that was a complete comment. + ``(?=...)`` Matches if ``...`` matches next, but doesn't consume any of the string. This is called a lookahead assertion. For example, ``Isaac (?=Asimov)`` will match === modified file 'Lib/re.py' --- Lib/re.py 2008-05-20 07:49:57 +0000 +++ Lib/re.py 2008-05-24 18:56:21 +0000 @@ -235,7 +235,7 @@ if flags: raise ValueError('Cannot process flags argument with a compiled pattern') return pattern - if not sre_compile.isstring(pattern): + if not isinstance(pattern, basestring): raise TypeError, "first argument must be string or compiled pattern" try: p = sre_compile.compile(pattern, flags) === modified file 'Lib/sre_compile.py' --- Lib/sre_compile.py 2008-04-08 21:27:42 +0000 +++ Lib/sre_compile.py 2008-05-24 21:31:18 +0000 @@ -11,7 +11,7 @@ """Internal support module for sre""" import _sre, sys -import sre_parse + from sre_constants import * assert _sre.MAGIC == MAGIC, "SRE module mismatch" @@ -149,7 +149,7 @@ emit(OPCODES[JUMP]) tailappend(_len(code)); emit(0) code[skip] = _len(code) - skip - emit(0) # end of branch + emit(OPCODES[FAILURE]) # end of branch for tail in tail: code[tail] = _len(code) - tail elif op is CATEGORY: @@ -470,19 +470,6 @@ _compile_charset(charset, flags, code) code[skip] = len(code) - skip -try: - unicode -except NameError: - STRING_TYPES = (type(""),) -else: - STRING_TYPES = (type(""), type(unicode(""))) - -def isstring(obj): - for tp in STRING_TYPES: - if isinstance(obj, tp): - return 1 - return 0 - def _code(p, flags): flags = p.pattern.flags | flags @@ -501,7 +488,8 @@ def compile(p, flags=0): # internal: convert pattern list to internal format - if isstring(p): + if isinstance(p, basestring): + import sre_parse pattern = p p = sre_parse.parse(p, flags) else: === modified file 'Lib/sre_constants.py' --- Lib/sre_constants.py 2004-08-25 02:22:30 +0000 +++ Lib/sre_constants.py 2008-05-24 21:31:18 +0000 @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20031017 +MAGIC = 20080329 # max code word in this release === modified file 'Lib/sre_parse.py' --- Lib/sre_parse.py 2006-12-19 08:17:50 +0000 +++ Lib/sre_parse.py 2008-05-24 21:32:48 +0000 @@ -424,8 +424,6 @@ # character set set = [] setappend = set.append -## if sourcematch(":"): -## pass # handle character classes if sourcematch("^"): setappend((NEGATE, None)) # check remaining characters @@ -568,6 +566,27 @@ raise error, "unknown group name" subpatternappend((GROUPREF, gid)) continue + elif sourcematch("#"): + # Python-Specific Comment -- allows for nested + # paren + depth = 1 + while 1: + if sourcematch("\\"): + # Ignore escaped characters + if not source.next: + break + elif source.next == "(": + depth += 1 + elif source.next == ")": + depth -= 1 + if not depth: + break + if source.next is None: + break + sourceget() + if not sourcematch(")"): + raise error, "unbalanced parenthesis" + continue else: char = sourceget() if char is None: === modified file 'Lib/test/test_re.py' --- Lib/test/test_re.py 2008-01-10 21:59:42 +0000 +++ Lib/test/test_re.py 2008-05-24 21:32:48 +0000 @@ -644,8 +644,8 @@ def test_inline_flags(self): # Bug #1700 - upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow - lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow + upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Below + lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Below p = re.compile(upper_char, re.I | re.U) q = p.match(lower_char) @@ -683,6 +683,33 @@ self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') self.assertEqual(pattern.sub('#', '\n'), '#\n#') + def test_nested_parenthesis_in_comments(self): + """Verify that 'Hello' matches + 'Hell(?P# not the (really) bad place)o' but not + 'Hell(?# not the (really) bad place)o' (which is invalid).""" + self.assertRaises(re.error, re.compile, + 'Hell(?# not the (really) bad place)o') + + goodHello = 'Hello' + badHello = 'Hell bad place)o' + + patPyComment = re.compile('Hell(?P# not the (really) bad place)o') + self.assertEqual(patPyComment.match(goodHello).group(0), goodHello) + self.assertEqual(patPyComment.match(badHello), None) + + goodWorld = 'Hello World!' + badWorld = 'Hello ((Planet)))World!' + + self.assertRaises(re.error, re.compile, + r'Hello (?# 3\) ((Planet)))World!') + #patNumComment = re.compile(r'Hello (?# 3\) ((Planet)))World!') + #self.assertEqual(patNumComment.match(goodWorld), None) + #self.assertEqual(patNumComment.match(badWorld).group(0), badWorld) + + patNumPyComment = re.compile(r'Hello (?P# 3\) ((Planet)))World!') + self.assertEqual(patNumPyComment.match(goodWorld).group(0), goodWorld) + self.assertEqual(patNumPyComment.match(badWorld), None) + def run_re_tests(): from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR === modified file 'Modules/sre_constants.h' --- Modules/sre_constants.h 2003-10-17 22:13:16 +0000 +++ Modules/sre_constants.h 2008-05-24 21:31:18 +0000 @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20031017 +#define SRE_MAGIC 20080329 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2