diff -r 8ddf40f68def Doc/library/re.rst --- a/Doc/library/re.rst Fri Jun 01 00:07:28 2012 -0500 +++ b/Doc/library/re.rst Fri Jun 01 09:39:49 2012 +0300 @@ -414,8 +414,8 @@ accepted by the regular expression parser:: \a \b \f \n - \r \t \v \x - \\ + \r \t \u \U + \v \x \\ (Note that ``\b`` is used to represent word boundaries, and means "backspace" only inside character classes.) diff -r 8ddf40f68def Lib/sre_parse.py --- a/Lib/sre_parse.py Fri Jun 01 00:07:28 2012 -0500 +++ b/Lib/sre_parse.py Fri Jun 01 09:39:49 2012 +0300 @@ -247,6 +247,25 @@ if len(escape) != 2: raise error("bogus escape: %s" % repr("\\" + escape)) return LITERAL, int(escape, 16) & 0xff + elif c == "u": + # unicode escape (exactly four digits) + while source.next in HEXDIGITS and len(escape) < 6: + escape = escape + source.get() + if len(escape) != 6: + raise error("bogus escape: %s" % repr(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "U": + # unicode escape (exactly eight digits) + while source.next in HEXDIGITS and len(escape) < 10: + escape = escape + source.get() + if len(escape) != 10: + raise error("bogus escape: %s" % repr(escape)) + c = int(escape[2:], 16) + if c > sys.maxunicode: + raise error("bogus escape: %s" % repr(escape)) + if c > 0x10FFFF: + raise ValueError + return LITERAL, c elif c in OCTDIGITS: # octal escape (up to three digits) while source.next in OCTDIGITS and len(escape) < 4: @@ -278,6 +297,23 @@ if len(escape) != 4: raise ValueError return LITERAL, int(escape[2:], 16) & 0xff + elif c == "u": + # unicode escape (exactly four digits) + while source.next in HEXDIGITS and len(escape) < 6: + escape = escape + source.get() + if len(escape) != 6: + raise ValueError + return LITERAL, int(escape[2:], 16) + elif c == "U": + # unicode escape (exactly eight digits) + while source.next in HEXDIGITS and len(escape) < 10: + escape = escape + source.get() + if len(escape) != 10: + raise ValueError + c = int(escape[2:], 16) + if c > 0x10FFFF: + raise ValueError + return LITERAL, c elif c == "0": # octal escape while source.next in OCTDIGITS and len(escape) < 4: @@ -457,6 +493,9 @@ raise error("bad character range") lo = code1[1] hi = code2[1] + + if lo > sys.maxunicode or hi > sys.maxunicode: + raise error("bad character range") if hi < lo: raise error("bad character range") setappend((RANGE, (lo, hi))) @@ -660,7 +699,18 @@ elif this and this[0] == "\\": code = _escape(source, this, state) - subpatternappend(code) + if code[0] == LITERAL and code[1] > sys.maxunicode: + # translate from 10000..10FFFF to 0..FFFF + c = code[1] - 0x10000 + # high surrogate = top 10 bits added to D800 + c1 = 0xD800 + (c >> 10) + # low surrogate = bottom 10 bits added to DC00 + c2 = 0xDC00 + (c & 0x03FF) + + subpatternappend((LITERAL, c1)) + subpatternappend((LITERAL, c2)) + else: + subpatternappend(code) else: raise error("parser error") diff -r 8ddf40f68def Lib/test/test_re.py --- a/Lib/test/test_re.py Fri Jun 01 00:07:28 2012 -0500 +++ b/Lib/test/test_re.py Fri Jun 01 09:39:49 2012 +0300 @@ -533,8 +533,12 @@ self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None) self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None) self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None) + self.assertNotEqual(re.match(r"\u1234", "\u1234"), None) + self.assertNotEqual(re.match(r"\U00001234", "\u1234"), None) self.assertRaises(re.error, re.match, "\911", "") - + self.assertRaises(re.error, re.match, r"\U00110000", "") + self.assertEqual(re.match(r"\U00100000", "\U00100000").group(), "\U00100000") + def test_sre_character_class_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255]: self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None) @@ -543,8 +547,14 @@ self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None) self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None) self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None) + self.assertNotEqual(re.match(r"[\u1234-\u1236]", "\u1235"), None) + self.assertNotEqual(re.match(r"[\U00001234]", "\u1234"), None) self.assertRaises(re.error, re.match, "[\911]", "") - + self.assertRaises(re.error, re.match, r"[\U0011ffff]", "") + if sys.maxunicode <= 0xffff: + self.assertRaises(re.error, re.match, r"[\U00100000]", "") + self.assertRaises(re.error, re.match, "r[\U00100000-\U001000FF]", "") + def test_bug_113254(self): self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)