Index: Doc/library/re.rst =================================================================== --- Doc/library/re.rst (revision 82756) +++ Doc/library/re.rst (working copy) @@ -400,8 +400,8 @@ accepted by the regular expression parser:: \a \b \f \n - \r \t \v \x - \\ + \r \t \u \U + \v \x \\ Octal escapes are included in a limited form: If the first digit is a 0, or if there are three octal digits, it is considered an octal escape. Otherwise, it is Index: Lib/test/test_re.py =================================================================== --- Lib/test/test_re.py (revision 82756) +++ Lib/test/test_re.py (working copy) @@ -462,8 +462,12 @@ self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None) self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None) self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None) + self.assertNotEqual(re.match(r"\u1234", "\u1234"), None) + self.assertNotEqual(re.match(r"\U00001234", "\u1234"), None) self.assertRaises(re.error, re.match, "\911", "") - + self.assertRaises(re.error, re.match, r"\U00110000", "") + self.assertEqual(re.match(r"\U00100000", "\U00100000").group(), "\U00100000") + def test_sre_character_class_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255]: self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None) @@ -472,8 +476,14 @@ self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None) self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None) self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None) + self.assertNotEqual(re.match(r"[\u1234-\u1236]", "\u1235"), None) + self.assertNotEqual(re.match(r"[\U00001234]", "\u1234"), None) self.assertRaises(re.error, re.match, "[\911]", "") - + self.assertRaises(re.error, re.match, r"[\U0011ffff]", "") + if sys.maxunicode <= 0xffff: + self.assertRaises(re.error, re.match, r"[\U00100000]", "") + self.assertRaises(re.error, re.match, "r[\U00100000-\U001000FF]", "") + def test_bug_113254(self): self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1) Index: Lib/sre_parse.py =================================================================== --- Lib/sre_parse.py (revision 82756) +++ Lib/sre_parse.py (working copy) @@ -247,6 +247,25 @@ if len(escape) != 2: raise error("bogus escape: %s" % repr("\\" + escape)) return LITERAL, int(escape, 16) & 0xff + elif c == "u": + # unicode escape (exactly four digits) + while source.next in HEXDIGITS and len(escape) < 6: + escape = escape + source.get() + if len(escape) != 6: + raise error("bogus escape: %s" % repr(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "U": + # unicode escape (exactly eight digits) + while source.next in HEXDIGITS and len(escape) < 10: + escape = escape + source.get() + if len(escape) != 10: + raise error("bogus escape: %s" % repr(escape)) + c = int(escape[2:], 16) + if c > sys.maxunicode: + raise error("bogus escape: %s" % repr(escape)) + if c > 0x10FFFF: + raise ValueError + return LITERAL, c elif c in OCTDIGITS: # octal escape (up to three digits) while source.next in OCTDIGITS and len(escape) < 4: @@ -278,6 +297,23 @@ if len(escape) != 4: raise ValueError return LITERAL, int(escape[2:], 16) & 0xff + elif c == "u": + # unicode escape (exactly four digits) + while source.next in HEXDIGITS and len(escape) < 6: + escape = escape + source.get() + if len(escape) != 6: + raise ValueError + return LITERAL, int(escape[2:], 16) + elif c == "U": + # unicode escape (exactly eight digits) + while source.next in HEXDIGITS and len(escape) < 10: + escape = escape + source.get() + if len(escape) != 10: + raise ValueError + c = int(escape[2:], 16) + if c > 0x10FFFF: + raise ValueError + return LITERAL, c elif c == "0": # octal escape while source.next in OCTDIGITS and len(escape) < 4: @@ -457,6 +493,9 @@ raise error("bad character range") lo = code1[1] hi = code2[1] + + if lo > sys.maxunicode or hi > sys.maxunicode: + raise error("bad character range") if hi < lo: raise error("bad character range") setappend((RANGE, (lo, hi))) @@ -660,8 +699,19 @@ elif this and this[0] == "\\": code = _escape(source, this, state) - subpatternappend(code) + if code[0] == LITERAL and code[1] > sys.maxunicode: + # translate from 10000..10FFFF to 0..FFFF + c = code[1] - 0x10000 + # high surrogate = top 10 bits added to D800 + c1 = 0xD800 + (c >> 10) + # low surrogate = bottom 10 bits added to DC00 + c2 = 0xDC00 + (c & 0x03FF) + subpatternappend((LITERAL, c1)) + subpatternappend((LITERAL, c2)) + else: + subpatternappend(code) + else: raise error("parser error")