diff -r 5b02d622d625 Lib/re.py --- a/Lib/re.py Thu Jan 24 07:23:34 2013 -0800 +++ b/Lib/re.py Thu Jan 24 22:14:22 2013 +0200 @@ -233,7 +233,8 @@ alphanum = _alphanum_str s = list(pattern) for i, c in enumerate(pattern): - if c not in alphanum: + if c not in alphanum and not ('\U00010000'[1:] and + 0xdc00 <= ord(c) < 0xe000): if c == "\000": s[i] = "\\000" else: diff -r 5b02d622d625 Lib/sre_parse.py --- a/Lib/sre_parse.py Thu Jan 24 07:23:34 2013 -0800 +++ b/Lib/sre_parse.py Thu Jan 24 22:14:22 2013 +0200 @@ -177,26 +177,32 @@ class Tokenizer: def __init__(self, string): - self.string = string + if isinstance(string, bytes): + self.string = string.decode('latin1') + else: + self.string = string self.index = 0 self.__next() def __next(self): if self.index >= len(self.string): self.next = None return - char = self.string[self.index:self.index+1] - # Special case for the str8, since indexing returns a integer - # XXX This is only needed for test_bug_926075 in test_re.py - if char and isinstance(char, bytes): - char = chr(char[0]) + char = self.string[self.index] if char == "\\": try: c = self.string[self.index + 1] except IndexError: raise error("bogus escape (end of line)") - if isinstance(self.string, bytes): - c = chr(c) char = char + c + else: + c = char + if '\U00010000'[1:] and 0xd800 <= ord(c) < 0xdc00: + try: + c2 = self.string[self.index + len(char)] + if 0xdc00 <= ord(c2) < 0xe000: + char += c2 + except IndexError: + pass self.index = self.index + len(char) self.next = char def match(self, char, skip=1): @@ -238,7 +244,7 @@ if code and code[0] == IN: return code try: - c = escape[1:2] + c = escape[1:] if c == "x": # hexadecimal escape (exactly two digits) while source.next in HEXDIGITS and len(escape) < 4: @@ -255,8 +261,8 @@ return LITERAL, int(escape, 8) & 0xff elif c in DIGITS: raise error("bogus escape: %s" % repr(escape)) - if len(escape) == 2: - return LITERAL, ord(escape[1]) + if c: + return LITERAL, ord(c) except ValueError: pass raise error("bogus escape: %s" % repr(escape)) @@ -270,7 +276,7 @@ if code: return code try: - c = escape[1:2] + c = escape[1:] if c == "x": # hexadecimal escape while source.next in HEXDIGITS and len(escape) < 4: @@ -299,8 +305,8 @@ raise error("cannot refer to open group") return GROUPREF, group raise ValueError - if len(escape) == 2: - return LITERAL, ord(escape[1]) + if c: + return LITERAL, ord(c) except ValueError: pass raise error("bogus escape: %s" % repr(escape)) @@ -458,7 +464,7 @@ lo = code1[1] hi = code2[1] if hi < lo: - raise error("bad character range") + raise error("bad character range %x %x" % (lo, hi)) setappend((RANGE, (lo, hi))) else: raise error("unexpected end of regular expression") @@ -704,6 +710,7 @@ elif tail: raise error("bogus characters at end of regular expression") + #flags |= SRE_FLAG_DEBUG if flags & SRE_FLAG_DEBUG: p.dump() diff -r 5b02d622d625 Lib/test/test_re.py --- a/Lib/test/test_re.py Thu Jan 24 07:23:34 2013 -0800 +++ b/Lib/test/test_re.py Thu Jan 24 22:14:22 2013 +0200 @@ -522,12 +522,23 @@ self.assertMatch(re.escape(p), p) def test_re_escape_non_ascii(self): - s = 'xxx\u2620\u2620\u2620xxx' + #s = 'xxx\u2620\u2620\u2620xxx' + #s_escaped = re.escape(s) + #self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx') + #self.assertMatch(s_escaped, s) + #self.assertMatch('.%s+.' % re.escape('\u2620'), s, + #'x\u2620\u2620\u2620x', (2, 7), re.search) + print('*********') + s = 'xxx\U0001d11e\U0001d11e\U0001d11exxx' s_escaped = re.escape(s) - self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx') + p = re.compile('.%s+.' % re.escape('\U0001d11e'), re.DEBUG) + print(p.code) + m = re.search('.%s+.' % re.escape('\U0001d11e'), s) + self.assertEqual(m.group(), 'x\U0001d11e\U0001d11e\U0001d11ex') + self.assertEqual(s_escaped, 'xxx\\\U0001d11e\\\U0001d11e\\\U0001d11exxx') self.assertMatch(s_escaped, s) - self.assertMatch('.%s+.' % re.escape('\u2620'), s, - 'x\u2620\u2620\u2620x', (2, 7), re.search) + self.assertMatch('.%s+.' % re.escape('\U0001d11e'), s, + 'x\U0001d11e\U0001d11e\U0001d11ex', (2, len(s) - 2), re.search) def test_re_escape_non_ascii_bytes(self): b = 'y\u2620y\u2620y'.encode('utf-8') diff -r 5b02d622d625 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Thu Jan 24 07:23:34 2013 -0800 +++ b/Objects/unicodeobject.c Thu Jan 24 22:14:22 2013 +0200 @@ -98,18 +98,30 @@ Another way to look at this is that to say that the actual reference count of a string is: s->ob_refcnt + (s->state ? 2 : 0) */ -static PyObject *interned; +static PyObject *interned = NULL; /* Free list for Unicode objects */ -static PyUnicodeObject *free_list; -static int numfree; +static PyUnicodeObject *free_list = NULL; +static int numfree = 0; /* The empty Unicode object is shared to improve performance. */ -static PyUnicodeObject *unicode_empty; +static PyUnicodeObject *unicode_empty = NULL; + +#define _Py_RETURN_UNICODE_EMPTY() \ + do { \ + if (unicode_empty != NULL) \ + Py_INCREF(unicode_empty); \ + else { \ + unicode_empty = _PyUnicode_New(0); \ + if (unicode_empty != NULL) \ + Py_INCREF(unicode_empty); \ + } \ + return (PyObject *)unicode_empty; \ + } while (0) /* Single character Unicode strings in the Latin-1 range are being shared as well. */ -static PyUnicodeObject *unicode_latin1[256]; +static PyUnicodeObject *unicode_latin1[256] = {NULL}; /* Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { @@ -214,7 +226,7 @@ #define BLOOM_MASK unsigned long -static BLOOM_MASK bloom_linebreak; +static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) @@ -479,10 +491,8 @@ if (u != NULL) { /* Optimization for empty strings */ - if (size == 0 && unicode_empty != NULL) { - Py_INCREF(unicode_empty); - return (PyObject *)unicode_empty; - } + if (size == 0) + _Py_RETURN_UNICODE_EMPTY(); /* Single character Unicode objects in the Latin-1 range are shared when using this constructor */ @@ -528,10 +538,8 @@ if (u != NULL) { /* Optimization for empty strings */ - if (size == 0 && unicode_empty != NULL) { - Py_INCREF(unicode_empty); - return (PyObject *)unicode_empty; - } + if (size == 0) + _Py_RETURN_UNICODE_EMPTY(); /* Single characters are shared when using this constructor. Restrict to ASCII, since the input must be UTF-8. */ @@ -1393,15 +1401,11 @@ /* Decoding bytes objects is the most common case and should be fast */ if (PyBytes_Check(obj)) { - if (PyBytes_GET_SIZE(obj) == 0) { - Py_INCREF(unicode_empty); - v = (PyObject *) unicode_empty; - } - else { - v = PyUnicode_Decode( - PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), - encoding, errors); - } + if (PyBytes_GET_SIZE(obj) == 0) + _Py_RETURN_UNICODE_EMPTY(); + v = PyUnicode_Decode( + PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), + encoding, errors); return v; } @@ -1421,12 +1425,11 @@ } if (buffer.len == 0) { - Py_INCREF(unicode_empty); - v = (PyObject *) unicode_empty; - } - else - v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); - + PyBuffer_Release(&buffer); + _Py_RETURN_UNICODE_EMPTY(); + } + + v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); PyBuffer_Release(&buffer); return v; } @@ -8323,10 +8326,8 @@ Py_ssize_t nchars; size_t nbytes; - if (len < 1) { - Py_INCREF(unicode_empty); - return (PyObject *)unicode_empty; - } + if (len < 1) + _Py_RETURN_UNICODE_EMPTY(); if (len == 1 && PyUnicode_CheckExact(str)) { /* no repeat, return original string */ @@ -10056,8 +10057,6 @@ void _PyUnicode_Init(void) { - int i; - /* XXX - move this array to unicodectype.c ? */ Py_UNICODE linebreak[] = { 0x000A, /* LINE FEED */ @@ -10071,14 +10070,10 @@ }; /* Init the implementation */ - free_list = NULL; - numfree = 0; unicode_empty = _PyUnicode_New(0); if (!unicode_empty) return; - for (i = 0; i < 256; i++) - unicode_latin1[i] = NULL; if (PyType_Ready(&PyUnicode_Type) < 0) Py_FatalError("Can't initialize 'unicode'"); @@ -10123,15 +10118,11 @@ { int i; - Py_XDECREF(unicode_empty); - unicode_empty = NULL; - - for (i = 0; i < 256; i++) { - if (unicode_latin1[i]) { - Py_DECREF(unicode_latin1[i]); - unicode_latin1[i] = NULL; - } - } + Py_CLEAR(unicode_empty); + + for (i = 0; i < 256; i++) + Py_CLEAR(unicode_latin1[i]); + (void)PyUnicode_ClearFreeList(); } @@ -10250,8 +10241,7 @@ "mortal/immortal\n", mortal_size, immortal_size); Py_DECREF(keys); PyDict_Clear(interned); - Py_DECREF(interned); - interned = NULL; + Py_CLEAR(interned); }