diff -r 6040015cb439 -r 0562d189d57c Include/unicodeobject.h --- a/Include/unicodeobject.h Thu Jan 11 17:43:23 2007 -0800 +++ b/Include/unicodeobject.h Thu Jan 11 19:47:31 2007 -0800 @@ -380,15 +380,37 @@ extern "C" { /* --- Unicode Type ------------------------------------------------------- */ +/* + * if set, this is a concatenation object (PyUnicodeConcatenationObject), + * not a conventional unicode string (PyUnicodeObject) + */ +#define PYUNICODE_FLAG_IS_CONCATENATION (1) + +#define PyUnicodeObject_HEAD \ + PyObject_HEAD \ + Py_ssize_t length; /* Length of raw Unicode data in buffer */ \ + Py_UNICODE *str; /* Raw Unicode buffer */ \ + long hash; /* Hash value; -1 if not set */ \ + unsigned long flags; /* see PYUNICODE_FLAG_ definitions */ \ + PyObject *defenc /* (Default) Encoded version as Python */ \ + /* string, or NULL; this is used for */ \ + /* implementing the buffer protocol */ \ + + typedef struct { - PyObject_HEAD - Py_ssize_t length; /* Length of raw Unicode data in buffer */ - Py_UNICODE *str; /* Raw Unicode buffer */ - long hash; /* Hash value; -1 if not set */ - PyObject *defenc; /* (Default) Encoded version as Python - string, or NULL; this is used for - implementing the buffer protocol */ + PyUnicodeObject_HEAD; } PyUnicodeObject; + + +#define PYUNICODE_CONCATENATIONS (8) +#define PYUNICODE_RIGHTRECURSIONDEPTH (16384) + +typedef struct { + PyUnicodeObject_HEAD; + unsigned short rightRecursionDepth; + unsigned short stringsIndex; + PyUnicodeObject *strings[PYUNICODE_CONCATENATIONS]; +} PyUnicodeConcatenationObject; PyAPI_DATA(PyTypeObject) PyUnicode_Type; @@ -400,10 +422,20 @@ PyAPI_DATA(PyTypeObject) PyUnicode_Type; (((PyUnicodeObject *)(op))->length) #define PyUnicode_GET_DATA_SIZE(op) \ (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)) +#define PyUnicode_GET_FLAGS(op) \ + (((PyUnicodeObject *)(op))->flags) +#define PyUnicode_CHECK_CONCATENATED(op) \ + (PyUnicode_GET_FLAGS(op) & PYUNICODE_FLAG_IS_CONCATENATION) +#define PyUnicode_AS_UNICODE_DIRECT(op) \ + (((PyUnicodeObject *)(op))->str) #define PyUnicode_AS_UNICODE(op) \ - (((PyUnicodeObject *)(op))->str) + ( PyUnicode_CHECK_CONCATENATED(op) ? PyUnicode_AsUnicode((PyObject *)op) : PyUnicode_AS_UNICODE_DIRECT(op)) +#define PyUnicode_AS_DATA_DIRECT(op) \ + ((const char *)PyUnicode_AS_UNICODE_DIRECT(op)) #define PyUnicode_AS_DATA(op) \ - ((const char *)((PyUnicodeObject *)(op))->str) + ((const char *)PyUnicode_AS_UNICODE(op)) + +#define PyUnicode_GET_RIGHT_RECURSION_DEPTH(op) (PyUnicode_CHECK_CONCATENATED(op) ? (((PyUnicodeConcatenationObject *)(op))->rightRecursionDepth) : 0) /* --- Constants ---------------------------------------------------------- */ diff -r 6040015cb439 -r 0562d189d57c Objects/unicodeobject.c --- a/Objects/unicodeobject.c Thu Jan 11 17:43:23 2007 -0800 +++ b/Objects/unicodeobject.c Thu Jan 11 19:47:31 2007 -0800 @@ -48,6 +48,14 @@ OF OR IN CONNECTION WITH THE USE OR PERF #ifdef MS_WINDOWS #include #endif + +#ifndef min +#define min(a, b) ( (a) < (b) ? (a) : (b) ) +#endif /* max */ + +#ifndef max +#define max(a, b) ( (a) > (b) ? (a) : (b) ) +#endif /* max */ /* Limit for the Unicode object free list */ @@ -281,6 +289,7 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize unicode->length = length; unicode->hash = -1; unicode->defenc = NULL; + unicode->flags = 0; return unicode; onError: @@ -289,30 +298,75 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize return NULL; } + +/* + * *Carefully* deallocate the recursive tree of concatenation objects, + * being careful to *iterate* (*not* recurse) down the left-hand side. + */ +static void unicode_recursive_dealloc(PyUnicodeConcatenationObject *concat) +{ + for (;;) { + PyUnicodeConcatenationObject *next; + + if (concat == NULL) + return; + + if ((concat->ob_refcnt == 1) && PyUnicode_CHECK_CONCATENATED(concat) && (concat->stringsIndex)) { + next = (PyUnicodeConcatenationObject *)*concat->strings; + *concat->strings = NULL; + } + else + next = NULL; + + Py_DECREF(concat); + concat = next; + } +} + static void unicode_dealloc(register PyUnicodeObject *unicode) { if (PyUnicode_CheckExact(unicode) && + !PyUnicode_CHECK_CONCATENATED(unicode) && unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { - /* Keep-Alive optimization */ - if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { - PyMem_DEL(unicode->str); - unicode->str = NULL; - unicode->length = 0; - } - if (unicode->defenc) { - Py_DECREF(unicode->defenc); - unicode->defenc = NULL; - } - /* Add to free list */ - *(PyUnicodeObject **)unicode = unicode_freelist; - unicode_freelist = unicode; - unicode_freelist_size++; - } - else { - PyMem_DEL(unicode->str); - Py_XDECREF(unicode->defenc); - unicode->ob_type->tp_free((PyObject *)unicode); + /* Keep-Alive optimization */ + if (unicode->str == NULL) + unicode->length = 0; + else if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { + PyMem_DEL(unicode->str); + unicode->str = NULL; + unicode->length = 0; + } + if (unicode->defenc) { + Py_DECREF(unicode->defenc); + unicode->defenc = NULL; + } + /* Add to free list */ + *(PyUnicodeObject **)unicode = unicode_freelist; + unicode_freelist = unicode; + unicode_freelist_size++; + } + else { + if (unicode->str) + PyMem_DEL(unicode->str); + Py_XDECREF(unicode->defenc); + + if (PyUnicode_CHECK_CONCATENATED(unicode)) { + PyUnicodeConcatenationObject *concat = (PyUnicodeConcatenationObject *)unicode; + register PyUnicodeObject **i; + if (concat->stringsIndex) { + for (i = concat->strings + concat->stringsIndex - 1; i > concat->strings; i--) { + if (*i) { + Py_DECREF(*i); + } + } + + if (*i) { + unicode_recursive_dealloc((PyUnicodeConcatenationObject *)*i); + } + } + } + unicode->ob_type->tp_free((PyObject *)unicode); } } @@ -339,7 +393,7 @@ int PyUnicode_Resize(PyObject **unicode, PyUnicodeObject *w = _PyUnicode_New(length); if (w == NULL) return -1; - Py_UNICODE_COPY(w->str, v->str, + Py_UNICODE_COPY(w->str, PyUnicode_AS_UNICODE(v), length < v->length ? length : v->length); Py_DECREF(*unicode); *unicode = (PyObject *)w; @@ -443,7 +497,7 @@ Py_ssize_t PyUnicode_AsWideChar(PyUnicod size = PyUnicode_GET_SIZE(unicode) + 1; #ifdef HAVE_USABLE_WCHAR_T - memcpy(w, unicode->str, size * sizeof(wchar_t)); + memcpy(w, PyUnicode_AS_UNICODE(unicode), size * sizeof(wchar_t)); #else { register Py_UNICODE *u; @@ -737,16 +791,76 @@ PyObject *_PyUnicode_AsDefaultEncodedStr return v; } -Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) -{ - if (!PyUnicode_Check(unicode)) { + + +static void unicode_recursive_concatenate(Py_UNICODE *buffer, Py_ssize_t length, PyUnicodeConcatenationObject *s) { + register PyUnicodeObject **i; + + for (;;) { + /* + * optimized for the general case of 'a'+'b'+'c'+'d'+'e': + * in this case, we will never actually recurse, we will iterate + */ + + if (s->str != NULL) { + Py_UNICODE_COPY(buffer, s->str, s->length); + return; + } + + for (i = s->strings + s->stringsIndex - 1; i >= s->strings + 1; i--) { + PyUnicodeObject *child = *i; + Py_UNICODE *childDestination; + length -= child->length; + childDestination = buffer + length; + if (child->str != NULL) + Py_UNICODE_COPY(childDestination, child->str, child->length); + else + unicode_recursive_concatenate(childDestination, child->length, (PyUnicodeConcatenationObject *)child); + } + + s = (PyUnicodeConcatenationObject *)*s->strings; + } +} + + +Py_UNICODE *PyUnicode_AsUnicode(PyObject *object) +{ + register PyUnicodeConcatenationObject *unicode; + if (!PyUnicode_Check(object)) { PyErr_BadArgument(); - goto onError; - } - return PyUnicode_AS_UNICODE(unicode); - - onError: - return NULL; + return NULL; + } + + /* lch */ + unicode = (PyUnicodeConcatenationObject *)object; + if ((unicode->str == NULL) && PyUnicode_CHECK_CONCATENATED(unicode)) { + register PyUnicodeObject **i; + register Py_UNICODE *string; + + string = (Py_UNICODE *)PyMem_NEW(Py_UNICODE, unicode->length + 1); + if (string == NULL) { + PyErr_NoMemory(); + return NULL; + } + + unicode_recursive_concatenate(string, unicode->length, unicode); + + string[unicode->length] = 0; + + for (i = unicode->strings + unicode->stringsIndex - 1; i >= unicode->strings; i--) { + Py_DECREF(*i); +#ifdef Py_DEBUG + *i = NULL; +#endif /* Py_DEBUG */ + } + + unicode->str = string; + unicode->stringsIndex = 0; + unicode->rightRecursionDepth = 0; + } + + + return PyUnicode_AS_UNICODE_DIRECT(object); } Py_ssize_t PyUnicode_GetSize(PyObject *unicode) @@ -4304,7 +4418,7 @@ Py_ssize_t PyUnicode_Count(PyObject *str FIX_START_END(str_obj); result = stringlib_count( - str_obj->str + start, end - start, sub_obj->str, sub_obj->length + PyUnicode_AS_UNICODE(str_obj) + start, end - start, PyUnicode_AS_UNICODE(sub_obj), sub_obj->length ); Py_DECREF(sub_obj); @@ -4365,6 +4479,7 @@ int tailmatch(PyUnicodeObject *self, if (end < start) return 0; + PyUnicode_AS_UNICODE(self); /* force render */ if (direction > 0) { if (Py_UNICODE_MATCH(self, end, substring)) return 1; @@ -4415,7 +4530,7 @@ PyObject *fixup(PyUnicodeObject *self, if (u == NULL) return NULL; - Py_UNICODE_COPY(u->str, self->str, self->length); + Py_UNICODE_COPY(u->str, PyUnicode_AS_UNICODE(self), self->length); if (!fixfct(u) && PyUnicode_CheckExact(self)) { /* fixfct should return TRUE if it modified the buffer. If @@ -4432,7 +4547,7 @@ int fixupper(PyUnicodeObject *self) int fixupper(PyUnicodeObject *self) { Py_ssize_t len = self->length; - Py_UNICODE *s = self->str; + Py_UNICODE *s = PyUnicode_AS_UNICODE(self); int status = 0; while (len-- > 0) { @@ -4453,7 +4568,7 @@ int fixlower(PyUnicodeObject *self) int fixlower(PyUnicodeObject *self) { Py_ssize_t len = self->length; - Py_UNICODE *s = self->str; + Py_UNICODE *s = PyUnicode_AS_UNICODE(self); int status = 0; while (len-- > 0) { @@ -4474,7 +4589,7 @@ int fixswapcase(PyUnicodeObject *self) int fixswapcase(PyUnicodeObject *self) { Py_ssize_t len = self->length; - Py_UNICODE *s = self->str; + Py_UNICODE *s = PyUnicode_AS_UNICODE(self); int status = 0; while (len-- > 0) { @@ -4495,7 +4610,7 @@ int fixcapitalize(PyUnicodeObject *self) int fixcapitalize(PyUnicodeObject *self) { Py_ssize_t len = self->length; - Py_UNICODE *s = self->str; + Py_UNICODE *s = PyUnicode_AS_UNICODE(self); int status = 0; if (len == 0) @@ -4724,7 +4839,7 @@ PyUnicodeObject *pad(PyUnicodeObject *se if (u) { if (left) Py_UNICODE_FILL(u->str, fill, left); - Py_UNICODE_COPY(u->str + left, self->str, self->length); + Py_UNICODE_COPY(u->str + left, PyUnicode_AS_UNICODE(self), self->length); if (right) Py_UNICODE_FILL(u->str + left + self->length, fill, right); } @@ -4753,6 +4868,7 @@ PyObject *split_whitespace(PyUnicodeObje Py_ssize_t len = self->length; PyObject *str; + PyUnicode_AS_UNICODE(self); /* force render */ for (i = j = 0; i < len; ) { /* find a token */ while (i < len && Py_UNICODE_ISSPACE(self->str[i])) @@ -4844,6 +4960,7 @@ PyObject *split_char(PyUnicodeObject *se Py_ssize_t len = self->length; PyObject *str; + PyUnicode_AS_UNICODE(self); /* force render */ for (i = j = 0; i < len; ) { if (self->str[i] == ch) { if (maxcount-- <= 0) @@ -4875,6 +4992,7 @@ PyObject *split_substring(PyUnicodeObjec Py_ssize_t sublen = substring->length; PyObject *str; + PyUnicode_AS_UNICODE(self); /* force render */ for (i = j = 0; i <= len - sublen; ) { if (Py_UNICODE_MATCH(self, i, substring)) { if (maxcount-- <= 0) @@ -4904,6 +5022,7 @@ PyObject *rsplit_whitespace(PyUnicodeObj Py_ssize_t len = self->length; PyObject *str; + PyUnicode_AS_UNICODE(self); /* force render */ for (i = j = len - 1; i >= 0; ) { /* find a token */ while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) @@ -4943,6 +5062,7 @@ PyObject *rsplit_char(PyUnicodeObject *s Py_ssize_t len = self->length; PyObject *str; + PyUnicode_AS_UNICODE(self); /* force render */ for (i = j = len - 1; i >= 0; ) { if (self->str[i] == ch) { if (maxcount-- <= 0) @@ -4976,6 +5096,7 @@ PyObject *rsplit_substring(PyUnicodeObje Py_ssize_t sublen = substring->length; PyObject *str; + PyUnicode_AS_UNICODE(self); /* force render */ for (i = len - sublen, j = len; i >= 0; ) { if (Py_UNICODE_MATCH(self, i, substring)) { if (maxcount-- <= 0) @@ -5069,20 +5190,23 @@ PyObject *replace(PyUnicodeObject *self, if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; + PyUnicode_AS_UNICODE(self); /* force render */ + PyUnicode_AS_UNICODE(str1); /* force render */ + PyUnicode_AS_UNICODE(str2); /* force render */ if (str1->length == str2->length) { /* same length */ Py_ssize_t i; if (str1->length == 1) { /* replace characters */ Py_UNICODE u1, u2; - if (!findchar(self->str, self->length, str1->str[0])) + if (!findchar(PyUnicode_AS_UNICODE_DIRECT(self), self->length, PyUnicode_AS_UNICODE_DIRECT(str1)[0])) goto nothing; u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); if (!u) return NULL; Py_UNICODE_COPY(u->str, self->str, self->length); u1 = str1->str[0]; - u2 = str2->str[0]; + u2 = PyUnicode_AS_UNICODE_DIRECT(str2)[0]; for (i = 0; i < u->length; i++) if (u->str[i] == u1) { if (--maxcount < 0) @@ -5091,7 +5215,7 @@ PyObject *replace(PyUnicodeObject *self, } } else { i = fastsearch( - self->str, self->length, str1->str, str1->length, FAST_SEARCH + PyUnicode_AS_UNICODE_DIRECT(self), self->length, PyUnicode_AS_UNICODE_DIRECT(str1), str1->length, FAST_SEARCH ); if (i < 0) goto nothing; @@ -5099,6 +5223,7 @@ PyObject *replace(PyUnicodeObject *self, if (!u) return NULL; Py_UNICODE_COPY(u->str, self->str, self->length); + PyUnicode_AS_UNICODE(str2); /* force render */ while (i <= self->length - str1->length) if (Py_UNICODE_MATCH(self, i, str1)) { if (--maxcount < 0) @@ -5115,7 +5240,7 @@ PyObject *replace(PyUnicodeObject *self, Py_UNICODE *p; /* replace strings */ - n = stringlib_count(self->str, self->length, str1->str, str1->length); + n = stringlib_count(PyUnicode_AS_UNICODE_DIRECT(self), self->length, PyUnicode_AS_UNICODE_DIRECT(str1), str1->length); if (n > maxcount) n = maxcount; if (n == 0) @@ -5334,8 +5459,8 @@ unicode_compare(PyUnicodeObject *str1, P { Py_ssize_t len1, len2; - Py_UNICODE *s1 = str1->str; - Py_UNICODE *s2 = str2->str; + Py_UNICODE *s1 = PyUnicode_AS_UNICODE(str1); + Py_UNICODE *s2 = PyUnicode_AS_UNICODE(str2); len1 = str1->length; len2 = str2->length; @@ -5368,8 +5493,8 @@ unicode_compare(PyUnicodeObject *str1, P { register Py_ssize_t len1, len2; - Py_UNICODE *s1 = str1->str; - Py_UNICODE *s2 = str2->str; + Py_UNICODE *s1 = PyUnicode_AS_UNICODE(str1); + Py_UNICODE *s2 = PyUnicode_AS_UNICODE(str2); len1 = str1->length; len2 = str2->length; @@ -5530,10 +5655,18 @@ int PyUnicode_Contains(PyObject *contain /* Concat to string or Unicode object giving a new Unicode object. */ +static void +unicode_render_if_too_deep(register PyUnicodeConcatenationObject *op) +{ + if (PyUnicode_GET_RIGHT_RECURSION_DEPTH(op) >= PYUNICODE_RIGHTRECURSIONDEPTH) + PyUnicode_AsUnicode((PyObject *)op); +} + PyObject *PyUnicode_Concat(PyObject *left, PyObject *right) { - PyUnicodeObject *u = NULL, *v = NULL, *w; + PyUnicodeObject *u = NULL, *v = NULL; + PyUnicodeConcatenationObject *unicode; /* Coerce the two arguments */ u = (PyUnicodeObject *)PyUnicode_FromObject(left); @@ -5553,8 +5686,72 @@ PyObject *PyUnicode_Concat(PyObject *lef return (PyObject *)v; } - /* Concat the two Unicode strings */ - w = _PyUnicode_New(u->length + v->length); + /* Concat the two Unicode strings */ + +#if 1 + /* lch */ + /* if left side is already a concatenation object, and hasn't been rendered yet, and only has one reference, and has room, just append to it */ + if (PyUnicode_CHECK_CONCATENATED(u) && (u->str == NULL) && (u->ob_refcnt == 1)) { + unicode = (PyUnicodeConcatenationObject *)u; + if (unicode->stringsIndex < PYUNICODE_CONCATENATIONS) { + Py_INCREF(u); + unicode->strings[unicode->stringsIndex++] = v; + unicode->length += v->length; + + unicode->rightRecursionDepth = max(unicode->rightRecursionDepth, + PyUnicode_GET_RIGHT_RECURSION_DEPTH(v) + 1); + unicode_render_if_too_deep(unicode); + + Py_INCREF(unicode); + return (PyObject *)unicode; + } + } + /* if right side is already a concatenation object, and hasn't been rendered yet, and only has one reference, and has room, just prepend to it */ + if (PyUnicode_CHECK_CONCATENATED(v) && (v->str == NULL) && (v->ob_refcnt == 1)) { + unicode = (PyUnicodeConcatenationObject *)v; + if (unicode->stringsIndex < PYUNICODE_CONCATENATIONS) { + memmove(unicode->strings + 1, unicode->strings, unicode->stringsIndex * sizeof(PyUnicodeObject *)); + Py_INCREF(v); + unicode->strings[0] = u; + unicode->stringsIndex++; + unicode->length += u->length; + + unicode->rightRecursionDepth = max(unicode->rightRecursionDepth, + PyUnicode_GET_RIGHT_RECURSION_DEPTH(unicode->strings[1]) + 1); + unicode_render_if_too_deep(unicode); + + Py_INCREF(unicode); + return (PyObject *)unicode; + } + } + + /* lch */ + unicode = (PyUnicodeConcatenationObject *)PyObject_MALLOC(sizeof(PyUnicodeConcatenationObject)); + if (unicode == NULL) + return PyErr_NoMemory(); + PyObject_INIT(unicode, &PyUnicode_Type); + + unicode->str = NULL; + unicode->length = u->length + v->length; + unicode->hash = -1; + unicode->defenc = NULL; + unicode->strings[0] = u; + unicode->strings[1] = v; +#ifdef Py_DEBUG + memset(unicode->strings + 2, 0xee, sizeof(unicode->strings) - (sizeof(unicode->strings[0]) * 2)); +#endif /* Py_DEBUG */ + unicode->stringsIndex = 2; + unicode->flags = PYUNICODE_FLAG_IS_CONCATENATION; + unicode->rightRecursionDepth = PyUnicode_GET_RIGHT_RECURSION_DEPTH(v) + 1; + + Py_INCREF(u); + Py_INCREF(v); + + unicode_render_if_too_deep(unicode); + return (PyObject *)unicode; +#else + { + PyUnicodeObject *w = _PyUnicode_New(u->length + v->length); if (w == NULL) goto onError; Py_UNICODE_COPY(w->str, u->str, u->length); @@ -5563,6 +5760,8 @@ PyObject *PyUnicode_Concat(PyObject *lef Py_DECREF(u); Py_DECREF(v); return (PyObject *)w; + } +#endif onError: Py_XDECREF(u); @@ -5597,8 +5796,8 @@ unicode_count(PyUnicodeObject *self, PyO FIX_START_END(self); result = PyInt_FromSsize_t( - stringlib_count(self->str + start, end - start, - substring->str, substring->length) + stringlib_count(PyUnicode_AS_UNICODE(self) + start, end - start, + PyUnicode_AS_UNICODE(substring), substring->length) ); Py_DECREF(substring); @@ -5699,7 +5898,7 @@ unicode_expandtabs(PyUnicodeObject *self /* First pass: determine size of output string */ i = j = 0; - e = self->str + self->length; + e = PyUnicode_AS_UNICODE(self) + self->length; for (p = self->str; p < e; p++) if (*p == '\t') { if (tabsize > 0) @@ -5719,7 +5918,7 @@ unicode_expandtabs(PyUnicodeObject *self return NULL; j = 0; - q = u->str; + q = PyUnicode_AS_UNICODE(u); for (p = self->str; p < e; p++) if (*p == '\t') { @@ -5783,6 +5982,7 @@ unicode_getitem(PyUnicodeObject *self, P return NULL; } + PyUnicode_AS_UNICODE(self); return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); } @@ -6389,17 +6589,17 @@ unicode_repeat(PyUnicodeObject *str, Py_ p = u->str; if (str->length == 1 && len > 0) { - Py_UNICODE_FILL(p, str->str[0], len); + Py_UNICODE_FILL(p, PyUnicode_AS_UNICODE(str)[0], len); } else { Py_ssize_t done = 0; /* number of characters copied this far */ if (done < nchars) { - Py_UNICODE_COPY(p, str->str, str->length); + Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(str), str->length); done = str->length; - } - while (done < nchars) { - int n = (done <= nchars-done) ? done : nchars-done; - Py_UNICODE_COPY(p+done, p, n); - done += n; + while (done < nchars) { + int n = (done <= nchars-done) ? done : nchars-done; + Py_UNICODE_COPY(p+done, p, n); + done += n; + } } } @@ -6592,7 +6792,7 @@ unicode_slice(PyUnicodeObject *self, Py_ if (start > end) start = end; /* copy slice */ - return (PyObject*) PyUnicode_FromUnicode(self->str + start, + return (PyObject*) PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self) + start, end - start); } @@ -6824,7 +7024,7 @@ static PyObject* static PyObject* unicode_translate(PyUnicodeObject *self, PyObject *table) { - return PyUnicode_TranslateCharmap(self->str, + return PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(self), self->length, table, "ignore"); @@ -6989,8 +7189,26 @@ static PyObject * static PyObject * unicode_getnewargs(PyUnicodeObject *v) { - return Py_BuildValue("(u#)", v->str, v->length); -} + return Py_BuildValue("(u#)", PyUnicode_AS_UNICODE(v), v->length); +} + + +PyDoc_STRVAR(simplify__doc__, +"S.simplify() -> S\n\ +\n\ +Simplifies the internal representation of S,\n\ +forcing it to \"render\" it if it is a concatenation.\n\ +Returns a new reference to S.\n\ +Only necessary for memory/reference tuning."); + +static PyObject* +unicode_simplify(PyObject *self) +{ + PyUnicode_AsUnicode(self); + Py_INCREF(self); + return (PyObject *)self; +} + static PyMethodDef unicode_methods[] = { @@ -7048,6 +7266,7 @@ static PyMethodDef unicode_methods[] = { #endif {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, + {"simplify", (PyCFunction) unicode_simplify, METH_NOARGS, simplify__doc__}, {NULL, NULL} }; @@ -7140,7 +7359,7 @@ unicode_buffer_getreadbuf(PyUnicodeObjec "accessing non-existent unicode segment"); return -1; } - *ptr = (void *) self->str; + *ptr = (void *) PyUnicode_AS_UNICODE(self); return PyUnicode_GET_DATA_SIZE(self); } @@ -7932,6 +8151,7 @@ unicode_subtype_new(PyTypeObject *type, Py_UNICODE_COPY(pnew->str, tmp->str, n+1); pnew->length = n; pnew->hash = tmp->hash; + pnew->flags = 0; Py_DECREF(tmp); return (PyObject *)pnew; }