diff -r 6040015cb439 -r 5e654a623a53 Include/unicodeobject.h --- a/Include/unicodeobject.h Thu Jan 11 17:43:23 2007 -0800 +++ b/Include/unicodeobject.h Thu Jan 11 19:48:11 2007 -0800 @@ -367,28 +367,60 @@ typedef PY_UNICODE_TYPE Py_UNICODE; for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ } while (0) -/* check if substring matches at given offset. the offset must be - valid, and the substring must not be empty */ -#define Py_UNICODE_MATCH(string, offset, substring) \ - ((*((string)->str + (offset)) == *((substring)->str)) && \ - ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \ - !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE))) - #ifdef __cplusplus extern "C" { #endif /* --- Unicode Type ------------------------------------------------------- */ +/* + * if set, this is a concatenation object (PyUnicodeConcatenationObject), + * not a conventional unicode string (PyUnicodeObject) + */ +#define PYUNICODE_FLAG_IS_CONCATENATION (1) +/* + * if set, this is a slice object (PyUnicodeSliceObject), + * not a conventional unicode string (PyUnicodeObject) + */ +#define PYUNICODE_FLAG_IS_SLICE (2) + +#define PyUnicodeObject_HEAD \ + PyObject_HEAD \ + Py_ssize_t length; /* Length of raw Unicode data in buffer */ \ + Py_UNICODE *str; /* Raw Unicode buffer */ \ + long hash; /* Hash value; -1 if not set */ \ + unsigned long flags; /* see PYUNICODE_FLAG_ definitions */ \ + PyObject *defenc /* (Default) Encoded version as Python */ \ + /* string, or NULL; this is used for */ \ + /* implementing the buffer protocol */ \ + + typedef struct { - PyObject_HEAD - Py_ssize_t length; /* Length of raw Unicode data in buffer */ - Py_UNICODE *str; /* Raw Unicode buffer */ - long hash; /* Hash value; -1 if not set */ - PyObject *defenc; /* (Default) Encoded version as Python - string, or NULL; this is used for - implementing the buffer protocol */ + PyUnicodeObject_HEAD; } PyUnicodeObject; + + +#define PYUNICODE_CONCATENATIONS (8) +#define PYUNICODE_RIGHTRECURSIONDEPTH (16384) + +typedef struct { + PyUnicodeObject_HEAD; + unsigned short rightRecursionDepth; + unsigned short stringsIndex; + PyUnicodeObject *strings[PYUNICODE_CONCATENATIONS]; +} PyUnicodeConcatenationObject; + +#define PYUNICODE_MINIMUM_SIZE_FOR_SLICE_OBJECT (20) + +typedef struct { + PyUnicodeObject_HEAD; + unsigned short rightRecursionDepth; /* this object matches a PyStringConcatenationObject to this point */ + PyUnicodeObject *child; + Py_ssize_t start; + Py_ssize_t end; +} PyUnicodeSliceObject; + + PyAPI_DATA(PyTypeObject) PyUnicode_Type; @@ -400,10 +432,24 @@ PyAPI_DATA(PyTypeObject) PyUnicode_Type; (((PyUnicodeObject *)(op))->length) #define PyUnicode_GET_DATA_SIZE(op) \ (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)) +#define PyUnicode_GET_FLAGS(op) \ + (((PyUnicodeObject *)(op))->flags) +#define PyUnicode_CHECK_CONCATENATED(op) \ + (PyUnicode_GET_FLAGS(op) & PYUNICODE_FLAG_IS_CONCATENATION) +#define PyUnicode_CHECK_SLICE(op) \ + (PyUnicode_GET_FLAGS(op) & PYUNICODE_FLAG_IS_SLICE) +#define PyUnicode_CHECK_SLICE_OR_CONCATENATED(op) \ + (PyUnicode_GET_FLAGS(op) & (PYUNICODE_FLAG_IS_CONCATENATION | PYUNICODE_FLAG_IS_SLICE)) +#define PyUnicode_AS_UNICODE_DIRECT(op) \ + (((PyUnicodeObject *)(op))->str) #define PyUnicode_AS_UNICODE(op) \ - (((PyUnicodeObject *)(op))->str) + ( PyUnicode_CHECK_SLICE_OR_CONCATENATED(op) ? PyUnicode_AsUnicode((PyObject *)op) : PyUnicode_AS_UNICODE_DIRECT(op)) +#define PyUnicode_AS_DATA_DIRECT(op) \ + ((const char *)PyUnicode_AS_UNICODE_DIRECT(op)) #define PyUnicode_AS_DATA(op) \ - ((const char *)((PyUnicodeObject *)(op))->str) + ((const char *)PyUnicode_AS_UNICODE(op)) + +#define PyUnicode_GET_RIGHT_RECURSION_DEPTH(op) (PyUnicode_CHECK_CONCATENATED(op) ? (((PyUnicodeConcatenationObject *)(op))->rightRecursionDepth) : 0) /* --- Constants ---------------------------------------------------------- */ diff -r 6040015cb439 -r 5e654a623a53 Objects/stringlib/README.txt --- a/Objects/stringlib/README.txt Thu Jan 11 17:43:23 2007 -0800 +++ b/Objects/stringlib/README.txt Thu Jan 11 19:48:11 2007 -0800 @@ -32,3 +32,8 @@ STRINGLIB_CHAR* STRINGLIB_STR(PyObject*) returns the pointer to the character data for the given string object (which must be of the right type) + +PyObject* STRINGLIB_SLICE(PyObject*, Py_ssize_t start, Py_ssize_t end) + + creates a new slice object from the existing string object + diff -r 6040015cb439 -r 5e654a623a53 Objects/stringlib/partition.h --- a/Objects/stringlib/partition.h Thu Jan 11 17:43:23 2007 -0800 +++ b/Objects/stringlib/partition.h Thu Jan 11 19:48:11 2007 -0800 @@ -37,11 +37,11 @@ stringlib_partition( return out; } - PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos)); + PyTuple_SET_ITEM(out, 0, STRINGLIB_SLICE(str_obj, 0, pos)); Py_INCREF(sep_obj); PyTuple_SET_ITEM(out, 1, sep_obj); pos += sep_len; - PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos)); + PyTuple_SET_ITEM(out, 2, STRINGLIB_SLICE(str_obj, pos, str_len)); if (PyErr_Occurred()) { Py_DECREF(out); @@ -87,11 +87,11 @@ stringlib_rpartition( return out; } - PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos)); + PyTuple_SET_ITEM(out, 0, STRINGLIB_SLICE(str_obj, 0, pos)); Py_INCREF(sep_obj); PyTuple_SET_ITEM(out, 1, sep_obj); pos += sep_len; - PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos)); + PyTuple_SET_ITEM(out, 2, STRINGLIB_SLICE(str_obj, pos, str_len)); if (PyErr_Occurred()) { Py_DECREF(out); diff -r 6040015cb439 -r 5e654a623a53 Objects/stringobject.c --- a/Objects/stringobject.c Thu Jan 11 17:43:23 2007 -0800 +++ b/Objects/stringobject.c Thu Jan 11 19:48:11 2007 -0800 @@ -778,6 +778,33 @@ PyString_AsStringAndSize(register PyObje #define STRINGLIB_EMPTY nullstring +#define STRINGLIB_SLICE(s, i, j) string_slice((PyStringObject *)(s), (i), (j)) + +/* String slice a[i:j] consists of characters a[i] ... a[j-1] */ + +static PyObject * +string_slice(register PyStringObject *a, register Py_ssize_t i, + register Py_ssize_t j) + /* j -- may be negative! */ +{ + if (i < 0) + i = 0; + if (j < 0) + j = 0; /* Avoid signed/unsigned bug in next line */ + if (j > a->ob_size) + j = a->ob_size; + if (i == 0 && j == a->ob_size && PyString_CheckExact(a)) { + /* It's the same as a */ + Py_INCREF(a); + return (PyObject *)a; + } + if (j < i) + j = i; + return PyString_FromStringAndSize(a->ob_sval + i, j-i); +} + + + #include "stringlib/fastsearch.h" #include "stringlib/count.h" @@ -1037,29 +1064,6 @@ string_repeat(register PyStringObject *a i += j; } return (PyObject *) op; -} - -/* String slice a[i:j] consists of characters a[i] ... a[j-1] */ - -static PyObject * -string_slice(register PyStringObject *a, register Py_ssize_t i, - register Py_ssize_t j) - /* j -- may be negative! */ -{ - if (i < 0) - i = 0; - if (j < 0) - j = 0; /* Avoid signed/unsigned bug in next line */ - if (j > a->ob_size) - j = a->ob_size; - if (i == 0 && j == a->ob_size && PyString_CheckExact(a)) { - /* It's the same as a */ - Py_INCREF(a); - return (PyObject *)a; - } - if (j < i) - j = i; - return PyString_FromStringAndSize(a->ob_sval + i, j-i); } static int diff -r 6040015cb439 -r 5e654a623a53 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Thu Jan 11 17:43:23 2007 -0800 +++ b/Objects/unicodeobject.c Thu Jan 11 19:48:11 2007 -0800 @@ -49,6 +49,14 @@ OF OR IN CONNECTION WITH THE USE OR PERF #include #endif +#ifndef min +#define min(a, b) ( (a) < (b) ? (a) : (b) ) +#endif /* max */ + +#ifndef max +#define max(a, b) ( (a) > (b) ? (a) : (b) ) +#endif /* max */ + /* Limit for the Unicode object free list */ #define MAX_UNICODE_FREELIST_SIZE 1024 @@ -170,6 +178,41 @@ Py_LOCAL_INLINE(int) unicode_member(Py_U BLOOM(mask, chr) && unicode_member(chr, set, setlen) /* --- Unicode Object ----------------------------------------------------- */ + +/* + * Internal, used only by unicodeobject.c. + * Call this when it's okay if the string is not zero-terminated + * (ends with a '\0'). This means we don't need to render string + * slice objects. + * + * Note: use the macro, don't use the function directly. + * + * Note: in release builds, this function blindly assumes the + * object you passed in *is* some kind of PyUnicodeObject *! + * + * Note: if the string is a concatenation object, this *will* + * render it. + * + * Note: if the string is zero length, you always *will* + * get a terminating zero. (Zero-length strings slices don't + * bother using the slice object.) + */ +#define PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(x) (PyUnicode_AS_UNICODE_DIRECT(x) ? PyUnicode_AS_UNICODE_DIRECT(x) : __pyunicode_as_unterminated_unicode_string((PyUnicodeSliceObject *)x) ) +Py_LOCAL_INLINE(Py_UNICODE *) +__pyunicode_as_unterminated_unicode_string(register PyUnicodeSliceObject *slice) +{ +#ifdef Py_DEBUG + assert(PyUnicode_Check(slice)); +#endif /* Py_DEBUG */ +/* + if (PyUnicode_AS_UNICODE_DIRECT(slice)) + return PyUnicode_AS_UNICODE_DIRECT(slice); +*/ + if (PyUnicode_CHECK_SLICE(slice) && slice->child != NULL) + return PyUnicode_AS_UNICODE(slice->child) + slice->start; + return PyUnicode_AsUnicode((PyObject *)slice); +} + static int unicode_resize(register PyUnicodeObject *unicode, @@ -281,6 +324,7 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize unicode->length = length; unicode->hash = -1; unicode->defenc = NULL; + unicode->flags = 0; return unicode; onError: @@ -289,30 +333,78 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize return NULL; } + +/* + * *Carefully* deallocate the recursive tree of concatenation objects, + * being careful to *iterate* (*not* recurse) down the left-hand side. + */ +static void unicode_recursive_dealloc(PyUnicodeConcatenationObject *concat) +{ + for (;;) { + PyUnicodeConcatenationObject *next; + + if (concat == NULL) + return; + + if ((concat->ob_refcnt == 1) && PyUnicode_CHECK_CONCATENATED(concat) && (concat->stringsIndex)) { + next = (PyUnicodeConcatenationObject *)*concat->strings; + *concat->strings = NULL; + } + else + next = NULL; + + Py_DECREF(concat); + concat = next; + } +} + static void unicode_dealloc(register PyUnicodeObject *unicode) { if (PyUnicode_CheckExact(unicode) && + !PyUnicode_CHECK_SLICE_OR_CONCATENATED(unicode) && unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { - /* Keep-Alive optimization */ - if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { - PyMem_DEL(unicode->str); - unicode->str = NULL; - unicode->length = 0; - } - if (unicode->defenc) { - Py_DECREF(unicode->defenc); - unicode->defenc = NULL; - } - /* Add to free list */ - *(PyUnicodeObject **)unicode = unicode_freelist; - unicode_freelist = unicode; - unicode_freelist_size++; - } - else { - PyMem_DEL(unicode->str); - Py_XDECREF(unicode->defenc); - unicode->ob_type->tp_free((PyObject *)unicode); + /* Keep-Alive optimization */ + if (unicode->str == NULL) + unicode->length = 0; + else if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { + PyMem_DEL(unicode->str); + unicode->str = NULL; + unicode->length = 0; + } + if (unicode->defenc) { + Py_DECREF(unicode->defenc); + unicode->defenc = NULL; + } + /* Add to free list */ + *(PyUnicodeObject **)unicode = unicode_freelist; + unicode_freelist = unicode; + unicode_freelist_size++; + } + else { + if (unicode->str) + PyMem_DEL(unicode->str); + Py_XDECREF(unicode->defenc); + + if (PyUnicode_CHECK_SLICE(unicode)) { + PyUnicodeSliceObject *slice = (PyUnicodeSliceObject *)unicode; + Py_XDECREF(slice->child); + } else if (PyUnicode_CHECK_CONCATENATED(unicode)) { + PyUnicodeConcatenationObject *concat = (PyUnicodeConcatenationObject *)unicode; + register PyUnicodeObject **i; + if (concat->stringsIndex) { + for (i = concat->strings + concat->stringsIndex - 1; i > concat->strings; i--) { + if (*i) { + Py_DECREF(*i); + } + } + + if (*i) { + unicode_recursive_dealloc((PyUnicodeConcatenationObject *)*i); + } + } + } + unicode->ob_type->tp_free((PyObject *)unicode); } } @@ -339,7 +431,7 @@ int PyUnicode_Resize(PyObject **unicode, PyUnicodeObject *w = _PyUnicode_New(length); if (w == NULL) return -1; - Py_UNICODE_COPY(w->str, v->str, + Py_UNICODE_COPY(w->str, PyUnicode_AS_UNICODE(v), length < v->length ? length : v->length); Py_DECREF(*unicode); *unicode = (PyObject *)w; @@ -443,7 +535,7 @@ Py_ssize_t PyUnicode_AsWideChar(PyUnicod size = PyUnicode_GET_SIZE(unicode) + 1; #ifdef HAVE_USABLE_WCHAR_T - memcpy(w, unicode->str, size * sizeof(wchar_t)); + memcpy(w, PyUnicode_AS_UNICODE(unicode), size * sizeof(wchar_t)); #else { register Py_UNICODE *u; @@ -737,16 +829,97 @@ PyObject *_PyUnicode_AsDefaultEncodedStr return v; } -Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) -{ - if (!PyUnicode_Check(unicode)) { + + +static void unicode_recursive_concatenate(Py_UNICODE *buffer, Py_ssize_t length, PyUnicodeConcatenationObject *s) { + register PyUnicodeObject **i; + + for (;;) { + /* + * optimized for the general case of 'a'+'b'+'c'+'d'+'e': + * in this case, we will never actually recurse, we will iterate + */ + + if (s->str || (s->flags == PYUNICODE_FLAG_IS_SLICE)) { + Py_UNICODE_COPY(buffer, PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(s), s->length); + return; + } + + for (i = s->strings + s->stringsIndex - 1; i >= s->strings + 1; i--) { + PyUnicodeObject *child = *i; + Py_UNICODE *childDestination; + length -= child->length; + childDestination = buffer + length; + if (child->str || (child->flags == PYUNICODE_FLAG_IS_SLICE)) + Py_UNICODE_COPY(childDestination, PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(child), child->length); + else + unicode_recursive_concatenate(childDestination, child->length, (PyUnicodeConcatenationObject *)child); + } + + s = (PyUnicodeConcatenationObject *)*s->strings; + } +} + + +Py_UNICODE *PyUnicode_AsUnicode(PyObject *object) +{ + register PyUnicodeConcatenationObject *unicode; + if (!PyUnicode_Check(object)) { PyErr_BadArgument(); - goto onError; - } - return PyUnicode_AS_UNICODE(unicode); - - onError: - return NULL; + return NULL; + } + + /* lch */ + unicode = (PyUnicodeConcatenationObject *)object; + if (unicode->str == NULL) { + if (PyUnicode_CHECK_SLICE(object)) { + register PyUnicodeSliceObject *s = (PyUnicodeSliceObject *)object; + Py_ssize_t length = s->end - s->start; + Py_UNICODE *string = (Py_UNICODE *)PyMem_NEW(Py_UNICODE, s->length + 1); + if (string == NULL) { + PyErr_NoMemory(); + return NULL; + } + + Py_UNICODE_COPY(string, PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(s), s->length); + string[length] = 0; + s->str = string; + + Py_DECREF(s->child); + s->child = NULL; + + s->rightRecursionDepth = 0; + } else { + register PyUnicodeObject **i; + register Py_UNICODE *string; + + assert(PyUnicode_CHECK_CONCATENATED(unicode)); + + string = (Py_UNICODE *)PyMem_NEW(Py_UNICODE, unicode->length + 1); + if (string == NULL) { + PyErr_NoMemory(); + return NULL; + } + + unicode_recursive_concatenate(string, unicode->length, unicode); + + string[unicode->length] = 0; + + for (i = unicode->strings + unicode->stringsIndex - 1; i >= unicode->strings; i--) { + Py_DECREF(*i); + #ifdef Py_DEBUG + *i = NULL; + #endif /* Py_DEBUG */ + } + + unicode->str = string; + unicode->stringsIndex = 0; + unicode->rightRecursionDepth = 0; + } + } + + + return PyUnicode_AS_UNICODE_DIRECT(object); } Py_ssize_t PyUnicode_GetSize(PyObject *unicode) @@ -4254,6 +4427,82 @@ int PyUnicode_EncodeDecimal(Py_UNICODE * #define STRINGLIB_NEW PyUnicode_FromUnicode #define STRINGLIB_STR PyUnicode_AS_UNICODE +#define STRINGLIB_SLICE(self, start, end) unicode_slice((PyUnicodeObject *)(self), (start), (end)) + + +/* check if substring matches at given offset. the offset must be +valid, and the substring must not be empty */ +#define Py_UNICODE_MATCH(string, offset, substring) \ + ((*(PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(string) + (offset)) == *PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(substring)) && \ + ((*(PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(string) + (offset) + (substring)->length-1) == *(PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(substring) + (substring)->length-1))) && \ + !memcmp(PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(string) + (offset), PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(substring), (substring)->length*sizeof(Py_UNICODE))) + +static PyObject* +unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end) +{ + /* standard clamping */ + if (start < 0) + start = 0; + if (end < 0) + end = 0; + if (end > self->length) + end = self->length; + if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { + /* full slice, return original string */ + Py_INCREF(self); + return (PyObject*) self; + } + if (start > end) + start = end; +/* copy slice */ +#if 1 + { + PyUnicodeSliceObject *slice; + if (!self->str && PyUnicode_CHECK_SLICE(self)) { + /* + * if you take the slice of an unrendered slice, + * just slice further into the original. + * (a rendered slice will have dropped its reference + * to the grandparent.) + */ + PyUnicodeSliceObject *child = (PyUnicodeSliceObject *)self; + PyUnicodeObject *grandchild = child->child; + start += child->start; + start = min(start, grandchild->length); + end += child->start; + end = min(end, grandchild->length); + self = grandchild; + } + if (start == end) + return (PyObject *)_PyUnicode_New(0); + + if ( ((end - start) < PYUNICODE_MINIMUM_SIZE_FOR_SLICE_OBJECT) + || ((PyUnicode_GET_RIGHT_RECURSION_DEPTH(self) + 1) >= PYUNICODE_RIGHTRECURSIONDEPTH) ) + return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self) + start, end - start); + + slice = (PyUnicodeSliceObject *)PyObject_MALLOC(sizeof(PyUnicodeSliceObject)); + if (slice == NULL) + return PyErr_NoMemory(); + PyObject_INIT(slice, &PyUnicode_Type); + slice->hash = -1; + slice->str = NULL; + slice->defenc = NULL; + slice->length = end - start; + slice->flags = PYUNICODE_FLAG_IS_SLICE; + + slice->start = start; + slice->end = end; + slice->child = self; + slice->rightRecursionDepth = PyUnicode_GET_RIGHT_RECURSION_DEPTH(self) + 1; + Py_INCREF(self); + return (PyObject *)slice; + } +#else + return (PyObject*)PyUnicode_FromUnicode( + PyUnicode_AS_UNICODE(self) + start, end - start); +#endif +} + Py_LOCAL_INLINE(int) STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len) { @@ -4304,7 +4553,7 @@ Py_ssize_t PyUnicode_Count(PyObject *str FIX_START_END(str_obj); result = stringlib_count( - str_obj->str + start, end - start, sub_obj->str, sub_obj->length + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(str_obj) + start, end - start, PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(sub_obj), sub_obj->length ); Py_DECREF(sub_obj); @@ -4332,14 +4581,14 @@ Py_ssize_t PyUnicode_Find(PyObject *str, if (direction > 0) result = stringlib_find_slice( - PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), - PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(str), PyUnicode_GET_SIZE(str), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(sub), PyUnicode_GET_SIZE(sub), start, end ); else result = stringlib_rfind_slice( - PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), - PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(str), PyUnicode_GET_SIZE(str), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(sub), PyUnicode_GET_SIZE(sub), start, end ); @@ -4415,7 +4664,7 @@ PyObject *fixup(PyUnicodeObject *self, if (u == NULL) return NULL; - Py_UNICODE_COPY(u->str, self->str, self->length); + Py_UNICODE_COPY(u->str, PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self), self->length); if (!fixfct(u) && PyUnicode_CheckExact(self)) { /* fixfct should return TRUE if it modified the buffer. If @@ -4432,7 +4681,7 @@ int fixupper(PyUnicodeObject *self) int fixupper(PyUnicodeObject *self) { Py_ssize_t len = self->length; - Py_UNICODE *s = self->str; + Py_UNICODE *s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); int status = 0; while (len-- > 0) { @@ -4453,7 +4702,7 @@ int fixlower(PyUnicodeObject *self) int fixlower(PyUnicodeObject *self) { Py_ssize_t len = self->length; - Py_UNICODE *s = self->str; + Py_UNICODE *s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); int status = 0; while (len-- > 0) { @@ -4474,7 +4723,7 @@ int fixswapcase(PyUnicodeObject *self) int fixswapcase(PyUnicodeObject *self) { Py_ssize_t len = self->length; - Py_UNICODE *s = self->str; + Py_UNICODE *s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); int status = 0; while (len-- > 0) { @@ -4495,7 +4744,7 @@ int fixcapitalize(PyUnicodeObject *self) int fixcapitalize(PyUnicodeObject *self) { Py_ssize_t len = self->length; - Py_UNICODE *s = self->str; + Py_UNICODE *s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); int status = 0; if (len == 0) @@ -4518,7 +4767,7 @@ static static int fixtitle(PyUnicodeObject *self) { - register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); + register Py_UNICODE *p = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); register Py_UNICODE *e; int previous_is_cased; @@ -4608,7 +4857,7 @@ PyUnicode_Join(PyObject *separator, PyOb internal_separator = PyUnicode_FromObject(separator); if (internal_separator == NULL) goto onError; - sep = PyUnicode_AS_UNICODE(internal_separator); + sep = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(internal_separator); seplen = PyUnicode_GET_SIZE(internal_separator); /* In case PyUnicode_FromObject() mutated seq. */ seqlen = PySequence_Fast_GET_SIZE(fseq); @@ -4619,7 +4868,7 @@ PyUnicode_Join(PyObject *separator, PyOb res = _PyUnicode_New(res_alloc); if (res == NULL) goto onError; - res_p = PyUnicode_AS_UNICODE(res); + res_p = PyUnicode_AS_UNICODE_DIRECT(res); res_used = 0; for (i = 0; i < seqlen; ++i) { @@ -4664,11 +4913,11 @@ PyUnicode_Join(PyObject *separator, PyOb Py_DECREF(item); goto onError; } - res_p = PyUnicode_AS_UNICODE(res) + res_used; + res_p = PyUnicode_AS_UNICODE_DIRECT(res) + res_used; } /* Copy item, and maybe the separator. */ - Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); + Py_UNICODE_COPY(res_p, PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(item), itemlen); res_p += itemlen; if (i < seqlen - 1) { Py_UNICODE_COPY(res_p, sep, seplen); @@ -4723,17 +4972,17 @@ PyUnicodeObject *pad(PyUnicodeObject *se u = _PyUnicode_New(left + self->length + right); if (u) { if (left) - Py_UNICODE_FILL(u->str, fill, left); - Py_UNICODE_COPY(u->str + left, self->str, self->length); + Py_UNICODE_FILL(PyUnicode_AS_UNICODE_DIRECT(u), fill, left); + Py_UNICODE_COPY(u->str + left, PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self), self->length); if (right) - Py_UNICODE_FILL(u->str + left + self->length, fill, right); + Py_UNICODE_FILL(PyUnicode_AS_UNICODE_DIRECT(u) + left + self->length, fill, right); } return u; } #define SPLIT_APPEND(data, left, right) \ - str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ + str = unicode_slice((data), (left), (right)); \ if (!str) \ goto onError; \ if (PyList_Append(list, str)) { \ @@ -4752,25 +5001,26 @@ PyObject *split_whitespace(PyUnicodeObje register Py_ssize_t j; Py_ssize_t len = self->length; PyObject *str; + Py_UNICODE *s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); for (i = j = 0; i < len; ) { /* find a token */ - while (i < len && Py_UNICODE_ISSPACE(self->str[i])) + while (i < len && Py_UNICODE_ISSPACE(s[i])) i++; j = i; - while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) + while (i < len && !Py_UNICODE_ISSPACE(s[i])) i++; if (j < i) { if (maxcount-- <= 0) break; - SPLIT_APPEND(self->str, j, i); - while (i < len && Py_UNICODE_ISSPACE(self->str[i])) + SPLIT_APPEND(self, j, i); + while (i < len && Py_UNICODE_ISSPACE(s[i])) i++; j = i; } } if (j < len) { - SPLIT_APPEND(self->str, j, len); + SPLIT_APPEND(self, j, len); } return list; @@ -4792,7 +5042,7 @@ PyObject *PyUnicode_Splitlines(PyObject string = PyUnicode_FromObject(string); if (string == NULL) return NULL; - data = PyUnicode_AS_UNICODE(string); + data = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(string); len = PyUnicode_GET_SIZE(string); list = PyList_New(0); @@ -4817,11 +5067,11 @@ PyObject *PyUnicode_Splitlines(PyObject if (keepends) eol = i; } - SPLIT_APPEND(data, j, eol); + SPLIT_APPEND((PyUnicodeObject *)string, j, eol); j = i; } if (j < len) { - SPLIT_APPEND(data, j, len); + SPLIT_APPEND((PyUnicodeObject *)string, j, len); } Py_DECREF(string); @@ -4843,18 +5093,19 @@ PyObject *split_char(PyUnicodeObject *se register Py_ssize_t j; Py_ssize_t len = self->length; PyObject *str; + Py_UNICODE *s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); for (i = j = 0; i < len; ) { - if (self->str[i] == ch) { + if (s[i] == ch) { if (maxcount-- <= 0) break; - SPLIT_APPEND(self->str, j, i); + SPLIT_APPEND(self, j, i); i = j = i + 1; } else i++; } if (j <= len) { - SPLIT_APPEND(self->str, j, len); + SPLIT_APPEND(self, j, len); } return list; @@ -4879,13 +5130,13 @@ PyObject *split_substring(PyUnicodeObjec if (Py_UNICODE_MATCH(self, i, substring)) { if (maxcount-- <= 0) break; - SPLIT_APPEND(self->str, j, i); + SPLIT_APPEND(self, j, i); i = j = i + sublen; } else i++; } if (j <= len) { - SPLIT_APPEND(self->str, j, len); + SPLIT_APPEND(self, j, len); } return list; @@ -4903,25 +5154,26 @@ PyObject *rsplit_whitespace(PyUnicodeObj register Py_ssize_t j; Py_ssize_t len = self->length; PyObject *str; + Py_UNICODE *s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); for (i = j = len - 1; i >= 0; ) { /* find a token */ - while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) + while (i >= 0 && Py_UNICODE_ISSPACE(s[i])) i--; j = i; - while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i])) + while (i >= 0 && !Py_UNICODE_ISSPACE(s[i])) i--; if (j > i) { if (maxcount-- <= 0) break; - SPLIT_APPEND(self->str, i + 1, j + 1); - while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) + SPLIT_APPEND(self, i + 1, j + 1); + while (i >= 0 && Py_UNICODE_ISSPACE(s[i])) i--; j = i; } } if (j >= 0) { - SPLIT_APPEND(self->str, 0, j + 1); + SPLIT_APPEND(self, 0, j + 1); } if (PyList_Reverse(list) < 0) goto onError; @@ -4942,18 +5194,19 @@ PyObject *rsplit_char(PyUnicodeObject *s register Py_ssize_t j; Py_ssize_t len = self->length; PyObject *str; + Py_UNICODE *s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); for (i = j = len - 1; i >= 0; ) { - if (self->str[i] == ch) { + if (s[i] == ch) { if (maxcount-- <= 0) break; - SPLIT_APPEND(self->str, i + 1, j + 1); + SPLIT_APPEND(self, i + 1, j + 1); j = i = i - 1; } else i--; } if (j >= -1) { - SPLIT_APPEND(self->str, 0, j + 1); + SPLIT_APPEND(self, 0, j + 1); } if (PyList_Reverse(list) < 0) goto onError; @@ -4980,14 +5233,14 @@ PyObject *rsplit_substring(PyUnicodeObje if (Py_UNICODE_MATCH(self, i, substring)) { if (maxcount-- <= 0) break; - SPLIT_APPEND(self->str, i + sublen, j); + SPLIT_APPEND(self, i + sublen, j); j = i; i -= sublen; } else i--; } if (j >= 0) { - SPLIT_APPEND(self->str, 0, j); + SPLIT_APPEND(self, 0, j); } if (PyList_Reverse(list) < 0) goto onError; @@ -5006,6 +5259,7 @@ PyObject *split(PyUnicodeObject *self, Py_ssize_t maxcount) { PyObject *list; + Py_UNICODE *s; if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; @@ -5015,17 +5269,18 @@ PyObject *split(PyUnicodeObject *self, return NULL; if (substring == NULL) - return split_whitespace(self,list,maxcount); - - else if (substring->length == 1) - return split_char(self,list,substring->str[0],maxcount); - - else if (substring->length == 0) { - Py_DECREF(list); - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - else + return split_whitespace(self,list,maxcount); + + s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(substring); + if (substring->length == 1) + return split_char(self,list,s[0],maxcount); + + if (substring->length == 0) { + Py_DECREF(list); + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + return split_substring(self,list,substring,maxcount); } @@ -5035,6 +5290,7 @@ PyObject *rsplit(PyUnicodeObject *self, Py_ssize_t maxcount) { PyObject *list; + Py_UNICODE *s; if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; @@ -5044,17 +5300,18 @@ PyObject *rsplit(PyUnicodeObject *self, return NULL; if (substring == NULL) - return rsplit_whitespace(self,list,maxcount); - - else if (substring->length == 1) - return rsplit_char(self,list,substring->str[0],maxcount); - - else if (substring->length == 0) { - Py_DECREF(list); - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - else + return rsplit_whitespace(self,list,maxcount); + + s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(substring); + if (substring->length == 1) + return rsplit_char(self,list,s[0],maxcount); + + if (substring->length == 0) { + Py_DECREF(list); + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + return rsplit_substring(self,list,substring,maxcount); } @@ -5065,6 +5322,9 @@ PyObject *replace(PyUnicodeObject *self, Py_ssize_t maxcount) { PyUnicodeObject *u; + Py_UNICODE *self_str = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); + Py_UNICODE *str1_str = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(str1); + Py_UNICODE *str2_str = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(str2); if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; @@ -5075,14 +5335,14 @@ PyObject *replace(PyUnicodeObject *self, if (str1->length == 1) { /* replace characters */ Py_UNICODE u1, u2; - if (!findchar(self->str, self->length, str1->str[0])) + if (!findchar(self_str, self->length, str1_str[0])) goto nothing; u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); if (!u) return NULL; - Py_UNICODE_COPY(u->str, self->str, self->length); - u1 = str1->str[0]; - u2 = str2->str[0]; + Py_UNICODE_COPY(u->str, self_str, self->length); + u1 = str1_str[0]; + u2 = str2_str[0]; for (i = 0; i < u->length; i++) if (u->str[i] == u1) { if (--maxcount < 0) @@ -5091,19 +5351,19 @@ PyObject *replace(PyUnicodeObject *self, } } else { i = fastsearch( - self->str, self->length, str1->str, str1->length, FAST_SEARCH + self_str, self->length, str1_str, str1->length, FAST_SEARCH ); if (i < 0) goto nothing; u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); if (!u) return NULL; - Py_UNICODE_COPY(u->str, self->str, self->length); + Py_UNICODE_COPY(u->str, self_str, self->length); while (i <= self->length - str1->length) if (Py_UNICODE_MATCH(self, i, str1)) { if (--maxcount < 0) break; - Py_UNICODE_COPY(u->str+i, str2->str, str2->length); + Py_UNICODE_COPY(u->str+i, str2_str, str2->length); i += str1->length; } else i++; @@ -5115,28 +5375,25 @@ PyObject *replace(PyUnicodeObject *self, Py_UNICODE *p; /* replace strings */ - n = stringlib_count(self->str, self->length, str1->str, str1->length); + n = stringlib_count(self_str, self->length, str1_str, str1->length); if (n > maxcount) n = maxcount; if (n == 0) goto nothing; /* new_size = self->length + n * (str2->length - str1->length)); */ delta = (str2->length - str1->length); - if (delta == 0) { - new_size = self->length; - } else { - product = n * (str2->length - str1->length); - if ((product / (str2->length - str1->length)) != n) { - PyErr_SetString(PyExc_OverflowError, - "replace string is too long"); - return NULL; - } - new_size = self->length + product; - if (new_size < 0) { - PyErr_SetString(PyExc_OverflowError, - "replace string is too long"); - return NULL; - } + assert(delta != 0); + product = n * delta; + if ((product / delta) != n) { + PyErr_SetString(PyExc_OverflowError, + "replace string is too long"); + return NULL; + } + new_size = self->length + product; + if (new_size < 0) { + PyErr_SetString(PyExc_OverflowError, + "replace string is too long"); + return NULL; } u = _PyUnicode_New(new_size); if (!u) @@ -5153,33 +5410,33 @@ PyObject *replace(PyUnicodeObject *self, break; j++; } - if (j > i) { + if (j > i) { if (j > e) break; /* copy unchanged part [i:j] */ - Py_UNICODE_COPY(p, self->str+i, j-i); + Py_UNICODE_COPY(p, self_str+i, j-i); p += j - i; } /* copy substitution string */ if (str2->length > 0) { - Py_UNICODE_COPY(p, str2->str, str2->length); + Py_UNICODE_COPY(p, str2_str, str2->length); p += str2->length; } i = j + str1->length; } if (i < self->length) /* copy tail [i:] */ - Py_UNICODE_COPY(p, self->str+i, self->length-i); + Py_UNICODE_COPY(p, self_str+i, self->length-i); } else { /* interleave */ while (n > 0) { - Py_UNICODE_COPY(p, str2->str, str2->length); + Py_UNICODE_COPY(p, str2_str, str2->length); p += str2->length; if (--n <= 0) break; - *p++ = self->str[i++]; + *p++ = self_str[i++]; } - Py_UNICODE_COPY(p, self->str+i, self->length-i); + Py_UNICODE_COPY(p, self_str+i, self->length-i); } } return (PyObject *) u; @@ -5190,7 +5447,7 @@ nothing: Py_INCREF(self); return (PyObject *) self; } - return PyUnicode_FromUnicode(self->str, self->length); + return PyUnicode_FromUnicode(self_str, self->length); } /* --- Unicode Object Methods --------------------------------------------- */ @@ -5278,7 +5535,7 @@ convert_uc(PyObject *obj, void *addr) Py_DECREF(uniobj); return 0; } - unistr = PyUnicode_AS_UNICODE(uniobj); + unistr = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(uniobj); *fillcharloc = unistr[0]; Py_DECREF(uniobj); return 1; @@ -5334,8 +5591,8 @@ unicode_compare(PyUnicodeObject *str1, P { Py_ssize_t len1, len2; - Py_UNICODE *s1 = str1->str; - Py_UNICODE *s2 = str2->str; + Py_UNICODE *s1 = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(str1); + Py_UNICODE *s2 = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(str2); len1 = str1->length; len2 = str2->length; @@ -5368,8 +5625,8 @@ unicode_compare(PyUnicodeObject *str1, P { register Py_ssize_t len1, len2; - Py_UNICODE *s1 = str1->str; - Py_UNICODE *s2 = str2->str; + Py_UNICODE *s1 = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(str1); + Py_UNICODE *s2 = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(str2); len1 = str1->length; len2 = str2->length; @@ -5530,10 +5787,18 @@ int PyUnicode_Contains(PyObject *contain /* Concat to string or Unicode object giving a new Unicode object. */ +static void +unicode_render_if_too_deep(register PyUnicodeConcatenationObject *op) +{ + if (PyUnicode_GET_RIGHT_RECURSION_DEPTH(op) >= PYUNICODE_RIGHTRECURSIONDEPTH) + PyUnicode_AsUnicode((PyObject *)op); +} + PyObject *PyUnicode_Concat(PyObject *left, PyObject *right) { - PyUnicodeObject *u = NULL, *v = NULL, *w; + PyUnicodeObject *u = NULL, *v = NULL; + PyUnicodeConcatenationObject *unicode; /* Coerce the two arguments */ u = (PyUnicodeObject *)PyUnicode_FromObject(left); @@ -5553,8 +5818,72 @@ PyObject *PyUnicode_Concat(PyObject *lef return (PyObject *)v; } - /* Concat the two Unicode strings */ - w = _PyUnicode_New(u->length + v->length); + /* Concat the two Unicode strings */ + +#if 1 + /* lch */ + /* if left side is already a concatenation object, and hasn't been rendered yet, and only has one reference, and has room, just append to it */ + if (PyUnicode_CHECK_CONCATENATED(u) && (u->str == NULL) && (u->ob_refcnt == 1)) { + unicode = (PyUnicodeConcatenationObject *)u; + if (unicode->stringsIndex < PYUNICODE_CONCATENATIONS) { + Py_INCREF(u); + unicode->strings[unicode->stringsIndex++] = v; + unicode->length += v->length; + + unicode->rightRecursionDepth = max(unicode->rightRecursionDepth, + PyUnicode_GET_RIGHT_RECURSION_DEPTH(v) + 1); + unicode_render_if_too_deep(unicode); + + Py_INCREF(unicode); + return (PyObject *)unicode; + } + } + /* if right side is already a concatenation object, and hasn't been rendered yet, and only has one reference, and has room, just prepend to it */ + if (PyUnicode_CHECK_CONCATENATED(v) && (v->str == NULL) && (v->ob_refcnt == 1)) { + unicode = (PyUnicodeConcatenationObject *)v; + if (unicode->stringsIndex < PYUNICODE_CONCATENATIONS) { + memmove(unicode->strings + 1, unicode->strings, unicode->stringsIndex * sizeof(PyUnicodeObject *)); + Py_INCREF(v); + unicode->strings[0] = u; + unicode->stringsIndex++; + unicode->length += u->length; + + unicode->rightRecursionDepth = max(unicode->rightRecursionDepth, + PyUnicode_GET_RIGHT_RECURSION_DEPTH(unicode->strings[1]) + 1); + unicode_render_if_too_deep(unicode); + + Py_INCREF(unicode); + return (PyObject *)unicode; + } + } + + /* lch */ + unicode = (PyUnicodeConcatenationObject *)PyObject_MALLOC(sizeof(PyUnicodeConcatenationObject)); + if (unicode == NULL) + return PyErr_NoMemory(); + PyObject_INIT(unicode, &PyUnicode_Type); + + unicode->str = NULL; + unicode->length = u->length + v->length; + unicode->hash = -1; + unicode->defenc = NULL; + unicode->strings[0] = u; + unicode->strings[1] = v; +#ifdef Py_DEBUG + memset(unicode->strings + 2, 0xee, sizeof(unicode->strings) - (sizeof(unicode->strings[0]) * 2)); +#endif /* Py_DEBUG */ + unicode->stringsIndex = 2; + unicode->flags = PYUNICODE_FLAG_IS_CONCATENATION; + unicode->rightRecursionDepth = PyUnicode_GET_RIGHT_RECURSION_DEPTH(v) + 1; + + Py_INCREF(u); + Py_INCREF(v); + + unicode_render_if_too_deep(unicode); + return (PyObject *)unicode; +#else + { + PyUnicodeObject *w = _PyUnicode_New(u->length + v->length); if (w == NULL) goto onError; Py_UNICODE_COPY(w->str, u->str, u->length); @@ -5563,6 +5892,8 @@ PyObject *PyUnicode_Concat(PyObject *lef Py_DECREF(u); Py_DECREF(v); return (PyObject *)w; + } +#endif onError: Py_XDECREF(u); @@ -5597,8 +5928,8 @@ unicode_count(PyUnicodeObject *self, PyO FIX_START_END(self); result = PyInt_FromSsize_t( - stringlib_count(self->str + start, end - start, - substring->str, substring->length) + stringlib_count(PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self) + start, end - start, + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(substring), substring->length) ); Py_DECREF(substring); @@ -5699,7 +6030,7 @@ unicode_expandtabs(PyUnicodeObject *self /* First pass: determine size of output string */ i = j = 0; - e = self->str + self->length; + e = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self) + self->length; for (p = self->str; p < e; p++) if (*p == '\t') { if (tabsize > 0) @@ -5719,9 +6050,9 @@ unicode_expandtabs(PyUnicodeObject *self return NULL; j = 0; - q = u->str; - - for (p = self->str; p < e; p++) + q = PyUnicode_AS_UNICODE_DIRECT(u); + + for (p = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); p < e; p++) if (*p == '\t') { if (tabsize > 0) { i = tabsize - (j % tabsize); @@ -5765,8 +6096,8 @@ unicode_find(PyUnicodeObject *self, PyOb return NULL; result = stringlib_find_slice( - PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), - PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self), PyUnicode_GET_SIZE(self), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(substring), PyUnicode_GET_SIZE(substring), start, end ); @@ -5778,12 +6109,14 @@ static PyObject * static PyObject * unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) { + Py_UNICODE *s; if (index < 0 || index >= self->length) { PyErr_SetString(PyExc_IndexError, "string index out of range"); return NULL; } - return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); + s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); + return (PyObject*) PyUnicode_FromUnicode(s + index, 1); } static long @@ -5802,7 +6135,7 @@ unicode_hash(PyUnicodeObject *self) if (self->hash != -1) return self->hash; len = PyUnicode_GET_SIZE(self); - p = PyUnicode_AS_UNICODE(self); + p = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); x = *p << 7; while (--len >= 0) x = (1000003*x) ^ *p++; @@ -5834,8 +6167,8 @@ unicode_index(PyUnicodeObject *self, PyO return NULL; result = stringlib_find_slice( - PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), - PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self), PyUnicode_GET_SIZE(self), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(substring), PyUnicode_GET_SIZE(substring), start, end ); @@ -5858,7 +6191,7 @@ static PyObject* static PyObject* unicode_islower(PyUnicodeObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); + register const Py_UNICODE *p = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); register const Py_UNICODE *e; int cased; @@ -5892,7 +6225,7 @@ static PyObject* static PyObject* unicode_isupper(PyUnicodeObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); + register const Py_UNICODE *p = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); register const Py_UNICODE *e; int cased; @@ -5928,7 +6261,7 @@ static PyObject* static PyObject* unicode_istitle(PyUnicodeObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); + register const Py_UNICODE *p = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); register const Py_UNICODE *e; int cased, previous_is_cased; @@ -5974,7 +6307,7 @@ static PyObject* static PyObject* unicode_isspace(PyUnicodeObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); + register const Py_UNICODE *p = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); register const Py_UNICODE *e; /* Shortcut for single character strings */ @@ -6003,7 +6336,7 @@ static PyObject* static PyObject* unicode_isalpha(PyUnicodeObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); + register const Py_UNICODE *p = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); register const Py_UNICODE *e; /* Shortcut for single character strings */ @@ -6032,7 +6365,7 @@ static PyObject* static PyObject* unicode_isalnum(PyUnicodeObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); + register const Py_UNICODE *p = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); register const Py_UNICODE *e; /* Shortcut for single character strings */ @@ -6061,7 +6394,7 @@ static PyObject* static PyObject* unicode_isdecimal(PyUnicodeObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); + register const Py_UNICODE *p = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); register const Py_UNICODE *e; /* Shortcut for single character strings */ @@ -6090,7 +6423,7 @@ static PyObject* static PyObject* unicode_isdigit(PyUnicodeObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); + register const Py_UNICODE *p = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); register const Py_UNICODE *e; /* Shortcut for single character strings */ @@ -6119,7 +6452,7 @@ static PyObject* static PyObject* unicode_isnumeric(PyUnicodeObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); + register const Py_UNICODE *p = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); register const Py_UNICODE *e; /* Shortcut for single character strings */ @@ -6204,9 +6537,9 @@ PyObject * PyObject * _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) { - Py_UNICODE *s = PyUnicode_AS_UNICODE(self); + Py_UNICODE *s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); Py_ssize_t len = PyUnicode_GET_SIZE(self); - Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); + Py_UNICODE *sep = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(sepobj); Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); Py_ssize_t i, j; @@ -6232,14 +6565,14 @@ _PyUnicode_XStrip(PyUnicodeObject *self, return (PyObject*)self; } else - return PyUnicode_FromUnicode(s+i, j-i); + return unicode_slice(self, i, j); } static PyObject * do_strip(PyUnicodeObject *self, int striptype) { - Py_UNICODE *s = PyUnicode_AS_UNICODE(self); + Py_UNICODE *s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self); Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; i = 0; @@ -6262,7 +6595,7 @@ do_strip(PyUnicodeObject *self, int stri return (PyObject*)self; } else - return PyUnicode_FromUnicode(s+i, j-i); + return unicode_slice(self, i, j); } @@ -6354,7 +6687,7 @@ unicode_repeat(PyUnicodeObject *str, Py_ unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) { PyUnicodeObject *u; - Py_UNICODE *p; + Py_UNICODE *p, *s; Py_ssize_t nchars; size_t nbytes; @@ -6387,19 +6720,19 @@ unicode_repeat(PyUnicodeObject *str, Py_ return NULL; p = u->str; - + s = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(str); if (str->length == 1 && len > 0) { - Py_UNICODE_FILL(p, str->str[0], len); + Py_UNICODE_FILL(p, s[0], len); } else { Py_ssize_t done = 0; /* number of characters copied this far */ if (done < nchars) { - Py_UNICODE_COPY(p, str->str, str->length); + Py_UNICODE_COPY(p, s, str->length); done = str->length; - } - while (done < nchars) { - int n = (done <= nchars-done) ? done : nchars-done; - Py_UNICODE_COPY(p+done, p, n); - done += n; + while (done < nchars) { + int n = (done <= nchars-done) ? done : nchars-done; + Py_UNICODE_COPY(p+done, p, n); + done += n; + } } } @@ -6506,8 +6839,8 @@ unicode_rfind(PyUnicodeObject *self, PyO return NULL; result = stringlib_rfind_slice( - PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), - PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self), PyUnicode_GET_SIZE(self), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(substring), PyUnicode_GET_SIZE(substring), start, end ); @@ -6537,8 +6870,8 @@ unicode_rindex(PyUnicodeObject *self, Py return NULL; result = stringlib_rfind_slice( - PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), - PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self), PyUnicode_GET_SIZE(self), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(substring), PyUnicode_GET_SIZE(substring), start, end ); @@ -6574,27 +6907,6 @@ unicode_rjust(PyUnicodeObject *self, PyO return (PyObject*) pad(self, width - self->length, 0, fillchar); } -static PyObject* -unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end) -{ - /* standard clamping */ - if (start < 0) - start = 0; - if (end < 0) - end = 0; - if (end > self->length) - end = self->length; - if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { - /* full slice, return original string */ - Py_INCREF(self); - return (PyObject*) self; - } - if (start > end) - start = end; - /* copy slice */ - return (PyObject*) PyUnicode_FromUnicode(self->str + start, - end - start); -} PyObject *PyUnicode_Split(PyObject *s, PyObject *sep, @@ -6662,8 +6974,8 @@ PyUnicode_Partition(PyObject *str_in, Py } out = stringlib_partition( - str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), - sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) + str_obj, PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(str_obj), PyUnicode_GET_SIZE(str_obj), + sep_obj, PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(sep_obj), PyUnicode_GET_SIZE(sep_obj) ); Py_DECREF(sep_obj); @@ -6690,8 +7002,8 @@ PyUnicode_RPartition(PyObject *str_in, P } out = stringlib_rpartition( - str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), - sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) + str_obj, PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(str_obj), PyUnicode_GET_SIZE(str_obj), + sep_obj, PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(sep_obj), PyUnicode_GET_SIZE(sep_obj) ); Py_DECREF(sep_obj); @@ -6824,7 +7136,7 @@ static PyObject* static PyObject* unicode_translate(PyUnicodeObject *self, PyObject *table) { - return PyUnicode_TranslateCharmap(self->str, + return PyUnicode_TranslateCharmap(PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self), self->length, table, "ignore"); @@ -6864,7 +7176,7 @@ unicode_zfill(PyUnicodeObject *self, PyO } else return PyUnicode_FromUnicode( - PyUnicode_AS_UNICODE(self), + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(self), PyUnicode_GET_SIZE(self) ); } @@ -6989,8 +7301,26 @@ static PyObject * static PyObject * unicode_getnewargs(PyUnicodeObject *v) { - return Py_BuildValue("(u#)", v->str, v->length); -} + return Py_BuildValue("(u#)", PyUnicode_AS_UNICODE(v), v->length); +} + + +PyDoc_STRVAR(simplify__doc__, +"S.simplify() -> S\n\ +\n\ +Simplifies the internal representation of S,\n\ +forcing it to \"render\" it if it is a concatenation.\n\ +Returns a new reference to S.\n\ +Only necessary for memory/reference tuning."); + +static PyObject* +unicode_simplify(PyObject *self) +{ + PyUnicode_AsUnicode(self); + Py_INCREF(self); + return (PyObject *)self; +} + static PyMethodDef unicode_methods[] = { @@ -7048,6 +7378,7 @@ static PyMethodDef unicode_methods[] = { #endif {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, + {"simplify", (PyCFunction) unicode_simplify, METH_NOARGS, simplify__doc__}, {NULL, NULL} }; @@ -7102,7 +7433,9 @@ unicode_subscript(PyUnicodeObject* self, if (slicelength <= 0) { return PyUnicode_FromUnicode(NULL, 0); - } else { + } else if (step == 1) { + return unicode_slice(self, start, stop); + } else { source_buf = PyUnicode_AS_UNICODE((PyObject*)self); result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength* sizeof(Py_UNICODE)); @@ -7140,7 +7473,7 @@ unicode_buffer_getreadbuf(PyUnicodeObjec "accessing non-existent unicode segment"); return -1; } - *ptr = (void *) self->str; + *ptr = (void *) PyUnicode_AS_UNICODE(self); return PyUnicode_GET_DATA_SIZE(self); } @@ -7395,7 +7728,7 @@ formatchar(Py_UNICODE *buf, if (PyUnicode_Check(v)) { if (PyUnicode_GET_SIZE(v) != 1) goto onError; - buf[0] = PyUnicode_AS_UNICODE(v)[0]; + buf[0] = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(v)[0]; } else if (PyString_Check(v)) { @@ -7463,7 +7796,7 @@ PyObject *PyUnicode_Format(PyObject *for uformat = PyUnicode_FromObject(format); if (uformat == NULL) return NULL; - fmt = PyUnicode_AS_UNICODE(uformat); + fmt = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(uformat); fmtcnt = PyUnicode_GET_SIZE(uformat); reslen = rescnt = fmtcnt + 100; @@ -7701,7 +8034,7 @@ PyObject *PyUnicode_Format(PyObject *for goto onError; } } - pbuf = PyUnicode_AS_UNICODE(temp); + pbuf = PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(temp); len = PyUnicode_GET_SIZE(temp); if (prec >= 0 && len > prec) len = prec; @@ -7932,6 +8265,7 @@ unicode_subtype_new(PyTypeObject *type, Py_UNICODE_COPY(pnew->str, tmp->str, n+1); pnew->length = n; pnew->hash = tmp->hash; + pnew->flags = 0; Py_DECREF(tmp); return (PyObject *)pnew; } @@ -8097,7 +8431,7 @@ unicodeiter_next(unicodeiterobject *it) if (it->it_index < PyUnicode_GET_SIZE(seq)) { item = PyUnicode_FromUnicode( - PyUnicode_AS_UNICODE(seq)+it->it_index, 1); + PYUNICODE_AS_UNTERMINATED_UNICODE_STRING(seq)+it->it_index, 1); if (item != NULL) ++it->it_index; return item;