diff -r 96c7338bf39a Include/unicodeobject.h --- a/Include/unicodeobject.h Wed Oct 06 23:21:18 2010 +0200 +++ b/Include/unicodeobject.h Thu Oct 07 01:21:22 2010 +0200 @@ -359,8 +359,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE; #define Py_UNICODE_MATCH(string, offset, substring) \ ((*((string)->str + (offset)) == *((substring)->str)) && \ - ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \ - !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE))) + ((*((string)->str + (offset) + Py_SIZE(substring)-1) == *((substring)->str + Py_SIZE(substring)-1))) && \ + !memcmp((string)->str + (offset), (substring)->str, Py_SIZE(substring)*sizeof(Py_UNICODE))) #ifdef __cplusplus extern "C" { @@ -369,18 +369,18 @@ extern "C" { /* --- Unicode Type ------------------------------------------------------- */ typedef struct { - PyObject_HEAD - Py_ssize_t length; /* Length of raw Unicode data in buffer */ - Py_UNICODE *str; /* Raw Unicode buffer */ - long hash; /* Hash value; -1 if not set */ - int state; /* != 0 if interned. In this case the two + PyObject_VAR_HEAD + long hash; /* Hash value; -1 if not set */ + PyObject *defenc; /* (Default) Encoded version as Python + string, or NULL; this is used for + implementing the buffer protocol */ + unsigned char state; /* != 0 if interned. In this case the two * references from the dictionary to this object * are *not* counted in ob_refcnt. */ - PyObject *defenc; /* (Default) Encoded version as Python - string, or NULL; this is used for - implementing the buffer protocol */ + Py_UNICODE str[1]; /* Raw Unicode buffer */ } PyUnicodeObject; + PyAPI_DATA(PyTypeObject) PyUnicode_Type; PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; @@ -394,9 +394,9 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_T /* Fast access macros */ #define PyUnicode_GET_SIZE(op) \ - (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length)) + (assert(PyUnicode_Check(op)), Py_SIZE(op)) #define PyUnicode_GET_DATA_SIZE(op) \ - (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))) + (assert(PyUnicode_Check(op)), Py_SIZE(op) * sizeof(Py_UNICODE)) #define PyUnicode_AS_UNICODE(op) \ (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str)) #define PyUnicode_AS_DATA(op) \ diff -r 96c7338bf39a Lib/test/test_io.py --- a/Lib/test/test_io.py Wed Oct 06 23:21:18 2010 +0200 +++ b/Lib/test/test_io.py Thu Oct 07 01:21:22 2010 +0200 @@ -2479,9 +2479,9 @@ class MiscIOTest(unittest.TestCase): self.assertRaises(TypeError, self.BlockingIOError, 1, "", None) b = self.BlockingIOError(1, "") self.assertEqual(b.characters_written, 0) - class C(str): + class C: pass - c = C("") + c = C() b = self.BlockingIOError(1, c) c.b = b b.c = c diff -r 96c7338bf39a Lib/test/test_sys.py --- a/Lib/test/test_sys.py Wed Oct 06 23:21:18 2010 +0200 +++ b/Lib/test/test_sys.py Thu Oct 07 01:21:22 2010 +0200 @@ -893,10 +893,11 @@ class SizeofTest(unittest.TestCase): # unicode usize = len('\0'.encode('unicode-internal')) samples = ['', '1'*100] + ucode = {2: 'H', 4: 'I'}[usize] # we need to test for both sizes, because we don't know if the string # has been cached for s in samples: - basicsize = size(h + 'PPliP') + usize * (len(s) + 1) + basicsize = struct.calcsize(vh + 'lPb' + '%d%s' % (len(s) + 1, ucode)) check(s, basicsize) # weakref import weakref diff -r 96c7338bf39a Objects/stringlib/eq.h --- a/Objects/stringlib/eq.h Wed Oct 06 23:21:18 2010 +0200 +++ b/Objects/stringlib/eq.h Thu Oct 07 01:21:22 2010 +0200 @@ -9,13 +9,13 @@ unicode_eq(PyObject *aa, PyObject *bb) register PyUnicodeObject *a = (PyUnicodeObject *)aa; register PyUnicodeObject *b = (PyUnicodeObject *)bb; - if (a->length != b->length) + if (PyUnicode_GET_SIZE(a) != PyUnicode_GET_SIZE(b)) return 0; - if (a->length == 0) + if (PyUnicode_GET_SIZE(a) == 0) return 1; if (a->str[0] != b->str[0]) return 0; - if (a->length == 1) + if (PyUnicode_GET_SIZE(a) == 1) return 1; - return memcmp(a->str, b->str, a->length * sizeof(Py_UNICODE)) == 0; + return memcmp(a->str, b->str, PyUnicode_GET_DATA_SIZE(a)) == 0; } diff -r 96c7338bf39a Objects/unicodeobject.c --- a/Objects/unicodeobject.c Wed Oct 06 23:21:18 2010 +0200 +++ b/Objects/unicodeobject.c Thu Oct 07 01:21:22 2010 +0200 @@ -46,32 +46,38 @@ OF OR IN CONNECTION WITH THE USE OR PERF #include "unicodeobject.h" #include "ucnhash.h" +#include + #ifdef MS_WINDOWS #include #endif -/* Limit for the Unicode object free list */ - -#define PyUnicode_MAXFREELIST 1024 - -/* Limit for the Unicode object free list stay alive optimization. +/* PyUnicodeObject_SIZE gives the basic physical size of an unicode string; + any memory allocation for a string of length n should request + (PyUnicodeObject_SIZE + n * sizeof(Py_UNICODE)) bytes. + + Using PyUnicodeObject_SIZE instead of sizeof(PyUnicodeObject) saves + 3 bytes per string allocation on a typical system. +*/ +#define PyUnicodeObject_SIZE (offsetof(PyUnicodeObject, str) + sizeof(Py_UNICODE)) + + +/* Number of free lists, one per unicode object size. The implementation will keep allocated Unicode memory intact for - all objects on the free list having a size less than this - limit. This reduces malloc() overhead for small Unicode objects. - - At worst this will result in PyUnicode_MAXFREELIST * - (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + - malloc()-overhead) bytes of unused garbage. + objects having a size less than this limit, within a certain number + of objects for each size (as defined by the CAN_SAVE macro below). Setting the limit to 0 effectively turns the feature off. - - Note: This is an experimental feature ! If you get core dumps when - using Unicode objects, turn this feature off. - */ -#define KEEPALIVE_SIZE_LIMIT 9 +#define MAX_SAVED_SIZE 100 + +/* We keep lots of small objects in the free lists, but less larger ones. */ + +#define CAN_SAVE(obj_length, list_size) \ + ((obj_length < 20 && list_size < 50) \ + || (list_size < 3)) /* Endianness switches; defaults to little endian */ @@ -103,9 +109,8 @@ extern "C" { */ static PyObject *interned; -/* Free list for Unicode objects */ -static PyUnicodeObject *free_list; -static int numfree; +/* Free lists for Unicode objects */ +static PyUnicodeObject *unicode_freelist[MAX_SAVED_SIZE]; /* The empty Unicode object is shared to improve performance. */ static PyUnicodeObject *unicode_empty; @@ -257,61 +262,74 @@ Py_LOCAL_INLINE(int) unicode_member(Py_U /* --- Unicode Object ----------------------------------------------------- */ static -int unicode_resize(register PyUnicodeObject *unicode, - Py_ssize_t length) -{ - void *oldstr; - - /* Shortcut if there's nothing much to do. */ - if (unicode->length == length) +PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); + +static +PyUnicodeObject *unicode_resize(register PyUnicodeObject *unicode, + Py_ssize_t length) +{ + PyUnicodeObject *v; + + /* Optimization for empty strings; yes, this sometimes happens. */ + if (length == 0 && unicode_empty != NULL) { + Py_DECREF(unicode); + Py_INCREF(unicode_empty); + return unicode_empty; + } + + /* Resizing unicode_empty and single character objects is not + possible since these are being shared. We simply return a fresh + copy with the same Unicode content. */ + if (PyUnicode_GET_SIZE(unicode) != length && + (unicode == unicode_empty || PyUnicode_GET_SIZE(unicode) == 1)) { + v = _PyUnicode_New(length); + if (v == NULL) + return NULL; + Py_UNICODE_COPY(v->str, unicode->str, + length < PyUnicode_GET_SIZE(unicode) ? length : PyUnicode_GET_SIZE(unicode)); + Py_DECREF(unicode); + return v; + } + + /* PyObject_REALLOC will almost always return a new memory block, so try + to find an existing one instead */ + if (length < MAX_SAVED_SIZE && (v = unicode_freelist[length])) { + unicode_freelist[length] = (PyUnicodeObject *) v->defenc; + v->defenc = NULL; + v->state = 0; + Py_UNICODE_COPY(v->str, unicode->str, + length < PyUnicode_GET_SIZE(unicode) ? length : PyUnicode_GET_SIZE(unicode)); + Py_DECREF(unicode); goto reset; - - /* Resizing shared object (unicode_empty or single character - objects) in-place is not allowed. Use PyUnicode_Resize() - instead ! */ - - if (unicode == unicode_empty || - (unicode->length == 1 && - unicode->str[0] < 256U && - unicode_latin1[unicode->str[0]] == unicode)) { - PyErr_SetString(PyExc_SystemError, - "can't resize shared str objects"); - return -1; - } - - /* We allocate one more byte to make sure the string is Ux0000 terminated. - The overallocation is also used by fastsearch, which assumes that it's - safe to look at str[length] (without making any assumptions about what - it contains). */ - - oldstr = unicode->str; - unicode->str = PyObject_REALLOC(unicode->str, - sizeof(Py_UNICODE) * (length + 1)); - if (!unicode->str) { - unicode->str = (Py_UNICODE *)oldstr; + } + + /* Adapted from similar code in tupleobject */ + _Py_DEC_REFTOTAL; + _Py_ForgetReference(unicode); + v = (PyUnicodeObject *) PyObject_REALLOC((char *) unicode, + PyUnicodeObject_SIZE + length * sizeof(Py_UNICODE)); + if (v == NULL) { + PyObject_DEL(unicode); PyErr_NoMemory(); - return -1; - } - unicode->str[length] = 0; - unicode->length = length; - + return NULL; + } reset: + Py_SIZE(v) = length; + v->str[length] = 0; /* Reset the object caches */ - if (unicode->defenc) { - Py_CLEAR(unicode->defenc); - } - unicode->hash = -1; - - return 0; -} - -/* We allocate one more byte to make sure the string is - Ux0000 terminated; some code (e.g. new_identifier) - relies on that. + Py_CLEAR(v->defenc); + v->hash = -1; + _Py_NewReference(v); + return v; +} + +/* We allocate one more byte to make sure the string is Ux0000 terminated. + The overallocation is also used by fastsearch, which assumes that it's + safe to look at str[length] (without making any assumptions about what + it contains). XXX This allocator could further be enhanced by assuring that the free list never reduces its size below 1. - */ static @@ -331,38 +349,22 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize } /* Unicode freelist & memory allocation */ - if (free_list) { - unicode = free_list; - free_list = *(PyUnicodeObject **)unicode; - numfree--; - if (unicode->str) { - /* Keep-Alive optimization: we only upsize the buffer, - never downsize it. */ - if ((unicode->length < length) && - unicode_resize(unicode, length) < 0) { - PyObject_DEL(unicode->str); - unicode->str = NULL; - } - } - else { - size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); - unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); - } - PyObject_INIT(unicode, &PyUnicode_Type); + if (length < MAX_SAVED_SIZE + && (unicode = unicode_freelist[length])) { + _Py_NewReference(unicode); + unicode_freelist[length] = (PyUnicodeObject *) unicode->defenc; } else { - size_t new_size; - unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); - if (unicode == NULL) + /* Inline PyObject_NewVar */ + unicode = (PyUnicodeObject *) PyObject_MALLOC( + PyUnicodeObject_SIZE + length * sizeof(Py_UNICODE)); + if (!unicode) { + PyErr_NoMemory(); return NULL; - new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); - unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); - } - - if (!unicode->str) { - PyErr_NoMemory(); - goto onError; - } + } + PyObject_INIT_VAR(unicode, &PyUnicode_Type, length); + } + /* Initialize the first element to guard against cases where * the caller fails before initializing str -- unicode_resize() * reads str[0], and the Keep-Alive optimization can keep memory @@ -372,23 +374,18 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize */ unicode->str[0] = 0; unicode->str[length] = 0; - unicode->length = length; + Py_SIZE(unicode) = length; unicode->hash = -1; unicode->state = 0; unicode->defenc = NULL; return unicode; - - onError: - /* XXX UNREF/NEWREF interface should be more symmetrical */ - _Py_DEC_REFTOTAL; - _Py_ForgetReference((PyObject *)unicode); - PyObject_Del(unicode); - return NULL; } static void unicode_dealloc(register PyUnicodeObject *unicode) { + Py_ssize_t length = PyUnicode_GET_SIZE(unicode); + switch (PyUnicode_CHECK_INTERNED(unicode)) { case SSTATE_NOT_INTERNED: break; @@ -408,27 +405,20 @@ void unicode_dealloc(register PyUnicodeO Py_FatalError("Inconsistent interned string state."); } - if (PyUnicode_CheckExact(unicode) && - numfree < PyUnicode_MAXFREELIST) { - /* Keep-Alive optimization */ - if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { - PyObject_DEL(unicode->str); - unicode->str = NULL; - unicode->length = 0; - } - if (unicode->defenc) { - Py_CLEAR(unicode->defenc); - } - /* Add to free list */ - *(PyUnicodeObject **)unicode = free_list; - free_list = unicode; - numfree++; - } - else { - PyObject_DEL(unicode->str); - Py_XDECREF(unicode->defenc); - Py_TYPE(unicode)->tp_free((PyObject *)unicode); - } + Py_CLEAR(unicode->defenc); + + if (PyUnicode_CheckExact(unicode) && length < MAX_SAVED_SIZE) { + PyUnicodeObject *v = unicode_freelist[length]; + if (!v || CAN_SAVE(length, PyUnicode_GET_SIZE(v))) { + /* Keep track of number of items stacked on the freelist */ + Py_SIZE(unicode) = v ? PyUnicode_GET_SIZE(v) + 1 : 1; + unicode->defenc = (PyObject *) v; + unicode_freelist[length] = unicode; + return; + } + } + + Py_TYPE(unicode)->tp_free((PyObject *)unicode); } static @@ -447,29 +437,16 @@ int _PyUnicode_Resize(PyUnicodeObject ** return -1; } - /* Resizing unicode_empty and single character objects is not - possible since these are being shared. We simply return a fresh - copy with the same Unicode content. */ - if (v->length != length && - (v == unicode_empty || v->length == 1)) { - PyUnicodeObject *w = _PyUnicode_New(length); - if (w == NULL) - return -1; - Py_UNICODE_COPY(w->str, v->str, - length < v->length ? length : v->length); - Py_DECREF(*unicode); - *unicode = w; - return 0; - } - - /* Note that we don't have to modify *unicode for unshared Unicode - objects, since we can modify them in-place. */ - return unicode_resize(v, length); + v = unicode_resize(v, length); + if (v == NULL) + return -1; + *unicode = v; + return 0; } int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) { - return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); + return _PyUnicode_Resize((PyUnicodeObject **) unicode, length); } PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, @@ -794,7 +771,6 @@ PyUnicode_FromFormatV(const char *format width = (width*10) + *f++ - '0'; while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) ; - /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since * they don't affect the amount of space we reserve. */ @@ -952,7 +928,6 @@ PyUnicode_FromFormatV(const char *format string = PyUnicode_FromUnicode(NULL, n); if (!string) goto fail; - s = PyUnicode_AS_UNICODE(string); callresult = callresults; @@ -6232,9 +6207,9 @@ Py_ssize_t PyUnicode_Count(PyObject *str return -1; } - ADJUST_INDICES(start, end, str_obj->length); + ADJUST_INDICES(start, end, PyUnicode_GET_SIZE(str_obj)); result = stringlib_count( - str_obj->str + start, end - start, sub_obj->str, sub_obj->length, + str_obj->str + start, end - start, sub_obj->str, PyUnicode_GET_SIZE(sub_obj), PY_SSIZE_T_MAX ); @@ -6287,11 +6262,11 @@ int tailmatch(PyUnicodeObject *self, Py_ssize_t end, int direction) { - if (substring->length == 0) + if (PyUnicode_GET_SIZE(substring) == 0) return 1; - ADJUST_INDICES(start, end, self->length); - end -= substring->length; + ADJUST_INDICES(start, end, PyUnicode_GET_SIZE(self)); + end -= PyUnicode_GET_SIZE(substring); if (end < start) return 0; @@ -6341,11 +6316,11 @@ PyObject *fixup(PyUnicodeObject *self, PyUnicodeObject *u; - u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); + u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, PyUnicode_GET_SIZE(self)); if (u == NULL) return NULL; - Py_UNICODE_COPY(u->str, self->str, self->length); + Py_UNICODE_COPY(u->str, self->str, PyUnicode_GET_SIZE(self)); if (!fixfct(u) && PyUnicode_CheckExact(self)) { /* fixfct should return TRUE if it modified the buffer. If @@ -6361,7 +6336,7 @@ PyObject *fixup(PyUnicodeObject *self, static int fixupper(PyUnicodeObject *self) { - Py_ssize_t len = self->length; + Py_ssize_t len = PyUnicode_GET_SIZE(self); Py_UNICODE *s = self->str; int status = 0; @@ -6382,7 +6357,7 @@ int fixupper(PyUnicodeObject *self) static int fixlower(PyUnicodeObject *self) { - Py_ssize_t len = self->length; + Py_ssize_t len = PyUnicode_GET_SIZE(self); Py_UNICODE *s = self->str; int status = 0; @@ -6403,7 +6378,7 @@ int fixlower(PyUnicodeObject *self) static int fixswapcase(PyUnicodeObject *self) { - Py_ssize_t len = self->length; + Py_ssize_t len = PyUnicode_GET_SIZE(self); Py_UNICODE *s = self->str; int status = 0; @@ -6424,7 +6399,7 @@ int fixswapcase(PyUnicodeObject *self) static int fixcapitalize(PyUnicodeObject *self) { - Py_ssize_t len = self->length; + Py_ssize_t len = PyUnicode_GET_SIZE(self); Py_UNICODE *s = self->str; int status = 0; @@ -6603,6 +6578,7 @@ PyUnicodeObject *pad(PyUnicodeObject *se Py_UNICODE fill) { PyUnicodeObject *u; + Py_ssize_t length = PyUnicode_GET_SIZE(self); if (left < 0) left = 0; @@ -6614,18 +6590,18 @@ PyUnicodeObject *pad(PyUnicodeObject *se return self; } - if (left > PY_SSIZE_T_MAX - self->length || - right > PY_SSIZE_T_MAX - (left + self->length)) { + if (left > PY_SSIZE_T_MAX - length || + right > PY_SSIZE_T_MAX - (left + length)) { PyErr_SetString(PyExc_OverflowError, "padded string is too long"); return NULL; } - u = _PyUnicode_New(left + self->length + right); + u = _PyUnicode_New(left + length + right); if (u) { if (left) Py_UNICODE_FILL(u->str, fill, left); - Py_UNICODE_COPY(u->str + left, self->str, self->length); + Py_UNICODE_COPY(u->str + left, self->str, length); if (right) - Py_UNICODE_FILL(u->str + left + self->length, fill, right); + Py_UNICODE_FILL(u->str + left + length, fill, right); } return u; @@ -6657,12 +6633,12 @@ PyObject *split(PyUnicodeObject *self, if (substring == NULL) return stringlib_split_whitespace( - (PyObject*) self, self->str, self->length, maxcount + (PyObject*) self, self->str, PyUnicode_GET_SIZE(self), maxcount ); return stringlib_split( - (PyObject*) self, self->str, self->length, - substring->str, substring->length, + (PyObject*) self, self->str, PyUnicode_GET_SIZE(self), + substring->str, PyUnicode_GET_SIZE(substring), maxcount ); } @@ -6677,12 +6653,12 @@ PyObject *rsplit(PyUnicodeObject *self, if (substring == NULL) return stringlib_rsplit_whitespace( - (PyObject*) self, self->str, self->length, maxcount + (PyObject*) self, self->str, PyUnicode_GET_SIZE(self), maxcount ); return stringlib_rsplit( - (PyObject*) self, self->str, self->length, - substring->str, substring->length, + (PyObject*) self, self->str, PyUnicode_GET_SIZE(self), + substring->str, PyUnicode_GET_SIZE(substring), maxcount ); } @@ -6697,26 +6673,26 @@ PyObject *replace(PyUnicodeObject *self, if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; - else if (maxcount == 0 || self->length == 0) + else if (maxcount == 0 || PyUnicode_GET_SIZE(self) == 0) goto nothing; - if (str1->length == str2->length) { + if (PyUnicode_GET_SIZE(str1) == PyUnicode_GET_SIZE(str2)) { Py_ssize_t i; /* same length */ - if (str1->length == 0) + if (PyUnicode_GET_SIZE(str1) == 0) goto nothing; - if (str1->length == 1) { + if (PyUnicode_GET_SIZE(str1) == 1) { /* replace characters */ Py_UNICODE u1, u2; - if (!findchar(self->str, self->length, str1->str[0])) + if (!findchar(self->str, PyUnicode_GET_SIZE(self), str1->str[0])) goto nothing; - u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); + u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, PyUnicode_GET_SIZE(self)); if (!u) return NULL; - Py_UNICODE_COPY(u->str, self->str, self->length); + Py_UNICODE_COPY(u->str, self->str, PyUnicode_GET_SIZE(self)); u1 = str1->str[0]; u2 = str2->str[0]; - for (i = 0; i < u->length; i++) + for (i = 0; i < PyUnicode_GET_SIZE(u); i++) if (u->str[i] == u1) { if (--maxcount < 0) break; @@ -6724,27 +6700,28 @@ PyObject *replace(PyUnicodeObject *self, } } else { i = stringlib_find( - self->str, self->length, str1->str, str1->length, 0 + self->str, PyUnicode_GET_SIZE(self), + str1->str, PyUnicode_GET_SIZE(str1), 0 ); if (i < 0) goto nothing; - u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); + u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, PyUnicode_GET_SIZE(self)); if (!u) return NULL; - Py_UNICODE_COPY(u->str, self->str, self->length); + Py_UNICODE_COPY(u->str, self->str, PyUnicode_GET_SIZE(self)); /* change everything in-place, starting with this one */ - Py_UNICODE_COPY(u->str+i, str2->str, str2->length); - i += str1->length; + Py_UNICODE_COPY(u->str+i, str2->str, PyUnicode_GET_SIZE(str2)); + i += PyUnicode_GET_SIZE(str1); while ( --maxcount > 0) { - i = stringlib_find(self->str+i, self->length-i, - str1->str, str1->length, + i = stringlib_find(self->str+i, PyUnicode_GET_SIZE(self)-i, + str1->str, PyUnicode_GET_SIZE(str1), i); if (i == -1) break; - Py_UNICODE_COPY(u->str+i, str2->str, str2->length); - i += str1->length; + Py_UNICODE_COPY(u->str+i, str2->str, PyUnicode_GET_SIZE(str2)); + i += PyUnicode_GET_SIZE(str1); } } } else { @@ -6754,22 +6731,23 @@ PyObject *replace(PyUnicodeObject *self, Py_UNICODE *p; /* replace strings */ - n = stringlib_count(self->str, self->length, str1->str, str1->length, + n = stringlib_count(self->str, PyUnicode_GET_SIZE(self), + str1->str, PyUnicode_GET_SIZE(str1), maxcount); if (n == 0) goto nothing; - /* new_size = self->length + n * (str2->length - str1->length)); */ - delta = (str2->length - str1->length); + /* new_size = PyUnicode_GET_SIZE(self) + n * (PyUnicode_GET_SIZE(str2) - PyUnicode_GET_SIZE(str1))); */ + delta = (PyUnicode_GET_SIZE(str2) - PyUnicode_GET_SIZE(str1)); if (delta == 0) { - new_size = self->length; + new_size = PyUnicode_GET_SIZE(self); } else { - product = n * (str2->length - str1->length); - if ((product / (str2->length - str1->length)) != n) { + product = n * (PyUnicode_GET_SIZE(str2) - PyUnicode_GET_SIZE(str1)); + if ((product / (PyUnicode_GET_SIZE(str2) - PyUnicode_GET_SIZE(str1))) != n) { PyErr_SetString(PyExc_OverflowError, "replace string is too long"); return NULL; } - new_size = self->length + product; + new_size = PyUnicode_GET_SIZE(self) + product; if (new_size < 0) { PyErr_SetString(PyExc_OverflowError, "replace string is too long"); @@ -6781,12 +6759,12 @@ PyObject *replace(PyUnicodeObject *self, return NULL; i = 0; p = u->str; - e = self->length - str1->length; - if (str1->length > 0) { + e = PyUnicode_GET_SIZE(self) - PyUnicode_GET_SIZE(str1); + if (PyUnicode_GET_SIZE(str1) > 0) { while (n-- > 0) { /* look for next match */ - j = stringlib_find(self->str+i, self->length-i, - str1->str, str1->length, + j = stringlib_find(self->str+i, PyUnicode_GET_SIZE(self)-i, + str1->str, PyUnicode_GET_SIZE(str1), i); if (j == -1) break; @@ -6796,25 +6774,25 @@ PyObject *replace(PyUnicodeObject *self, p += j - i; } /* copy substitution string */ - if (str2->length > 0) { - Py_UNICODE_COPY(p, str2->str, str2->length); - p += str2->length; + if (PyUnicode_GET_SIZE(str2) > 0) { + Py_UNICODE_COPY(p, str2->str, PyUnicode_GET_SIZE(str2)); + p += PyUnicode_GET_SIZE(str2); } - i = j + str1->length; - } - if (i < self->length) + i = j + PyUnicode_GET_SIZE(str1); + } + if (i < PyUnicode_GET_SIZE(self)) /* copy tail [i:] */ - Py_UNICODE_COPY(p, self->str+i, self->length-i); + Py_UNICODE_COPY(p, self->str+i, PyUnicode_GET_SIZE(self)-i); } else { /* interleave */ while (n > 0) { - Py_UNICODE_COPY(p, str2->str, str2->length); - p += str2->length; + Py_UNICODE_COPY(p, str2->str, PyUnicode_GET_SIZE(str2)); + p += PyUnicode_GET_SIZE(str2); if (--n <= 0) break; *p++ = self->str[i++]; } - Py_UNICODE_COPY(p, self->str+i, self->length-i); + Py_UNICODE_COPY(p, self->str+i, PyUnicode_GET_SIZE(self)-i); } } return (PyObject *) u; @@ -6825,7 +6803,7 @@ PyObject *replace(PyUnicodeObject *self, Py_INCREF(self); return (PyObject *) self; } - return PyUnicode_FromUnicode(self->str, self->length); + return PyUnicode_FromUnicode(self->str, PyUnicode_GET_SIZE(self)); } /* --- Unicode Object Methods --------------------------------------------- */ @@ -6935,12 +6913,12 @@ unicode_center(PyUnicodeObject *self, Py if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) return NULL; - if (self->length >= width && PyUnicode_CheckExact(self)) { + if (PyUnicode_GET_SIZE(self) >= width && PyUnicode_CheckExact(self)) { Py_INCREF(self); return (PyObject*) self; } - marg = width - self->length; + marg = width - PyUnicode_GET_SIZE(self); left = marg / 2 + (marg & width & 1); return (PyObject*) pad(self, left, marg - left, fillchar); @@ -6972,8 +6950,8 @@ unicode_compare(PyUnicodeObject *str1, P Py_UNICODE *s1 = str1->str; Py_UNICODE *s2 = str2->str; - len1 = str1->length; - len2 = str2->length; + len1 = PyUnicode_GET_SIZE(str1); + len2 = PyUnicode_GET_SIZE(str2); while (len1 > 0 && len2 > 0) { Py_UNICODE c1, c2; @@ -7006,8 +6984,8 @@ unicode_compare(PyUnicodeObject *str1, P Py_UNICODE *s1 = str1->str; Py_UNICODE *s2 = str2->str; - len1 = str1->length; - len2 = str2->length; + len1 = PyUnicode_GET_SIZE(str1); + len2 = PyUnicode_GET_SIZE(str2); while (len1 > 0 && len2 > 0) { Py_UNICODE c1, c2; @@ -7071,8 +7049,7 @@ PyObject *PyUnicode_RichCompare(PyObject if (PyUnicode_Check(left) && PyUnicode_Check(right)) { PyObject *v; - if (((PyUnicodeObject *) left)->length != - ((PyUnicodeObject *) right)->length) { + if (PyUnicode_GET_SIZE(left) != PyUnicode_GET_SIZE(right)) { if (op == Py_EQ) { Py_INCREF(Py_False); return Py_False; @@ -7175,11 +7152,11 @@ PyObject *PyUnicode_Concat(PyObject *lef } /* Concat the two Unicode strings */ - w = _PyUnicode_New(u->length + v->length); + w = _PyUnicode_New(PyUnicode_GET_SIZE(u) + PyUnicode_GET_SIZE(v)); if (w == NULL) goto onError; - Py_UNICODE_COPY(w->str, u->str, u->length); - Py_UNICODE_COPY(w->str + u->length, v->str, v->length); + Py_UNICODE_COPY(w->str, u->str, PyUnicode_GET_SIZE(u)); + Py_UNICODE_COPY(w->str + PyUnicode_GET_SIZE(u), v->str, PyUnicode_GET_SIZE(v)); Py_DECREF(u); Py_DECREF(v); @@ -7238,10 +7215,10 @@ unicode_count(PyUnicodeObject *self, PyO if (substring == NULL) return NULL; - ADJUST_INDICES(start, end, self->length); + ADJUST_INDICES(start, end, PyUnicode_GET_SIZE(self)); result = PyLong_FromSsize_t( stringlib_count(self->str + start, end - start, - substring->str, substring->length, + substring->str, PyUnicode_GET_SIZE(substring), PY_SSIZE_T_MAX) ); @@ -7311,7 +7288,7 @@ unicode_expandtabs(PyUnicodeObject *self /* First pass: determine size of output string */ i = 0; /* chars up to and including most recent \n or \r */ j = 0; /* chars since most recent \n or \r (use in tab calculations) */ - e = self->str + self->length; /* end of input */ + e = self->str + PyUnicode_GET_SIZE(self); /* end of input */ for (p = self->str; p < e; p++) if (*p == '\t') { if (tabsize > 0) { @@ -7343,7 +7320,7 @@ unicode_expandtabs(PyUnicodeObject *self j = 0; /* same as in first pass */ q = u->str; /* next output char */ - qe = u->str + u->length; /* end of output */ + qe = u->str + PyUnicode_GET_SIZE(u); /* end of output */ for (p = self->str; p < e; p++) if (*p == '\t') { @@ -7409,7 +7386,7 @@ unicode_find(PyUnicodeObject *self, PyOb static PyObject * unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) { - if (index < 0 || index >= self->length) { + if (index < 0 || index >= PyUnicode_GET_SIZE(self)) { PyErr_SetString(PyExc_IndexError, "string index out of range"); return NULL; } @@ -7844,7 +7821,7 @@ unicode_join(PyObject *self, PyObject *d static Py_ssize_t unicode_length(PyUnicodeObject *self) { - return self->length; + return PyUnicode_GET_SIZE(self); } PyDoc_STRVAR(ljust__doc__, @@ -7862,12 +7839,12 @@ unicode_ljust(PyUnicodeObject *self, PyO if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) return NULL; - if (self->length >= width && PyUnicode_CheckExact(self)) { + if (PyUnicode_GET_SIZE(self) >= width && PyUnicode_CheckExact(self)) { Py_INCREF(self); return (PyObject*) self; } - return (PyObject*) pad(self, 0, width - self->length, fillchar); + return (PyObject*) pad(self, 0, width - PyUnicode_GET_SIZE(self), fillchar); } PyDoc_STRVAR(lower__doc__, @@ -8050,8 +8027,8 @@ unicode_repeat(PyUnicodeObject *str, Py_ /* ensure # of chars needed doesn't overflow int and # of bytes * needed doesn't overflow size_t */ - nchars = len * str->length; - if (nchars / len != str->length) { + nchars = len * PyUnicode_GET_SIZE(str); + if (len && nchars / len != PyUnicode_GET_SIZE(str)) { PyErr_SetString(PyExc_OverflowError, "repeated string is too long"); return NULL; @@ -8068,11 +8045,14 @@ unicode_repeat(PyUnicodeObject *str, Py_ p = u->str; - if (str->length == 1) { + if (PyUnicode_GET_SIZE(str) == 1 && len > 0) { Py_UNICODE_FILL(p, str->str[0], len); } else { - Py_ssize_t done = str->length; /* number of characters copied this far */ - Py_UNICODE_COPY(p, str->str, str->length); + Py_ssize_t done = 0; /* number of characters copied this far */ + if (done < nchars) { + Py_UNICODE_COPY(p, str->str, PyUnicode_GET_SIZE(str)); + done = PyUnicode_GET_SIZE(str); + } while (done < nchars) { Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; Py_UNICODE_COPY(p+done, p, n); @@ -8376,12 +8356,12 @@ unicode_rjust(PyUnicodeObject *self, PyO if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) return NULL; - if (self->length >= width && PyUnicode_CheckExact(self)) { + if (PyUnicode_GET_SIZE(self) >= width && PyUnicode_CheckExact(self)) { Py_INCREF(self); return (PyObject*) self; } - return (PyObject*) pad(self, width - self->length, 0, fillchar); + return (PyObject*) pad(self, width - PyUnicode_GET_SIZE(self), 0, fillchar); } PyObject *PyUnicode_Split(PyObject *s, @@ -8722,7 +8702,7 @@ are deleted."); static PyObject* unicode_translate(PyUnicodeObject *self, PyObject *table) { - return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); + return PyUnicode_TranslateCharmap(self->str, PyUnicode_GET_SIZE(self), table, "ignore"); } PyDoc_STRVAR(upper__doc__, @@ -8752,7 +8732,7 @@ unicode_zfill(PyUnicodeObject *self, PyO if (!PyArg_ParseTuple(args, "n:zfill", &width)) return NULL; - if (self->length >= width) { + if (PyUnicode_GET_SIZE(self) >= width) { if (PyUnicode_CheckExact(self)) { Py_INCREF(self); return (PyObject*) self; @@ -8764,7 +8744,7 @@ unicode_zfill(PyUnicodeObject *self, PyO ); } - fill = width - self->length; + fill = width - PyUnicode_GET_SIZE(self); u = pad(self, fill, 0, '0'); @@ -8780,14 +8760,6 @@ unicode_zfill(PyUnicodeObject *self, PyO return (PyObject*) u; } -#if 0 -static PyObject* -unicode_freelistsize(PyUnicodeObject *self) -{ - return PyLong_FromLong(numfree); -} -#endif - PyDoc_STRVAR(startswith__doc__, "S.startswith(prefix[, start[, end]]) -> bool\n\ \n\ @@ -8907,8 +8879,8 @@ PyDoc_STRVAR(p_format__doc__, static PyObject * unicode__sizeof__(PyUnicodeObject *v) { - return PyLong_FromSsize_t(sizeof(PyUnicodeObject) + - sizeof(Py_UNICODE) * (v->length + 1)); + return PyLong_FromSsize_t(PyUnicodeObject_SIZE + + sizeof(Py_UNICODE) * PyUnicode_GET_SIZE(v)); } PyDoc_STRVAR(sizeof__doc__, @@ -8917,7 +8889,7 @@ PyDoc_STRVAR(sizeof__doc__, static PyObject * unicode_getnewargs(PyUnicodeObject *v) { - return Py_BuildValue("(u#)", v->str, v->length); + return Py_BuildValue("(u#)", v->str, PyUnicode_GET_SIZE(v)); } @@ -9037,7 +9009,7 @@ unicode_subscript(PyUnicodeObject* self, if (slicelength <= 0) { return PyUnicode_FromUnicode(NULL, 0); - } else if (start == 0 && step == 1 && slicelength == self->length && + } else if (start == 0 && step == 1 && slicelength == PyUnicode_GET_SIZE(self) && PyUnicode_CheckExact(self)) { Py_INCREF(self); return (PyObject *)self; @@ -9671,22 +9643,13 @@ unicode_subtype_new(PyTypeObject *type, if (tmp == NULL) return NULL; assert(PyUnicode_Check(tmp)); - pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); - if (pnew == NULL) { + pnew = (PyUnicodeObject *) type->tp_alloc(type, n = PyUnicode_GET_SIZE(tmp)); + if (pnew != NULL) { + Py_UNICODE_COPY(pnew->str, tmp->str, n+1); + Py_SIZE(pnew) = n; + pnew->hash = tmp->hash; Py_DECREF(tmp); - return NULL; - } - pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); - if (pnew->str == NULL) { - _Py_ForgetReference((PyObject *)pnew); - PyObject_Del(pnew); - Py_DECREF(tmp); - return PyErr_NoMemory(); - } - Py_UNICODE_COPY(pnew->str, tmp->str, n+1); - pnew->length = n; - pnew->hash = tmp->hash; - Py_DECREF(tmp); + } return (PyObject *)pnew; } @@ -9701,9 +9664,9 @@ static PyObject *unicode_iter(PyObject * PyTypeObject PyUnicode_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) - "str", /* tp_name */ - sizeof(PyUnicodeObject), /* tp_size */ - 0, /* tp_itemsize */ + "str", /* tp_name */ + sizeof(PyUnicodeObject), /* tp_size */ + sizeof(Py_UNICODE), /* tp_itemsize */ /* Slots */ (destructor)unicode_dealloc, /* tp_dealloc */ 0, /* tp_print */ @@ -9762,8 +9725,6 @@ void _PyUnicode_Init(void) }; /* Init the implementation */ - free_list = NULL; - numfree = 0; unicode_empty = _PyUnicode_New(0); if (!unicode_empty) return; @@ -9786,21 +9747,20 @@ void _PyUnicode_Init(void) int PyUnicode_ClearFreeList(void) { - int freelist_size = numfree; - PyUnicodeObject *u; - - for (u = free_list; u != NULL;) { - PyUnicodeObject *v = u; - u = *(PyUnicodeObject **)u; - if (v->str) - PyObject_DEL(v->str); - Py_XDECREF(v->defenc); - PyObject_Del(v); - numfree--; - } - free_list = NULL; - assert(numfree == 0); - return freelist_size; + int i, freed_objects = 0; + for (i = 0; i < MAX_SAVED_SIZE; i++) { + PyUnicodeObject *u, *v; + u = unicode_freelist[i]; + while (u != NULL) { + v = (PyUnicodeObject *) u->defenc; + Py_SIZE(u) = i; + PyObject_DEL(u); + u = v; + freed_objects++; + } + unicode_freelist[i] = NULL; + } + return freed_objects; } void @@ -9919,11 +9879,11 @@ void _Py_ReleaseInternedUnicodeStrings(v break; case SSTATE_INTERNED_IMMORTAL: Py_REFCNT(s) += 1; - immortal_size += s->length; + immortal_size += PyUnicode_GET_SIZE(s); break; case SSTATE_INTERNED_MORTAL: Py_REFCNT(s) += 2; - mortal_size += s->length; + mortal_size += PyUnicode_GET_SIZE(s); break; default: Py_FatalError("Inconsistent interned string state.");