Index: Python/ceval.c =================================================================== --- Python/ceval.c (revision 52047) +++ Python/ceval.c (working copy) @@ -4279,7 +4279,7 @@ } } - if (v->ob_refcnt == 1 && !PyString_CHECK_INTERNED(v)) { + if (v->ob_refcnt == 1 && !PyString_CHECK_INTERNED(v) && !PyString_CHECK_CONCATENATED(v)) { /* Now we own the last reference to 'v', so we can resize it * in-place. */ Index: Include/stringobject.h =================================================================== --- Include/stringobject.h (revision 52047) +++ Include/stringobject.h (working copy) @@ -36,7 +36,8 @@ PyObject_VAR_HEAD long ob_shash; int ob_sstate; - char ob_sval[1]; + char *ob_sval; + char ob_svalstorage[1]; /* Invariants: * ob_sval contains space for 'ob_size+1' elements. @@ -48,10 +49,27 @@ */ } PyStringObject; +#define PYSTRING_CONCATENATIONS (8) +#define PYSTRING_RIGHTRECURSIONDEPTH (16384) + +typedef struct { + PyObject_VAR_HEAD + long ob_shash; + int ob_sstate; + char *ob_sval; + char ob_svalstorage[1]; + unsigned short ob_sstringsindex; + unsigned short ob_srightrecursiondepth; + PyStringObject *ob_sstrings[PYSTRING_CONCATENATIONS]; +} PyStringConcatenationObject; + + #define SSTATE_NOT_INTERNED 0 #define SSTATE_INTERNED_MORTAL 1 #define SSTATE_INTERNED_IMMORTAL 2 +#define SSTATE_CONCATENATION 4 + PyAPI_DATA(PyTypeObject) PyBaseString_Type; PyAPI_DATA(PyTypeObject) PyString_Type; @@ -83,12 +101,20 @@ PyAPI_FUNC(PyObject *) PyString_InternFromString(const char *); PyAPI_FUNC(void) _Py_ReleaseInternedStrings(void); -/* Use only if you know it's a string */ -#define PyString_CHECK_INTERNED(op) (((PyStringObject *)(op))->ob_sstate) +/* Use these only if you know it's a string */ +#define __PyString_STATE(op) (((PyStringObject *)(op))->ob_sstate) +#define PyString_CHECK_INTERNED(op) (__PyString_STATE(op) & ( SSTATE_INTERNED_MORTAL | SSTATE_INTERNED_IMMORTAL)) +#define PyString_CHECK_CONCATENATED(op) (__PyString_STATE(op) & SSTATE_CONCATENATION) +#define PyString_SET_INTERNED(op, val) (__PyString_STATE(op) = (__PyString_STATE(op) & ~(SSTATE_INTERNED_MORTAL | SSTATE_INTERNED_IMMORTAL)) | (val & (SSTATE_INTERNED_MORTAL | SSTATE_INTERNED_IMMORTAL))) +#define PyString_SET_CONCATENATED(op, val) (__PyString_STATE(op) = (__PyString_STATE(op) & ~SSTATE_CONCATENATION) | (val & SSTATE_CONCATENATION)) + + /* Macro, trading safety for speed */ -#define PyString_AS_STRING(op) (((PyStringObject *)(op))->ob_sval) +#define PyString_AS_STRING_DIRECT(op) (((PyStringObject *)(op))->ob_sval) +#define PyString_AS_STRING(op) ( PyString_AS_STRING_DIRECT(op) ? PyString_AS_STRING_DIRECT(op) : PyString_AsString((struct _object *)op) ) #define PyString_GET_SIZE(op) (((PyStringObject *)(op))->ob_size) +#define PyString_GET_RIGHT_RECURSION_DEPTH(op) (PyString_CHECK_CONCATENATED(op) ? (((PyStringConcatenationObject *)(op))->ob_srightrecursiondepth) : 0) /* _PyString_Join(sep, x) is like sep.join(x). sep must be PyStringObject*, x must be an iterable object. */ Index: Objects/codeobject.c =================================================================== --- Objects/codeobject.c (revision 52047) +++ Objects/codeobject.c (working copy) @@ -25,6 +25,11 @@ return 1; } +int +PyCode_AllNameChars(unsigned char *s) { + return all_name_chars(s); +} + static void intern_strings(PyObject *tuple) { @@ -71,7 +76,7 @@ /* Intern selected string constants */ for (i = PyTuple_Size(consts); --i >= 0; ) { PyObject *v = PyTuple_GetItem(consts, i); - if (!PyString_Check(v)) + if (!PyString_Check(v) || (PyString_AS_STRING_DIRECT(v) == NULL)) continue; if (!all_name_chars((unsigned char *)PyString_AS_STRING(v))) continue; Index: Objects/stringobject.c =================================================================== --- Objects/stringobject.c (revision 52047) +++ Objects/stringobject.c (working copy) @@ -6,6 +6,8 @@ #include + + #ifdef COUNT_ALLOCS int null_strings, one_strings; #endif @@ -78,6 +80,7 @@ PyObject_INIT_VAR(op, &PyString_Type, size); op->ob_shash = -1; op->ob_sstate = SSTATE_NOT_INTERNED; + op->ob_sval = op->ob_svalstorage; if (str != NULL) Py_MEMCPY(op->ob_sval, str, size); op->ob_sval[size] = '\0'; @@ -133,6 +136,7 @@ PyObject_INIT_VAR(op, &PyString_Type, size); op->ob_shash = -1; op->ob_sstate = SSTATE_NOT_INTERNED; + op->ob_sval = op->ob_svalstorage; Py_MEMCPY(op->ob_sval, str, size+1); /* share short strings */ if (size == 0) { @@ -512,6 +516,30 @@ return NULL; } +/* + * *Carefully* deallocate the recursive tree of concatenation objects, + * being careful to *iterate* (*not* recurse) down the left-hand side. + */ +static void recursive_dealloc(PyStringConcatenationObject *concat) +{ + for (;;) { + PyStringConcatenationObject *next; + + if (concat == NULL) + return; + + if ((concat->ob_refcnt == 1) && PyString_CHECK_CONCATENATED(concat) && (concat->ob_sstringsindex)) { + next = (PyStringConcatenationObject *)*concat->ob_sstrings; + *concat->ob_sstrings = NULL; + } + else + next = NULL; + + Py_DECREF(concat); + concat = next; + } +} + static void string_dealloc(PyObject *op) { @@ -533,6 +561,27 @@ default: Py_FatalError("Inconsistent interned string state."); } + + /* lch */ + if (PyString_CHECK_CONCATENATED(op)) { + PyStringConcatenationObject *concat = (PyStringConcatenationObject *)op; + register PyStringObject **i; + if (concat->ob_sstringsindex) { + for (i = concat->ob_sstrings + concat->ob_sstringsindex - 1; i > concat->ob_sstrings; i--) { + if (*i) { + Py_DECREF(*i); + } + } + + if (*i) { + recursive_dealloc((PyStringConcatenationObject *)*i); + } + } + + if (concat->ob_sval != NULL) { + PyObject_Free(concat->ob_sval); + } + } op->ob_type->tp_free(op); } @@ -720,12 +769,94 @@ return ((PyStringObject *)op) -> ob_size; } + +static void recursiveConcatenate(char *buffer, Py_ssize_t length, PyStringConcatenationObject *s) { + register PyStringObject **i; + + for (;;) { + /* + * optimized for the general case of 'a'+'b'+'c'+'d'+'e': + * in this case, we will never actually recurse, we will iterate + */ + + if (s->ob_sval != NULL) { + memcpy(buffer, s->ob_sval, s->ob_size); + return; + } + + for (i = s->ob_sstrings + s->ob_sstringsindex - 1; i >= s->ob_sstrings + 1; i--) { + PyStringObject *child = *i; + char *childDestination; + length -= child->ob_size; + childDestination = buffer + length; + if (!PyString_CHECK_CONCATENATED(child)) + memcpy(childDestination, child->ob_sval, child->ob_size); + else + recursiveConcatenate(childDestination, child->ob_size, (PyStringConcatenationObject *)child); + } + + s = (PyStringConcatenationObject *)*s->ob_sstrings; + } +} + + /*const*/ char * PyString_AsString(register PyObject *op) { + register PyStringConcatenationObject *s = (PyStringConcatenationObject *)op; + if (!PyString_Check(op)) return string_getbuffer(op); - return ((PyStringObject *)op) -> ob_sval; + + /* lch */ + if ((s->ob_sval == NULL) && PyString_CHECK_CONCATENATED(s)) { + register PyStringObject **i; + /* if the string is small, we'll just overwrite the concatenation parts of the structure with the string */ + char smallStackBuffer[sizeof(PyStringConcatenationObject) - sizeof(PyStringObject) + sizeof((*i)->ob_svalstorage)]; + int smallEnough = (s->ob_size + 1) < sizeof(smallStackBuffer); + register char *string; + + if (smallEnough) + string = smallStackBuffer; + else + string = (char *)PyObject_Malloc(s->ob_size + 1); + + recursiveConcatenate(string, s->ob_size, s); + + string[s->ob_size] = 0; + + for (i = s->ob_sstrings + s->ob_sstringsindex - 1; i >= s->ob_sstrings; i--) { + Py_DECREF(*i); +#ifdef Py_DEBUG + *i = NULL; +#endif /* Py_DEBUG */ + } + + if (smallEnough) { + s->ob_sval = s->ob_svalstorage; + memcpy(s->ob_sval, smallStackBuffer, s->ob_size + 1); + /* s is no longer a concatenation object! */ + PyString_SET_CONCATENATED(s, 0); + } + else { + s->ob_sval = string; + s->ob_sstringsindex = 0; + s->ob_srightrecursiondepth = 0; + } + +#if 0 + /* this doesn't work; InternInPlace can realloc the object, and we have no way of returning the changed object to the caller */ + if ((s->ob_refcnt == 1) && PyCode_AllNameChars((unsigned char *)string)) { + /* why this? can't & on a register var */ + /* NEEDSWORK: this is never called! */ + PyObject *o = (PyObject *)s; + PyString_InternInPlace((PyObject **)&o); + s = (PyStringConcatenationObject *)o; + } +#endif + } + + return s->ob_sval; } int @@ -803,6 +934,7 @@ Py_DECREF(op); return ret; } + PyString_AS_STRING(op); // force string concatenation objects to render if (flags & Py_PRINT_RAW) { #ifdef __VMS if (op->ob_size) fwrite(op->ob_sval, (int) op->ob_size, 1, fp); @@ -856,8 +988,10 @@ register Py_ssize_t i; register char c; register char *p; + register char *pStart; int quote; + PyString_AS_STRING(op); // force concatenated strings to render /* figure out which quote to use; single is preferred */ quote = '\''; if (smartquotes && @@ -865,12 +999,12 @@ !memchr(op->ob_sval, '"', op->ob_size)) quote = '"'; - p = PyString_AS_STRING(v); + p = pStart = PyString_AS_STRING_DIRECT(v); *p++ = quote; for (i = 0; i < op->ob_size; i++) { /* There's at least enough room for a hex escape and a closing quote. */ - assert(newsize - (p - PyString_AS_STRING(v)) >= 5); + assert(newsize - (p - pStart) >= 5); c = op->ob_sval[i]; if (c == quote || c == '\\') *p++ = '\\', *p++ = c; @@ -890,11 +1024,10 @@ else *p++ = c; } - assert(newsize - (p - PyString_AS_STRING(v)) >= 1); + assert(newsize - (p - pStart) >= 1); *p++ = quote; *p = '\0'; - _PyString_Resize( - &v, (p - PyString_AS_STRING(v))); + _PyString_Resize(&v, (p - pStart)); return v; } } @@ -916,7 +1049,7 @@ else { /* Subtype -- return genuine string with the same value. */ PyStringObject *t = (PyStringObject *) s; - return PyString_FromStringAndSize(t->ob_sval, t->ob_size); + return PyString_FromStringAndSize(PyString_AS_STRING(t), t->ob_size); } } @@ -926,11 +1059,19 @@ return a->ob_size; } + +static void +string_render_if_too_deep(register PyStringConcatenationObject *op) +{ + if (PyString_GET_RIGHT_RECURSION_DEPTH(op) >= PYSTRING_RIGHTRECURSIONDEPTH) + PyString_AsString((PyObject *)op); +} + static PyObject * string_concat(register PyStringObject *a, register PyObject *bb) { register Py_ssize_t size; - register PyStringObject *op; + register PyStringConcatenationObject *op; if (!PyString_Check(bb)) { #ifdef Py_USING_UNICODE if (PyUnicode_Check(bb)) @@ -958,18 +1099,79 @@ "strings are too large to concat"); return NULL; } - + + + /* lch */ + /* if left side is already a concatenation object, and hasn't been rendered yet, and only has one reference, and has room, just append to it */ + if (PyString_CHECK_CONCATENATED(a) && (a->ob_sval == NULL) && (a->ob_refcnt == 1)) { + op = (PyStringConcatenationObject *)a; + if (op->ob_sstringsindex < PYSTRING_CONCATENATIONS) { + Py_INCREF(b); + op->ob_sstrings[op->ob_sstringsindex++] = b; + op->ob_size += b->ob_size; + + op->ob_srightrecursiondepth = max(op->ob_srightrecursiondepth, PyString_GET_RIGHT_RECURSION_DEPTH(b) + 1); + string_render_if_too_deep(op); + + Py_INCREF(op); + return (PyObject *)op; + } + } + /* if right side is already a concatenation object, and hasn't been rendered yet, and only has one reference, and has room, just prepend to it */ + if (PyString_CHECK_CONCATENATED(b) && (b->ob_sval == NULL) && (b->ob_refcnt == 1)) { + op = (PyStringConcatenationObject *)b; + if (op->ob_sstringsindex < PYSTRING_CONCATENATIONS) { + memmove(op->ob_sstrings + 1, op->ob_sstrings, op->ob_sstringsindex * sizeof(PyStringObject *)); + Py_INCREF(a); + op->ob_sstrings[0] = a; + op->ob_sstringsindex++; + op->ob_size += a->ob_size; + + op->ob_srightrecursiondepth = max(op->ob_srightrecursiondepth, PyString_GET_RIGHT_RECURSION_DEPTH(op->ob_sstrings[1]) + 1); + string_render_if_too_deep(op); + + Py_INCREF(op); + return (PyObject *)op; + } + } + + /* lch */ +#if 1 + op = (PyStringConcatenationObject *)PyObject_MALLOC(sizeof(PyStringConcatenationObject)); + if (op == NULL) + return PyErr_NoMemory(); + PyObject_INIT_VAR(op, &PyString_Type, size); + op->ob_shash = -1; + op->ob_sstate = SSTATE_NOT_INTERNED | SSTATE_CONCATENATION | 0xff000000; +#ifdef Py_DEBUG + memset(op->ob_sstrings, 0, sizeof(op->ob_sstrings)); +#endif /* Py_DEBUG */ + op->ob_sstringsindex = 2; + op->ob_sval = NULL; + op->ob_sstrings[0] = a; + op->ob_sstrings[1] = b; + Py_INCREF(a); + Py_INCREF(b); + + op->ob_srightrecursiondepth = PyString_GET_RIGHT_RECURSION_DEPTH(b) + 1; + string_render_if_too_deep(op); + + return (PyObject *) op; + +#else /* Inline PyObject_NewVar */ - op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size); + op = (PyStringConcatenationObject *)PyObject_MALLOC(sizeof(PyStringObject) + size); if (op == NULL) return PyErr_NoMemory(); PyObject_INIT_VAR(op, &PyString_Type, size); op->ob_shash = -1; op->ob_sstate = SSTATE_NOT_INTERNED; + op->ob_sval = op->ob_svalstorage; Py_MEMCPY(op->ob_sval, a->ob_sval, a->ob_size); Py_MEMCPY(op->ob_sval + a->ob_size, b->ob_sval, b->ob_size); op->ob_sval[size] = '\0'; return (PyObject *) op; +#endif // 0 #undef b } @@ -1009,7 +1211,9 @@ PyObject_INIT_VAR(op, &PyString_Type, size); op->ob_shash = -1; op->ob_sstate = SSTATE_NOT_INTERNED; + op->ob_sval = op->ob_svalstorage; op->ob_sval[size] = '\0'; + PyString_AS_STRING(a); // force concatenated strings to render if (a->ob_size == 1 && n > 0) { memset(op->ob_sval, a->ob_sval[0] , n); return (PyObject *) op; @@ -1047,7 +1251,7 @@ } if (j < i) j = i; - return PyString_FromStringAndSize(a->ob_sval + i, j-i); + return PyString_FromStringAndSize(PyString_AS_STRING(a) + i, j-i); } static int @@ -1077,7 +1281,7 @@ PyErr_SetString(PyExc_IndexError, "string index out of range"); return NULL; } - pchar = a->ob_sval[i]; + pchar = PyString_AS_STRING(a)[i]; v = (PyObject *)characters[pchar & UCHAR_MAX]; if (v == NULL) v = PyString_FromStringAndSize(&pchar, 1); @@ -1097,6 +1301,8 @@ Py_ssize_t len_a, len_b; Py_ssize_t min_len; PyObject *result; + char *aString; + char *bString; /* Make sure both arguments are strings. */ if (!(PyString_Check(a) && PyString_Check(b))) { @@ -1113,13 +1319,14 @@ goto out; } } + aString = PyString_AS_STRING(a); + bString = PyString_AS_STRING(b); if (op == Py_EQ) { /* Supporting Py_NE here as well does not save much time, since Py_NE is rarely used. */ if (a->ob_size == b->ob_size - && (a->ob_sval[0] == b->ob_sval[0] - && memcmp(a->ob_sval, b->ob_sval, - a->ob_size) == 0)) { + && (aString[0] == bString[0] + && memcmp(aString, bString, a->ob_size) == 0)) { result = Py_True; } else { result = Py_False; @@ -1129,9 +1336,9 @@ len_a = a->ob_size; len_b = b->ob_size; min_len = (len_a < len_b) ? len_a : len_b; if (min_len > 0) { - c = Py_CHARMASK(*a->ob_sval) - Py_CHARMASK(*b->ob_sval); + c = Py_CHARMASK(*aString) - Py_CHARMASK(*bString); if (c==0) - c = memcmp(a->ob_sval, b->ob_sval, min_len); + c = memcmp(aString, bString, min_len); }else c = 0; if (c == 0) @@ -1159,7 +1366,7 @@ PyStringObject *a = (PyStringObject*) o1; PyStringObject *b = (PyStringObject*) o2; return a->ob_size == b->ob_size - && *a->ob_sval == *b->ob_sval + && *PyString_AS_STRING(a) == *PyString_AS_STRING(b) && memcmp(a->ob_sval, b->ob_sval, a->ob_size) == 0; } @@ -1173,7 +1380,7 @@ if (a->ob_shash != -1) return a->ob_shash; len = a->ob_size; - p = (unsigned char *) a->ob_sval; + p = (unsigned char *)PyString_AS_STRING(a); x = *p << 7; while (--len >= 0) x = (1000003*x) ^ *p++; @@ -1242,7 +1449,7 @@ "accessing non-existent string segment"); return -1; } - *ptr = (void *)self->ob_sval; + *ptr = (void *)PyString_AS_STRING(self); return self->ob_size; } @@ -1270,7 +1477,7 @@ "accessing non-existent string segment"); return -1; } - *ptr = self->ob_sval; + *ptr = PyString_AS_STRING(self); return self->ob_size; } @@ -1810,6 +2017,7 @@ p += seplen; } } + *p = 0; Py_DECREF(seq); return res; @@ -3805,7 +4013,7 @@ static PyObject * string_getnewargs(PyStringObject *v) { - return Py_BuildValue("(s#)", v->ob_sval, v->ob_size); + return Py_BuildValue("(s#)", PyString_AS_STRING(v), v->ob_size); } @@ -3883,7 +4091,8 @@ static PyObject * str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { - PyObject *tmp, *pnew; + PyObject *tmp; + PyStringObject *pnew; Py_ssize_t n; assert(PyType_IsSubtype(type, &PyString_Type)); @@ -3892,15 +4101,16 @@ return NULL; assert(PyString_CheckExact(tmp)); n = PyString_GET_SIZE(tmp); - pnew = type->tp_alloc(type, n); + pnew = (PyStringObject *)type->tp_alloc(type, n); if (pnew != NULL) { - Py_MEMCPY(PyString_AS_STRING(pnew), PyString_AS_STRING(tmp), n+1); + pnew->ob_sval = pnew->ob_svalstorage; + Py_MEMCPY(pnew->ob_sval, PyString_AS_STRING(tmp), n+1); ((PyStringObject *)pnew)->ob_shash = ((PyStringObject *)tmp)->ob_shash; ((PyStringObject *)pnew)->ob_sstate = SSTATE_NOT_INTERNED; } Py_DECREF(tmp); - return pnew; + return (PyObject *)pnew; } static PyObject * @@ -4069,7 +4279,7 @@ { register PyObject *v; register PyStringObject *sv; - v = *pv; + v = (PyObject *)*pv; if (!PyString_Check(v) || v->ob_refcnt != 1 || newsize < 0 || PyString_CHECK_INTERNED(v)) { *pv = 0; @@ -4077,6 +4287,27 @@ PyErr_BadInternalCall(); return -1; } + + /* lch */ + if (PyString_CHECK_CONCATENATED(v)) { + char *newString; + sv = (PyStringObject *) *pv; + sv->ob_size = newsize; + + if (sv->ob_sval == NULL) + return 0; + + newString = PyObject_Realloc(sv->ob_sval, newsize + 1); + if (newString == NULL) { + PyObject_Del(*pv); + PyErr_NoMemory(); + return -1; + } + newString[newsize] = '\0'; + sv->ob_sval = newString; + return 0; + } + /* XXX UNREF/NEWREF interface should be more symmetrical */ _Py_DEC_REFTOTAL; _Py_ForgetReference(v); @@ -4090,6 +4321,7 @@ _Py_NewReference(*pv); sv = (PyStringObject *) *pv; sv->ob_size = newsize; + sv->ob_sval = sv->ob_svalstorage; sv->ob_sval[newsize] = '\0'; sv->ob_shash = -1; /* invalidate cached hash value */ return 0; @@ -4911,7 +5143,7 @@ /* The two references in interned are not counted by refcnt. The string deallocator will take care of this */ s->ob_refcnt -= 2; - PyString_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; + PyString_SET_INTERNED(s, SSTATE_INTERNED_MORTAL); } void @@ -4919,7 +5151,7 @@ { PyString_InternInPlace(p); if (PyString_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { - PyString_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; + PyString_SET_INTERNED(*p, SSTATE_INTERNED_IMMORTAL); Py_INCREF(*p); } } @@ -4983,7 +5215,7 @@ default: Py_FatalError("Inconsistent interned string state."); } - s->ob_sstate = SSTATE_NOT_INTERNED; + PyString_SET_INTERNED(s->ob_sstate, SSTATE_NOT_INTERNED); } Py_DECREF(keys); PyDict_Clear(interned);