diff -r fd658692db3a Objects/unicodeobject.c --- a/Objects/unicodeobject.c Wed Oct 15 13:40:53 2014 -0400 +++ b/Objects/unicodeobject.c Wed Oct 15 22:30:09 2014 +0200 @@ -9412,6 +9412,10 @@ PyUnicode_Tailmatch(PyObject *str, return result; } +Py_LOCAL_INLINE(int) +case_operation_write(_PyUnicodeWriter* writer, Py_UCS4 *mapped, Py_ssize_t len, + Py_UCS4 *maxchar); + /* Apply fixfct filter to the Unicode object self and return a reference to the modified object */ @@ -9534,37 +9538,36 @@ lower_ucs4(int kind, void *data, Py_ssiz return _PyUnicode_ToLowerFull(c, mapped); } -static Py_ssize_t -do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) -{ - Py_ssize_t i, k = 0; - int n_res, j; - Py_UCS4 c, mapped[3]; - - c = PyUnicode_READ(kind, data, 0); - n_res = _PyUnicode_ToUpperFull(c, mapped); - for (j = 0; j < n_res; j++) { - *maxchar = Py_MAX(*maxchar, mapped[j]); - res[k++] = mapped[j]; - } +static int +do_capitalize(int kind, void *data, Py_ssize_t length, + _PyUnicodeWriter *writer, Py_UCS4 *maxchar) +{ + Py_ssize_t i; + int n_res; + Py_UCS4 ch, mapped[3]; + + ch = PyUnicode_READ(kind, data, 0); + n_res = _PyUnicode_ToUpperFull(ch, mapped); + if (case_operation_write(writer, mapped, n_res, maxchar) < 0) + return -1; for (i = 1; i < length; i++) { - c = PyUnicode_READ(kind, data, i); - n_res = lower_ucs4(kind, data, length, i, c, mapped); - for (j = 0; j < n_res; j++) { - *maxchar = Py_MAX(*maxchar, mapped[j]); - res[k++] = mapped[j]; - } - } - return k; -} - -static Py_ssize_t -do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { - Py_ssize_t i, k = 0; + ch = PyUnicode_READ(kind, data, i); + n_res = lower_ucs4(kind, data, length, i, ch, mapped); + if (case_operation_write(writer, mapped, n_res, maxchar) < 0) + return -1; + } + return 0; +} + +static int +do_swapcase(int kind, void *data, Py_ssize_t length, + _PyUnicodeWriter *writer, Py_UCS4 *maxchar) +{ + Py_ssize_t i; for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + int n_res; if (Py_UNICODE_ISUPPER(c)) { n_res = lower_ucs4(kind, data, length, i, c, mapped); } @@ -9575,137 +9578,155 @@ do_swapcase(int kind, void *data, Py_ssi n_res = 1; mapped[0] = c; } - for (j = 0; j < n_res; j++) { - *maxchar = Py_MAX(*maxchar, mapped[j]); - res[k++] = mapped[j]; - } - } - return k; -} - -static Py_ssize_t -do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, - Py_UCS4 *maxchar, int lower) -{ - Py_ssize_t i, k = 0; + if (case_operation_write(writer, mapped, n_res, maxchar) < 0) + return -1; + } + return 0; +} + +static int +do_upper_or_lower(int kind, void *data, Py_ssize_t length, + _PyUnicodeWriter *writer, Py_UCS4 *maxchar, int lower) +{ + Py_ssize_t i; for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + int n_res; if (lower) n_res = lower_ucs4(kind, data, length, i, c, mapped); else n_res = _PyUnicode_ToUpperFull(c, mapped); - for (j = 0; j < n_res; j++) { - *maxchar = Py_MAX(*maxchar, mapped[j]); - res[k++] = mapped[j]; - } - } - return k; -} - -static Py_ssize_t -do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) -{ - return do_upper_or_lower(kind, data, length, res, maxchar, 0); -} - -static Py_ssize_t -do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) -{ - return do_upper_or_lower(kind, data, length, res, maxchar, 1); -} - -static Py_ssize_t -do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) -{ - Py_ssize_t i, k = 0; + if (case_operation_write(writer, mapped, n_res, maxchar) < 0) + return -1; + } + return 0; +} + +static int +do_upper(int kind, void *data, Py_ssize_t length, + _PyUnicodeWriter *writer, Py_UCS4 *maxchar) +{ + return do_upper_or_lower(kind, data, length, writer, maxchar, 0); +} + +static int +do_lower(int kind, void *data, Py_ssize_t length, + _PyUnicodeWriter *writer, Py_UCS4 *maxchar) +{ + return do_upper_or_lower(kind, data, length, writer, maxchar, 1); +} + +static int +do_casefold(int kind, void *data, Py_ssize_t length, + _PyUnicodeWriter *writer, Py_UCS4 *maxchar) +{ + Py_ssize_t i; for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); - for (j = 0; j < n_res; j++) { - *maxchar = Py_MAX(*maxchar, mapped[j]); - res[k++] = mapped[j]; - } - } - return k; -} - -static Py_ssize_t -do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) -{ - Py_ssize_t i, k = 0; + int n_res = _PyUnicode_ToFoldedFull(c, mapped); + if (case_operation_write(writer, mapped, n_res, maxchar) < 0) + return -1; + } + return 0; +} + +static int +do_title(int kind, void *data, Py_ssize_t length, + _PyUnicodeWriter *writer, Py_UCS4 *maxchar) +{ + Py_ssize_t i; int previous_is_cased; previous_is_cased = 0; for (i = 0; i < length; i++) { - const Py_UCS4 c = PyUnicode_READ(kind, data, i); + const Py_UCS4 ch = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int n_res, j; + Py_ssize_t n_res; if (previous_is_cased) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, ch, mapped); else - n_res = _PyUnicode_ToTitleFull(c, mapped); - - for (j = 0; j < n_res; j++) { - *maxchar = Py_MAX(*maxchar, mapped[j]); - res[k++] = mapped[j]; - } - - previous_is_cased = _PyUnicode_IsCased(c); - } - return k; + n_res = _PyUnicode_ToTitleFull(ch, mapped); + + if (case_operation_write(writer, mapped, n_res, maxchar) < 0) + return -1; + + previous_is_cased = _PyUnicode_IsCased(ch); + } + return 0; +} + +Py_LOCAL_INLINE(int) +case_operation_write(_PyUnicodeWriter* writer, Py_UCS4 *mapped, Py_ssize_t len, + Py_UCS4 *maxchar) +{ + Py_ssize_t i; + if (len > 1) + writer->overallocate = 1; + for (i = 0; i < len; i++) { + *maxchar = Py_MAX(*maxchar, mapped[i]); + if (_PyUnicodeWriter_WriteCharInline(writer, mapped[i]) < 0) + return -1; + } + return 0; } static PyObject * case_operation(PyObject *self, - Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) -{ - PyObject *res = NULL; - Py_ssize_t length, newlength = 0; - int kind, outkind; - void *data, *outdata; - Py_UCS4 maxchar = 0, *tmp, *tmpend; + int (*perform)(int, void *, Py_ssize_t, _PyUnicodeWriter*, Py_UCS4*)) +{ + Py_ssize_t length; + int kind; + void *data; + Py_UCS4 maxchar, maxchar2; + _PyUnicodeWriter writer; + PyObject *res; assert(PyUnicode_IS_READY(self)); - kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); + maxchar = PyUnicode_MAX_CHAR_VALUE(self); length = PyUnicode_GET_LENGTH(self); - if (length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) { - PyErr_SetString(PyExc_OverflowError, "string is too long"); - return NULL; - } - tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); - if (tmp == NULL) - return PyErr_NoMemory(); - newlength = perform(kind, data, length, tmp, &maxchar); - res = PyUnicode_New(newlength, maxchar); - if (res == NULL) - goto leave; - tmpend = tmp + newlength; - outdata = PyUnicode_DATA(res); - outkind = PyUnicode_KIND(res); - switch (outkind) { - case PyUnicode_1BYTE_KIND: - _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); - break; - case PyUnicode_2BYTE_KIND: - _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); - break; - case PyUnicode_4BYTE_KIND: - memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); - break; - default: - assert(0); - break; - } - leave: - PyMem_FREE(tmp); - return res; + + _PyUnicodeWriter_Init(&writer); + writer.min_length = length; + if (_PyUnicodeWriter_Prepare(&writer, length, maxchar) < 0) + return NULL; + + maxchar2 = 127; + if (perform(kind, data, length, &writer, &maxchar2) < 0) { + _PyUnicodeWriter_Dealloc(&writer); + return NULL; + } + + if (maxchar2 >= 65536) + maxchar2 = 0x10ffff; + else if (maxchar2 >= 256) + maxchar2 = 0xffff; + else if (maxchar2 >= 128) + maxchar2 = 0xff; + else + maxchar2 = 127; + + if (maxchar2 < maxchar) { + assert(!writer.readonly); + length = writer.pos; + + res = PyUnicode_New(length, maxchar2); + if (res == NULL) { + _PyUnicodeWriter_Dealloc(&writer); + return NULL; + } + _PyUnicode_FastCopyCharacters(res, 0, writer.buffer, 0, length); + + _PyUnicodeWriter_Dealloc(&writer); + return res; + } + + return _PyUnicodeWriter_Finish(&writer); } PyObject *