diff -r 0ee785c9d1b4 Include/unicodeobject.h --- a/Include/unicodeobject.h Sun Apr 14 02:35:33 2013 +0200 +++ b/Include/unicodeobject.h Sun Apr 14 03:34:24 2013 +0200 @@ -898,22 +898,28 @@ typedef struct { Py_UCS4 maxchar; Py_ssize_t size; Py_ssize_t pos; - /* minimum length of the buffer when overallocation is enabled, - see _PyUnicodeWriter_Init() */ + + /* minimum number of allocated characters (default: 0) */ Py_ssize_t min_length; + + /* minimum character (default: 127, ASCII) */ + Py_UCS4 min_char; + + /* If non-zero, overallocate the buffer by 25% (default: 0). */ unsigned char overallocate; + /* If readonly is 1, buffer is a shared string (cannot be modified) and size is set to 0. */ unsigned char readonly; } _PyUnicodeWriter ; /* Initialize a Unicode writer. - - If min_length is greater than zero, _PyUnicodeWriter_Prepare() - overallocates the buffer and min_length is the minimum length in characters - of the buffer. */ + * + * By default, the minimum buffer size is 0 character and overallocation is + * disabled. Set min_length, min_char and overallocate attributes to control + * the allocation of the buffer. */ PyAPI_FUNC(void) -_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length); +_PyUnicodeWriter_Init(_PyUnicodeWriter *writer); /* Prepare the buffer to write 'length' characters with the specified maximum character. diff -r 0ee785c9d1b4 Modules/cjkcodecs/multibytecodec.c --- a/Modules/cjkcodecs/multibytecodec.c Sun Apr 14 02:35:33 2013 +0200 +++ b/Modules/cjkcodecs/multibytecodec.c Sun Apr 14 03:34:24 2013 +0200 @@ -633,7 +633,8 @@ MultibyteCodec_Decode(MultibyteCodecObje return make_tuple(PyUnicode_New(0, 0), 0); } - _PyUnicodeWriter_Init(&buf.writer, datalen); + _PyUnicodeWriter_Init(&buf.writer); + buf.writer.min_length = datalen; buf.excobj = NULL; buf.inbuf = buf.inbuf_top = (unsigned char *)data; buf.inbuf_end = buf.inbuf_top + datalen; @@ -841,7 +842,7 @@ decoder_prepare_buffer(MultibyteDecodeBu { buf->inbuf = buf->inbuf_top = (const unsigned char *)data; buf->inbuf_end = buf->inbuf_top + size; - _PyUnicodeWriter_Init(&buf->writer, size); + buf->writer.min_length += size; return 0; } @@ -1039,7 +1040,7 @@ mbidecoder_decode(MultibyteIncrementalDe data = pdata.buf; size = pdata.len; - _PyUnicodeWriter_Init(&buf.writer, 1); + _PyUnicodeWriter_Init(&buf.writer); buf.excobj = NULL; origpending = self->pendingsize; @@ -1243,7 +1244,7 @@ mbstreamreader_iread(MultibyteStreamRead if (sizehint == 0) return PyUnicode_New(0, 0); - _PyUnicodeWriter_Init(&buf.writer, 1); + _PyUnicodeWriter_Init(&buf.writer); buf.excobj = NULL; cres = NULL; diff -r 0ee785c9d1b4 Objects/complexobject.c --- a/Objects/complexobject.c Sun Apr 14 02:35:33 2013 +0200 +++ b/Objects/complexobject.c Sun Apr 14 03:34:24 2013 +0200 @@ -705,7 +705,7 @@ complex__format__(PyObject* self, PyObje if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) return NULL; - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); ret = _PyComplex_FormatAdvancedWriter( &writer, self, diff -r 0ee785c9d1b4 Objects/floatobject.c --- a/Objects/floatobject.c Sun Apr 14 02:35:33 2013 +0200 +++ b/Objects/floatobject.c Sun Apr 14 03:34:24 2013 +0200 @@ -1711,7 +1711,7 @@ float__format__(PyObject *self, PyObject if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) return NULL; - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); ret = _PyFloat_FormatAdvancedWriter( &writer, self, diff -r 0ee785c9d1b4 Objects/longobject.c --- a/Objects/longobject.c Sun Apr 14 02:35:33 2013 +0200 +++ b/Objects/longobject.c Sun Apr 14 03:34:24 2013 +0200 @@ -4379,7 +4379,7 @@ long__format__(PyObject *self, PyObject if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) return NULL; - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); ret = _PyLong_FormatAdvancedWriter( &writer, self, diff -r 0ee785c9d1b4 Objects/stringlib/unicode_format.h --- a/Objects/stringlib/unicode_format.h Sun Apr 14 02:35:33 2013 +0200 +++ b/Objects/stringlib/unicode_format.h Sun Apr 14 03:34:24 2013 +0200 @@ -906,7 +906,6 @@ build_string(SubString *input, PyObject int recursion_depth, AutoNumber *auto_number) { _PyUnicodeWriter writer; - Py_ssize_t minlen; /* check the recursion level */ if (recursion_depth <= 0) { @@ -915,8 +914,9 @@ build_string(SubString *input, PyObject return NULL; } - minlen = PyUnicode_GET_LENGTH(input->str) + 100; - _PyUnicodeWriter_Init(&writer, minlen); + _PyUnicodeWriter_Init(&writer); + writer.overallocate = 1; + writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100; if (!do_markup(input, args, kwargs, &writer, recursion_depth, auto_number)) { diff -r 0ee785c9d1b4 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Sun Apr 14 02:35:33 2013 +0200 +++ b/Objects/unicodeobject.c Sun Apr 14 03:34:24 2013 +0200 @@ -2665,7 +2665,9 @@ PyUnicode_FromFormatV(const char *format const char *f; _PyUnicodeWriter writer; - _PyUnicodeWriter_Init(&writer, strlen(format) + 100); + _PyUnicodeWriter_Init(&writer); + writer.min_length = strlen(format) + 100; + writer.overallocate = 1; /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). Copy it to be able to pass a reference to a subfunction. */ @@ -4117,7 +4119,10 @@ unicode_decode_call_errorhandler_writer( goto onError; } - writer->overallocate = 1; + if (PyUnicode_READY(repunicode) < 0) + goto onError; + if (PyUnicode_GET_LENGTH(repunicode) > 1) + writer->overallocate = 1; if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) return @@ -4256,9 +4261,8 @@ PyUnicode_DecodeUTF7Stateful(const char } /* Start off assuming it's all ASCII. Widen later as necessary. */ - _PyUnicodeWriter_Init(&writer, 0); - if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) - goto onError; + _PyUnicodeWriter_Init(&writer); + writer.min_length = size; shiftOutStart = 0; e = s + size; @@ -4655,7 +4659,7 @@ PyUnicode_DecodeUTF8Stateful(const char return get_latin1_char((unsigned char)s[0]); } - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) goto onError; @@ -4910,7 +4914,7 @@ PyUnicode_DecodeUTF32Stateful(const char le = bo <= 0; #endif - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1) goto onError; @@ -5149,7 +5153,7 @@ PyUnicode_DecodeUTF16Stateful(const char /* Note: size will always be longer than the resulting Unicode character count */ - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1) goto onError; @@ -5420,9 +5424,9 @@ PyUnicode_DecodeUnicodeEscape(const char and we determined it's exact size (common case) or it contains \x, \u, ... escape sequences. then we create a legacy wchar string and resize it at the end of this function. */ - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (len > 0) { - if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1) + if (_PyUnicodeWriter_Prepare(&writer, len, 127) < 0) goto onError; assert(writer.kind == PyUnicode_1BYTE_KIND); } @@ -5432,7 +5436,7 @@ PyUnicode_DecodeUnicodeEscape(const char length after conversion to the true value. (but if the error callback returns a long replacement string we'll have to allocate more space) */ - if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) + if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) goto onError; } @@ -5787,9 +5791,8 @@ PyUnicode_DecodeRawUnicodeEscape(const c Unicode string, so we start with size here and then reduce the length after conversion to the true value. (But decoding error handler might have to resize the string) */ - _PyUnicodeWriter_Init(&writer, 1); - if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) - goto onError; + _PyUnicodeWriter_Init(&writer); + writer.min_length = size; end = s + size; while (s < end) { @@ -5982,12 +5985,14 @@ PyObject * if (size == 0) _Py_RETURN_UNICODE_EMPTY(); - /* XXX overflow detection missing */ - _PyUnicodeWriter_Init(&writer, 0); - if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1) + _PyUnicodeWriter_Init(&writer); + if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { + PyErr_NoMemory(); goto onError; + } + writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; + end = s + size; - while (s < end) { Py_UNICODE uch; Py_UCS4 ch; @@ -6429,9 +6434,9 @@ PyUnicode_DecodeASCII(const char *s, if (size == 1 && (unsigned char)s[0] < 128) return get_latin1_char((unsigned char)s[0]); - _PyUnicodeWriter_Init(&writer, 0); - if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) - goto onError; + _PyUnicodeWriter_Init(&writer); + if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) + return NULL; e = s + size; data = writer.data; @@ -7280,7 +7285,7 @@ PyUnicode_DecodeCharmap(const char *s, if (size == 0) _Py_RETURN_UNICODE_EMPTY(); - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) goto onError; @@ -7312,7 +7317,7 @@ PyUnicode_DecodeCharmap(const char *s, ch = *s; x = mapdata_ucs1[ch]; if (x > maxchar) { - if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1) + if (_PyUnicodeWriter_Prepare(&writer, 1, 0xff) == -1) goto onError; maxchar = writer.maxchar; outdata = (Py_UCS1 *)writer.data; @@ -12833,21 +12838,27 @@ unicode_endswith(PyObject *self, Py_LOCAL_INLINE(void) _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) { - writer->size = PyUnicode_GET_LENGTH(writer->buffer); + if (!writer->readonly) + writer->size = PyUnicode_GET_LENGTH(writer->buffer); + else { + /* Copy-on-write mode: set buffer size to 0 so + * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on + * next write. */ + writer->size = 0; + } writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); writer->data = PyUnicode_DATA(writer->buffer); writer->kind = PyUnicode_KIND(writer->buffer); } void -_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length) +_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) { memset(writer, 0, sizeof(*writer)); #ifdef Py_DEBUG writer->kind = 5; /* invalid kind */ #endif - writer->min_length = Py_MAX(min_length, 100); - writer->overallocate = (min_length > 0); + writer->min_char = 127; } int @@ -12865,29 +12876,28 @@ int } newlen = writer->pos + length; + maxchar = MAX_MAXCHAR(maxchar, writer->min_char); + if (writer->buffer == NULL) { - if (writer->overallocate) { + assert(!writer->readonly); + if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) { /* overallocate 25% to limit the number of resize */ - if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) - newlen += newlen / 4; - if (newlen < writer->min_length) - newlen = writer->min_length; - } + newlen += newlen / 4; + } + if (newlen < writer->min_length) + newlen = writer->min_length; + writer->buffer = PyUnicode_New(newlen, maxchar); if (writer->buffer == NULL) return -1; - _PyUnicodeWriter_Update(writer); - return 0; - } - - if (newlen > writer->size) { - if (writer->overallocate) { + } + else if (newlen > writer->size) { + if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) { /* overallocate 25% to limit the number of resize */ - if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) - newlen += newlen / 4; - if (newlen < writer->min_length) - newlen = writer->min_length; - } + newlen += newlen / 4; + } + if (newlen < writer->min_length) + newlen = writer->min_length; if (maxchar > writer->maxchar || writer->readonly) { /* resize + widen */ @@ -12905,7 +12915,6 @@ int return -1; } writer->buffer = newbuffer; - _PyUnicodeWriter_Update(writer); } else if (maxchar > writer->maxchar) { assert(!writer->readonly); @@ -12916,8 +12925,8 @@ int writer->buffer, 0, writer->pos); Py_DECREF(writer->buffer); writer->buffer = newbuffer; - _PyUnicodeWriter_Update(writer); - } + } + _PyUnicodeWriter_Update(writer); return 0; } @@ -12951,11 +12960,10 @@ int maxchar = PyUnicode_MAX_CHAR_VALUE(str); if (maxchar > writer->maxchar || len > writer->size - writer->pos) { if (writer->buffer == NULL && !writer->overallocate) { + writer->readonly = 1; Py_INCREF(str); writer->buffer = str; _PyUnicodeWriter_Update(writer); - writer->readonly = 1; - writer->size = 0; writer->pos += len; return 0; } @@ -13072,7 +13080,7 @@ unicode__format__(PyObject* self, PyObje if (PyUnicode_READY(self) == -1) return NULL; - _PyUnicodeWriter_Init(&writer, 0); + _PyUnicodeWriter_Init(&writer); ret = _PyUnicode_FormatAdvancedWriter(&writer, self, format_spec, 0, PyUnicode_GET_LENGTH(format_spec)); @@ -14156,7 +14164,9 @@ PyUnicode_Format(PyObject *format, PyObj ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); ctx.fmtpos = 0; - _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100); + _PyUnicodeWriter_Init(&ctx.writer); + ctx.writer.min_length = ctx.fmtcnt + 100; + ctx.writer.overallocate = 1; if (PyTuple_Check(args)) { ctx.arglen = PyTuple_Size(args);