From python-dev-bounces+dw+python-dev=hmmz.org@python.org Thu Jul 17 00:00:17 2014 Return-Path: X-Original-To: dw+python-dev@hmmz.org Delivered-To: dw+python-dev@hmmz.org Received: from mail.python.org (mail.python.org [82.94.164.166]) by mail.botanicus.net (Postfix) with ESMTPS id BAC997E269 for ; Thu, 17 Jul 2014 00:00:17 +0000 (UTC) Received: from albatross.python.org (localhost [127.0.0.1]) by mail.python.org (Postfix) with ESMTP id 3hDFw13ZYmz7LnR for ; Thu, 17 Jul 2014 02:00:17 +0200 (CEST) X-Original-To: python-dev@python.org Delivered-To: python-dev@mail.python.org Received: from albatross.python.org (localhost [127.0.0.1]) by mail.python.org (Postfix) with ESMTP id 3hDDlZ3zGKz7Ljp for ; Thu, 17 Jul 2014 01:07:54 +0200 (CEST) X-Spam-Status: OK 0.000 X-Spam-Evidence: '*H*': 1.00; '*S*': 0.00; '+++': 0.03; 'subject:: [': 0.04; 'static': 0.04; 'subsequent': 0.05; 'subject:Python': 0.06; 'diff': 0.07; 'duplicate': 0.07; 'initialize': 0.07; 'modify': 0.07; 'string': 0.09; '#include': 0.09; '+/*': 0.09; 'indicates': 0.09; 'required,': 0.09; 'url:github': 0.09; 'wrapper': 0.09; 'python': 0.11; 'def': 0.12; 'mostly': 0.14; '(char': 0.16; '(without': 0.16; '+};': 0.16; '--git': 0.16; '-11,6': 0.16; '-1;': 0.16; '-2,6': 0.16; './python': 0.16; '16)': 0.16; 'buffer,': 0.16; 'buffer.': 0.16; 'called,': 0.16; 'cstringio': 0.16; 'enum': 0.16; 'exported': 0.16; 'flags;': 0.16; 'hint': 0.16; 'initializes': 0.16; 'null);': 0.16; 'pypi?': 0.16; 'redundant': 0.16; 'removed,': 0.16; 'res': 0.16; 'size)': 0.16; 'size;': 0.16; 'subject:Dev': 0.16; 'truncate': 0.16; 'typedef': 0.16; 'workaround?': 0.16; 'zero.': 0.16; 'fix': 0.17; 'wrote:': 0.18; 'variable': 0.18; 'module': 0.19; 'basically': 0.19; 'implementing': 0.19; 'thu,': 0.19; 'later': 0.20; 'subject:] ': 0.20; 'seems': 0.21; '8bit%:5': 0.22; 'import': 0.22; 'cc:addr:python.org': 0.22; 'creating': 0.23; 'affects': 0.24; 'bytes': 0.24; 'char': 0.24; 'necessary.': 0.24; 'looks': 0.24; '(or': 0.24; '---': 0.24; 'cc:2**0': 0.24; 'cc:no real name:2**0': 0.24; 'source': 0.25; 'header:In-Reply-To:1': 0.27; 'skip:p 30': 0.29; '0);': 0.29; 'patch': 0.29; 'wonder': 0.29; 'important.': 0.30; 'skip:( 20': 0.30; "i'm": 0.30; 'code': 0.31; '(possibly': 0.31; '3.x': 0.31; 'flags': 0.31; 'null)': 0.31; 'null;': 0.31; 'object.': 0.31; 'provided,': 0.31; 'routine': 0.31; 'struct': 0.31; 'compatible': 0.32; "we're": 0.32; 'another': 0.32; 'quite': 0.32; 'maybe': 0.34; 'problem': 0.35; 'objects': 0.35; 'plans': 0.35; 'prepare': 0.35; 'but': 0.35; 'add': 0.35; 'there': 0.35; 'adjust': 0.36; 'method': 0.36; 'possible': 0.36; 'operating': 0.37; 'skip:& 10': 0.38; 'initially': 0.38; 'previous': 0.38; 'does': 0.39; 'itself': 0.39; 'skip:p 20': 0.39; 'release': 0.40; 'easy': 0.60; 'preparation': 0.60; 'from:no real name:2**0': 0.61; 'free': 0.61; 'length': 0.61; 'reserved': 0.61; 'new': 0.61; 'skip:* 10': 0.61; 'first': 0.61; 'content-disposition:inline': 0.62; 'making': 0.63; 'our': 0.64; 'total': 0.65; 'skip:+ 10': 0.65; 'to:addr:gmail.com': 0.65; 'between': 0.67; 'believe': 0.68; 'allocation': 0.74; 'jul': 0.74; '100': 0.79; 'potentially': 0.81; '2.9': 0.84; 'const': 0.84; 'describes': 0.84; 'fail.': 0.84; 'reading,': 0.84; 'writing,': 0.84; 'serious': 0.97 Received: from localhost (HELO mail.python.org) (127.0.0.1) by albatross.python.org with SMTP; 17 Jul 2014 01:07:54 +0200 Received: from mail.botanicus.net (unknown [91.121.174.40]) by mail.python.org (Postfix) with ESMTP for ; Thu, 17 Jul 2014 01:07:54 +0200 (CEST) Received: by mail.botanicus.net (Postfix, from userid 1000) id 2DCB47E2FD; Wed, 16 Jul 2014 23:07:54 +0000 (UTC) Date: Wed, 16 Jul 2014 23:07:54 +0000 From: dw+python-dev@hmmz.org To: Mikhail Korobov Message-ID: <20140716230754.GA22619@k2> References: MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: X-Mailman-Approved-At: Thu, 17 Jul 2014 01:59:08 +0200 Cc: python-dev@python.org Subject: Re: [Python-Dev] cStringIO vs io.BytesIO X-BeenThere: python-dev@python.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: Python core developers List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable Errors-To: python-dev-bounces+dw+python-dev=hmmz.org@python.org Sender: "Python-Dev" Status: RO X-Status: A Content-Length: 7372 Lines: 266 On Thu, Jul 17, 2014 at 03:44:23AM +0600, Mikhail Korobov wrote: > So making code 3.x compatible by ditching cStringIO can cause a serious > performance/memory=A0 regressions. One can change the code to build the d= ata > using BytesIO (without creating bytes objects in the first place), but th= at is > not always possible or convenient. > = > I believe this problem affects tornado (https://github.com/tornadoweb/tor= nado/ > Do you know if there a workaround? Maybe there is some stdlib part that I= 'm > missing, or a module on PyPI? It is not that hard to write an own wrapper= that > won't do copies (or to port [c]StringIO to 3.x), but I wonder if there is= an > existing solution or plans to fix it in Python itself - this BytesIO use = case > looks quite important. Regarding a fix, the problem seems mostly that the StringI/StringO specializations were removed, and the new implementation is basically just a StringO. At a small cost to memory, it is easy to add a Py_buffer source and flags variable to the bytesio struct, with the buffers initially setup for reading, and if a mutation method is called, check for a copy-on-write flag, duplicate the source object into private memory, then continue operating as it does now. Attached is a (rough) patch implementing this, feel free to try it with hg tip. [23:03:44 k2!124 cpython] cat i.py import io buf =3D b'x' * (1048576 * 16) def x(): io.BytesIO(buf) [23:03:51 k2!125 cpython] ./python -m timeit -s 'import i' 'i.x()' 100 loops, best of 3: 2.9 msec per loop [23:03:57 k2!126 cpython] ./python-cow -m timeit -s 'import i' 'i.x()' 1000000 loops, best of 3: 0.364 usec per loop David diff --git a/Modules/_io/bytesio.c b/Modules/_io/bytesio.c --- a/Modules/_io/bytesio.c +++ b/Modules/_io/bytesio.c @@ -2,6 +2,12 @@ #include "structmember.h" /* for offsetof() */ #include "_iomodule.h" = +enum io_flags { + /* initvalue describes a borrowed buffer we cannot modify and must lat= er + * release */ + IO_SHARED =3D 1 +}; + typedef struct { PyObject_HEAD char *buf; @@ -11,6 +17,10 @@ PyObject *dict; PyObject *weakreflist; Py_ssize_t exports; + Py_buffer initvalue; + /* If IO_SHARED, indicates PyBuffer_release(initvalue) required, and t= hat + * we don't own buf. */ + enum io_flags flags; } bytesio; = typedef struct { @@ -33,6 +43,47 @@ return NULL; \ } = +/* Unshare our buffer in preparation for writing, in the case that an + * initvalue object was provided, and we're currently borrowing its buffer. + * size indicates the total reserved buffer size allocated as part of + * unsharing, to avoid a potentially redundant allocation in the subsequent + * mutation. + */ +static int +unshare(bytesio *self, size_t size) +{ + Py_ssize_t new_size =3D size; + Py_ssize_t copy_size =3D size; + char *new_buf; + + /* Do nothing if buffer wasn't shared */ + if (! (self->flags & IO_SHARED)) { + return 0; + } + + /* If hint provided, adjust our new buffer size and truncate the amoun= t of + * source buffer we copy as necessary. */ + if (size > copy_size) { + copy_size =3D size; + } + + /* Allocate or fail. */ + new_buf =3D (char *)PyMem_Malloc(new_size); + if (new_buf =3D=3D NULL) { + PyErr_NoMemory(); + return -1; + } + + /* Copy the (possibly now truncated) source string to the new buffer, = and + * forget any reference used to keep the source buffer alive. */ + memcpy(new_buf, self->buf, copy_size); + PyBuffer_Release(&self->initvalue); + self->flags &=3D ~IO_SHARED; + self->buf =3D new_buf; + self->buf_size =3D new_size; + self->string_size =3D (Py_ssize_t) copy_size; + return 0; +} = /* Internal routine to get a line from the buffer of a BytesIO object. Returns the length between the current position to the @@ -125,11 +176,18 @@ static Py_ssize_t write_bytes(bytesio *self, const char *bytes, Py_ssize_t len) { + size_t desired; + assert(self->buf !=3D NULL); assert(self->pos >=3D 0); assert(len >=3D 0); = - if ((size_t)self->pos + len > self->buf_size) { + desired =3D (size_t)self->pos + len; + if (unshare(self, desired)) { + return -1; + } + + if (desired > self->buf_size) { if (resize_buffer(self, (size_t)self->pos + len) < 0) return -1; } @@ -502,6 +560,10 @@ return NULL; } = + if (unshare(self, size)) { + return NULL; + } + if (size < self->string_size) { self->string_size =3D size; if (resize_buffer(self, size) < 0) @@ -655,10 +717,13 @@ static PyObject * bytesio_close(bytesio *self) { - if (self->buf !=3D NULL) { + if (self->flags & IO_SHARED) { + PyBuffer_Release(&self->initvalue); + self->flags &=3D ~IO_SHARED; + } else if (self->buf !=3D NULL) { PyMem_Free(self->buf); - self->buf =3D NULL; } + self->buf =3D NULL; Py_RETURN_NONE; } = @@ -788,10 +853,17 @@ "deallocated BytesIO object has exported buffers"); PyErr_Print(); } - if (self->buf !=3D NULL) { + + if (self->flags & IO_SHARED) { + /* We borrowed buf from another object */ + PyBuffer_Release(&self->initvalue); + self->flags &=3D ~IO_SHARED; + } else if (self->buf !=3D NULL) { + /* We owned buf */ PyMem_Free(self->buf); - self->buf =3D NULL; } + self->buf =3D NULL; + Py_CLEAR(self->dict); if (self->weakreflist !=3D NULL) PyObject_ClearWeakRefs((PyObject *) self); @@ -811,12 +883,6 @@ /* tp_alloc initializes all the fields to zero. So we don't have to initialize them here. */ = - self->buf =3D (char *)PyMem_Malloc(0); - if (self->buf =3D=3D NULL) { - Py_DECREF(self); - return PyErr_NoMemory(); - } - return (PyObject *)self; } = @@ -834,13 +900,32 @@ self->string_size =3D 0; self->pos =3D 0; = + /* Release any previous initvalue. */ + if (self->flags & IO_SHARED) { + PyBuffer_Release(&self->initvalue); + self->buf =3D NULL; + self->flags &=3D ~IO_SHARED; + } + if (initvalue && initvalue !=3D Py_None) { - PyObject *res; - res =3D bytesio_write(self, initvalue); - if (res =3D=3D NULL) + Py_buffer *buf =3D &self->initvalue; + if (PyObject_GetBuffer(initvalue, buf, PyBUF_CONTIG_RO) < 0) { return -1; - Py_DECREF(res); - self->pos =3D 0; + } + self->buf =3D self->initvalue.buf; + self->buf_size =3D (size_t)self->initvalue.len; + self->string_size =3D self->initvalue.len; + self->flags |=3D IO_SHARED; + } + + /* If no initvalue provided, prepare a private buffer now. */ + if (self->buf =3D=3D NULL) { + self->buf =3D (char *)PyMem_Malloc(0); + if (self->buf =3D=3D NULL) { + Py_DECREF(self); + PyErr_NoMemory(); + return -1; + } } = return 0; _______________________________________________ Python-Dev mailing list Python-Dev@python.org https://mail.python.org/mailman/listinfo/python-dev Unsubscribe: https://mail.python.org/mailman/options/python-dev/dw%2Bpython= -dev%40hmmz.org