Message 387387 - Python tracker

➜

This issue tracker has been migrated to GitHub, and is currently read-only.
For more information, see the GitHub FAQs in the Python's Developer Guide.

Author	eryksun
Recipients	Ramin Farajpour Cami, eryksun, methane, paul.moore, steve.dower, tim.golden, zach.ware
Date	2021-02-20.04:40:26
SpamBayes Score	-1.0
Marked as misclassified	Yes
Message-id	<1613796026.67.0.274129551452.issue43260@roundup.psfhosted.org>
In-reply-to

Content
> stdout.write("small text") > stdout.write("very large text") # Calls writeflush, but can not allocate buffer. Without the optimization, in most cases this will likely fail in _io_TextIOWrapper_write_impl() at the line `b = (self->encodefunc)((PyObject ) self, text)`. In some cases, it could be that the latter succeeds, but its size combined with the existing pending_bytes_count leads to a memory error in _textiowrapper_writeflush(). > * If input text is large (>1M?) I'd change write() to only optimize ASCII writes so long as the new total size of pending writes would not exceed the text wrapper's chunk size. Then rearrange the logic to pre-flush the text wrapper if the pending bytes plus the write would exceed the chunk size. Thus the total size of a list of pending writes (aggregating small writes as a chunk), or that of a single ASCII str() object, would be limited to the chunk size, in which case PyBytes_FromStringAndSize in _textiowrapper_writeflush() shouldn't fail in any normal circumstances. For example: if (self->encodefunc != NULL) { [NEW CONDITION] if (PyUnicode_IS_ASCII(text) && (PyUnicode_GET_LENGTH(text) + (self->pending_bytes ? self->pending_bytes_count : 0)) <= self->chunk_size && is_asciicompat_encoding(self->encodefunc)) { b = text; Py_INCREF(b); } else { b = (self->encodefunc)((PyObject ) self, text); } self->encoding_start_of_stream = 0; } else { b = PyObject_CallMethodOneArg(self->encoder, _PyIO_str_encode, text); } Py_DECREF(text); if (b == NULL) return NULL; if (b != text && !PyBytes_Check(b)) { PyErr_Format(PyExc_TypeError, "encoder should return a bytes object, not '%.200s'", Py_TYPE(b)->tp_name); Py_DECREF(b); return NULL; } Py_ssize_t bytes_len; if (b == text) { bytes_len = PyUnicode_GET_LENGTH(b); } else { bytes_len = PyBytes_GET_SIZE(b); } if (self->pending_bytes == NULL) { self->pending_bytes_count = 0; self->pending_bytes = b; } [NEW PRE-FLUSH] else if ((self->pending_bytes_count + bytes_len) > self->chunk_size) { if (_textiowrapper_writeflush(self) < 0) { Py_DECREF(b); return NULL; } self->pending_bytes = b; } else if (!PyList_CheckExact(self->pending_bytes)) { PyObject *list = PyList_New(2); if (list == NULL) { Py_DECREF(b); return NULL; } PyList_SET_ITEM(list, 0, self->pending_bytes); PyList_SET_ITEM(list, 1, b); self->pending_bytes = list; } else { if (PyList_Append(self->pending_bytes, b) < 0) { Py_DECREF(b); return NULL; } Py_DECREF(b); } self->pending_bytes_count += bytes_len; if (self->pending_bytes_count > self->chunk_size \|\| needflush \|\| text_needflush) { if (_textiowrapper_writeflush(self) < 0) return NULL; }

> stdout.write("small text")
> stdout.write("very large text")  # Calls writeflush, but can not allocate buffer.

Without the optimization, in most cases this will likely fail in _io_TextIOWrapper_write_impl() at the line `b = (*self->encodefunc)((PyObject *) self, text)`. In some cases, it could be that the latter succeeds, but its size combined with the existing pending_bytes_count leads to a memory error in _textiowrapper_writeflush().

> * If input text is large (>1M?)

I'd change write() to only optimize ASCII writes so long as the new total size of pending writes would not exceed the text wrapper's chunk size. Then rearrange the logic to pre-flush the text wrapper if the pending bytes plus the write would exceed the chunk size. Thus the total size of a list of pending writes (aggregating small writes as a chunk), or that of a single ASCII str() object, would be limited to the chunk size, in which case PyBytes_FromStringAndSize in _textiowrapper_writeflush() shouldn't fail in any normal circumstances. For example:

    if (self->encodefunc != NULL) {

[NEW CONDITION]

        if (PyUnicode_IS_ASCII(text) &&
              (PyUnicode_GET_LENGTH(text) +
                (self->pending_bytes ? self->pending_bytes_count : 0)) <=
                  self->chunk_size &&
              is_asciicompat_encoding(self->encodefunc)) {
            b = text;
            Py_INCREF(b);
        }
        else {
            b = (*self->encodefunc)((PyObject *) self, text);
        }
        self->encoding_start_of_stream = 0;
    }
    else {
        b = PyObject_CallMethodOneArg(self->encoder, _PyIO_str_encode, text);
    }

    Py_DECREF(text);
    if (b == NULL)
        return NULL;
    if (b != text && !PyBytes_Check(b)) {
        PyErr_Format(PyExc_TypeError,
                     "encoder should return a bytes object, not '%.200s'",
                     Py_TYPE(b)->tp_name);
        Py_DECREF(b);
        return NULL;
    }

    Py_ssize_t bytes_len;
    if (b == text) {
        bytes_len = PyUnicode_GET_LENGTH(b);
    }
    else {
        bytes_len = PyBytes_GET_SIZE(b);
    }

    if (self->pending_bytes == NULL) {
        self->pending_bytes_count = 0;
        self->pending_bytes = b;
    }

[NEW PRE-FLUSH]

    else if ((self->pending_bytes_count + bytes_len) > self->chunk_size) {
        if (_textiowrapper_writeflush(self) < 0) {
            Py_DECREF(b);
            return NULL;
        }
        self->pending_bytes = b;
    }
    else if (!PyList_CheckExact(self->pending_bytes)) {
        PyObject *list = PyList_New(2);
        if (list == NULL) {
            Py_DECREF(b);
            return NULL;
        }
        PyList_SET_ITEM(list, 0, self->pending_bytes);
        PyList_SET_ITEM(list, 1, b);
        self->pending_bytes = list;
    }
    else {
        if (PyList_Append(self->pending_bytes, b) < 0) {
            Py_DECREF(b);
            return NULL;
        }
        Py_DECREF(b);
    }

    self->pending_bytes_count += bytes_len;
    if (self->pending_bytes_count > self->chunk_size || needflush ||
        text_needflush) {
        if (_textiowrapper_writeflush(self) < 0)
            return NULL;
    }

History
Date	User	Action	Args
2021-02-20 04:40:26	eryksun	set	recipients: + eryksun, paul.moore, tim.golden, methane, zach.ware, steve.dower, Ramin Farajpour Cami
2021-02-20 04:40:26	eryksun	set	messageid: <1613796026.67.0.274129551452.issue43260@roundup.psfhosted.org>
2021-02-20 04:40:26	eryksun	link	issue43260 messages
2021-02-20 04:40:26	eryksun	create