Index: setup.py =================================================================== --- setup.py (revision 64056) +++ setup.py (working copy) @@ -422,6 +422,7 @@ exts.append( Extension("_functools", ["_functoolsmodule.c"]) ) # Memory-based IO accelerator modules exts.append( Extension("_bytesio", ["_bytesio.c"]) ) + exts.append( Extension("_stringio", ["_stringio.c"]) ) # atexit exts.append( Extension("atexit", ["atexitmodule.c"]) ) # _json speedups Index: Lib/io.py =================================================================== --- Lib/io.py (revision 64056) +++ Lib/io.py (working copy) @@ -1346,7 +1346,7 @@ if not isinstance(errors, str): raise ValueError("invalid errors: %r" % errors) - self.buffer = buffer + self._buffer = buffer self._line_buffering = line_buffering self._encoding = encoding self._errors = errors @@ -1383,6 +1383,10 @@ def line_buffering(self): return self._line_buffering + @property + def buffer(self): + return self._buffer + def seekable(self): return self._seekable @@ -1769,7 +1773,7 @@ def newlines(self): return self._decoder.newlines if self._decoder else None -class StringIO(TextIOWrapper): +class _StringIO(TextIOWrapper): """An in-memory stream for text. The initial_value argument sets the value of object. The other arguments are like those of TextIOWrapper's constructor. @@ -1792,3 +1796,288 @@ def getvalue(self): self.flush() return self.buffer.getvalue().decode(self._encoding, self._errors) + +try: + import _stringio + + # This subclass is a reimplementation of the TextIOWrapper + # interface without any of its text decoding facilities. All the + # stored data is manipulated with the efficient + # _stringio._StringIO extension type. Also, the newline decoding + # mechanism of IncrementalNewlineDecoder is reimplemented here for + # efficiency. Doing otherwise, would require us to implement a + # fake decoder which would add an additional and unnecessary layer + # on top of the _StringIO methods. + + class StringIO(_stringio._StringIO, TextIOBase): + """Text I/O implementation using an in-memory buffer.""" + + _CHUNK_SIZE = 4096 + + def __init__(self, initial_value="", encoding="utf-8", + errors="strict", newline="\n", line_buffering=False): + if newline not in (None, "", "\n", "\r", "\r\n"): + raise ValueError("illegal newline value: %r" % (newline,)) + + self._readuniversal = not newline + self._readtranslate = newline is None + self._readnl = newline + self._writetranslate = newline != "" + self._writenl = newline or os.linesep + self._pending = "" + self._seennl = 0 + + # The encoding, errors and line_buffering arguments are + # bogus since, unlike TextIOWrapper, StringIO does not + # have an underlying buffer. + self._encoding = encoding + self._errors = errors + self._line_buffering = line_buffering + + # Reset the buffer first, in case __init__ is called + # multiple times. + self.truncate(0) + if initial_value is None: + initial_value = "" + self.write(initial_value) + self.seek(0) + + @property + def encoding(self): + return self._encoding + + @property + def errors(self): + return self._errors + + @property + def line_buffering(self): + return self._line_buffering + + @property + def buffer(self): + raise UnsupportedOperation("%s.buffer attribute unsupported" % + self.__class__.__name__) + + def _decode_newlines(self, input, final=False): + # decode input (with the eventual \r from a previous pass) + if self._pending: + input = self._pending + input + + # retain last \r even when not translating data: + # then readline() is sure to get \r\n in one pass + if input.endswith("\r") and not final: + input = input[:-1] + self._pending = "\r" + else: + self._pending = "" + + # Record which newlines are read + crlf = input.count('\r\n') + cr = input.count('\r') - crlf + lf = input.count('\n') - crlf + self._seennl |= (lf and self._LF) | (cr and self._CR) \ + | (crlf and self._CRLF) + + if self._readtranslate: + if crlf: + output = input.replace("\r\n", "\n") + if cr: + output = input.replace("\r", "\n") + else: + output = input + + return output + + def writable(self): + return True + + def readable(self): + return True + + def seekable(self): + return True + + _read = _stringio._StringIO.read + _write = _stringio._StringIO.write + _tell = _stringio._StringIO.tell + _seek = _stringio._StringIO.seek + _truncate = _stringio._StringIO.truncate + _getvalue = _stringio._StringIO.getvalue + + def getvalue(self) -> str: + """Retrieve the entire contents of the object.""" + if self.closed: + raise ValueError("read on closed file") + return self._getvalue() + + def write(self, s: str) -> int: + """Write string s to file. + + Returns the number of characters written. + """ + if self.closed: + raise ValueError("write to closed file") + if not isinstance(s, str): + raise TypeError("can't write %s to text stream" % + s.__class__.__name__) + length = len(s) + haslf = (self._writetranslate or self._line_buffering) and "\n" in s + if haslf and self._writetranslate and self._writenl != "\n": + s = s.replace("\n", self._writenl) + self._pending = "" + self._write(s) + return length + + def read(self, n: int = None) -> str: + """Read at most n characters, returned as a string. + + If the argument is negative or omitted, read until EOF + is reached. Return an empty string at EOF. + """ + if self.closed: + raise ValueError("read to closed file") + if n is None: + n = -1 + res = self._pending + if n < 0: + res += self._decode_newlines(self._read(), True) + self._pending = "" + return res + else: + res = self._decode_newlines(self._read(n), True) + self._pending = res[n:] + return res[:n] + + def tell(self) -> int: + """Tell the current file position.""" + if self.closed: + raise ValueError("tell from closed file") + if self._pending: + return self._tell() - len(self._pending) + else: + return self._tell() + + def seek(self, pos: int = None, whence: int = 0) -> int: + """Change stream position. + + Seek to character offset pos relative to position indicated by whence: + 0 Start of stream (the default). pos should be >= 0; + 1 Current position - pos must be 0; + 2 End of stream - pos must be 0. + Returns the new absolute position. + """ + if self.closed: + raise ValueError("seek from closed file") + self._pending = "" + return self._seek(pos, whence) + + def truncate(self, pos: int = None) -> int: + """Truncate size to pos. + + The pos argument defaults to the current file position, as + returned by tell(). Imply an absolute seek to pos. + Returns the new absolute position. + """ + if self.closed: + raise ValueError("truncate from closed file") + self._pending = "" + return self._truncate(pos) + + def readline(self, limit: int = None) -> str: + if self.closed: + raise ValueError("read from closed file") + if limit is None: + limit = -1 + if limit >= 0: + # XXX: Hack to support limit argument, for backwards + # XXX compatibility + line = self.readline() + if len(line) <= limit: + return line + line, self._pending = line[:limit], line[limit:] + self._pending + return line + + line = self._pending + self._pending = "" + + start = 0 + pos = endpos = None + while True: + if self._readtranslate: + # Newlines are already translated, only search for \n + pos = line.find('\n', start) + if pos >= 0: + endpos = pos + 1 + break + else: + start = len(line) + + elif self._readuniversal: + # Universal newline search. Find any of \r, \r\n, \n + # The decoder ensures that \r\n are not split in two pieces + + # In C we'd look for these in parallel of course. + nlpos = line.find("\n", start) + crpos = line.find("\r", start) + if crpos == -1: + if nlpos == -1: + # Nothing found + start = len(line) + else: + # Found \n + endpos = nlpos + 1 + break + elif nlpos == -1: + # Found lone \r + endpos = crpos + 1 + break + elif nlpos < crpos: + # Found \n + endpos = nlpos + 1 + break + elif nlpos == crpos + 1: + # Found \r\n + endpos = crpos + 2 + break + else: + # Found \r + endpos = crpos + 1 + break + else: + # non-universal + pos = line.find(self._readnl) + if pos >= 0: + endpos = pos + len(self._readnl) + break + + # No line ending seen yet - get more data + more_line = self.read(self._CHUNK_SIZE) + if more_line: + line += more_line + else: + # end of file + return line + + self._pending = line[endpos:] + return line[:endpos] + + _LF = 1 + _CR = 2 + _CRLF = 4 + + @property + def newlines(self): + return (None, + "\n", + "\r", + ("\r", "\n"), + "\r\n", + ("\n", "\r\n"), + ("\r", "\r\n"), + ("\r", "\n", "\r\n") + )[self._seennl] + + +except ImportError: + StringIO = _StringIO Index: Lib/test/test_memoryio.py =================================================================== --- Lib/test/test_memoryio.py (revision 64056) +++ Lib/test/test_memoryio.py (working copy) @@ -404,10 +404,14 @@ class CBytesIOTest(PyBytesIOTest): ioclass = io.BytesIO + class CStringIOTest(PyStringIOTest): + ioclass = io.StringIO + + def test_main(): tests = [PyBytesIOTest, PyStringIOTest] if has_c_implementation: - tests.extend([CBytesIOTest]) + tests.extend([CBytesIOTest, CStringIOTest]) support.run_unittest(*tests) if __name__ == '__main__': Index: Modules/_stringio.c =================================================================== --- Modules/_stringio.c (revision 0) +++ Modules/_stringio.c (revision 0) @@ -0,0 +1,365 @@ +#include "Python.h" + +/* This module is a stripped down version of _bytesio.c with a Py_UNICODE + buffer. Most of the functionality is provided by subclassing _StringIO. */ + + +typedef struct { + PyObject_HEAD + Py_UNICODE *buf; + Py_ssize_t pos; + Py_ssize_t string_size; + size_t buf_size; +} StringIOObject; + + +/* Internal routine for changing the size, in terms of characters, of the + buffer of StringIO objects. The caller should ensure that the 'size' + argument is non-negative. Returns 0 on success, -1 otherwise. */ +static int +resize_buffer(StringIOObject *self, size_t size) +{ + /* Here, unsigned types are used to avoid dealing with signed integer + overflow, which is undefined in C. */ + size_t alloc = self->buf_size; + Py_UNICODE *new_buf = NULL; + + assert(self->buf != NULL); + + /* For simplicity, stay in the range of the signed type. Anyway, Python + doesn't allow strings to be longer than this. */ + if (size > PY_SSIZE_T_MAX) + goto overflow; + + if (size < alloc / 2) { + /* Major downsize; resize down to exact size. */ + alloc = size + 1; + } + else if (size < alloc) { + /* Within allocated size; quick exit */ + return 0; + } + else if (size <= alloc * 1.125) { + /* Moderate upsize; overallocate similar to list_resize() */ + alloc = size + (size >> 3) + (size < 9 ? 3 : 6); + } + else { + /* Major upsize; resize up to exact size */ + alloc = size + 1; + } + + if (alloc > ((size_t)-1) / sizeof(Py_UNICODE)) + goto overflow; + new_buf = (Py_UNICODE *)PyMem_Realloc(self->buf, + alloc * sizeof(Py_UNICODE)); + if (new_buf == NULL) { + PyErr_NoMemory(); + return -1; + } + self->buf_size = alloc; + self->buf = new_buf; + + return 0; + + overflow: + PyErr_SetString(PyExc_OverflowError, + "new buffer size too large"); + return -1; +} + +/* Internal routine for writing a string of characters to the buffer of a + StringIO object. Returns the number of bytes wrote, or -1 on error. */ +static Py_ssize_t +write_str(StringIOObject *self, const Py_UNICODE *str, Py_ssize_t len) +{ + assert(self->buf != NULL); + assert(self->pos >= 0); + assert(len >= 0); + + /* This overflow check is not strictly necessary. However, it avoids us to + deal with funky things like comparing an unsigned and a signed + integer. */ + if (self->pos > PY_SSIZE_T_MAX - len) { + PyErr_SetString(PyExc_OverflowError, + "new position too large"); + return -1; + } + if (self->pos + len > self->string_size) { + if (resize_buffer(self, self->pos + len) < 0) + return -1; + } + + if (self->pos > self->string_size) { + /* In case of overseek, pad with null bytes the buffer region between + the end of stream and the current position. + + 0 lo string_size hi + | |<---used--->|<----------available----------->| + | | <--to pad-->|<---to write---> | + 0 buf positon + + */ + memset(self->buf + self->string_size, '\0', + (self->pos - self->string_size) * sizeof(Py_UNICODE)); + } + + /* Copy the data to the internal buffer, overwriting some of the + existing data if self->pos < self->string_size. */ + memcpy(self->buf + self->pos, str, len * sizeof(Py_UNICODE)); + self->pos += len; + + /* Set the new length of the internal string if it has changed */ + if (self->string_size < self->pos) { + self->string_size = self->pos; + } + + return len; +} + +static PyObject * +stringio_getvalue(StringIOObject *self) +{ + return PyUnicode_FromUnicode(self->buf, self->string_size); +} + +static PyObject * +stringio_tell(StringIOObject *self) +{ + return PyLong_FromSsize_t(self->pos); +} + +static PyObject * +stringio_read(StringIOObject *self, PyObject *args) +{ + Py_ssize_t size, n; + Py_UNICODE *output; + PyObject *arg = Py_None; + + if (!PyArg_ParseTuple(args, "|O:read", &arg)) + return NULL; + + if (PyLong_Check(arg)) { + size = PyLong_AsSsize_t(arg); + } + else if (arg == Py_None) { + /* Read until EOF is reached, by default. */ + size = -1; + } + else { + PyErr_Format(PyExc_TypeError, "integer argument expected, got '%s'", + Py_TYPE(arg)->tp_name); + return NULL; + } + + /* adjust invalid sizes */ + n = self->string_size - self->pos; + if (size < 0 || size > n) { + size = n; + if (size < 0) + size = 0; + } + + assert(self->buf != NULL); + output = self->buf + self->pos; + self->pos += size; + + return PyUnicode_FromUnicode(output, size); +} + +static PyObject * +stringio_truncate(StringIOObject *self, PyObject *args) +{ + Py_ssize_t size; + PyObject *arg = Py_None; + + if (!PyArg_ParseTuple(args, "|O:truncate", &arg)) + return NULL; + + if (PyLong_Check(arg)) { + size = PyLong_AsSsize_t(arg); + } + else if (arg == Py_None) { + /* Truncate to current position if no argument is passed. */ + size = self->pos; + } + else { + PyErr_Format(PyExc_TypeError, "integer argument expected, got '%s'", + Py_TYPE(arg)->tp_name); + return NULL; + } + + if (size < 0) { + PyErr_Format(PyExc_ValueError, + "Negative size value %zd", size); + return NULL; + } + + if (size < self->string_size) { + self->string_size = size; + if (resize_buffer(self, size) < 0) + return NULL; + } + self->pos = size; + + return PyLong_FromSsize_t(size); +} + +static PyObject * +stringio_seek(StringIOObject *self, PyObject *args) +{ + Py_ssize_t pos; + int mode = 0; + + if (!PyArg_ParseTuple(args, "n|i:seek", &pos, &mode)) + return NULL; + + if (mode != 0 && mode != 1 && mode != 2) { + PyErr_Format(PyExc_ValueError, + "Invalid whence (%i, should be 0, 1 or 2)", mode); + return NULL; + } + else if (pos < 0 && mode == 0) { + PyErr_Format(PyExc_ValueError, + "Negative seek position %zd", pos); + return NULL; + } + else if (mode != 0 && pos != 0) { + PyErr_SetString(PyExc_IOError, + "Can't do nonzero cur-relative seeks"); + return NULL; + } + + /* mode 0: offset relative to beginning of the string. + mode 1: no change to current position. + mode 2: change position to end of file. */ + if (mode == 1) { + pos = self->pos; + } + else if (mode == 2) { + pos = self->string_size; + } + + self->pos = pos; + + return PyLong_FromSsize_t(self->pos); +} + +static PyObject * +stringio_write(StringIOObject *self, PyObject *obj) +{ + const Py_UNICODE *str; + Py_ssize_t size; + Py_ssize_t n = 0; + + if (PyUnicode_Check(obj)) { + str = PyUnicode_AsUnicode(obj); + size = PyUnicode_GetSize(obj); + } + else { + PyErr_Format(PyExc_TypeError, "string argument expected, got '%s'", + Py_TYPE(obj)->tp_name); + return NULL; + } + + if (size != 0) { + n = write_str(self, str, size); + if (n < 0) + return NULL; + } + + return PyLong_FromSsize_t(n); +} + +static void +stringio_dealloc(StringIOObject *self) +{ + PyMem_Free(self->buf); + Py_TYPE(self)->tp_free(self); +} + +static PyObject * +stringio_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + StringIOObject *self; + + assert(type != NULL && type->tp_alloc != NULL); + self = (StringIOObject *)type->tp_alloc(type, 0); + if (self == NULL) + return NULL; + + self->string_size = 0; + self->pos = 0; + self->buf_size = 0; + self->buf = (Py_UNICODE *)PyMem_Malloc(0); + if (self->buf == NULL) { + Py_DECREF(self); + return PyErr_NoMemory(); + } + + return (PyObject *)self; +} + +static struct PyMethodDef stringio_methods[] = { + {"getvalue", (PyCFunction)stringio_getvalue, METH_VARARGS, NULL}, + {"read", (PyCFunction)stringio_read, METH_VARARGS, NULL}, + {"tell", (PyCFunction)stringio_tell, METH_NOARGS, NULL}, + {"truncate", (PyCFunction)stringio_truncate, METH_VARARGS, NULL}, + {"seek", (PyCFunction)stringio_seek, METH_VARARGS, NULL}, + {"write", (PyCFunction)stringio_write, METH_O, NULL}, + {NULL, NULL} /* sentinel */ +}; + +static PyTypeObject StringIO_Type = { + PyVarObject_HEAD_INIT(NULL, 0) + "_stringio._StringIO", /*tp_name*/ + sizeof(StringIOObject), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor)stringio_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ + 0, /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + stringio_methods, /*tp_methods*/ + 0, /*tp_members*/ + 0, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + 0, /*tp_init*/ + 0, /*tp_alloc*/ + stringio_new, /*tp_new*/ +}; + +PyMODINIT_FUNC +init_stringio(void) +{ + PyObject *m; + + if (PyType_Ready(&StringIO_Type) < 0) + return; + m = Py_InitModule("_stringio", NULL); + if (m == NULL) + return; + Py_INCREF(&StringIO_Type); + PyModule_AddObject(m, "_StringIO", (PyObject *)&StringIO_Type); +} Property changes on: Modules/_stringio.c ___________________________________________________________________ Name: svn:eol-style + native