Index: Lib/_pyio.py =================================================================== --- Lib/_pyio.py (revision 73236) +++ Lib/_pyio.py (working copy) @@ -6,6 +6,7 @@ import abc import codecs import warnings +import sys # Import _thread instead of threading to reduce startup cost try: from _thread import allocate_lock as Lock @@ -765,6 +766,11 @@ self._buffer = buf self._pos = 0 + def __getstate__(self): + if self.closed: + raise ValueError("__getstate__ on closed file") + return self.__dict__.copy() + def getvalue(self): """Return the bytes value (contents) of the buffer """ @@ -1917,21 +1923,27 @@ # C version, even under Windows. if newline is None: self._writetranslate = False - if initial_value: + if initial_value is not None: if not isinstance(initial_value, str): - initial_value = str(initial_value) + raise TypeError("initial_value must be str or None, not {0}" + .format(type(initial_value).__name__)) self.write(initial_value) self.seek(0) - def getvalue(self): - self.flush() - return self.buffer.getvalue().decode(self._encoding, self._errors) + def __getstate__(self): + if self.closed: + raise ValueError("__getstate__ on closed file") + return self.__dict__.copy() def __repr__(self): # TextIOWrapper tells the encoding in its repr. In StringIO, # that's a implementation detail. return object.__repr__(self) + def getvalue(self): + self.flush() + return self.buffer.getvalue().decode(self._encoding, self._errors) + @property def encoding(self): return None Index: Lib/test/test_memoryio.py =================================================================== --- Lib/test/test_memoryio.py (revision 73236) +++ Lib/test/test_memoryio.py (working copy) @@ -9,6 +9,7 @@ import io import _pyio as pyio import sys +import pickle class MemorySeekTestMixin: @@ -338,7 +339,43 @@ self.assertEqual(test1(), buf) self.assertEqual(test2(), buf) + def test_pickling(self): + buf = self.buftype("1234567890") + memio = self.ioclass(buf) + memio.foo = 42 + memio.seek(2) + class PickleTestMemIO(self.ioclass): + def __init__(me, initvalue, foo): + self.ioclass.__init__(me, initvalue) + me.foo = foo + # __getnewargs__ is undefined on purpose. This checks that PEP 307 + # is used to provide pickling support. + + # Pickle expects the class to be on the module level. Here we use a + # little hack to allow the PickleTestMemIO class to derive from + # self.ioclass without having to define all combinations explictly on + # the module-level. + import __main__ + PickleTestMemIO.__module__ = '__main__' + __main__.PickleTestMemIO = PickleTestMemIO + submemio = PickleTestMemIO(buf, 80) + submemio.seek(2) + + # We only support pickle protocol 2 and onward since we use extended + # __reduce__ API of PEP 307 to provide pickling support. + for proto in range(2, pickle.HIGHEST_PROTOCOL): + for obj in (memio, submemio): + obj2 = pickle.loads(pickle.dumps(obj, protocol=proto)) + self.assertEqual(obj.getvalue(), obj2.getvalue()) + self.assertEqual(obj.__class__, obj2.__class__) + self.assertEqual(obj.foo, obj2.foo) + self.assertEqual(obj.tell(), obj2.tell()) + obj.close() + self.assertRaises(ValueError, pickle.dumps, obj, proto) + del __main__.PickleTestMemIO + + class PyBytesIOTest(MemoryTestMixin, MemorySeekTestMixin, unittest.TestCase): UnsupportedOperation = pyio.UnsupportedOperation @@ -417,14 +454,27 @@ self.assertEqual(memio.getvalue(), buf) -class PyStringIOTest(MemoryTestMixin, MemorySeekTestMixin, unittest.TestCase): - buftype = str - ioclass = pyio.StringIO - UnsupportedOperation = pyio.UnsupportedOperation - EOF = "" +class TextIOTestMixin: - # TextIO-specific behaviour. + def test_relative_seek(self): + memio = self.ioclass() + self.assertRaises(IOError, memio.seek, -1, 1) + self.assertRaises(IOError, memio.seek, 3, 1) + self.assertRaises(IOError, memio.seek, -3, 1) + self.assertRaises(IOError, memio.seek, -1, 2) + self.assertRaises(IOError, memio.seek, 1, 1) + self.assertRaises(IOError, memio.seek, 1, 2) + + def test_textio_properties(self): + memio = self.ioclass() + + # These are just dummy values but we nevertheless check them for fear + # of unexpected breakage. + self.assertTrue(memio.encoding is None) + self.assertEqual(memio.errors, "strict") + self.assertEqual(memio.line_buffering, False) + def test_newlines_property(self): memio = self.ioclass(newline=None) # The C StringIO decodes newlines in write() calls, but the Python @@ -444,25 +494,6 @@ force_decode() self.assertEqual(memio.newlines, ("\r", "\n", "\r\n")) - def test_relative_seek(self): - memio = self.ioclass() - - self.assertRaises(IOError, memio.seek, -1, 1) - self.assertRaises(IOError, memio.seek, 3, 1) - self.assertRaises(IOError, memio.seek, -3, 1) - self.assertRaises(IOError, memio.seek, -1, 2) - self.assertRaises(IOError, memio.seek, 1, 1) - self.assertRaises(IOError, memio.seek, 1, 2) - - def test_textio_properties(self): - memio = self.ioclass() - - # These are just dummy values but we nevertheless check them for fear - # of unexpected breakage. - self.assertTrue(memio.encoding is None) - self.assertEqual(memio.errors, "strict") - self.assertEqual(memio.line_buffering, False) - def test_newline_none(self): # newline=None memio = self.ioclass("a\nb\r\nc\rd", newline=None) @@ -505,7 +536,6 @@ def test_newline_cr(self): # newline="\r" memio = self.ioclass("a\nb\r\nc\rd", newline="\r") - memio.seek(0) self.assertEqual(memio.read(), "a\rb\r\rc\rd") memio.seek(0) self.assertEqual(list(memio), ["a\r", "b\r", "\r", "c\r", "d"]) @@ -513,7 +543,6 @@ def test_newline_crlf(self): # newline="\r\n" memio = self.ioclass("a\nb\r\nc\rd", newline="\r\n") - memio.seek(0) self.assertEqual(memio.read(), "a\r\nb\r\r\nc\rd") memio.seek(0) self.assertEqual(list(memio), ["a\r\n", "b\r\r\n", "c\rd"]) @@ -524,10 +553,59 @@ self.assertEqual(memio.read(5), "a\nb\n") +class PyStringIOTest(MemoryTestMixin, MemorySeekTestMixin, + TextIOTestMixin, unittest.TestCase): + buftype = str + ioclass = pyio.StringIO + UnsupportedOperation = pyio.UnsupportedOperation + EOF = "" + + +class PyStringIOPickleTest(TextIOTestMixin, unittest.TestCase): + """Test if pickle restores properly the internal state of StringIO. + """ + buftype = str + UnsupportedOperation = pyio.UnsupportedOperation + EOF = "" + + class ioclass(pyio.StringIO): + def __new__(cls, *args, **kwargs): + return pickle.loads(pickle.dumps(pyio.StringIO(*args, **kwargs))) + def __init__(self, *args, **kwargs): + pass + + class CBytesIOTest(PyBytesIOTest): ioclass = io.BytesIO UnsupportedOperation = io.UnsupportedOperation + def test_getstate(self): + memio = self.ioclass() + state = memio.__getstate__() + self.assertEqual(len(state), 3) + bytearray(state[0]) # Check if state[0] supports the buffer interface. + self.assert_(isinstance(state[1], int)) + self.assert_(isinstance(state[2], dict) or state[2] is None) + memio.close() + self.assertRaises(ValueError, memio.__getstate__) + + def test_setstate(self): + # This checks whether __setstate__ does proper input validation. + memio = self.ioclass() + memio.__setstate__((b"no error", 0, None)) + memio.__setstate__((bytearray(b"no error"), 0, None)) + memio.__setstate__((b"no error", 0, {'spam': 3})) + self.assertRaises(ValueError, memio.__setstate__, (b"", -1, None)) + self.assertRaises(TypeError, memio.__setstate__, ("unicode", 0, None)) + self.assertRaises(TypeError, memio.__setstate__, (b"", 0.0, None)) + self.assertRaises(TypeError, memio.__setstate__, (b"", 0, 0)) + self.assertRaises(TypeError, memio.__setstate__, (b"len-test", 0)) + self.assertRaises(TypeError, memio.__setstate__) + self.assertRaises(TypeError, memio.__setstate__, 0) + memio.close() + self.assertRaises(ValueError, memio.__setstate__, (b"closed", 0, None)) + + class CStringIOTest(PyStringIOTest): ioclass = io.StringIO UnsupportedOperation = io.UnsupportedOperation @@ -546,9 +624,48 @@ self.assertEqual(memio.tell(), len(buf) * 2) self.assertEqual(memio.getvalue(), buf + buf) + def test_getstate(self): + memio = self.ioclass() + state = memio.__getstate__() + self.assertEqual(len(state), 4) + self.assert_(isinstance(state[0], str)) + self.assert_(isinstance(state[1], str)) + self.assert_(isinstance(state[2], int)) + self.assert_(isinstance(state[3], dict) or state[3] is None) + memio.close() + self.assertRaises(ValueError, memio.__getstate__) + def test_setstate(self): + # This checks whether __setstate__ does proper input validation. + memio = self.ioclass() + memio.__setstate__(("no error", "\n", 0, None)) + memio.__setstate__(("no error", "", 0, {'spam': 3})) + self.assertRaises(ValueError, memio.__setstate__, ("", "f", 0, None)) + self.assertRaises(ValueError, memio.__setstate__, ("", "", -1, None)) + self.assertRaises(TypeError, memio.__setstate__, (b"", "", 0, None)) + self.assertRaises(TypeError, memio.__setstate__, ("", b"", 0, None)) + self.assertRaises(TypeError, memio.__setstate__, ("", "", 0.0, None)) + self.assertRaises(TypeError, memio.__setstate__, ("", "", 0, 0)) + self.assertRaises(TypeError, memio.__setstate__, ("len-test", 0)) + self.assertRaises(TypeError, memio.__setstate__) + self.assertRaises(TypeError, memio.__setstate__, 0) + memio.close() + self.assertRaises(ValueError, memio.__setstate__, ("closed", "", 0, None)) + + +class CStringIOPickleTest(PyStringIOPickleTest): + UnsupportedOperation = io.UnsupportedOperation + + class ioclass(io.StringIO): + def __new__(cls, *args, **kwargs): + return pickle.loads(pickle.dumps(io.StringIO(*args, **kwargs))) + def __init__(self, *args, **kwargs): + pass + + def test_main(): - tests = [PyBytesIOTest, PyStringIOTest, CBytesIOTest, CStringIOTest] + tests = [PyBytesIOTest, PyStringIOTest, CBytesIOTest, CStringIOTest, + PyStringIOPickleTest, CStringIOPickleTest] support.run_unittest(*tests) if __name__ == '__main__': Index: Modules/_io/bytesio.c =================================================================== --- Modules/_io/bytesio.c (revision 73236) +++ Modules/_io/bytesio.c (working copy) @@ -606,6 +595,120 @@ Py_RETURN_NONE; } +/* Pickling support. + + Note that only pickle protocol 2 and onward are supported since we use + extended __reduce__ API of PEP 307 to make BytesIO instances picklable. + + Providing support for protocol < 2 would require the __reduce_ex__ method + which is notably long-winded when defined properly. + + For BytesIO, the implementation would similar to one coded for + object.__reduce_ex__, but slightly less general. To be more specific, we + could call bytesio_getstate directly and avoid checking for the presence of + a fallback __reduce__ method. However, we would still need a __newobj__ + function to use the efficient instance representation of PEP 307. + */ + +static PyObject * +bytesio_getstate(BytesIOObject *self) +{ + PyObject *initvalue = bytesio_getvalue(self); + PyObject *dict; + PyObject *state; + + if (initvalue == NULL) + return NULL; + if (self->dict == NULL) { + Py_INCREF(Py_None); + dict = Py_None; + } + else { + dict = PyDict_Copy(self->dict); + if (dict == NULL) + return NULL; + } + + state = Py_BuildValue("(OnN)", initvalue, self->pos, dict); + Py_DECREF(initvalue); + return state; +} + +static PyObject * +bytesio_setstate(BytesIOObject *self, PyObject *state) +{ + PyObject *result; + PyObject *position_obj; + PyObject *dict; + Py_ssize_t pos; + + assert(state != NULL); + + /* We allow the state tuple to be longer than 3, because we may need + someday to extend the object's state without breaking + backward-compatibility. */ + if (!PyTuple_Check(state) || Py_SIZE(state) < 3) { + PyErr_Format(PyExc_TypeError, + "%.200s.__setstate__ argument should be 3-tuple, got %.200s", + Py_TYPE(self)->tp_name, Py_TYPE(state)->tp_name); + return NULL; + } + /* Reset the object to its default state. This is only needed to handle + the case of repeated calls to __setstate__. */ + self->string_size = 0; + self->pos = 0; + + /* Set the value of the internal buffer. If state[0] does not support the + buffer protocol, bytesio_write will raise the appropriate TypeError. */ + result = bytesio_write(self, PyTuple_GET_ITEM(state, 0)); + if (result == NULL) + return NULL; + Py_DECREF(result); + + /* Set carefully the position value. Alternatively, we could use the seek + method instead of modifying self->pos directly to better protect the + object internal state against errneous (or malicious) inputs. */ + position_obj = PyTuple_GET_ITEM(state, 1); + if (!PyLong_Check(position_obj)) { + PyErr_Format(PyExc_TypeError, + "second item of state must be an integer, not %.200s", + Py_TYPE(position_obj)->tp_name); + return NULL; + } + pos = PyLong_AsSsize_t(position_obj); + if (pos == -1 && PyErr_Occurred()) + return NULL; + if (pos < 0) { + PyErr_SetString(PyExc_ValueError, + "position value cannot be negative"); + return NULL; + } + self->pos = pos; + + /* Set the dictionary of the instance variables. */ + dict = PyTuple_GET_ITEM(state, 2); + if (dict != Py_None) { + if (!PyDict_Check(dict)) { + PyErr_Format(PyExc_TypeError, + "third item of state should be a dict, got a %.200s", + Py_TYPE(dict)->tp_name); + return NULL; + } + if (self->dict) { + /* Alternatively, we could replace the internal dictionary + completely. However, it seems more practical to just update it. */ + if (PyDict_Update(self->dict, dict) < 0) + return NULL; + } + else { + Py_INCREF(dict); + self->dict = dict; + } + } + + Py_RETURN_NONE; +} + static void bytesio_dealloc(BytesIOObject *self) { @@ -627,9 +730,9 @@ if (self == NULL) return NULL; - self->string_size = 0; - self->pos = 0; - self->buf_size = 0; + /* tp_alloc initializes all the fields to zero. So we don't have to + initialize them here. */ + self->buf = (char *)PyMem_Malloc(0); if (self->buf == NULL) { Py_DECREF(self); @@ -705,6 +816,8 @@ {"getvalue", (PyCFunction)bytesio_getvalue, METH_VARARGS, getval_doc}, {"seek", (PyCFunction)bytesio_seek, METH_VARARGS, seek_doc}, {"truncate", (PyCFunction)bytesio_truncate, METH_VARARGS, truncate_doc}, + {"__getstate__", (PyCFunction)bytesio_getstate, METH_NOARGS, NULL}, + {"__setstate__", (PyCFunction)bytesio_setstate, METH_O, NULL}, {NULL, NULL} /* sentinel */ }; Index: Modules/_io/stringio.c =================================================================== --- Modules/_io/stringio.c (revision 73236) +++ Modules/_io/stringio.c (working copy) @@ -495,6 +495,7 @@ stringio_traverse(StringIOObject *self, visitproc visit, void *arg) { Py_VISIT(self->dict); + Py_VISIT(self->weakreflist); return 0; } @@ -502,6 +503,8 @@ stringio_clear(StringIOObject *self) { Py_CLEAR(self->dict); + if (self->weakreflist != NULL) + PyObject_ClearWeakRefs((PyObject *) self); return 0; } @@ -512,10 +515,9 @@ Py_CLEAR(self->readnl); Py_CLEAR(self->writenl); Py_CLEAR(self->decoder); + stringio_clear(self); if (self->buf) PyMem_Free(self->buf); - if (self->weakreflist != NULL) - PyObject_ClearWeakRefs((PyObject *) self); Py_TYPE(self)->tp_free(self); } @@ -529,9 +531,9 @@ if (self == NULL) return NULL; - self->string_size = 0; - self->pos = 0; - self->buf_size = 0; + /* tp_alloc initializes all the fields to zero. So we don't have to + initialize them here. */ + self->buf = (Py_UNICODE *)PyMem_Malloc(0); if (self->buf == NULL) { Py_DECREF(self); @@ -546,22 +548,42 @@ { char *kwlist[] = {"initial_value", "newline", NULL}; PyObject *value = NULL; + PyObject *newline_obj = NULL; char *newline = "\n"; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oz:__init__", kwlist, - &value, &newline)) + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OO:__init__", kwlist, + &value, &newline_obj)) return -1; + /* Parse the newline argument. This used to be done with the 'z' + specifier, however this allowed any object with the buffer interface to + be converted. Thus we have to parse it manually since we only want to + allow unicode objects or None. */ + if (newline_obj == Py_None) { + newline = NULL; + } + else if (newline_obj) { + if (!PyUnicode_Check(newline_obj)) { + PyErr_Format(PyExc_TypeError, + "newline must be str or None, not %.200s", + Py_TYPE(newline_obj)->tp_name); + return -1; + } + newline = _PyUnicode_AsString(newline_obj); + if (newline == NULL) + return -1; + } + if (newline && newline[0] != '\0' && !(newline[0] == '\n' && newline[1] == '\0') && !(newline[0] == '\r' && newline[1] == '\0') && !(newline[0] == '\r' && newline[1] == '\n' && newline[2] == '\0')) { PyErr_Format(PyExc_ValueError, - "illegal newline value: %s", newline); + "illegal newline value: %R", newline_obj); return -1; } if (value && value != Py_None && !PyUnicode_Check(value)) { - PyErr_Format(PyExc_ValueError, + PyErr_Format(PyExc_TypeError, "initial_value must be str or None, not %.200s", Py_TYPE(value)->tp_name); return -1; @@ -573,6 +595,9 @@ Py_CLEAR(self->writenl); Py_CLEAR(self->decoder); + assert((newline != NULL && newline_obj != Py_None) || + (newline == NULL && newline_obj == Py_None)); + if (newline) { self->readnl = PyUnicode_FromString(newline); if (self->readnl == NULL) @@ -645,7 +670,136 @@ Py_RETURN_TRUE; } +/* Pickling support. + + The implementation of __getstate__ is similar to the one for BytesIO, + except that we also save the newline parameter. For __setstate__ and unlike + BytesIO, we call __init__ to restore the object's state. Doing so allows us + to avoid decoding the complex newline state while keeping the object + representation compact. + + See comment in bytesio.c regarding why only pickle protocols and onward are + supported. +*/ + static PyObject * +stringio_getstate(StringIOObject *self) +{ + PyObject *initvalue = stringio_getvalue(self); + PyObject *dict; + PyObject *state; + + if (initvalue == NULL) + return NULL; + if (self->dict == NULL) { + Py_INCREF(Py_None); + dict = Py_None; + } + else { + dict = PyDict_Copy(self->dict); + if (dict == NULL) + return NULL; + } + + state = Py_BuildValue("(OOnN)", initvalue, + self->readnl ? self->readnl : Py_None, + self->pos, dict); + Py_DECREF(initvalue); + return state; +} + +static PyObject * +stringio_setstate(StringIOObject *self, PyObject *state) +{ + PyObject *initarg; + PyObject *position_obj; + PyObject *dict; + Py_ssize_t pos; + + assert(state != NULL); + CHECK_CLOSED(self); + + /* We allow the state tuple to be longer than 4, because we may need + someday to extend the object's state without breaking + backward-compatibility. */ + if (!PyTuple_Check(state) || Py_SIZE(state) < 4) { + PyErr_Format(PyExc_TypeError, + "%.200s.__setstate__ argument should be 4-tuple, got %.200s", + Py_TYPE(self)->tp_name, Py_TYPE(state)->tp_name); + return NULL; + } + + /* Initialize the object's state. */ + initarg = PyTuple_GetSlice(state, 0, 2); + if (initarg == NULL) + return NULL; + if (stringio_init(self, initarg, NULL) < 0) { + Py_DECREF(initarg); + return NULL; + } + Py_DECREF(initarg); + + /* Restore the buffer state. Even if __init__ did initialize the buffer, + we have to initialize it again since __init__ may translates the + newlines in the inital_value string. We clearly do not want that + because the string value in the state tuple has already been translated + once by __init__. So we do not take any chance and replace object's + buffer completely. */ + { + Py_UNICODE *buf = PyUnicode_AS_UNICODE(PyTuple_GET_ITEM(state, 0)); + Py_ssize_t bufsize = PyUnicode_GET_SIZE(PyTuple_GET_ITEM(state, 0)); + if (resize_buffer(self, bufsize) < 0) + return NULL; + memcpy(self->buf, buf, bufsize * sizeof(Py_UNICODE)); + self->string_size = bufsize; + } + + /* Set carefully the position value. Alternatively, we could use the seek + method instead of modifying self->pos directly to better protect the + object internal state against errneous (or malicious) inputs. */ + position_obj = PyTuple_GET_ITEM(state, 2); + if (!PyLong_Check(position_obj)) { + PyErr_Format(PyExc_TypeError, + "third item of state must be an integer, got %.200s", + Py_TYPE(position_obj)->tp_name); + return NULL; + } + pos = PyLong_AsSsize_t(position_obj); + if (pos == -1 && PyErr_Occurred()) + return NULL; + if (pos < 0) { + PyErr_SetString(PyExc_ValueError, + "position value cannot be negative"); + return NULL; + } + self->pos = pos; + + /* Set the dictionary of the instance variables. */ + dict = PyTuple_GET_ITEM(state, 3); + if (dict != Py_None) { + if (!PyDict_Check(dict)) { + PyErr_Format(PyExc_TypeError, + "fourth item of state should be a dict, got a %.200s", + Py_TYPE(dict)->tp_name); + return NULL; + } + if (self->dict) { + /* Alternatively, we could replace the internal dictionary + completely. However, it seems more practical to just update it. */ + if (PyDict_Update(self->dict, dict) < 0) + return NULL; + } + else { + Py_INCREF(dict); + self->dict = dict; + } + } + + Py_RETURN_NONE; +} + + +static PyObject * stringio_buffer(StringIOObject *self, void *context) { PyErr_SetString(IO_STATE->unsupported_operation, @@ -703,10 +857,13 @@ {"truncate", (PyCFunction)stringio_truncate, METH_VARARGS, stringio_truncate_doc}, {"seek", (PyCFunction)stringio_seek, METH_VARARGS, stringio_seek_doc}, {"write", (PyCFunction)stringio_write, METH_O, stringio_write_doc}, - + {"seekable", (PyCFunction)stringio_seekable, METH_NOARGS}, {"readable", (PyCFunction)stringio_readable, METH_NOARGS}, {"writable", (PyCFunction)stringio_writable, METH_NOARGS}, + + {"__getstate__", (PyCFunction)stringio_getstate, METH_NOARGS}, + {"__setstate__", (PyCFunction)stringio_setstate, METH_O}, {NULL, NULL} /* sentinel */ };