Index: Lib/test/pickletester.py =================================================================== --- Lib/test/pickletester.py (révision 84996) +++ Lib/test/pickletester.py (copie de travail) @@ -30,6 +30,18 @@ n += 1 return n + +class UnseekableIO(io.BytesIO): + def seekable(self): + return False + + def seek(self, *args): + raise io.UnsupportedOperation + + def tell(self): + raise io.UnsupportedOperation + + # We can't very well test the extension registry without putting known stuff # in it, but we have to be careful to restore its original state. Code # should do this: @@ -1072,9 +1084,10 @@ # Test the correctness of internal buffering routines when handling # large data. for proto in protocols: - data = (1, b'x' * (256 * 1024)) + data = (1, b'x\n' * (256 * 1024)) dumped = self.dumps(data, proto) loaded = self.loads(dumped) + self.assertEqual(len(loaded), len(data)) self.assertEqual(loaded, data) @@ -1373,7 +1386,32 @@ f.seek(0) self.assertEqual(unpickler.load(), data2) + def _check_multiple_unpicklings(self, ioclass): + for proto in protocols: + data1 = [(x, str(x)) for x in range(2000)] + [b"abcde"] + f = ioclass() + pickler = self.pickler_class(f, protocol=proto) + pickler.dump(data1) + pickled = f.getvalue() + N = 5 + f = ioclass(pickled * N) + unpickler = self.unpickler_class(f) + for i in range(N): + if f.seekable(): + pos = f.tell() + self.assertEqual(unpickler.load(), data1) + if f.seekable(): + self.assertEqual(f.tell(), pos + len(pickled)) + self.assertRaises(EOFError, unpickler.load) + + def test_multiple_unpicklings_seekable(self): + self._check_multiple_unpicklings(io.BytesIO) + + def test_multiple_unpicklings_unseekable(self): + self._check_multiple_unpicklings(UnseekableIO) + + if __name__ == "__main__": # Print some stuff that can be used to rewrite DATA{0,1,2} from pickletools import dis Index: Modules/_pickle.c =================================================================== --- Modules/_pickle.c (révision 84996) +++ Modules/_pickle.c (copie de travail) @@ -101,6 +101,9 @@ /* Maximum size of the write buffer of Pickler when pickling to a stream. This is ignored for in-memory pickling. */ MAX_WRITE_BUF_SIZE = 64 * 1024, + + /* Prefetch size when unpickling (disabled on unseekable streams) */ + PREFETCH = 4096, }; /* Exception classes for pickle. These should override the ones defined in @@ -357,6 +360,7 @@ Py_ssize_t next_read_idx; PyObject *read; /* read() method of the input stream. */ PyObject *readline; /* readline() method of the input stream. */ + PyObject *seek; /* seek() method of the input stream, or NULL */ char *encoding; /* Name of the encoding to be used for decoding strings pickled using Python @@ -862,6 +866,22 @@ return self->input_len; } +static int +_Unpickler_UndoReadahead(UnpicklerObject *self) +{ + Py_ssize_t readahead = self->input_len - self->next_read_idx; + if (readahead > 0) { + PyObject *r; + r = PyObject_CallFunction(self->seek, "ni", -readahead, 1); + if (r == NULL) + return -1; + Py_DECREF(r); + /* Invalidate buffer */ + self->next_read_idx = self->input_len; + } + return 0; +} + static const Py_ssize_t READ_WHOLE_LINE = -1; /* If reading from a file, we need to only pull the bytes we need, since there @@ -885,7 +905,6 @@ Py_ssize_t read_size; assert(self->read != NULL); - assert(self->next_read_idx == 0); if (n == READ_WHOLE_LINE) data = PyObject_Call(self->readline, empty_tuple, NULL); @@ -900,7 +919,6 @@ return -1; read_size = _Unpickler_SetStringInput(self, data); - self->input_len = 0; Py_DECREF(data); return read_size; } @@ -921,30 +939,41 @@ static Py_ssize_t _Unpickler_Read(UnpicklerObject *self, char **s, Py_ssize_t n) { + Py_ssize_t num_read, readahead, to_read; + if (n == 0) { *s = NULL; return 0; } - /* This condition will always be true if self->read. */ - if (self->next_read_idx + n > self->input_len) { - if (self->read) { - Py_ssize_t num_read; - assert(self->next_read_idx == self->input_len); - num_read = _Unpickler_ReadFromFile(self, n); - if (n < 0) - return -1; - if (num_read == n) { - *s = self->input_buffer; - return num_read; - } - } + readahead = self->input_len - self->next_read_idx; + if (n <= readahead) { + *s = self->input_buffer + self->next_read_idx; + self->next_read_idx += n; + return n; + } + if (!self->read) { PyErr_Format(PyExc_EOFError, "Ran out of input"); return -1; } - assert(self->read == NULL); - *s = self->input_buffer + self->next_read_idx; - self->next_read_idx += n; + to_read = n; + if (self->seek) { + _Unpickler_UndoReadahead(self); + if (to_read < PREFETCH) + to_read = PREFETCH; + } + else { + assert(readahead == 0); + } + num_read = _Unpickler_ReadFromFile(self, to_read); + if (num_read < 0) + return -1; + if (num_read < n) { + PyErr_Format(PyExc_EOFError, "Ran out of input"); + return -1; + } + *s = self->input_buffer; + self->next_read_idx = n; return n; } @@ -970,11 +999,9 @@ static Py_ssize_t _Unpickler_Readline(UnpicklerObject *self, char **result) { - Py_ssize_t i, num_read; + Py_ssize_t i, num_read, readahead; - /* This loop will never be entered if self->read is not NULL. */ for (i = self->next_read_idx; i < self->input_len; i++) { - assert(self->read == NULL); if (self->input_buffer[i] == '\n') { char *line_start = self->input_buffer + self->next_read_idx; num_read = i - self->next_read_idx + 1; @@ -983,11 +1010,18 @@ } } if (self->read) { - assert(self->next_read_idx == self->input_len); + readahead = self->input_len - self->next_read_idx; + if (self->seek) { + _Unpickler_UndoReadahead(self); + } + else { + assert(self->next_read_idx == self->input_len); + } num_read = _Unpickler_ReadFromFile(self, READ_WHOLE_LINE); if (num_read < 0) return -1; *result = self->input_buffer; + self->next_read_idx = num_read; return num_read; } @@ -1108,6 +1142,7 @@ self->next_read_idx = 0; self->read = NULL; self->readline = NULL; + self->seek = NULL; self->encoding = NULL; self->errors = NULL; self->marks = NULL; @@ -1124,6 +1159,28 @@ static int _Unpickler_SetInputStream(UnpicklerObject *self, PyObject *file) { + PyObject *r; + int seekable = 0; + + if (PyObject_HasAttrString(file, "seekable")) { + r = PyObject_CallMethod(file, "seekable", "", NULL); + if (r == NULL) + return -1; + seekable = PyObject_IsTrue(r); + Py_DECREF(r); + if (seekable == -1) + return -1; + } + if (seekable) { + self->seek = PyObject_GetAttrString(file, "seek"); + if (self->seek == NULL) { + if (PyErr_ExceptionMatches(PyExc_AttributeError)) + PyErr_SetString(PyExc_TypeError, + "seekable() returned True but file doesn't " + "have a 'seek' attribute"); + return -1; + } + } self->read = PyObject_GetAttrString(file, "read"); self->readline = PyObject_GetAttrString(file, "readline"); if (self->readline == NULL || self->read == NULL) { @@ -1132,6 +1189,7 @@ "file must have 'read' and 'readline' attributes"); Py_CLEAR(self->read); Py_CLEAR(self->readline); + Py_CLEAR(self->seek); return -1; } return 0; @@ -5207,6 +5265,9 @@ break; /* and we are done! */ } + if (self->seek && _Unpickler_UndoReadahead(self) < 0) + return NULL; + /* XXX: It is not clear what this is actually for. */ if ((err = PyErr_Occurred())) { if (err == PyExc_EOFError) { @@ -5356,6 +5417,7 @@ PyObject_GC_UnTrack((PyObject *)self); Py_XDECREF(self->readline); Py_XDECREF(self->read); + Py_XDECREF(self->seek); Py_XDECREF(self->stack); Py_XDECREF(self->pers_func); Py_XDECREF(self->arg); @@ -5378,6 +5440,7 @@ { Py_VISIT(self->readline); Py_VISIT(self->read); + Py_VISIT(self->seek); Py_VISIT(self->stack); Py_VISIT(self->pers_func); Py_VISIT(self->arg); @@ -5389,6 +5452,7 @@ { Py_CLEAR(self->readline); Py_CLEAR(self->read); + Py_CLEAR(self->seek); Py_CLEAR(self->stack); Py_CLEAR(self->pers_func); Py_CLEAR(self->arg);