diff -r ee7b713fec71 Doc/library/marshal.rst --- a/Doc/library/marshal.rst Tue Nov 13 09:31:51 2012 +0100 +++ b/Doc/library/marshal.rst Tue Nov 20 16:23:26 2012 +0000 @@ -40,10 +40,11 @@ point numbers, complex numbers, strings, bytes, bytearrays, tuples, lists, sets, frozensets, dictionaries, and code objects, where it should be understood that tuples, lists, sets, frozensets and dictionaries are only supported as long as -the values contained therein are themselves supported; and recursive lists, sets -and dictionaries should not be written (they will cause infinite loops). The +the values contained therein are themselves supported. singletons :const:`None`, :const:`Ellipsis` and :exc:`StopIteration` can also be marshalled and unmarshalled. +For format *version* lower than 3, recursive lists, sets and dictionaries cannot +be written (see below). There are functions that read/write files as well as functions operating on strings. @@ -103,7 +104,9 @@ Indicates the format that the module uses. Version 0 is the historical format, version 1 shares interned strings and version 2 uses a binary format - for floating point numbers. The current version is 2. + for floating point numbers. + Version 3 adds support for object instancing and recursion. + The current version is 3. .. rubric:: Footnotes diff -r ee7b713fec71 Include/marshal.h --- a/Include/marshal.h Tue Nov 13 09:31:51 2012 +0100 +++ b/Include/marshal.h Tue Nov 20 16:23:26 2012 +0000 @@ -7,7 +7,7 @@ extern "C" { #endif -#define Py_MARSHAL_VERSION 2 +#define Py_MARSHAL_VERSION 3 PyAPI_FUNC(void) PyMarshal_WriteLongToFile(long, FILE *, int); PyAPI_FUNC(void) PyMarshal_WriteObjectToFile(PyObject *, FILE *, int); diff -r ee7b713fec71 Lib/test/test_marshal.py --- a/Lib/test/test_marshal.py Tue Nov 13 09:31:51 2012 +0100 +++ b/Lib/test/test_marshal.py Tue Nov 20 16:23:26 2012 +0000 @@ -200,8 +200,12 @@ except Exception: pass + def test_loads_2x_code(self): + s = b'c' + (b'X' * 4*4) + b'{' * 2**20 + self.assertRaises(ValueError, marshal.loads, s) + def test_loads_recursion(self): - s = b'c' + (b'X' * 4*4) + b'{' * 2**20 + s = b'c' + (b'X' * 4*5) + b'{' * 2**20 self.assertRaises(ValueError, marshal.loads, s) def test_recursion_limit(self): @@ -279,6 +283,122 @@ unicode_string = 'T' self.assertRaises(TypeError, marshal.loads, unicode_string) +def CollectObjectIDs(ids, obj): + """Collect object ids seen in a structure""" + if id(obj) in ids: + return + ids.add(id(obj)) + if isinstance(obj, (list, tuple, set, frozenset)): + for e in obj: + CollectObjectIDs(ids, e) + elif isinstance(obj, dict): + for k, v in obj.items(): + CollectObjectIDs(ids, k) + CollectObjectIDs(ids, v) + return len(ids) + +class InstancingTestCase(unittest.TestCase, HelperMixin): + intobj = 123321 + floatobj = 1.2345 + strobj = "abcde"*3 + dictobj = {"hello":floatobj, "goodbye":floatobj, floatobj:"hello"} + + def helper3(self, rsample, recursive=False, simple=False): + #we have two instances + sample = (rsample, rsample) + + n0 = CollectObjectIDs(set(), sample) + + s3 = marshal.dumps(sample, 3) + n3 = CollectObjectIDs(set(), marshal.loads(s3)) + + #same number of instances generated + self.assertEqual(n3, n0) + + if not recursive: + #can compare with version 2 + s2 = marshal.dumps(sample, 2) + n2 = CollectObjectIDs(set(), marshal.loads(s2)) + #old format generated more instances + self.assertGreater(n2, n0) + + #if complex objects are in there, old format is larger + if not simple: + self.assertGreater(len(s2), len(s3)) + else: + self.assertGreaterEqual(len(s2), len(s3)) + + def testInt(self): + self.helper(self.intobj) + self.helper3(self.intobj, simple=True) + + def testFloat(self): + self.helper(self.floatobj) + self.helper3(self.floatobj) + + def testStr(self): + self.helper(self.strobj) + self.helper3(self.strobj) + + def testDict(self): + self.helper(self.dictobj) + self.helper3(self.dictobj) + + def testModule(self): + with open(__file__, "rb") as f: + code = f.read() + if __file__.endswith(".py"): + code = compile(code, __file__, "exec") + self.helper(code) + self.helper3(code) + + def testRecursion(self): + d = dict(self.dictobj) + d["self"] = d + self.helper3(d, recursive=True) + l = [self.dictobj] + l.append(l) + self.helper3(l, recursive=True) + +class CompatibilityTestCase(unittest.TestCase): + def _test(self, version): + with open(__file__, "rb") as f: + code = f.read() + if __file__.endswith(".py"): + code = compile(code, __file__, "exec") + data = marshal.dumps(code, version) + marshal.loads(data) + + def test0To3(self): + self._test(0) + + def test1To3(self): + self._test(1) + + def test2To3(self): + self._test(2) + + def test3To3(self): + self._test(3) + +class InterningTestCase(unittest.TestCase, HelperMixin): + strobj = "this is an interned string" + strobj = sys.intern(strobj) + + def testIntern(self): + s = marshal.loads(marshal.dumps(self.strobj)) + self.assertEqual(s, self.strobj) + self.assertEqual(id(s), id(self.strobj)) + s2 = sys.intern(s) + self.assertEqual(id(s2), id(s)) + + def testNoIntern(self): + s = marshal.loads(marshal.dumps(self.strobj, 2)) + self.assertEqual(s, self.strobj) + self.assertNotEqual(id(s), id(self.strobj)) + s2 = sys.intern(s) + self.assertNotEqual(id(s2), id(s)) + def test_main(): support.run_unittest(IntTestCase, @@ -288,7 +408,10 @@ ContainerTestCase, ExceptionTestCase, BufferTestCase, - BugsTestCase) + BugsTestCase, + InstancingTestCase, + InterningTestCase, + CompatibilityTestCase) if __name__ == "__main__": test_main() diff -r ee7b713fec71 Python/marshal.c --- a/Python/marshal.c Tue Nov 13 09:31:51 2012 +0100 +++ b/Python/marshal.c Tue Nov 20 16:23:26 2012 +0000 @@ -1,8 +1,10 @@ /* Write Python objects to files and read them back. - This is intended for writing and reading compiled Python code only; - a true persistent storage facility would be much harder, since - it would have to take circular links and sharing into account. */ + This is primarily intended for writing and reading compiled Python code, + even though dicts, lists, sets and frozensets, not commonly seen in + code objects, are supported. + Version 3 of this protocol properly supports circular links + and sharing. */ #define PY_SSIZE_T_CLEAN @@ -41,6 +43,8 @@ #define TYPE_BINARY_COMPLEX 'y' #define TYPE_LONG 'l' #define TYPE_STRING 's' +#define TYPE_INTERNED 't' +#define TYPE_REF 'r' #define TYPE_TUPLE '(' #define TYPE_LIST '[' #define TYPE_DICT '{' @@ -49,6 +53,7 @@ #define TYPE_UNKNOWN '?' #define TYPE_SET '<' #define TYPE_FROZENSET '>' +#define FLAG_REF '\x80' /* with a type, add obj to index */ #define WFERR_OK 0 #define WFERR_UNMARSHALLABLE 1 @@ -65,6 +70,7 @@ PyObject *current_filename; char *ptr; char *end; + PyObject *refs; /* dict on marshal, list on unmarshal */ int version; } WFILE; @@ -136,13 +142,17 @@ #endif #define PyLong_MARSHAL_RATIO (PyLong_SHIFT / PyLong_MARSHAL_SHIFT) +#define W_TYPE(t, p) do { \ + w_byte((t) | flag, (p)); \ +} while(0) + static void -w_PyLong(const PyLongObject *ob, WFILE *p) +w_PyLong(const PyLongObject *ob, char flag, WFILE *p) { Py_ssize_t i, j, n, l; digit d; - w_byte(TYPE_LONG, p); + W_TYPE(TYPE_LONG, p); if (Py_SIZE(ob) == 0) { w_long((long)0, p); return; @@ -174,10 +184,64 @@ } while (d != 0); } +static int +w_ref(PyObject *v, char *flag, WFILE *p) +{ + PyObject *id; + PyObject *idx; + + if (p->version < 3 || p->refs == NULL) + return 0; /* not writing object references */ + + /* if it has only one reference, it definitely isn't shared */ + if (Py_REFCNT(v) == 1) + return 0; + + id = PyLong_FromVoidPtr((void*)v); + if (id == NULL) + goto err; + idx = PyDict_GetItem(p->refs, id); + if (idx != NULL) { + /* write the reference index to the stream */ + long w = PyLong_AsLong(idx); + Py_DECREF(id); + if (w == -1 && PyErr_Occurred()) { + goto err; + } + /* we don't store "long" indices in the dict */ + assert(0 <= w && w <= 0x7fffffff); + w_byte(TYPE_REF, p); + w_long(w, p); + return 1; + } else { + int ok; + Py_ssize_t s = PyDict_Size(p->refs); + /* we don't support long indices */ + if (s >= 0x7fffffff) { + PyErr_SetString(PyExc_ValueError, "too many objects"); + goto err; + } + idx = PyLong_FromSsize_t(s); + ok = idx && PyDict_SetItem(p->refs, id, idx) == 0; + Py_DECREF(id); + Py_XDECREF(idx); + if (!ok) + goto err; + *flag |= FLAG_REF; + return 0; + } +err: + p->error = WFERR_UNMARSHALLABLE; + return 1; +} + +static void +w_complex_object(PyObject *v, char flag, WFILE *p); + static void w_object(PyObject *v, WFILE *p) { - Py_ssize_t i, n; + char flag = '\0'; p->depth++; @@ -202,24 +266,35 @@ else if (v == Py_True) { w_byte(TYPE_TRUE, p); } - else if (PyLong_CheckExact(v)) { + else if (!w_ref(v, &flag, p)) + w_complex_object(v, flag, p); + + p->depth--; +} + +static void +w_complex_object(PyObject *v, char flag, WFILE *p) +{ + Py_ssize_t i, n; + + if (PyLong_CheckExact(v)) { long x = PyLong_AsLong(v); if ((x == -1) && PyErr_Occurred()) { PyLongObject *ob = (PyLongObject *)v; PyErr_Clear(); - w_PyLong(ob, p); + w_PyLong(ob, flag, p); } else { #if SIZEOF_LONG > 4 long y = Py_ARITHMETIC_RIGHT_SHIFT(long, x, 31); if (y && y != -1) { /* Too large for TYPE_INT */ - w_PyLong((PyLongObject*)v, p); + w_PyLong((PyLongObject*)v, flag, p); } else #endif { - w_byte(TYPE_INT, p); + W_TYPE(TYPE_INT, p); w_long(x, p); } } @@ -232,7 +307,7 @@ p->error = WFERR_UNMARSHALLABLE; return; } - w_byte(TYPE_BINARY_FLOAT, p); + W_TYPE(TYPE_BINARY_FLOAT, p); w_string((char*)buf, 8, p); } else { @@ -243,7 +318,7 @@ return; } n = strlen(buf); - w_byte(TYPE_FLOAT, p); + W_TYPE(TYPE_FLOAT, p); w_byte((int)n, p); w_string(buf, (int)n, p); PyMem_Free(buf); @@ -257,7 +332,7 @@ p->error = WFERR_UNMARSHALLABLE; return; } - w_byte(TYPE_BINARY_COMPLEX, p); + W_TYPE(TYPE_BINARY_COMPLEX, p); w_string((char*)buf, 8, p); if (_PyFloat_Pack8(PyComplex_ImagAsDouble(v), buf, 1) < 0) { @@ -268,7 +343,7 @@ } else { char *buf; - w_byte(TYPE_COMPLEX, p); + W_TYPE(TYPE_COMPLEX, p); buf = PyOS_double_to_string(PyComplex_RealAsDouble(v), 'g', 17, 0, NULL); if (!buf) { @@ -292,7 +367,7 @@ } } else if (PyBytes_CheckExact(v)) { - w_byte(TYPE_STRING, p); + W_TYPE(TYPE_STRING, p); n = PyBytes_GET_SIZE(v); if (n > INT_MAX) { /* huge strings are not supported */ @@ -311,7 +386,10 @@ p->error = WFERR_UNMARSHALLABLE; return; } - w_byte(TYPE_UNICODE, p); + if (p->version >= 3 && PyUnicode_CHECK_INTERNED(v)) + W_TYPE(TYPE_INTERNED, p); + else + W_TYPE(TYPE_UNICODE, p); n = PyBytes_GET_SIZE(utf8); if (n > INT_MAX) { p->depth--; @@ -323,7 +401,7 @@ Py_DECREF(utf8); } else if (PyTuple_CheckExact(v)) { - w_byte(TYPE_TUPLE, p); + W_TYPE(TYPE_TUPLE, p); n = PyTuple_Size(v); w_long((long)n, p); for (i = 0; i < n; i++) { @@ -331,7 +409,7 @@ } } else if (PyList_CheckExact(v)) { - w_byte(TYPE_LIST, p); + W_TYPE(TYPE_LIST, p); n = PyList_GET_SIZE(v); w_long((long)n, p); for (i = 0; i < n; i++) { @@ -341,7 +419,7 @@ else if (PyDict_CheckExact(v)) { Py_ssize_t pos; PyObject *key, *value; - w_byte(TYPE_DICT, p); + W_TYPE(TYPE_DICT, p); /* This one is NULL object terminated! */ pos = 0; while (PyDict_Next(v, &pos, &key, &value)) { @@ -354,9 +432,9 @@ PyObject *value, *it; if (PyObject_TypeCheck(v, &PySet_Type)) - w_byte(TYPE_SET, p); + W_TYPE(TYPE_SET, p); else - w_byte(TYPE_FROZENSET, p); + W_TYPE(TYPE_FROZENSET, p); n = PyObject_Size(v); if (n == -1) { p->depth--; @@ -383,7 +461,7 @@ } else if (PyCode_Check(v)) { PyCodeObject *co = (PyCodeObject *)v; - w_byte(TYPE_CODE, p); + W_TYPE(TYPE_CODE, p); w_long(co->co_argcount, p); w_long(co->co_kwonlyargcount, p); w_long(co->co_nlocals, p); @@ -410,7 +488,7 @@ p->error = WFERR_UNMARSHALLABLE; return; } - w_byte(TYPE_STRING, p); + W_TYPE(TYPE_STRING, p); n = view.len; s = view.buf; if (n > INT_MAX) { @@ -423,10 +501,9 @@ PyBuffer_Release(&view); } else { - w_byte(TYPE_UNKNOWN, p); + W_TYPE(TYPE_UNKNOWN, p); p->error = WFERR_UNMARSHALLABLE; } - p->depth--; } /* version currently has no effect for writing longs. */ @@ -437,6 +514,7 @@ wf.fp = fp; wf.error = WFERR_OK; wf.depth = 0; + wf.refs = NULL; wf.version = version; w_long(x, &wf); } @@ -448,8 +526,14 @@ wf.fp = fp; wf.error = WFERR_OK; wf.depth = 0; + if (version >= 3) { + if ((wf.refs = PyDict_New()) == NULL) + return; /* caller mush check PyErr_Occurred() */ + } else + wf.refs = NULL; wf.version = version; w_object(x, &wf); + Py_XDECREF(wf.refs); } typedef WFILE RFILE; /* Same struct with different invariants */ @@ -485,7 +569,7 @@ data->ob_type->tp_name); } else { - read = PyBytes_GET_SIZE(data); + read = (int)PyBytes_GET_SIZE(data); if (read > 0) { ptr = PyBytes_AS_STRING(data); memcpy(s, ptr, read); @@ -655,6 +739,59 @@ return NULL; } +/* allocate the reflist index */ +static PyObject * +r_ref_reserve(PyObject *o, Py_ssize_t *idx, int flag, RFILE *p) +{ + if (flag) { /* currently only FLAG_REF is defined */ + *idx = PyList_Size(p->refs); + if (*idx < 0) + goto err; + if (*idx >= 0x7ffffffe) { + PyErr_SetString(PyExc_ValueError, "bad marshal data (index list too large)"); + goto err; + } + if (PyList_Append(p->refs, Py_None) < 0) + goto err; + } else + *idx = 0; + return o; +err: + Py_XDECREF(o); /* release the new object */ + *idx = -1; + return NULL; +} + +/* insert actual object to the reflist */ +static PyObject * +r_ref_insert(PyObject *o, Py_ssize_t idx, int flag, RFILE *p) +{ + if (o && (flag & FLAG_REF)) { + if (PyList_SetItem(p->refs, idx, o) < 0) { + Py_DECREF(o); /* release the new object */ + return NULL; + } else { + Py_INCREF(o); /* a reference for the list */ + } + } + return o; +} + +/* combination of both above, used when an object can be + * created whenever it is seen in the file, as opposed to + * after having loaded its sub-objects. + */ +static PyObject * +r_ref(PyObject *o, int flag, RFILE *p) +{ + if (o && (flag & FLAG_REF)) { + if (PyList_Append(p->refs, o) < 0) { + Py_DECREF(o); /* release the new object */ + return NULL; + } + } + return o; +} static PyObject * r_object(RFILE *p) @@ -662,8 +799,10 @@ /* NULL is a valid return value, it does not necessarily means that an exception is set. */ PyObject *v, *v2; + Py_ssize_t idx; long i, n; int type = r_byte(p); + int flag; PyObject *retval; p->depth++; @@ -674,6 +813,13 @@ return NULL; } + flag = type & FLAG_REF; + type = type & ~FLAG_REF; + +#define R_REF(O) do{\ + O = r_ref(O, flag, p);\ +} while (0) + switch (type) { case EOF: @@ -714,14 +860,17 @@ case TYPE_INT: n = r_long(p); retval = PyErr_Occurred() ? NULL : PyLong_FromLong(n); + R_REF(retval); break; case TYPE_INT64: retval = r_long64(p); + R_REF(retval); break; case TYPE_LONG: retval = r_PyLong(p); + R_REF(retval); break; case TYPE_FLOAT: @@ -740,6 +889,7 @@ if (dx == -1.0 && PyErr_Occurred()) break; retval = PyFloat_FromDouble(dx); + R_REF(retval); break; } @@ -759,6 +909,7 @@ break; } retval = PyFloat_FromDouble(x); + R_REF(retval); break; } @@ -788,6 +939,7 @@ if (c.imag == -1.0 && PyErr_Occurred()) break; retval = PyComplex_FromCComplex(c); + R_REF(retval); break; } @@ -818,6 +970,7 @@ break; } retval = PyComplex_FromCComplex(c); + R_REF(retval); break; } @@ -845,9 +998,11 @@ break; } retval = v; + R_REF(retval); break; case TYPE_UNICODE: + case TYPE_INTERNED: { char *buffer; @@ -875,7 +1030,10 @@ } v = PyUnicode_DecodeUTF8(buffer, n, "surrogatepass"); PyMem_DEL(buffer); + if (type == TYPE_INTERNED) + PyUnicode_InternInPlace(&v); retval = v; + R_REF(retval); break; } @@ -891,6 +1049,7 @@ break; } v = PyTuple_New((int)n); + R_REF(v); if (v == NULL) { retval = NULL; break; @@ -922,6 +1081,7 @@ break; } v = PyList_New((int)n); + R_REF(v); if (v == NULL) { retval = NULL; break; @@ -943,6 +1103,7 @@ case TYPE_DICT: v = PyDict_New(); + R_REF(v); if (v == NULL) { retval = NULL; break; @@ -978,6 +1139,13 @@ break; } v = (type == TYPE_SET) ? PySet_New(NULL) : PyFrozenSet_New(NULL); + /* must use delayed registration of frozensets because they must + * be init with a refcount of 1 + */ + if (type == TYPE_SET) + R_REF(v); + else + v = r_ref_reserve(v, &idx, flag, p); if (v == NULL) { retval = NULL; break; @@ -1000,6 +1168,8 @@ } Py_DECREF(v2); } + if (type != TYPE_SET) + v = r_ref_insert(v, idx, flag, p); retval = v; break; @@ -1020,6 +1190,12 @@ PyObject *name = NULL; int firstlineno; PyObject *lnotab = NULL; + + r_ref_reserve(NULL, &idx, flag, p); + if (idx < 0) { + retval = NULL; + break; + } v = NULL; @@ -1086,6 +1262,7 @@ code, consts, names, varnames, freevars, cellvars, filename, name, firstlineno, lnotab); + v = r_ref_insert(v, idx, flag, p); code_error: Py_XDECREF(code); @@ -1101,6 +1278,23 @@ retval = v; break; + case TYPE_REF: + n = r_long(p); + if (n < 0 || n >= PyList_GET_SIZE(p->refs)) { + PyErr_SetString(PyExc_ValueError, "bad marshal data (invalid reference)"); + retval = NULL; + break; + } + v = PyList_GET_ITEM(p->refs, n); + if (v == Py_None) { + PyErr_SetString(PyExc_ValueError, "bad marshal data (invalid reference)"); + retval = NULL; + break; + } + Py_INCREF(v); + retval = v; + break; + default: /* Bogus data got written, which isn't ideal. This will let you keep working and recover. */ @@ -1210,7 +1404,11 @@ rf.current_filename = NULL; rf.depth = 0; rf.ptr = rf.end = NULL; + rf.refs = PyList_New(0); + if (rf.refs == NULL) + return NULL; result = r_object(&rf); + Py_DECREF(rf.refs); return result; } @@ -1225,7 +1423,11 @@ rf.ptr = str; rf.end = str + len; rf.depth = 0; + rf.refs = PyList_New(0); + if (rf.refs == NULL) + return NULL; result = r_object(&rf); + Py_DECREF(rf.refs); return result; } @@ -1244,7 +1446,13 @@ wf.error = WFERR_OK; wf.depth = 0; wf.version = version; + if (version >= 3) { + if ((wf.refs = PyDict_New()) == NULL) + return NULL; + } else + wf.refs = NULL; w_object(x, &wf); + Py_XDECREF(wf.refs); if (wf.str != NULL) { char *base = PyBytes_AS_STRING((PyBytesObject *)wf.str); if (wf.ptr - base > PY_SSIZE_T_MAX) { @@ -1316,6 +1524,8 @@ * Make a call to the read method, but read zero bytes. * This is to ensure that the object passed in at least * has a read method which returns bytes. + * This can be removed if we guarantee good error handling + * for r_string() */ data = _PyObject_CallMethodId(f, &PyId_read, "i", 0); if (data == NULL) @@ -1331,7 +1541,11 @@ rf.fp = NULL; rf.readable = f; rf.current_filename = NULL; - result = read_object(&rf); + if ((rf.refs = PyList_New(0)) != NULL) { + result = read_object(&rf); + Py_DECREF(rf.refs); + } else + result = NULL; } Py_DECREF(data); return result; @@ -1388,8 +1602,11 @@ rf.ptr = s; rf.end = s + n; rf.depth = 0; + if ((rf.refs = PyList_New(0)) == NULL) + return NULL; result = read_object(&rf); PyBuffer_Release(&p); + Py_DECREF(rf.refs); return result; } @@ -1429,6 +1646,7 @@ version -- indicates the format that the module uses. Version 0 is the\n\ historical format, version 1 shares interned strings and version 2\n\ uses a binary format for floating point numbers.\n\ + Version 3 shares common object references (New in version 3.4).\n\ \n\ Functions:\n\ \n\