diff -r 0c508d87f80b Lib/pickle.py --- a/Lib/pickle.py Fri Dec 06 17:25:51 2013 +0200 +++ b/Lib/pickle.py Fri Dec 06 21:46:07 2013 +0100 @@ -991,7 +991,8 @@ map the old Python 2.x names to the new names used in Python 3.x. The *encoding* and *errors* tell pickle how to decode 8-bit string instances pickled by Python 2.x; these default to 'ASCII' and - 'strict', respectively. + 'strict', respectively. *encoding* can be 'bytes' to read 8-bit string + instances as bytes objects. """ self._file_readline = file.readline self._file_read = file.read @@ -1139,6 +1140,12 @@ self.append(unpack('>d', self.read(8))[0]) dispatch[BINFLOAT[0]] = load_binfloat + def decode_string(self, value): + if self.encoding == "bytes": + return value + else: + return value.decode(self.encoding, self.errors) + def load_string(self): data = self.readline()[:-1] # Strip outermost quotes @@ -1146,8 +1153,7 @@ data = data[1:-1] else: raise UnpicklingError("the STRING opcode argument must be quoted") - self.append(codecs.escape_decode(data)[0] - .decode(self.encoding, self.errors)) + self.append(self.decode_string(codecs.escape_decode(data)[0])) dispatch[STRING[0]] = load_string def load_binstring(self): @@ -1156,8 +1162,7 @@ if len < 0: raise UnpicklingError("BINSTRING pickle has negative byte count") data = self.read(len) - value = str(data, self.encoding, self.errors) - self.append(value) + self.append(self.decode_string(data)) dispatch[BINSTRING[0]] = load_binstring def load_binbytes(self): @@ -1191,8 +1196,7 @@ def load_short_binstring(self): len = self.read(1)[0] data = self.read(len) - value = str(data, self.encoding, self.errors) - self.append(value) + self.append(self.decode_string(data)) dispatch[SHORT_BINSTRING[0]] = load_short_binstring def load_short_binbytes(self): diff -r 0c508d87f80b Lib/test/pickletester.py --- a/Lib/test/pickletester.py Fri Dec 06 17:25:51 2013 +0200 +++ b/Lib/test/pickletester.py Fri Dec 06 21:46:07 2013 +0100 @@ -1602,6 +1602,59 @@ unpickled = self.loads(self.dumps(method, proto)) self.assertEqual(method(*args), unpickled(*args)) +class AbstractBytestrTests(unittest.TestCase): + def unpickleEqual(self, data, unpickled): + loaded = self.loads(data, encoding="bytes") + self.assertEqual(loaded, unpickled) + + def test_load_str_protocol_0(self): + """ Test str from protocol=0 + python 2: pickle.dumps('bytestring \x00\xa0', protocol=0) """ + self.unpickleEqual( + b"S'bytestring \\x00\\xa0'\np0\n.", + b'bytestring \x00\xa0') + + def test_load_str_protocol_1(self): + """ Test str from protocol=1 + python 2: pickle.dumps('bytestring \x00\xa0', protocol=1) """ + self.unpickleEqual( + b'U\rbytestring \x00\xa0q\x00.', + b'bytestring \x00\xa0') + + def test_load_str_protocol_2(self): + """ Test str from protocol=2 + python 2: pickle.dumps('bytestring \x00\xa0', protocol=2) """ + self.unpickleEqual( + b'\x80\x02U\rbytestring \x00\xa0q\x00.', + b'bytestring \x00\xa0') + + def test_load_unicode_protocol_0(self): + """ Test unicode with protocol=0 + python 2: pickle.dumps(u"\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440", protocol=0) """ + self.unpickleEqual( + b'V\\u041a\\u043e\\u043c\\u043f\\u044c\\u044e\\u0442\\u0435\\u0440\np0\n.', + '\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440') + + def test_load_unicode_protocol_1(self): + """ Test unicode with protocol=1 + python 2: pickle.dumps(u"\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440", protocol=1) """ + self.unpickleEqual( + b'X\x12\x00\x00\x00\xd0\x9a\xd0\xbe\xd0\xbc\xd0\xbf\xd1\x8c\xd1\x8e\xd1\x82\xd0\xb5\xd1\x80q\x00.', + '\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440') + + def test_load_unicode_protocol_2(self): + """ Test unicode with protocol=1 + python 2: pickle.dumps(u"\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440", protocol=2) """ + self.unpickleEqual( + b'\x80\x02X\x12\x00\x00\x00\xd0\x9a\xd0\xbe\xd0\xbc\xd0\xbf\xd1\x8c\xd1\x8e\xd1\x82\xd0\xb5\xd1\x80q\x00.', + '\u041a\u043e\u043c\u043f\u044c\u044e\u0442\u0435\u0440') + + def test_load_long_str_protocol_1(self): + """ Test long str with protocol=1 + python 2: pickle.dumps('x'*300, protocol=1) """ + self.unpickleEqual( + b'T,\x01\x00\x00xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxq\x00.', + b'x'*300) class BigmemPickleTests(unittest.TestCase): diff -r 0c508d87f80b Lib/test/test_pickle.py --- a/Lib/test/test_pickle.py Fri Dec 06 17:25:51 2013 +0200 +++ b/Lib/test/test_pickle.py Fri Dec 06 21:46:07 2013 +0100 @@ -10,6 +10,7 @@ from test.pickletester import AbstractPicklerUnpicklerObjectTests from test.pickletester import AbstractDispatchTableTests from test.pickletester import BigmemPickleTests +from test.pickletester import AbstractBytestrTests try: import _pickle @@ -21,15 +22,13 @@ class PickleTests(AbstractPickleModuleTests): pass - -class PyPicklerTests(AbstractPickleTests): - +class PyPicklerBase: pickler = pickle._Pickler unpickler = pickle._Unpickler - def dumps(self, arg, proto=None): + def dumps(self, arg, proto=None, **kwds): f = io.BytesIO() - p = self.pickler(f, proto) + p = self.pickler(f, proto, **kwds) p.dump(arg) f.seek(0) return bytes(f.read()) @@ -39,6 +38,11 @@ u = self.unpickler(f, **kwds) return u.load() +class PyPicklerTests(PyPicklerBase, AbstractPickleTests): + pass + +class PyPicklerBytestrTests(PyPicklerBase, AbstractBytestrTests): + pass class InMemoryPickleTests(AbstractPickleTests, BigmemPickleTests): @@ -99,6 +103,10 @@ pickler = _pickle.Pickler unpickler = _pickle.Unpickler + class CPicklerBytestrTests(PyPicklerBytestrTests): + pickler = _pickle.Pickler + unpickler = _pickle.Unpickler + class CPersPicklerTests(PyPersPicklerTests): pickler = _pickle.Pickler unpickler = _pickle.Unpickler @@ -137,14 +145,15 @@ def test_main(): tests = [PickleTests, PyPicklerTests, PyPersPicklerTests, - PyDispatchTableTests, PyChainDispatchTableTests] + PyDispatchTableTests, PyChainDispatchTableTests, + PyPicklerBytestrTests] if has_c_implementation: tests.extend([CPicklerTests, CPersPicklerTests, CDumpPickle_LoadPickle, DumpPickle_CLoadPickle, PyPicklerUnpicklerObjectTests, CPicklerUnpicklerObjectTests, CDispatchTableTests, CChainDispatchTableTests, - InMemoryPickleTests]) + CPicklerBytestrTests, InMemoryPickleTests]) support.run_unittest(*tests) support.run_doctest(pickle) diff -r 0c508d87f80b Modules/_pickle.c --- a/Modules/_pickle.c Fri Dec 06 17:25:51 2013 +0200 +++ b/Modules/_pickle.c Fri Dec 06 21:46:07 2013 +0100 @@ -4827,6 +4827,18 @@ return 0; } +/* Returns a new reference */ +static PyObject * +decode_string(UnpicklerObject *self, PyObject *value) +{ + if (strcmp(self->encoding, "bytes") == 0) { + Py_INCREF(value); + return value; + } else { + return PyUnicode_FromEncodedObject(value, self->encoding, self->errors); + } +} + static int load_string(UnpicklerObject *self) { @@ -4857,7 +4869,8 @@ bytes = PyBytes_DecodeEscape(p, len, NULL, 0, NULL); if (bytes == NULL) return -1; - str = PyUnicode_FromEncodedObject(bytes, self->encoding, self->errors); + + str = decode_string(self, bytes); Py_DECREF(bytes); if (str == NULL) return -1; @@ -4898,7 +4911,7 @@ static int load_counted_binstring(UnpicklerObject *self, int nbytes) { - PyObject *str; + PyObject *bytes, *str; Py_ssize_t size; char *s; @@ -4916,8 +4929,13 @@ if (_Unpickler_Read(self, &s, size) < 0) return -1; - /* Convert Python 2.x strings to unicode. */ - str = PyUnicode_Decode(s, size, self->encoding, self->errors); + + bytes = PyBytes_FromStringAndSize(s, size); + if (bytes == NULL) + return -1; + + str = decode_string(self, bytes); + Py_DECREF(bytes); if (str == NULL) return -1; @@ -6530,7 +6548,8 @@ map the old Python 2.x names to the new names used in Python 3.x. The *encoding* and *errors* tell pickle how to decode 8-bit string instances pickled by Python 2.x; these default to 'ASCII' and -'strict', respectively. +'strict', respectively. *encoding* can be 'bytes' to read 8-bit string +instances as byte objects. [clinic]*/ @@ -6539,7 +6558,7 @@ "This takes a binary file for reading a pickle data stream.\n" "\n" "The protocol version of the pickle is detected automatically, so no\n" -"proto argument is needed.\n" +"proto argument is \n" "\n" "The file-like object must have two methods, a read() method\n" "that takes an integer argument, and a readline() method that\n"