diff -r 0c508d87f80b Lib/pickle.py --- a/Lib/pickle.py Fri Dec 06 17:25:51 2013 +0200 +++ b/Lib/pickle.py Sat Dec 07 00:20:28 2013 +0100 @@ -991,7 +991,8 @@ map the old Python 2.x names to the new names used in Python 3.x. The *encoding* and *errors* tell pickle how to decode 8-bit string instances pickled by Python 2.x; these default to 'ASCII' and - 'strict', respectively. + 'strict', respectively. *encoding* can be 'bytes' to read 8-bit string + instances as bytes objects. """ self._file_readline = file.readline self._file_read = file.read @@ -1139,6 +1140,12 @@ self.append(unpack('>d', self.read(8))[0]) dispatch[BINFLOAT[0]] = load_binfloat + def decode_string(self, value): + if self.encoding == "bytes": + return value + else: + return value.decode(self.encoding, self.errors) + def load_string(self): data = self.readline()[:-1] # Strip outermost quotes @@ -1146,8 +1153,7 @@ data = data[1:-1] else: raise UnpicklingError("the STRING opcode argument must be quoted") - self.append(codecs.escape_decode(data)[0] - .decode(self.encoding, self.errors)) + self.append(self.decode_string(codecs.escape_decode(data)[0])) dispatch[STRING[0]] = load_string def load_binstring(self): @@ -1156,8 +1162,7 @@ if len < 0: raise UnpicklingError("BINSTRING pickle has negative byte count") data = self.read(len) - value = str(data, self.encoding, self.errors) - self.append(value) + self.append(self.decode_string(data)) dispatch[BINSTRING[0]] = load_binstring def load_binbytes(self): @@ -1191,8 +1196,7 @@ def load_short_binstring(self): len = self.read(1)[0] data = self.read(len) - value = str(data, self.encoding, self.errors) - self.append(value) + self.append(self.decode_string(data)) dispatch[SHORT_BINSTRING[0]] = load_short_binstring def load_short_binbytes(self): diff -r 0c508d87f80b Lib/test/pickletester.py --- a/Lib/test/pickletester.py Fri Dec 06 17:25:51 2013 +0200 +++ b/Lib/test/pickletester.py Sat Dec 07 00:20:28 2013 +0100 @@ -1602,6 +1602,53 @@ unpickled = self.loads(self.dumps(method, proto)) self.assertEqual(method(*args), unpickled(*args)) +class AbstractBytestrTests(unittest.TestCase): + def test_load_python2_str_as_bytes(self): + # created with: pickle.dumps('a\x00\xa0', protocol=0) + self.assertEqual( + self.loads(b"S'a\\x00\\xa0'\np0\n.", encoding="bytes"), + b'a\x00\xa0' + ) + + # python 2: pickle.dumps('a\x00\xa0', protocol=1) + self.assertEqual( + self.loads(b'U\x03a\x00\xa0q\x00.', encoding="bytes"), + b'a\x00\xa0' + ) + + # python 2: pickle.dumps('a\x00\xa0', protocol=2) + self.assertEqual( + self.loads(b'\x80\x02U\x03a\x00\xa0q\x00.', encoding="bytes"), + b'a\x00\xa0' + ) + + def test_load_python2_unicode_as_str(self): + # python 2: pickle.dumps(u"π", protocol=0) + self.assertEqual( + self.loads(b'V\\u03c0\np0\n.', encoding='bytes'), + 'π' + ) + + # python 2: pickle.dumps(u"π", protocol=1) + self.assertEqual( + self.loads(b'X\x02\x00\x00\x00\xcf\x80q\x00.', encoding="bytes"), + 'π' + ) + + # python 2: pickle.dumps(u"π", protocol=2) + self.assertEqual( + self.loads(b'\x80\x02X\x02\x00\x00\x00\xcf\x80q\x00.', + encoding="bytes"), + 'π' + ) + + def test_load_long_python2_str_as_bytes(self): + self.assertEqual( + self.loads(pickle.BINSTRING + struct.pack("encoding is "bytes", return the same bytes object + with it's reference counter incremented; + - otherwise, return a string object by decoding `value` using + self->encoding and self->errors. + + Returns a new reference */ +static PyObject * +_Unpickler_DecodeString(UnpicklerObject *self, PyObject *value) +{ + if (strcmp(self->encoding, "bytes") == 0) { + Py_INCREF(value); + return value; + } else { + return PyUnicode_FromEncodedObject(value, self->encoding, self->errors); + } +} + static int load_string(UnpicklerObject *self) { @@ -4857,7 +4875,8 @@ bytes = PyBytes_DecodeEscape(p, len, NULL, 0, NULL); if (bytes == NULL) return -1; - str = PyUnicode_FromEncodedObject(bytes, self->encoding, self->errors); + + str = _Unpickler_DecodeString(self, bytes); Py_DECREF(bytes); if (str == NULL) return -1; @@ -4898,7 +4917,7 @@ static int load_counted_binstring(UnpicklerObject *self, int nbytes) { - PyObject *str; + PyObject *bytes, *str; Py_ssize_t size; char *s; @@ -4916,8 +4935,13 @@ if (_Unpickler_Read(self, &s, size) < 0) return -1; - /* Convert Python 2.x strings to unicode. */ - str = PyUnicode_Decode(s, size, self->encoding, self->errors); + + bytes = PyBytes_FromStringAndSize(s, size); + if (bytes == NULL) + return -1; + + str = _Unpickler_DecodeString(self, bytes); + Py_DECREF(bytes); if (str == NULL) return -1; @@ -6530,8 +6554,8 @@ map the old Python 2.x names to the new names used in Python 3.x. The *encoding* and *errors* tell pickle how to decode 8-bit string instances pickled by Python 2.x; these default to 'ASCII' and -'strict', respectively. - +'strict', respectively. *encoding* can be 'bytes' to read 8-bit string +instances as byte objects. [clinic]*/ PyDoc_STRVAR(_pickle_Unpickler___init____doc__, @@ -6539,7 +6563,7 @@ "This takes a binary file for reading a pickle data stream.\n" "\n" "The protocol version of the pickle is detected automatically, so no\n" -"proto argument is needed.\n" +"proto argument is \n" "\n" "The file-like object must have two methods, a read() method\n" "that takes an integer argument, and a readline() method that\n" @@ -7250,7 +7274,8 @@ by Python 2.x. If fix_imports is True, pickle will try to map the old Python 2.x names to the new names used in Python 3.x. The encoding and errors tell pickle how to decode 8-bit string instances pickled by Python -2.x; these default to 'ASCII' and 'strict', respectively. +2.x; these default to 'ASCII' and 'strict', respectively. *encoding* can be +'bytes' to read 8-bit string instances as bytes objects. [clinic]*/ PyDoc_STRVAR(_pickle_load__doc__, @@ -7350,7 +7375,8 @@ by Python 2.x. If fix_imports is True, pickle will try to map the old Python 2.x names to the new names used in Python 3.x. The encoding and errors tell pickle how to decode 8-bit string instances pickled by Python -2.x; these default to 'ASCII' and 'strict', respectively. +2.x; these default to 'ASCII' and 'strict', respectively. *encoding* can be +'bytes' to read 8-bit string instances as bytes objects. [clinic]*/ PyDoc_STRVAR(_pickle_loads__doc__,