diff --git a/Lib/pickle.py b/Lib/pickle.py --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -991,7 +991,8 @@ map the old Python 2.x names to the new names used in Python 3.x. The *encoding* and *errors* tell pickle how to decode 8-bit string instances pickled by Python 2.x; these default to 'ASCII' and - 'strict', respectively. + 'strict', respectively. *encoding* can be 'bytes' to read 8-bit string + instances as bytes objects. """ self._file_readline = file.readline self._file_read = file.read @@ -1139,6 +1140,12 @@ self.append(unpack('>d', self.read(8))[0]) dispatch[BINFLOAT[0]] = load_binfloat + def _decode_string(self, value): + if self.encoding == "bytes": + return value + else: + return value.decode(self.encoding, self.errors) + def load_string(self): data = self.readline()[:-1] # Strip outermost quotes @@ -1146,8 +1153,7 @@ data = data[1:-1] else: raise UnpicklingError("the STRING opcode argument must be quoted") - self.append(codecs.escape_decode(data)[0] - .decode(self.encoding, self.errors)) + self.append(self._decode_string(codecs.escape_decode(data)[0])) dispatch[STRING[0]] = load_string def load_binstring(self): @@ -1156,8 +1162,7 @@ if len < 0: raise UnpicklingError("BINSTRING pickle has negative byte count") data = self.read(len) - value = str(data, self.encoding, self.errors) - self.append(value) + self.append(self._decode_string(data)) dispatch[BINSTRING[0]] = load_binstring def load_binbytes(self): @@ -1191,8 +1196,7 @@ def load_short_binstring(self): len = self.read(1)[0] data = self.read(len) - value = str(data, self.encoding, self.errors) - self.append(value) + self.append(self._decode_string(data)) dispatch[SHORT_BINSTRING[0]] = load_short_binstring def load_short_binbytes(self): diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -1305,6 +1305,34 @@ dumped = self.dumps(set([3]), 2) self.assertEqual(dumped, DATA6) + def test_load_python2_str_as_bytes(self): + # Created with: pickle.dumps('a\x00\xa0', protocol=0) + self.assertEqual(self.loads(b"S'a\\x00\\xa0'\n.", + encoding="bytes"), b'a\x00\xa0') + # From Python 2: pickle.dumps('a\x00\xa0', protocol=1) + self.assertEqual(self.loads(b'U\x03a\x00\xa0.', + encoding="bytes"), b'a\x00\xa0') + # From Python 2: pickle.dumps('a\x00\xa0', protocol=2) + self.assertEqual(self.loads(b'\x80\x02U\x03a\x00\xa0.', + encoding="bytes"), b'a\x00\xa0') + + def test_load_python2_unicode_as_str(self): + # From Python 2: pickle.dumps(u"π", protocol=0) + self.assertEqual(self.loads(b'V\\u03c0\n.', + encoding='bytes'), 'π') + # From Python 2: pickle.dumps(u"π", protocol=1) + self.assertEqual(self.loads(b'X\x02\x00\x00\x00\xcf\x80.', + encoding="bytes"), 'π') + # From Python 2: pickle.dumps(u"π", protocol=2) + self.assertEqual(self.loads(b'\x80\x02X\x02\x00\x00\x00\xcf\x80.', + encoding="bytes"), 'π') + + def test_load_long_python2_str_as_bytes(self): + self.assertEqual(self.loads(pickle.BINSTRING + + struct.pack("encoding, self->errors); - Py_DECREF(bytes); - if (str == NULL) - return -1; - - PDATA_PUSH(self->stack, str, -1); - return 0; -} - -static int -load_counted_binbytes(UnpicklerObject *self, int nbytes) -{ - PyObject *bytes; - Py_ssize_t size; - char *s; - - if (_Unpickler_Read(self, &s, nbytes) < 0) - return -1; - - size = calc_binsize(s, nbytes); - if (size < 0) { - PyErr_Format(PyExc_OverflowError, - "BINBYTES exceeds system's maximum size of %zd bytes", - PY_SSIZE_T_MAX); - return -1; - } - - if (_Unpickler_Read(self, &s, size) < 0) - return -1; - - bytes = PyBytes_FromStringAndSize(s, size); - if (bytes == NULL) - return -1; - - PDATA_PUSH(self->stack, bytes, -1); + + /* Leave the Python 2.x strings as bytes if the *encoding* given to the + Unpickler was 'bytes'. Otherwise, convert them to unicode. */ + if (strcmp(self->encoding, "bytes") == 0) { + obj = bytes; + } + else { + obj = PyUnicode_FromEncodedObject(bytes, self->encoding, self->errors); + Py_DECREF(bytes); + if (obj == NULL) { + return -1; + } + } + + PDATA_PUSH(self->stack, obj, -1); return 0; } static int load_counted_binstring(UnpicklerObject *self, int nbytes) { - PyObject *str; + PyObject *obj; Py_ssize_t size; char *s; @@ -4916,12 +4896,49 @@ if (_Unpickler_Read(self, &s, size) < 0) return -1; - /* Convert Python 2.x strings to unicode. */ - str = PyUnicode_Decode(s, size, self->encoding, self->errors); - if (str == NULL) - return -1; - - PDATA_PUSH(self->stack, str, -1); + + /* Convert Python 2.x strings to bytes if the *encoding* given to the + Unpickler was 'bytes'. Otherwise, convert them to unicode. */ + if (strcmp(self->encoding, "bytes") == 0) { + obj = PyBytes_FromStringAndSize(s, size); + } + else { + obj = PyUnicode_Decode(s, size, self->encoding, self->errors); + } + if (obj == NULL) { + return -1; + } + + PDATA_PUSH(self->stack, obj, -1); + return 0; +} + +static int +load_counted_binbytes(UnpicklerObject *self, int nbytes) +{ + PyObject *bytes; + Py_ssize_t size; + char *s; + + if (_Unpickler_Read(self, &s, nbytes) < 0) + return -1; + + size = calc_binsize(s, nbytes); + if (size < 0) { + PyErr_Format(PyExc_OverflowError, + "BINBYTES exceeds system's maximum size of %zd bytes", + PY_SSIZE_T_MAX); + return -1; + } + + if (_Unpickler_Read(self, &s, size) < 0) + return -1; + + bytes = PyBytes_FromStringAndSize(s, size); + if (bytes == NULL) + return -1; + + PDATA_PUSH(self->stack, bytes, -1); return 0; } @@ -6530,8 +6547,8 @@ map the old Python 2.x names to the new names used in Python 3.x. The *encoding* and *errors* tell pickle how to decode 8-bit string instances pickled by Python 2.x; these default to 'ASCII' and -'strict', respectively. - +'strict', respectively. *encoding* can be 'bytes' to read 8-bit string +instances as byte objects. [clinic]*/ PyDoc_STRVAR(_pickle_Unpickler___init____doc__, @@ -6554,7 +6571,8 @@ "map the old Python 2.x names to the new names used in Python 3.x. The\n" "*encoding* and *errors* tell pickle how to decode 8-bit string\n" "instances pickled by Python 2.x; these default to \'ASCII\' and\n" -"\'strict\', respectively."); +"\'strict\', respectively. *encoding* can be \'bytes\' to read 8-bit string\n" +"instances as byte objects."); #define _PICKLE_UNPICKLER___INIT___METHODDEF \ {"__init__", (PyCFunction)_pickle_Unpickler___init__, METH_VARARGS|METH_KEYWORDS, _pickle_Unpickler___init____doc__}, @@ -6584,7 +6602,7 @@ static PyObject * _pickle_Unpickler___init___impl(UnpicklerObject *self, PyObject *file, int fix_imports, const char *encoding, const char *errors) -/*[clinic checksum: bed0d8bbe1c647960ccc6f997b33bf33935fa56f]*/ +/*[clinic checksum: 517a6b86003168f4e504d2162dc6a93cdaf747e1]*/ { _Py_IDENTIFIER(persistent_load); @@ -7250,7 +7268,8 @@ by Python 2.x. If fix_imports is True, pickle will try to map the old Python 2.x names to the new names used in Python 3.x. The encoding and errors tell pickle how to decode 8-bit string instances pickled by Python -2.x; these default to 'ASCII' and 'strict', respectively. +2.x; these default to 'ASCII' and 'strict', respectively. *encoding* can be +'bytes' to read 8-bit string instances as bytes objects. [clinic]*/ PyDoc_STRVAR(_pickle_load__doc__, @@ -7274,7 +7293,8 @@ "by Python 2.x. If fix_imports is True, pickle will try to map the old\n" "Python 2.x names to the new names used in Python 3.x. The encoding and\n" "errors tell pickle how to decode 8-bit string instances pickled by Python\n" -"2.x; these default to \'ASCII\' and \'strict\', respectively."); +"2.x; these default to \'ASCII\' and \'strict\', respectively. *encoding* can be\n" +"\'bytes\' to read 8-bit string instances as bytes objects."); #define _PICKLE_LOAD_METHODDEF \ {"load", (PyCFunction)_pickle_load, METH_VARARGS|METH_KEYWORDS, _pickle_load__doc__}, @@ -7304,7 +7324,7 @@ static PyObject * _pickle_load_impl(PyModuleDef *module, PyObject *file, int fix_imports, const char *encoding, const char *errors) -/*[clinic checksum: e10796f6765b22ce48dca6940f11b3933853ca35]*/ +/*[clinic checksum: a7228bf391773a0e6fb5e7cddddd183702e247b9]*/ { PyObject *result; UnpicklerObject *unpickler = _Unpickler_New(); @@ -7350,7 +7370,8 @@ by Python 2.x. If fix_imports is True, pickle will try to map the old Python 2.x names to the new names used in Python 3.x. The encoding and errors tell pickle how to decode 8-bit string instances pickled by Python -2.x; these default to 'ASCII' and 'strict', respectively. +2.x; these default to 'ASCII' and 'strict', respectively. *encoding* can be +'bytes' to read 8-bit string instances as bytes objects. [clinic]*/ PyDoc_STRVAR(_pickle_loads__doc__, @@ -7366,7 +7387,8 @@ "by Python 2.x. If fix_imports is True, pickle will try to map the old\n" "Python 2.x names to the new names used in Python 3.x. The encoding and\n" "errors tell pickle how to decode 8-bit string instances pickled by Python\n" -"2.x; these default to \'ASCII\' and \'strict\', respectively."); +"2.x; these default to \'ASCII\' and \'strict\', respectively. *encoding* can be\n" +"\'bytes\' to read 8-bit string instances as bytes objects."); #define _PICKLE_LOADS_METHODDEF \ {"loads", (PyCFunction)_pickle_loads, METH_VARARGS|METH_KEYWORDS, _pickle_loads__doc__}, @@ -7396,7 +7418,7 @@ static PyObject * _pickle_loads_impl(PyModuleDef *module, PyObject *data, int fix_imports, const char *encoding, const char *errors) -/*[clinic checksum: 29ee725efcbf51a3533c19cb8261a8e267b7080a]*/ +/*[clinic checksum: d2263947b6dd5b1d45fb2ac45d4408230953ea0c]*/ { PyObject *result; UnpicklerObject *unpickler = _Unpickler_New();