diff -r 5c09e1c57200 Doc/library/zipfile.rst --- a/Doc/library/zipfile.rst Wed Mar 20 20:16:47 2013 +0100 +++ b/Doc/library/zipfile.rst Wed Mar 20 17:40:19 2013 -0700 @@ -429,6 +429,27 @@ Name of the file in the archive. + Officially, the Zip specification allows for filenames in one of two + encodings: ``utf-8`` or ``cp437``. If all software that created zip + files knew the encodings of the filenames they were putting into a zip + file, they'd be able to use one of these two encodings to make filenames + look the same to both the user who created the zip file and the user who + unzipped it. Unfortunately, tools to create zip files often encounter + filenames that they don't know the encoding of. When this happens, all + known tools (including this module) encode the filenames using + ``cp437`` to preserve the filename's actual byte values. When python + then loads the filename from the zipfile, it will use ``cp437`` to + decode the filenames into a :class:`str`. if the original encoding was + not ``cp437`` the filenames may be garbled. If you know what encoding + they really were you can make them readable by reencoding to ``cp437`` + and then decoding using the proper encoding. For instance:: + + >>> zi.filename + 'CafΘ_Espa±a.txt' + >>> zi.filename.encode('cp437').decode('latin-1') + 'Café_España.txt' + + .. attribute:: ZipInfo.date_time diff -r 5c09e1c57200 Lib/test/test_zipfile.py --- a/Lib/test/test_zipfile.py Wed Mar 20 20:16:47 2013 +0100 +++ b/Lib/test/test_zipfile.py Wed Mar 20 17:40:19 2013 -0700 @@ -1040,6 +1040,19 @@ self.assertEqual(zf.filelist[0].filename, "foo.txt") self.assertEqual(zf.filelist[1].filename, "\xf6.txt") + def test_undecodable_filenames(self): + undecodable = "\xf6.txt".encode("cp437").decode("utf-8", errors="surrogateescape") + with zipfile.ZipFile(TESTFN, "w") as zf: + zf.writestr("foo.txt", "Test for unicode filename") + zf.writestr("\xf6.txt", "Test for unicode filename") + zf.writestr(undecodable, "Test for undecodable filename") + self.assertIsInstance(zf.infolist()[2].filename, str) + + with zipfile.ZipFile(TESTFN, "r") as zf: + self.assertEqual(zf.filelist[0].filename, "foo.txt") + self.assertEqual(zf.filelist[1].filename, "\xf6.txt") + self.assertEqual(zf.filelist[2].filename, "\xf6.txt") + def test_create_non_existent_file_for_append(self): if os.path.exists(TESTFN): os.unlink(TESTFN) diff -r 5c09e1c57200 Lib/zipfile.py --- a/Lib/zipfile.py Wed Mar 20 20:16:47 2013 +0100 +++ b/Lib/zipfile.py Wed Mar 20 17:40:19 2013 -0700 @@ -402,10 +402,22 @@ return header + filename + extra def _encodeFilenameFlags(self): + # filenames in zip files can consist of any set of bytes (The + # *characters* the bytes represent are supposed to be from Code Page + # 437 but implementations I checked exploit the fact that Code Page 437 + # can contain any byte sequence to simply encode any byte sequence + # without attempting to determine if it's Code Page 437. + + # Since older python-stdlib separates ascii from utf-8 we don't set the + # utf-8 flag for ascii-only filenames below. However, since utf-8 is a + # superset of ascii, it would be legal to do so. try: return self.filename.encode('ascii'), self.flag_bits except UnicodeEncodeError: - return self.filename.encode('utf-8'), self.flag_bits | 0x800 + try: + return self.filename.encode('utf-8'), self.flag_bits | 0x800 + except UnicodeEncodeError: + return self.filename.encode('utf-8', errors='surrogateescape'), self.flag_bits def _decodeExtra(self): # Try to decode the extra field. @@ -997,6 +1009,10 @@ print(centdir) filename = fp.read(centdir[_CD_FILENAME_LENGTH]) flags = centdir[5] + # Caveat on filenames: the spec says that encodings are either + # utf-8 or cp437 but in practice zipfiles use the cp437 fallback to + # store unknown encodings. See the zipfile.ZipInfo documentation + # if you're confused by what this means. if flags & 0x800: # UTF-8 file names extension filename = filename.decode('utf-8') @@ -1153,6 +1169,10 @@ # strong encryption raise NotImplementedError("strong encryption (flag bit 6)") + # Caveat on filenames: the spec says that encodings are either + # utf-8 or cp437 but in practice zipfiles use the cp437 fallback to + # store unknown encodings. See the zipfile.ZipInfo documentation + # if you're confused by what this means if zinfo.flag_bits & 0x800: # UTF-8 filename fname_str = fname.decode("utf-8") diff -r 5c09e1c57200 Misc/NEWS --- a/Misc/NEWS Wed Mar 20 20:16:47 2013 +0100 +++ b/Misc/NEWS Wed Mar 20 17:40:19 2013 -0700 @@ -974,6 +974,9 @@ - ctypes.call_commethod was removed, since its only usage was in the defunct samples directory. +- Issue #16310: Fixed zipfile to handle filenames that are not decodable in the + present locale + Extension Modules -----------------