# HG changeset patch # Parent 1b1abe815db04f0cef4cedbe1e43c2c609ff1ace Add capability to read zipfile directories in nonstandard encodings. diff -r 1b1abe815db0 Doc/library/zipfile.rst --- a/Doc/library/zipfile.rst Wed Sep 07 10:33:28 2016 -0700 +++ b/Doc/library/zipfile.rst Mon Sep 12 04:16:18 2016 +0900 @@ -130,7 +130,7 @@ --------------- -.. class:: ZipFile(file, mode='r', compression=ZIP_STORED, allowZip64=True) +.. class:: ZipFile(file, mode='r', compression=ZIP_STORED, allowZip64=True, memberNameEncoding=None) Open a ZIP file, where *file* can be either a path to a file (a string) or a file-like object. The *mode* parameter should be ``'r'`` to read an existing @@ -154,6 +154,8 @@ ``True`` (the default) zipfile will create ZIP files that use the ZIP64 extensions when the zipfile is larger than 2 GiB. If it is false :mod:`zipfile` will raise an exception when the ZIP file would require ZIP64 extensions. + When in mode ``'r'``, *memberNameEncoding* may be set to the name + of a codec, which will be used to decode the names of members and ZIP comments. If the file is created with mode ``'w'``, ``'x'`` or ``'a'`` and then :meth:`closed ` without adding any files to the archive, the appropriate @@ -166,6 +168,15 @@ with ZipFile('spam.zip', 'w') as myzip: myzip.write('eggs.txt') + .. note:: + + The memberNameEncoding is a global setting for the ZipFile instance. It is not + currently possible to set this on a per-member basis. This attribute is a + workaround for legacy implementations which produce archives with names in the + current locale encoding or code page (mostly on Windows). Non-default values of + this argument are not supported for write modes. Such usage is nonconforming to + the ZIP standard. + .. versionadded:: 3.2 Added the ability to use :class:`ZipFile` as a context manager. @@ -179,6 +190,10 @@ Added support for writing to unseekable streams. Added support for the ``'x'`` mode. + .. versionchanged:: 3.6 + Added support for specifying member name encoding for reading + the zipfile's directory and file headers. + .. method:: ZipFile.close() @@ -333,10 +348,13 @@ .. note:: - There is no official file name encoding for ZIP files. If you have unicode file - names, you must convert them to byte strings in your desired encoding before - passing them to :meth:`write`. WinZip interprets all file names as encoded in - CP437, also known as DOS Latin. + The ZIP file standard historically did not specify a member name encoding, but + strongly recommended CP437 (the original IBM PC encoding) for interoperability. + Since Version 6.3.0 (published 2006/09/29), if flag bit 11 is set, the encoding + of member names and comments must be Unicode version 4.1.0 or later in the UTF-8 + format. UTF-8 will automatically be used to write the member names if they + contain any non-ASCII characters. It is not possible to write member names in + any encoding other than ASCII or UTF-8. .. note:: @@ -348,6 +366,9 @@ If ``arcname`` (or ``filename``, if ``arcname`` is not given) contains a null byte, the name of the file in the archive will be truncated at the null byte. + .. versionchanged:: 3.6 + Document use of Unicode names. + .. method:: ZipFile.writestr(zinfo_or_arcname, data[, compress_type]) Write the string *data* to the archive; *zinfo_or_arcname* is either the file diff -r 1b1abe815db0 Lib/test/test_zipfile.py --- a/Lib/test/test_zipfile.py Wed Sep 07 10:33:28 2016 -0700 +++ b/Lib/test/test_zipfile.py Mon Sep 12 04:16:18 2016 +0900 @@ -25,6 +25,13 @@ ('ziptest2dir/_ziptest2', 'qawsedrftg'), ('ziptest2dir/ziptest3dir/_ziptest3', 'azsxdcfvgb'), ('ziptest2dir/ziptest3dir/ziptest4dir/_ziptest3', '6y7u8i9o0p')] +# See comment on EncodedNameMembersTest.setUp(). +ENCODED_NAME_MEMBERS = [('ascii', chr(19968), "n1", # Han '一' + "This is pure ASCII.\n".encode('ascii')), + ('utf-8', chr(20108), "n2", # Han '二' + "これは現代的日本語です。\n".encode('utf-8')), + ('shift_jis', chr(19977), "n3", # Han '三' + "これは古い日本語です。\n".encode('shift_jis'))] def getrandbytes(size): return getrandbits(8 * size).to_bytes(size, 'little') @@ -1998,5 +2005,134 @@ self.assertEqual(zi.compress_type, zipfile.ZIP_STORED) self.assertEqual(zi.file_size, 0) +class EncodedMemberNameTests(unittest.TestCase): + def setUp(self): + os.mkdir(TESTFN2) + # Create .zip of 3 members with Han names encoded in Shift JIS. + # Each name is 1 Han character encoding to 2 bytes in Shift JIS. + # The ASCII names are arbitrary as long as they are length 2 and + # not otherwise contained in the zip file. + # Data elements are encoded bytes (ascii, utf-8, shift_jis). + ms = ENCODED_NAME_MEMBERS + with zipfile.ZipFile(TESTFN, mode="w") as tf: + for i in range(len(ms)): + tf.writestr(ms[i][2], ms[i][3], zipfile.ZIP_STORED) + # Hack in the Shift JIS names with flag bit 11 (UTF-8) unset. + with open(TESTFN, "rb") as tf: + text = bytearray(tf.read()) + for i in range(len(ms)): + text = text.replace(ms[i][2].encode('ascii'), + ms[i][1].encode('shift_jis')) + with open(TESTFN, "wb") as tf: + tf.write(text) + + def test_ZipInfo(self): + # Read the ZIP archive + ms = ENCODED_NAME_MEMBERS + with zipfile.ZipFile(TESTFN, "r", memberNameEncoding='shift_jis') as zipfp: + for i in range(len(ms)): + self.assertEqual(zipfp.read(ms[i][1]), ms[i][3]) + + # Print the ZIP directory + fp = io.StringIO() + zipfp.printdir(file=fp) + directory = fp.getvalue() + lines = directory.splitlines() + self.assertEqual(len(lines), len(ms) + 1) + + self.assertIn('File Name', lines[0]) + self.assertIn('Modified', lines[0]) + self.assertIn('Size', lines[0]) + + lines = lines[1:] + for i in range(len(ms)): + fn, date, time_, size = lines[i].split() + self.assertEqual(fn, ms[i][1]) + self.assertTrue(time.strptime(date, '%Y-%m-%d')) + self.assertTrue(time.strptime(time_, '%H:%M:%S')) + self.assertEqual(size, str(len(ms[i][3]))) + + # Check the namelist + names = zipfp.namelist() + self.assertEqual(len(names), len(ms)) + for i in range(len(ms)): + self.assertIn(ms[i][1], names) + + # Check infolist + infos = zipfp.infolist() + names = [i.filename for i in infos] + self.assertEqual(len(names), len(ms)) + for i in range(len(ms)): + self.assertIn(ms[i][1], names) + # can't rely on order, defer size check to next stanza + + # check getinfo + for member in ms: + info = zipfp.getinfo(member[1]) + self.assertEqual(info.filename, member[1]) + self.assertEqual(info.file_size, len(member[3])) + + # Check that testzip doesn't raise an exception + zipfp.testzip() + + def test_content_encoding(self): + # Test that name encoding doesn't leak over to content handling. + # Currently redundant vs test_ZipInfo because content is pre-encoded. + with zipfile.ZipFile(TESTFN, "r", memberNameEncoding='shift_jis') as zipfp: + for member in ENCODED_NAME_MEMBERS: + data = zipfp.read(member[1]) + if member[0] == 'ascii': + self.assertTrue(data.decode('ascii')) + self.assertTrue(data.decode('utf-8')) + self.assertTrue(data.decode('shift_jis')) + elif member[0] == 'utf-8': + self.assertRaises(UnicodeDecodeError, data.decode, 'ascii') + self.assertTrue(data.decode('utf-8')) + self.assertRaises(UnicodeDecodeError, data.decode, 'shift_jis') + elif member[0] == 'shift_jis': + self.assertRaises(UnicodeDecodeError, data.decode, 'ascii') + self.assertRaises(UnicodeDecodeError, data.decode, 'utf-8') + self.assertTrue(data.decode('shift_jis')) + + def test_ZipFile_membernameencoding(self): + with zipfile.ZipFile(TESTFN, "r", memberNameEncoding="shift_jis") as f: + self.assertTrue(f) + for mode in ("w", "a", "x"): + self.assertRaisesRegex( + RuntimeError, + "Setting member name encoding not allowed on write", + zipfile.ZipFile, + "nonesuch.zip", + mode, + memberNameEncoding="shift_jis") + + def test_main_membernameencoding(self): + # XXX I don't know how to do this right yet. :-P + # for operation in ("-c", "-t"): + # self.assertRaisesRegex(RuntimeError, + # "^Non-conforming encodings not supported", + # zipfile.main, + # [ "--membernameencoding=shift_jis", + # operation, + # "nonesuch.zip", + # "nonesuch.txt" + # ]) + self.assertTrue(zipfile.main, + [ "--membernameencoding=shift_jis", + "-l", + TESTFN + ]) + self.assertTrue(zipfile.main, + [ "--membernameencoding=shift_jis", + "-e", + TESTFN, + TESTFN2 + ]) + + def tearDown(self): + rmtree(TESTFN2) + if os.path.exists(TESTFN): + unlink(TESTFN) + if __name__ == "__main__": unittest.main() diff -r 1b1abe815db0 Lib/zipfile.py --- a/Lib/zipfile.py Wed Sep 07 10:33:28 2016 -0700 +++ b/Lib/zipfile.py Mon Sep 12 04:16:18 2016 +0900 @@ -1051,7 +1051,8 @@ fp = None # Set here since __del__ checks it _windows_illegal_name_trans_table = None - def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True): + def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, + memberNameEncoding=None): """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x', or append 'a'.""" if mode not in ('r', 'w', 'x', 'a'): @@ -1068,6 +1069,11 @@ self.mode = mode self.pwd = None self._comment = b'' + self.memberNameEncoding = memberNameEncoding + + # Check that we don't try to write with nonconforming codecs + if self.memberNameEncoding is not None and mode != 'r': + raise RuntimeError("Setting member name encoding not allowed on write") # Check if we were passed a file-like object if isinstance(file, str): @@ -1151,6 +1157,8 @@ elif self.filename is not None: result.append(' filename=%r' % self.filename) result.append(' mode=%r' % self.mode) + if self.memberNameEncoding is not None: + result.append(' memberNameEncoding=%r' % self.memberNameEncoding) else: result.append(' [closed]') result.append('>') @@ -1200,6 +1208,9 @@ if flags & 0x800: # UTF-8 file names extension filename = filename.decode('utf-8') + elif self.memberNameEncoding is not None: + # Python read-only encoded names extension + filename = filename.decode(self.memberNameEncoding) else: # Historical ZIP filename encoding filename = filename.decode('cp437') @@ -1379,6 +1390,9 @@ if zinfo.flag_bits & 0x800: # UTF-8 filename fname_str = fname.decode("utf-8") + elif self.memberNameEncoding is not None: + # Python read-only encoded names extension + fname_str = fname.decode(self.memberNameEncoding) else: fname_str = fname.decode("cp437") @@ -1958,10 +1972,24 @@ zipfile.py -t zipfile.zip # Test if a zipfile is valid zipfile.py -e zipfile.zip target # Extract zipfile into target dir zipfile.py -c zipfile.zip src ... # Create zipfile from sources + -l and -e may be preceded by --membernameencoding=codec to + specify the encoding of member names in the zipfile directory. """) + CODEC_USAGE="Non-conforming encodings not supported with -c and -t.\n" + if args is None: args = sys.argv[1:] + # intentionally awkward and inflexible parsing + codec = None + if args and args[0].startswith('--membernameencoding='): + codec = args[0][21:] + args = args[1:] + if args and args[0] in ('-c', '-t'): + print(CODEC_USAGE) + print(USAGE) + sys.exit(1) + if not args or args[0] not in ('-l', '-c', '-e', '-t'): print(USAGE) sys.exit(1) @@ -1970,7 +1998,7 @@ if len(args) != 2: print(USAGE) sys.exit(1) - with ZipFile(args[1], 'r') as zf: + with ZipFile(args[1], 'r', memberNameEncoding=codec) as zf: zf.printdir() elif args[0] == '-t': @@ -1988,7 +2016,7 @@ print(USAGE) sys.exit(1) - with ZipFile(args[1], 'r') as zf: + with ZipFile(args[1], 'r', memberNameEncoding=codec) as zf: zf.extractall(args[2]) elif args[0] == '-c':