# HG changeset patch # Parent 4a027e55dae3b88cafb56b38b9efd83437ec04de Add capability to read zipfile directories in nonstandard encodings. diff -r 4a027e55dae3 Doc/library/zipfile.rst --- a/Doc/library/zipfile.rst Sun Sep 11 13:34:42 2016 -0700 +++ b/Doc/library/zipfile.rst Mon Sep 12 08:48:32 2016 +0900 @@ -130,7 +130,8 @@ --------------- -.. class:: ZipFile(file, mode='r', compression=ZIP_STORED, allowZip64=True) +.. class:: ZipFile(file, mode='r', compression=ZIP_STORED, allowZip64=True, \ + *, memberNameEncoding=None) Open a ZIP file, where *file* can be either a path to a file (a string) or a file-like object. The *mode* parameter should be ``'r'`` to read an existing @@ -154,6 +155,8 @@ ``True`` (the default) zipfile will create ZIP files that use the ZIP64 extensions when the zipfile is larger than 2 GiB. If it is false :mod:`zipfile` will raise an exception when the ZIP file would require ZIP64 extensions. + When in mode ``'r'``, *memberNameEncoding* may be set to the name of a + codec, which will be used to decode the names of members and ZIP comments. If the file is created with mode ``'w'``, ``'x'`` or ``'a'`` and then :meth:`closed ` without adding any files to the archive, the appropriate @@ -166,6 +169,15 @@ with ZipFile('spam.zip', 'w') as myzip: myzip.write('eggs.txt') + .. note:: + + The memberNameEncoding is an instance-wide setting for the ZipFile. It + is not currently possible to set this on a per-member basis. This + attribute is a workaround for legacy implementations which produce + archives with names in the current locale encoding or code page (mostly + on Windows). The encoding of member names may be specified to be UTF-8 + in the archive header, and that takes precedence over memberNameEncoding. + .. versionadded:: 3.2 Added the ability to use :class:`ZipFile` as a context manager. @@ -183,6 +195,10 @@ Previously, a plain :exc:`RuntimeError` was raised for unrecognized compression values. + .. versionchanged:: 3.6 + Added support for specifying member name encoding for reading + the zipfile's directory and file headers. + .. method:: ZipFile.close() @@ -352,10 +368,12 @@ .. note:: - There is no official file name encoding for ZIP files. If you have unicode file - names, you must convert them to byte strings in your desired encoding before - passing them to :meth:`write`. WinZip interprets all file names as encoded in - CP437, also known as DOS Latin. + The ZIP file standard historically did not specify a member name + encoding, but strongly recommended CP437 (the original IBM PC encoding) + for interoperability. In this module, UTF-8 will automatically be used + to write the member names if they contain any non-ASCII characters. It + is not possible to write member names in any encoding other than ASCII + or UTF-8. .. note:: @@ -372,7 +390,6 @@ a closed ZipFile will raise a :exc:`ValueError`. Previously, a :exc:`RuntimeError` was raised. - .. method:: ZipFile.writestr(zinfo_or_arcname, data[, compress_type]) Write the string *data* to the archive; *zinfo_or_arcname* is either the file diff -r 4a027e55dae3 Lib/test/test_zipfile.py --- a/Lib/test/test_zipfile.py Sun Sep 11 13:34:42 2016 -0700 +++ b/Lib/test/test_zipfile.py Mon Sep 12 08:48:32 2016 +0900 @@ -25,6 +25,15 @@ ('ziptest2dir/_ziptest2', 'qawsedrftg'), ('ziptest2dir/ziptest3dir/_ziptest3', 'azsxdcfvgb'), ('ziptest2dir/ziptest3dir/ziptest4dir/_ziptest3', '6y7u8i9o0p')] +# See comment on EncodedNameMembersTest.setUp(). +ENCODED_NAME_MEMBERS = [(chr(19968), "n1", # Han 'one' + "This is pure ASCII.\n".encode('ascii')), + (chr(20108), "n2", # Han 'two' + # This is modern Japanese. (UTF-8) + "\u3053\u308c\u306f\u73fe\u4ee3\u7684\u65e5\u672c\u8a9e\u3067\u3059\u3002\n".encode('utf-8')), + (chr(19977), "n3", # Han 'three' + # This is obsolete Japanese. (Shift JIS) + "\u3053\u308c\u306f\u53e4\u3044\u65e5\u672c\u8a9e\u3067\u3059\u3002\n".encode('shift_jis'))] def getrandbytes(size): return getrandbits(8 * size).to_bytes(size, 'little') @@ -1998,5 +2007,103 @@ self.assertEqual(zi.compress_type, zipfile.ZIP_STORED) self.assertEqual(zi.file_size, 0) +class EncodedMemberNameTests(unittest.TestCase): + def setUp(self): + os.mkdir(TESTFN2) + # Create .zip of 3 members with Han names encoded in Shift JIS. + # Each name is 1 Han character encoding to 2 bytes in Shift JIS. + # The ASCII names are arbitrary as long as they are length 2 and + # not otherwise contained in the zip file. + # Data elements are encoded bytes (ascii, utf-8, shift_jis). + with zipfile.ZipFile(TESTFN, mode="w") as tf: + for name, temp, content in ENCODED_NAME_MEMBERS: + tf.writestr(temp, content, zipfile.ZIP_STORED) + # Hack in the Shift JIS names with flag bit 11 (UTF-8) unset. + with open(TESTFN, "rb") as tf: + text = bytearray(tf.read()) + for name, temp, content in ENCODED_NAME_MEMBERS: + text = text.replace(temp.encode('ascii'), + name.encode('shift_jis')) + with open(TESTFN, "wb") as tf: + tf.write(text) + + def test_ZipInfo(self): + # Read the ZIP archive + with zipfile.ZipFile(TESTFN, "r", memberNameEncoding='shift_jis') as zipfp: + for name, temp, content in ENCODED_NAME_MEMBERS: + self.assertEqual(zipfp.read(name), content) + + # Print the ZIP directory + fp = io.StringIO() + zipfp.printdir(file=fp) + directory = fp.getvalue() + lines = directory.splitlines()[1:] + self.assertEqual(len(lines), len(ENCODED_NAME_MEMBERS)) + for i in range(len(ENCODED_NAME_MEMBERS)): + fn, date, time_, size = lines[i].split() + self.assertEqual(fn, ENCODED_NAME_MEMBERS[i][0]) + + # Check the namelist + names = zipfp.namelist() + self.assertEqual(len(names), len(ENCODED_NAME_MEMBERS)) + for name, _, _ in ENCODED_NAME_MEMBERS: + self.assertIn(name, names) + + # Check infolist + infos = zipfp.infolist() + names = [i.filename for i in infos] + self.assertEqual(len(names), len(ENCODED_NAME_MEMBERS)) + for name, _, _ in ENCODED_NAME_MEMBERS: + self.assertIn(name, names) + + # check getinfo + for name, _, content in ENCODED_NAME_MEMBERS: + info = zipfp.getinfo(name) + self.assertEqual(info.filename, name) + self.assertEqual(info.file_size, len(content)) + + # Check that testzip doesn't raise an exception + zipfp.testzip() + + def test_ZipFile_membernameencoding(self): + with zipfile.ZipFile(TESTFN, "r", memberNameEncoding="shift_jis") as f: + self.assertTrue(f) + for mode in ("w", "a", "x"): + self.assertRaisesRegex( + ValueError, + "memberNameEncoding must be None for write modes", + zipfile.ZipFile, + "nonesuch.zip", + mode, + memberNameEncoding="shift_jis") + + def test_main_membernameencoding(self): + # XXX I don't know how to do this right yet. :-P + # for operation in ("-c", "-t"): + # self.assertRaisesRegex(RuntimeError, + # "^Non-conforming encodings not supported", + # zipfile.main, + # [ "--membernameencoding=shift_jis", + # operation, + # "nonesuch.zip", + # "nonesuch.txt" + # ]) + self.assertTrue(zipfile.main, + [ "--membernameencoding=shift_jis", + "-l", + TESTFN + ]) + self.assertTrue(zipfile.main, + [ "--membernameencoding=shift_jis", + "-e", + TESTFN, + TESTFN2 + ]) + + def tearDown(self): + rmtree(TESTFN2) + if os.path.exists(TESTFN): + unlink(TESTFN) + if __name__ == "__main__": unittest.main() diff -r 4a027e55dae3 Lib/zipfile.py --- a/Lib/zipfile.py Sun Sep 11 13:34:42 2016 -0700 +++ b/Lib/zipfile.py Mon Sep 12 08:48:32 2016 +0900 @@ -1051,7 +1051,8 @@ fp = None # Set here since __del__ checks it _windows_illegal_name_trans_table = None - def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True): + def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, + *, memberNameEncoding=None): """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x', or append 'a'.""" if mode not in ('r', 'w', 'x', 'a'): @@ -1068,6 +1069,11 @@ self.mode = mode self.pwd = None self._comment = b'' + self.memberNameEncoding = memberNameEncoding + + # Check that we don't try to write with nonconforming codecs + if self.memberNameEncoding is not None and mode != 'r': + raise ValueError("memberNameEncoding must be None for write modes") # Check if we were passed a file-like object if isinstance(file, str): @@ -1151,6 +1157,8 @@ elif self.filename is not None: result.append(' filename=%r' % self.filename) result.append(' mode=%r' % self.mode) + if self.memberNameEncoding is not None: + result.append(' memberNameEncoding=%r' % self.memberNameEncoding) else: result.append(' [closed]') result.append('>') @@ -1200,6 +1208,9 @@ if flags & 0x800: # UTF-8 file names extension filename = filename.decode('utf-8') + elif self.memberNameEncoding is not None: + # Python read-only encoded names extension + filename = filename.decode(self.memberNameEncoding) else: # Historical ZIP filename encoding filename = filename.decode('cp437') @@ -1379,6 +1390,9 @@ if zinfo.flag_bits & 0x800: # UTF-8 filename fname_str = fname.decode("utf-8") + elif self.memberNameEncoding is not None: + # Python read-only encoded names extension + fname_str = fname.decode(self.memberNameEncoding) else: fname_str = fname.decode("cp437") @@ -1958,24 +1972,38 @@ zipfile.py -t zipfile.zip # Test if a zipfile is valid zipfile.py -e zipfile.zip target # Extract zipfile into target dir zipfile.py -c zipfile.zip src ... # Create zipfile from sources + -l and -e may be preceded by --membernameencoding=codec to + specify the encoding of member names in the zipfile directory. """) + CODEC_USAGE="Non-conforming encodings not supported with -c and -t.\n" + if args is None: args = sys.argv[1:] + # intentionally awkward and inflexible parsing + codec = None + if args and args[0].startswith('--membernameencoding='): + codec = args[0][21:] + args = args[1:] + if args and args[0] in ('-c', '-t'): + print(CODEC_USAGE, file=sys.stderr) + print(USAGE, file=sys.stderr) + sys.exit(1) + if not args or args[0] not in ('-l', '-c', '-e', '-t'): - print(USAGE) + print(USAGE, file=sys.stderr) sys.exit(1) if args[0] == '-l': if len(args) != 2: - print(USAGE) + print(USAGE, file=sys.stderr) sys.exit(1) - with ZipFile(args[1], 'r') as zf: + with ZipFile(args[1], 'r', memberNameEncoding=codec) as zf: zf.printdir() elif args[0] == '-t': if len(args) != 2: - print(USAGE) + print(USAGE, file=sys.stderr) sys.exit(1) with ZipFile(args[1], 'r') as zf: badfile = zf.testzip() @@ -1985,15 +2013,15 @@ elif args[0] == '-e': if len(args) != 3: - print(USAGE) + print(USAGE, file=sys.stderr) sys.exit(1) - with ZipFile(args[1], 'r') as zf: + with ZipFile(args[1], 'r', memberNameEncoding=codec) as zf: zf.extractall(args[2]) elif args[0] == '-c': if len(args) < 3: - print(USAGE) + print(USAGE, file=sys.stderr) sys.exit(1) def addToZip(zf, path, zippath):