Index: Misc/NEWS =================================================================== --- Misc/NEWS (revision 59929) +++ Misc/NEWS (working copy) @@ -351,6 +351,10 @@ Library ------- +- Issue #1622: zipfile module parsing corrections; several fields are now + correctly interpreted as unsigned, fixed archive comment parsing. Also allow + writing ZIP files with an archive comment. + - Issue #1780: The Decimal constructor now accepts arbitrary leading and trailing whitespace when constructing from a string. Context.create_decimal no longer accepts trailing newlines. Index: Doc/library/zipfile.rst =================================================================== --- Doc/library/zipfile.rst (revision 59929) +++ Doc/library/zipfile.rst (working copy) @@ -277,7 +277,7 @@ member of the given :class:`ZipInfo` instance. By default, the :class:`ZipInfo` constructor sets this member to :const:`ZIP_STORED`. -The following data attribute is also available: +The following data attributes are also available: .. attribute:: ZipFile.debug @@ -286,7 +286,13 @@ output) to ``3`` (the most output). Debugging information is written to ``sys.stdout``. +.. attribute:: ZipFile.comment + The comment text associated with the ZIP file. If assigning a comment to a + :class:`ZipFile` instance created with mode 'a' or 'w', this should be a + string no longer than 65535 bytes. Comments longer than this will be + truncated in the written archive when :meth:`ZipFile.close` is called. + .. _pyzipfile-objects: PyZipFile Objects Index: Lib/zipfile.py =================================================================== --- Lib/zipfile.py (revision 59929) +++ Lib/zipfile.py (working copy) @@ -25,31 +25,49 @@ error = BadZipfile # The exception raised by this module ZIP64_LIMIT= (1 << 31) - 1 +ZIP_FILECOUNT_LIMIT = 1 << 16 +ZIP_MAX_COMMENT = (1 << 16) - 1 -# constants for Zip file compression methods +# constants for ZIP file compression methods supported by this module ZIP_STORED = 0 ZIP_DEFLATED = 8 -# Other ZIP compression methods not supported -# Here are some struct module formats for reading headers -structEndArchive = "<4s4H2lH" # 9 items, end of archive, 22 bytes -stringEndArchive = "PK\005\006" # magic number for end of archive record -structCentralDir = "<4s4B4HlLL5HLl"# 19 items, central directory, 46 bytes -stringCentralDir = "PK\001\002" # magic number for central directory -structFileHeader = "<4s2B4HlLL2H" # 12 items, file header record, 30 bytes -stringFileHeader = "PK\003\004" # magic number for file header -structEndArchive64Locator = "<4slql" # 4 items, locate Zip64 header, 20 bytes -stringEndArchive64Locator = "PK\x06\x07" # magic token for locator header -structEndArchive64 = "<4sqhhllqqqq" # 10 items, end of archive (Zip64), 56 bytes -stringEndArchive64 = "PK\x06\x06" # magic token for Zip64 header +# Below are some formats and associated data for reading/writing headers using +# the struct module. The names and structures of headers/records are those used +# in the PKWARE description of the ZIP file format: +# http://www.pkware.com/documents/casestudies/APPNOTE.TXT +# (URL valid as of January 2008) +# The "end of central directory" structure, magic number, size, and indices +# (section V.I in the format document) +structEndCentDir = "<4s4H2LH" +magicEndCentDir = "PK\005\006" +sizeEndCentDir = struct.calcsize(structEndCentDir) -# indexes of entries in the central directory structure +_ECD_SIGNATURE = 0 +_ECD_DISK_NUMBER = 1 +_ECD_DISK_START = 2 +_ECD_ENTRIES_THIS_DISK = 3 +_ECD_ENTRIES_TOTAL = 4 +_ECD_SIZE = 5 +_ECD_OFFSET = 6 +_ECD_COMMENT_SIZE = 7 +# These last two indices are not part of the structure as defined in the +# spec, but they are used internally by this module as a convenience +_ECD_COMMENT = 8 +_ECD_LOCATION = 9 + +# The "central directory" structure, magic number, size, and indices +# of entries in the structure (section V.F in the format document) +structCentralDir = "<4s4B4Hl2L5H2L" +magicCentralDir = "PK\001\002" +sizeCentralDir = struct.calcsize(structCentralDir) + _CD_SIGNATURE = 0 _CD_CREATE_VERSION = 1 _CD_CREATE_SYSTEM = 2 _CD_EXTRACT_VERSION = 3 -_CD_EXTRACT_SYSTEM = 4 # is this meaningful? +_CD_EXTRACT_SYSTEM = 4 _CD_FLAG_BITS = 5 _CD_COMPRESS_TYPE = 6 _CD_TIME = 7 @@ -65,10 +83,15 @@ _CD_EXTERNAL_FILE_ATTRIBUTES = 17 _CD_LOCAL_HEADER_OFFSET = 18 -# indexes of entries in the local file header structure +# The "local file header" structure, magic number, size, and indices +# (section V.A in the format document) +structFileHeader = "<4s2B4Hl2L2H" +magicFileHeader = "PK\003\004" +sizeFileHeader = struct.calcsize(structFileHeader) + _FH_SIGNATURE = 0 _FH_EXTRACT_VERSION = 1 -_FH_EXTRACT_SYSTEM = 2 # is this meaningful? +_FH_EXTRACT_SYSTEM = 2 _FH_GENERAL_PURPOSE_FLAG_BITS = 3 _FH_COMPRESSION_METHOD = 4 _FH_LAST_MOD_TIME = 5 @@ -79,6 +102,28 @@ _FH_FILENAME_LENGTH = 10 _FH_EXTRA_FIELD_LENGTH = 11 +# The "Zip64 end of central directory locator" structure, magic number, and size +structEndCentDir64Locator = "<4sLQL" +magicEndCentDir64Locator = "PK\x06\x07" +sizeEndCentDir64Locator = struct.calcsize(structEndCentDir64Locator) + +# The "Zip64 end of central directory" record, magic number, size, and indices +# (section V.G in the format document) +structEndCentDir64 = "<4sQ2H2L4Q" +magicEndCentDir64 = "PK\x06\x06" +sizeEndCentDir64 = struct.calcsize(structEndCentDir64) + +_CD64_SIGNATURE = 0 +_CD64_DIRECTORY_RECSIZE = 1 +_CD64_CREATE_VERSION = 2 +_CD64_EXTRACT_VERSION = 3 +_CD64_DISK_NUMBER = 4 +_CD64_DISK_NUMBER_START = 5 +_CD64_NUMBER_ENTRIES_THIS_DISK = 6 +_CD64_NUMBER_ENTRIES_TOTAL = 7 +_CD64_DIRECTORY_SIZE = 8 +_CD64_OFFSET_START_CENTDIR = 9 + def is_zipfile(filename): """Quickly see if file is a ZIP file by checking the magic number.""" try: @@ -95,75 +140,93 @@ """ Read the ZIP64 end-of-archive records and use that to update endrec """ - locatorSize = struct.calcsize(structEndArchive64Locator) - fpin.seek(offset - locatorSize, 2) - data = fpin.read(locatorSize) - sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data) - if sig != stringEndArchive64Locator: + fpin.seek(offset - sizeEndCentDir64Locator, 2) + data = fpin.read(sizeEndCentDir64Locator) + sig, diskno, reloff, disks = struct.unpack(structEndCentDir64Locator, data) + if sig != magicEndCentDir64Locator: return endrec if diskno != 0 or disks != 1: raise BadZipfile("zipfiles that span multiple disks are not supported") # Assume no 'zip64 extensible data' - endArchiveSize = struct.calcsize(structEndArchive64) - fpin.seek(offset - locatorSize - endArchiveSize, 2) - data = fpin.read(endArchiveSize) + fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2) + data = fpin.read(sizeEndCentDir64) sig, sz, create_version, read_version, disk_num, disk_dir, \ dircount, dircount2, dirsize, diroffset = \ - struct.unpack(structEndArchive64, data) - if sig != stringEndArchive64: + struct.unpack(structEndCentDir64, data) + if sig != magicEndCentDir64: return endrec # Update the original endrec using data from the ZIP64 record - endrec[1] = disk_num - endrec[2] = disk_dir - endrec[3] = dircount - endrec[4] = dircount2 - endrec[5] = dirsize - endrec[6] = diroffset + endrec[_ECD_DISK_NUMBER] = disk_num + endrec[_ECD_DISK_START] = disk_dir + endrec[_ECD_ENTRIES_THIS_DISK] = dircount + endrec[_ECD_ENTRIES_TOTAL] = dircount2 + endrec[_ECD_SIZE] = dirsize + endrec[_ECD_OFFSET] = diroffset return endrec - def _EndRecData(fpin): """Return data from the "End of Central Directory" record, or None. The data is a list of the nine items in the ZIP "End of central dir" record followed by a tenth item, the file seek offset of this record.""" - fpin.seek(-22, 2) # Assume no archive comment. - filesize = fpin.tell() + 22 # Get file size + + # Determine file size + fpin.seek(0, 2) + filesize = fpin.tell() + + # Check to see if this is ZIP file with no archive comment (the + # "end of central directory" structure should be the last item in the + # file if this is the case). + fpin.seek(-sizeEndCentDir, 2) data = fpin.read() - if data[0:4] == stringEndArchive and data[-2:] == "\000\000": - endrec = struct.unpack(structEndArchive, data) + if data[0:4] == magicEndCentDir and data[-2:] == "\000\000": + # the signature is correct and there's no comment, unpack structure + endrec = struct.unpack(structEndCentDir, data) endrec = list(endrec) - endrec.append("") # Append the archive comment - endrec.append(filesize - 22) # Append the record start offset - if endrec[-4] == -1 or endrec[-4] == 0xffffffff: - return _EndRecData64(fpin, -22, endrec) + + # Append a blank comment and record start offset + endrec.append("") + endrec.append(filesize - sizeEndCentDir) + if endrec[_ECD_OFFSET] == 0xffffffff: + # the value for the "offset of the start of the central directory" + # indicates that there is a "Zip64 end of central directory" + # structure present, so go look for it + return _EndRecData64(fpin, -sizeEndCentDir, endrec) + return endrec - # Search the last END_BLOCK bytes of the file for the record signature. - # The comment is appended to the ZIP file and has a 16 bit length. - # So the comment may be up to 64K long. We limit the search for the - # signature to a few Kbytes at the end of the file for efficiency. - # also, the signature must not appear in the comment. - END_BLOCK = min(filesize, 1024 * 4) - fpin.seek(filesize - END_BLOCK, 0) + + # Either this is not a ZIP file, or it is a ZIP file with an archive + # comment. Search the end of the file for the "end of central directory" + # record signature. The comment is the last item in the ZIP file and may be + # up to 64K long. It is assumed that the "end of central directory" magic + # number does not appear in the comment. + maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0) + fpin.seek(maxCommentStart, 0) data = fpin.read() - start = data.rfind(stringEndArchive) - if start >= 0: # Correct signature string was found - endrec = struct.unpack(structEndArchive, data[start:start+22]) - endrec = list(endrec) - comment = data[start+22:] - if endrec[7] == len(comment): # Comment length checks out + start = data.rfind(magicEndCentDir) + if start >= 0: + # found the magic number; attempt to unpack and interpret + recData = data[start:start+sizeEndCentDir] + endrec = list(struct.unpack(structEndCentDir, recData)) + comment = data[start+sizeEndCentDir:] + # check that comment length is correct + if endrec[_ECD_COMMENT_SIZE] == len(comment): # Append the archive comment and start offset endrec.append(comment) - endrec.append(filesize - END_BLOCK + start) - if endrec[-4] == -1 or endrec[-4] == 0xffffffff: - return _EndRecData64(fpin, - END_BLOCK + start, endrec) + endrec.append(maxCommentStart + start) + if endrec[_ECD_OFFSET] == 0xffffffff: + # There is apparently a "Zip64 end of central directory" + # structure present, so go look for it + return _EndRecData64(fpin, start - filesize, endrec) return endrec - return # Error, return None + # Unable to find a valid end of central directory structure + return + class ZipInfo (object): """Class with attributes describing each file in the ZIP archive.""" @@ -247,12 +310,12 @@ fmt = '= 24: - counts = unpack(' 1: print endrec - size_cd = endrec[5] # bytes in central directory - offset_cd = endrec[6] # offset of central directory - self.comment = endrec[8] # archive comment - # endrec[9] is the offset of the "End of Central Dir" record - if endrec[9] > ZIP64_LIMIT: - x = endrec[9] - size_cd - 56 - 20 - else: - x = endrec[9] - size_cd + size_cd = endrec[_ECD_SIZE] # bytes in central directory + offset_cd = endrec[_ECD_OFFSET] # offset of central directory + self.comment = endrec[_ECD_COMMENT] # archive comment + # "concat" is zero, unless zip was concatenated to another file - concat = x - offset_cd + concat = endrec[_ECD_LOCATION] - size_cd - offset_cd + if endrec[_ECD_LOCATION] > ZIP64_LIMIT: + # If the offset of the "End of Central Dir" record requires Zip64 + # extension structures, account for them + concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) + if self.debug > 2: - print "given, inferred, offset", offset_cd, x, concat + inferred = concat + offset_cd + print "given, inferred, offset", offset_cd, inferred, concat # self.start_dir: Position of start of central directory self.start_dir = offset_cd + concat fp.seek(self.start_dir, 0) @@ -662,9 +728,8 @@ fp = cStringIO.StringIO(data) total = 0 while total < size_cd: - centdir = fp.read(46) - total = total + 46 - if centdir[0:4] != stringCentralDir: + centdir = fp.read(sizeCentralDir) + if centdir[0:4] != magicCentralDir: raise BadZipfile, "Bad magic number for central directory" centdir = struct.unpack(structCentralDir, centdir) if self.debug > 2: @@ -674,9 +739,6 @@ x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) - total = (total + centdir[_CD_FILENAME_LENGTH] - + centdir[_CD_EXTRA_FIELD_LENGTH] - + centdir[_CD_COMMENT_LENGTH]) x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] (x.create_version, x.create_system, x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, @@ -690,6 +752,12 @@ x.header_offset = x.header_offset + concat self.filelist.append(x) self.NameToInfo[x.filename] = x + + # update total bytes read from central directory + total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] + + centdir[_CD_EXTRA_FIELD_LENGTH] + + centdir[_CD_COMMENT_LENGTH]) + if self.debug > 2: print "total", total @@ -721,7 +789,6 @@ except BadZipfile: return zinfo.filename - def getinfo(self, name): """Return the instance of ZipInfo given 'name'.""" info = self.NameToInfo.get(name) @@ -762,8 +829,8 @@ zef_file.seek(zinfo.header_offset, 0) # Skip the file header: - fheader = zef_file.read(30) - if fheader[0:4] != stringFileHeader: + fheader = zef_file.read(sizeFileHeader) + if fheader[0:4] != magicFileHeader: raise BadZipfile, "Bad magic number for file header" fheader = struct.unpack(structFileHeader, fheader) @@ -1016,15 +1083,15 @@ or zinfo.compress_size > ZIP64_LIMIT: extra.append(zinfo.file_size) extra.append(zinfo.compress_size) - file_size = 0xffffffff #-1 - compress_size = 0xffffffff #-1 + file_size = 0xffffffff + compress_size = 0xffffffff else: file_size = zinfo.file_size compress_size = zinfo.compress_size if zinfo.header_offset > ZIP64_LIMIT: extra.append(zinfo.header_offset) - header_offset = -1 # struct "l" format: 32 one bits + header_offset = 0xFFFFFFFF else: header_offset = zinfo.header_offset @@ -1042,7 +1109,7 @@ create_version = zinfo.create_version centdir = struct.pack(structCentralDir, - stringCentralDir, create_version, + magicCentralDir, create_version, zinfo.create_system, extract_version, zinfo.reserved, zinfo.flag_bits, zinfo.compress_type, dostime, dosdate, zinfo.CRC, compress_size, file_size, @@ -1056,31 +1123,38 @@ pos2 = self.fp.tell() # Write end-of-zip-archive record + centDirOffset = pos1 if pos1 > ZIP64_LIMIT: # Need to write the ZIP64 end-of-archive records zip64endrec = struct.pack( - structEndArchive64, stringEndArchive64, + structEndCentDir64, magicEndCentDir64, 44, 45, 45, 0, 0, count, count, pos2 - pos1, pos1) self.fp.write(zip64endrec) zip64locrec = struct.pack( - structEndArchive64Locator, - stringEndArchive64Locator, 0, pos2, 1) + structEndCentDir64Locator, + magicEndCentDir64Locator, 0, pos2, 1) self.fp.write(zip64locrec) + centDirOffset = 0xFFFFFFFF - # XXX Why is `pos3` computed next? It's never referenced. - pos3 = self.fp.tell() - endrec = struct.pack(structEndArchive, stringEndArchive, - 0, 0, count, count, pos2 - pos1, -1, 0) - self.fp.write(endrec) + # check for valid comment length + if len(self.comment) >= ZIP_MAX_COMMENT: + if self.debug > 0: + msg = 'Archive comment is too long; truncating to %d bytes' \ + % ZIP_MAX_COMMENT + self.comment = self.comment[:ZIP_MAX_COMMENT] - else: - endrec = struct.pack(structEndArchive, stringEndArchive, - 0, 0, count, count, pos2 - pos1, pos1, 0) - self.fp.write(endrec) + endrec = struct.pack(structEndCentDir, magicEndCentDir, + 0, 0, count % ZIP_FILECOUNT_LIMIT, + count % ZIP_FILECOUNT_LIMIT, pos2 - pos1, + centDirOffset, len(self.comment)) + self.fp.write(endrec) + self.fp.write(self.comment) self.fp.flush() + if not self._filePassed: self.fp.close() + self.fp = None Index: Lib/test/test_zipfile64.py =================================================================== --- Lib/test/test_zipfile64.py (revision 59929) +++ Lib/test/test_zipfile64.py (working copy) @@ -2,6 +2,7 @@ # The test_support.requires call is the only reason for keeping this separate # from test_zipfile from test import test_support + # XXX(nnorwitz): disable this test by looking for extra largfile resource # which doesn't exist. This test takes over 30 minutes to run in general # and requires more disk space than most of the buildbots. @@ -94,8 +95,31 @@ if os.path.exists(fname): os.remove(fname) + +class OtherTests(unittest.TestCase): + def testMoreThan64kFiles(self): + # This test checks that more than 64k files can be added to an archive, + # and that the resulting archive can be read properly by ZipFile + zipf = zipfile.ZipFile(TESTFN, mode="w") + zipf.debug = 100 + numfiles = (1 << 16) * 3/2 + for i in xrange(numfiles): + zipf.writestr("foo%08d" % i, "%d" % (i**3 % 57)) + self.assertEqual(len(zipf.namelist()), numfiles) + zipf.close() + + zipf2 = zipfile.ZipFile(TESTFN, mode="r") + self.assertEqual(len(zipf2.namelist()), numfiles) + for i in xrange(numfiles): + self.assertEqual(zipf2.read("foo%08d" % i), "%d" % (i**3 % 57)) + zipf.close() + + def tearDown(self): + test_support.unlink(TESTFN) + test_support.unlink(TESTFN2) + def test_main(): - run_unittest(TestsWithSourceFile) + run_unittest(TestsWithSourceFile, OtherTests) if __name__ == "__main__": test_main() Index: Lib/test/test_zipfile.py =================================================================== --- Lib/test/test_zipfile.py (revision 59929) +++ Lib/test/test_zipfile.py (working copy) @@ -681,6 +681,54 @@ zipf.writestr("foo.txt\x00qqq", "O, for a Muse of Fire!") self.assertEqual(zipf.namelist(), ['foo.txt']) + def test_StructSizes(self): + # check that ZIP internal structure sizes are calculated correctly + self.assertEqual(zipfile.sizeEndCentDir, 22) + self.assertEqual(zipfile.sizeCentralDir, 46) + self.assertEqual(zipfile.sizeEndCentDir64, 56) + self.assertEqual(zipfile.sizeEndCentDir64Locator, 20) + + def testComments(self): + # This test checks that comments on the archive are handled properly + + # check default comment is empty + zipf = zipfile.ZipFile(TESTFN, mode="w") + self.assertEqual(zipf.comment, '') + zipf.writestr("foo.txt", "O, for a Muse of Fire!") + zipf.close() + zipfr = zipfile.ZipFile(TESTFN, mode="r") + self.assertEqual(zipfr.comment, '') + zipfr.close() + + # check a simple short comment + comment = 'Bravely taking to his feet, he beat a very brave retreat.' + zipf = zipfile.ZipFile(TESTFN, mode="w") + zipf.comment = comment + zipf.writestr("foo.txt", "O, for a Muse of Fire!") + zipf.close() + zipfr = zipfile.ZipFile(TESTFN, mode="r") + self.assertEqual(zipfr.comment, comment) + zipfr.close() + + # check a comment of max length + comment2 = ''.join(['%d' % (i**3 % 10) for i in xrange((1 << 16)-1)]) + zipf = zipfile.ZipFile(TESTFN, mode="w") + zipf.comment = comment2 + zipf.writestr("foo.txt", "O, for a Muse of Fire!") + zipf.close() + zipfr = zipfile.ZipFile(TESTFN, mode="r") + self.assertEqual(zipfr.comment, comment2) + zipfr.close() + + # check a comment that is too long is truncated + zipf = zipfile.ZipFile(TESTFN, mode="w") + zipf.comment = comment2 + 'oops' + zipf.writestr("foo.txt", "O, for a Muse of Fire!") + zipf.close() + zipfr = zipfile.ZipFile(TESTFN, mode="r") + self.assertEqual(zipfr.comment, comment2) + zipfr.close() + def tearDown(self): support.unlink(TESTFN) support.unlink(TESTFN2)