Index: Lib/test/test_zipfile64.py =================================================================== --- Lib/test/test_zipfile64.py (revision 0) +++ Lib/test/test_zipfile64.py (revision 0) @@ -0,0 +1,67 @@ +# Tests of the full ZIP64 functionality of zipfile +# The test_support.requires call is the only reason for keeping this separate +# from test_zipfile +from test import test_support +test_support.requires( + 'largefile', + 'test requires loads of disk-space bytes and a long time to run' + ) + +# We can test part of the module without zlib. +try: + import zlib +except ImportError: + zlib = None + +import zipfile, os, unittest + +from StringIO import StringIO +from tempfile import TemporaryFile + +from test.test_support import TESTFN, run_unittest + +TESTFN2 = TESTFN + "2" + +class TestsWithSourceFile(unittest.TestCase): + def setUp(self): + line_gen = ("Test of zipfile line %d." % i for i in range(0, 1000000)) + self.data = '\n'.join(line_gen) + + # Make a source file with some lines + fp = open(TESTFN, "wb") + fp.write(self.data) + fp.close() + + def zipTest(self, f, compression): + # Create the ZIP archive + filecount = ((1 << 32) / len(self.data)) * 2 + zipfp = zipfile.ZipFile(f, "w", compression, allowZip64=True) + + for num in range(filecount): + zipfp.writestr("testfn%d"%(num,), self.data) + zipfp.close() + + # Read the ZIP archive + zipfp = zipfile.ZipFile(f, "r", compression) + for num in range(filecount): + self.assertEqual(zipfp.read("testfn%d"%(num,)), self.data) + zipfp.close() + + def testStored(self): + for f in (TESTFN2, TemporaryFile()): + self.zipTest(f, zipfile.ZIP_STORED) + + if zlib: + def testDeflated(self): + for f in (TESTFN2, TemporaryFile()): + self.zipTest(f, zipfile.ZIP_DEFLATED) + + def tearDown(self): + os.remove(TESTFN) + os.remove(TESTFN2) + +def test_main(): + run_unittest(TestsWithSourceFile) + +if __name__ == "__main__": + test_main() Index: Lib/test/test_zipfile.py =================================================================== --- Lib/test/test_zipfile.py (revision 46259) +++ Lib/test/test_zipfile.py (working copy) @@ -28,12 +28,14 @@ zipfp = zipfile.ZipFile(f, "w", compression) zipfp.write(TESTFN, "another"+os.extsep+"name") zipfp.write(TESTFN, TESTFN) + zipfp.writestr("strfile", self.data) zipfp.close() # Read the ZIP archive zipfp = zipfile.ZipFile(f, "r", compression) self.assertEqual(zipfp.read(TESTFN), self.data) self.assertEqual(zipfp.read("another"+os.extsep+"name"), self.data) + self.assertEqual(zipfp.read("strfile"), self.data) zipfp.close() def testStored(self): @@ -59,6 +61,79 @@ os.remove(TESTFN) os.remove(TESTFN2) +class TestZip64InSmallFiles(unittest.TestCase): + # These tests test the ZIP64 functionality without using large files, + # see test_zipfile64 for proper tests. + + def setUp(self): + self._limit = zipfile.ZIP64_LIMIT + zipfile.ZIP64_LIMIT = 5 + + line_gen = ("Test of zipfile line %d." % i for i in range(0, 1000)) + self.data = '\n'.join(line_gen) + + # Make a source file with some lines + fp = open(TESTFN, "wb") + fp.write(self.data) + fp.close() + + def largeFileExceptionTest(self, f, compression): + zipfp = zipfile.ZipFile(f, "w", compression) + self.assertRaises(zipfile.LargeZipFile, + zipfp.write, TESTFN, "another"+os.extsep+"name") + zipfp.close() + + def largeFileExceptionTest2(self, f, compression): + zipfp = zipfile.ZipFile(f, "w", compression) + self.assertRaises(zipfile.LargeZipFile, + zipfp.writestr, "another"+os.extsep+"name", self.data) + zipfp.close() + + def testLargeFileException(self): + for f in (TESTFN2, TemporaryFile(), StringIO()): + self.largeFileExceptionTest(f, zipfile.ZIP_STORED) + self.largeFileExceptionTest2(f, zipfile.ZIP_STORED) + + def zipTest(self, f, compression): + # Create the ZIP archive + zipfp = zipfile.ZipFile(f, "w", compression, allowZip64=True) + zipfp.write(TESTFN, "another"+os.extsep+"name") + zipfp.write(TESTFN, TESTFN) + zipfp.writestr("strfile", self.data) + zipfp.close() + + # Read the ZIP archive + zipfp = zipfile.ZipFile(f, "r", compression) + self.assertEqual(zipfp.read(TESTFN), self.data) + self.assertEqual(zipfp.read("another"+os.extsep+"name"), self.data) + self.assertEqual(zipfp.read("strfile"), self.data) + zipfp.close() + + def testStored(self): + for f in (TESTFN2, TemporaryFile(), StringIO()): + self.zipTest(f, zipfile.ZIP_STORED) + + + if zlib: + def testDeflated(self): + for f in (TESTFN2, TemporaryFile(), StringIO()): + self.zipTest(f, zipfile.ZIP_DEFLATED) + + def testAbsoluteArcnames(self): + zipfp = zipfile.ZipFile(TESTFN2, "w", zipfile.ZIP_STORED, allowZip64=True) + zipfp.write(TESTFN, "/absolute") + zipfp.close() + + zipfp = zipfile.ZipFile(TESTFN2, "r", zipfile.ZIP_STORED) + self.assertEqual(zipfp.namelist(), ["absolute"]) + zipfp.close() + + + def tearDown(self): + zipfile.ZIP64_LIMIT = self._limit + os.remove(TESTFN) + os.remove(TESTFN2) + class OtherTests(unittest.TestCase): def testCloseErroneousFile(self): # This test checks that the ZipFile constructor closes the file object @@ -103,7 +178,8 @@ self.assertRaises(RuntimeError, zipf.testzip) def test_main(): - run_unittest(TestsWithSourceFile, OtherTests) + run_unittest(TestsWithSourceFile, TestZip64InSmallFiles, OtherTests) + #run_unittest(TestZip64InSmallFiles) if __name__ == "__main__": test_main() Index: Lib/zipfile.py =================================================================== --- Lib/zipfile.py (revision 46259) +++ Lib/zipfile.py (working copy) @@ -1,7 +1,8 @@ -"Read and write ZIP files." - +""" +Read and write ZIP files. +""" import struct, os, time, sys -import binascii +import binascii, cStringIO try: import zlib # We may need its compression method @@ -9,12 +10,21 @@ zlib = None __all__ = ["BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "is_zipfile", - "ZipInfo", "ZipFile", "PyZipFile"] + "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile" ] class BadZipfile(Exception): pass + +class LargeZipFile(Exception): + """ + Raised when writing a zipfile, the zipfile requires ZIP64 extensions + and those extensions are disabled. + """ + error = BadZipfile # The exception raised by this module +ZIP64_LIMIT= 1 << 30 + # constants for Zip file compression methods ZIP_STORED = 0 ZIP_DEFLATED = 8 @@ -27,7 +37,12 @@ stringCentralDir = "PK\001\002" # magic number for central directory structFileHeader = "<4s2B4HlLL2H" # 12 items, file header record, 30 bytes stringFileHeader = "PK\003\004" # magic number for file header +structEndArchive64Locator = "<4slql" # 4 items, locate Zip64 header, 20 bytes +stringEndArchive64Locator = "PK\x06\x07" # magic token for locator header +structEndArchive64 = "<4sqhhllqqqq" # 10 items, end of archive (Zip64), 56 bytes +stringEndArchive64 = "PK\x06\x06" # magic token for Zip64 header + # indexes of entries in the central directory structure _CD_SIGNATURE = 0 _CD_CREATE_VERSION = 1 @@ -75,6 +90,40 @@ pass return False +def _EndRecData64(fpin, offset, endrec): + """ + Read the ZIP64 end-of-archive records and use that to update endrec + """ + locatorSize = struct.calcsize(structEndArchive64Locator) + fpin.seek(offset - locatorSize, 2) + data = fpin.read(locatorSize) + sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data) + if sig != stringEndArchive64Locator: + return endrec + + assert diskno == 0 + assert disks == 1 + + # Assume no 'zip64 extensible data' + endArchiveSize = struct.calcsize(structEndArchive64) + fpin.seek(offset - locatorSize - endArchiveSize, 2) + data = fpin.read(endArchiveSize) + sig, sz, create_version, read_version, disk_num, disk_dir, \ + dircount, dircount2, dirsize, diroffset = \ + struct.unpack(structEndArchive64, data) + if sig != stringEndArchive64: + return endrec + + # Update the original endrec using data from the ZIP64 record + endrec[1] = disk_num + endrec[2] = disk_dir + endrec[3] = dircount + endrec[4] = dircount2 + endrec[5] = dirsize + endrec[6] = diroffset + return endrec + + def _EndRecData(fpin): """Return data from the "End of Central Directory" record, or None. @@ -88,6 +137,8 @@ endrec = list(endrec) endrec.append("") # Append the archive comment endrec.append(filesize - 22) # Append the record start offset + if endrec[-4] == -1: + return _EndRecData64(fpin, -22, endrec) return endrec # Search the last END_BLOCK bytes of the file for the record signature. # The comment is appended to the ZIP file and has a 16 bit length. @@ -106,25 +157,50 @@ # Append the archive comment and start offset endrec.append(comment) endrec.append(filesize - END_BLOCK + start) + if endrec[-4] == -1: + return _EndRecData64(fpin, - END_BLOCK + start, endrec) return endrec return # Error, return None -class ZipInfo: +class ZipInfo (object): """Class with attributes describing each file in the ZIP archive.""" + __slots__ = ( + 'orig_filename', + 'filename', + 'date_time', + 'compress_type', + 'comment', + 'extra', + 'create_system', + 'create_version', + 'extract_version', + 'reserved', + 'flag_bits', + 'volume', + 'internal_attr', + 'external_attr', + 'header_offset', + 'CRC', + 'compress_size', + 'file_size', + ) + def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): self.orig_filename = filename # Original file name in archive -# Terminate the file name at the first null byte. Null bytes in file -# names are used as tricks by viruses in archives. + + # Terminate the file name at the first null byte. Null bytes in file + # names are used as tricks by viruses in archives. null_byte = filename.find(chr(0)) if null_byte >= 0: filename = filename[0:null_byte] -# This is used to ensure paths in generated ZIP files always use -# forward slashes as the directory separator, as required by the -# ZIP format specification. - if os.sep != "/": + # This is used to ensure paths in generated ZIP files always use + # forward slashes as the directory separator, as required by the + # ZIP format specification. + if os.sep != "/" and os.sep in filename: filename = filename.replace(os.sep, "/") + self.filename = filename # Normalized file name self.date_time = date_time # year, month, day, hour, min, sec # Standard values: @@ -145,7 +221,6 @@ self.external_attr = 0 # External file attributes # Other attributes are set by class ZipFile: # header_offset Byte offset to the file header - # file_offset Byte offset to the start of the file data # CRC CRC-32 of the uncompressed file # compress_size Size of the compressed file # file_size Size of the uncompressed file @@ -162,14 +237,65 @@ CRC = self.CRC compress_size = self.compress_size file_size = self.file_size + + extra = self.extra + + if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT: + # File is larger than what fits into a 4 byte integer, + # fall back to the ZIP64 extension + fmt = '= 24: + counts = unpack(' ZIP64_LIMIT: + x = endrec[9] - size_cd - 56 - 20 + else: + x = endrec[9] - size_cd # "concat" is zero, unless zip was concatenated to another file concat = x - offset_cd if self.debug > 2: @@ -258,6 +388,8 @@ # self.start_dir: Position of start of central directory self.start_dir = offset_cd + concat fp.seek(self.start_dir, 0) + data = fp.read(size_cd) + fp = cStringIO.StringIO(data) total = 0 while total < size_cd: centdir = fp.read(46) @@ -275,8 +407,7 @@ total = (total + centdir[_CD_FILENAME_LENGTH] + centdir[_CD_EXTRA_FIELD_LENGTH] + centdir[_CD_COMMENT_LENGTH]) - x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] + concat - # file_offset must be computed below... + x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] (x.create_version, x.create_system, x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, x.CRC, x.compress_size, x.file_size) = centdir[1:12] @@ -284,29 +415,15 @@ # Convert date/time code to (year, month, day, hour, min, sec) x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) + + x._decodeExtra() + x.header_offset = x.header_offset + concat self.filelist.append(x) self.NameToInfo[x.filename] = x if self.debug > 2: print "total", total - for data in self.filelist: - fp.seek(data.header_offset, 0) - fheader = fp.read(30) - if fheader[0:4] != stringFileHeader: - raise BadZipfile, "Bad magic number for file header" - fheader = struct.unpack(structFileHeader, fheader) - # file_offset is computed here, since the extra field for - # the central directory and for the local file header - # refer to different fields, and they can have different - # lengths - data.file_offset = (data.header_offset + 30 - + fheader[_FH_FILENAME_LENGTH] - + fheader[_FH_EXTRA_FIELD_LENGTH]) - fname = fp.read(fheader[_FH_FILENAME_LENGTH]) - if fname != data.orig_filename: - raise RuntimeError, \ - 'File name in directory "%s" and header "%s" differ.' % ( - data.orig_filename, fname) + def namelist(self): """Return a list of file names in the archive.""" l = [] @@ -324,7 +441,7 @@ print "%-46s %19s %12s" % ("File Name", "Modified ", "Size") for zinfo in self.filelist: date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time - print "%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size) + print "%-46s %s %12d %12d" % (zinfo.filename, date, zinfo.file_size) def testzip(self): """Read all the files and check the CRC.""" @@ -334,6 +451,7 @@ except BadZipfile: return zinfo.filename + def getinfo(self, name): """Return the instance of ZipInfo given 'name'.""" return self.NameToInfo[name] @@ -347,7 +465,24 @@ "Attempt to read ZIP archive that was already closed" zinfo = self.getinfo(name) filepos = self.fp.tell() - self.fp.seek(zinfo.file_offset, 0) + + self.fp.seek(zinfo.header_offset, 0) + + # Skip the file header: + fheader = self.fp.read(30) + if fheader[0:4] != stringFileHeader: + raise BadZipfile, "Bad magic number for file header" + + fheader = struct.unpack(structFileHeader, fheader) + fname = self.fp.read(fheader[_FH_FILENAME_LENGTH]) + if fheader[_FH_EXTRA_FIELD_LENGTH]: + self.fp.read(fheader[_FH_EXTRA_FIELD_LENGTH]) + + if fname != zinfo.orig_filename: + raise BadZipfile, \ + 'File name in directory "%s" and header "%s" differ.' % ( + zinfo.orig_filename, fname) + bytes = self.fp.read(zinfo.compress_size) self.fp.seek(filepos, 0) if zinfo.compress_type == ZIP_STORED: @@ -388,6 +523,12 @@ if zinfo.compress_type not in (ZIP_STORED, ZIP_DEFLATED): raise RuntimeError, \ "That compression method is not supported" + if zinfo.file_size > ZIP64_LIMIT: + if not self._allowZip64: + raise LargeZipFile("Filesize would require ZIP64 extensions") + if zinfo.header_offset > ZIP64_LIMIT: + if not self._allowZip64: + raise LargeZipFile("Zipfile size would require ZIP64 extensions") def write(self, filename, arcname=None, compress_type=None): """Put the bytes from filename into the archive under the name @@ -407,16 +548,18 @@ zinfo.compress_type = self.compression else: zinfo.compress_type = compress_type + + zinfo.file_size = st.st_size + zinfo.flag_bits = 0x00 + zinfo.header_offset = self.fp.tell() # Start of header bytes + self._writecheck(zinfo) fp = open(filename, "rb") - zinfo.flag_bits = 0x00 - zinfo.header_offset = self.fp.tell() # Start of header bytes # Must overwrite CRC and sizes with correct data later zinfo.CRC = CRC = 0 zinfo.compress_size = compress_size = 0 zinfo.file_size = file_size = 0 self.fp.write(zinfo.FileHeader()) - zinfo.file_offset = self.fp.tell() # Start of file bytes if zinfo.compress_type == ZIP_DEFLATED: cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -15) @@ -461,8 +604,9 @@ zinfo.compress_type = self.compression else: zinfo = zinfo_or_arcname + zinfo.file_size = len(bytes) # Uncompressed size + zinfo.header_offset = self.fp.tell() # Start of header bytes self._writecheck(zinfo) - zinfo.file_size = len(bytes) # Uncompressed size zinfo.CRC = binascii.crc32(bytes) # CRC-32 checksum if zinfo.compress_type == ZIP_DEFLATED: co = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, @@ -473,8 +617,8 @@ zinfo.compress_size = zinfo.file_size zinfo.header_offset = self.fp.tell() # Start of header bytes self.fp.write(zinfo.FileHeader()) - zinfo.file_offset = self.fp.tell() # Start of file bytes self.fp.write(bytes) + self.fp.flush() if zinfo.flag_bits & 0x08: # Write CRC and file sizes after the file data self.fp.write(struct.pack(" ZIP64_LIMIT \ + or zinfo.compress_size > ZIP64_LIMIT: + extra.append(zinfo.file_size) + extra.append(zinfo.compress_size) + file_size = -1 + compress_size = -1 + else: + file_size = zinfo.file_size + compress_size = zinfo.compress_size + + if zinfo.header_offset > ZIP64_LIMIT: + extra.append(zinfo.header_offset) + header_offset = -1 + else: + header_offset = zinfo.header_offset + + extra_data = zinfo.extra + if extra: + # Append a ZIP64 field to the extra's + extra_data = struct.pack( + ' ZIP64_LIMIT: + # Need to write the ZIP64 end-of-archive records + zip64endrec = struct.pack( + structEndArchive64, stringEndArchive64, + 44, 45, 45, 0, 0, count, count, pos2 - pos1, pos1) + self.fp.write(zip64endrec) + + zip64locrec = struct.pack( + structEndArchive64Locator, + stringEndArchive64Locator, 0, pos2, 1) + self.fp.write(zip64locrec) + + pos3 = self.fp.tell() + endrec = struct.pack(structEndArchive, stringEndArchive, + 0, 0, count, count, pos2 - pos1, -1, 0) + self.fp.write(endrec) + + else: + endrec = struct.pack(structEndArchive, stringEndArchive, + 0, 0, count, count, pos2 - pos1, pos1, 0) + self.fp.write(endrec) self.fp.flush() if not self._filePassed: self.fp.close() @@ -619,3 +813,71 @@ if basename: archivename = "%s/%s" % (basename, archivename) return (fname, archivename) + + +def main(args = None): + import textwrap + USAGE=textwrap.dedent("""\ + Usage: + zipfile.py -l zipfile.zip # Show listing of a zipfile + zipfile.py -e zipfile.zip target # Extract zipfile into target dir + zipfile.py -c zipfile.zip src ... # Create zipfile from sources + """) + if args is None: + args = sys.argv[1:] + + if not args or args[0] not in ('-l', '-c', '-e'): + print USAGE + sys.exit(1) + + if args[0] == '-l': + if len(args) != 2: + print USAGE + sys.exit(1) + zf = ZipFile(args[1], 'r') + zf.printdir() + zf.close() + + elif args[0] == '-e': + if len(args) != 3: + print USAGE + sys.exit(1) + + zf = ZipFile(args[1], 'r') + out = args[2] + for path in zf.namelist(): + if path.startswith('./'): + tgt = os.path.join(out, path[2:]) + else: + tgt = os.path.join(out, path) + + tgtdir = os.path.dirname(tgt) + if not os.path.exists(tgtdir): + os.makedirs(tgtdir) + fp = open(tgt, 'wb') + fp.write(zf.read(path)) + fp.close() + zf.close() + + elif args[0] == '-c': + if len(args) < 3: + print USAGE + sys.exit(1) + + def addToZip(zf, path, zippath): + if os.path.isfile(path): + zf.write(path, zippath, ZIP_DEFLATED) + elif os.path.isdir(path): + for nm in os.listdir(path): + addToZip(zf, + os.path.join(path, nm), os.path.join(zippath, nm)) + # else: ignore + + zf = ZipFile(args[1], 'w', allowZip64=True) + for src in args[2:]: + addToZip(zf, src, os.path.basename(src)) + + zf.close() + +if __name__ == "__main__": + main() Index: Doc/lib/libzipfile.tex =================================================================== --- Doc/lib/libzipfile.tex (revision 46259) +++ Doc/lib/libzipfile.tex (working copy) @@ -17,7 +17,9 @@ Note}. This module does not currently handle ZIP files which have appended -comments, or multi-disk ZIP files. +comments, or multi-disk ZIP files. It does optionally handle ZIP files +that use the ZIP64 extensions (that is ZIP files that are more than +4 GByte in size). The available attributes of this module are: @@ -25,6 +27,11 @@ The error raised for bad ZIP files. \end{excdesc} +\begin{excdesc}{LargeZipFile} + The error raised when a ZIP file would require ZIP64 functionality but that + has not been enabled. +\end{excdesc} + \begin{classdesc*}{ZipFile} The class for reading and writing ZIP files. See ``\citetitle{ZipFile Objects}'' (section \ref{zipfile-objects}) for @@ -77,7 +84,7 @@ \subsection{ZipFile Objects \label{zipfile-objects}} -\begin{classdesc}{ZipFile}{file\optional{, mode\optional{, compression}}} +\begin{classdesc}{ZipFile}{file\optional{, mode\optional{, compression\optional{, allowZip64}}}} Open a ZIP file, where \var{file} can be either a path to a file (a string) or a file-like object. The \var{mode} parameter should be \code{'r'} to read an existing file, \code{'w'} to @@ -100,6 +107,12 @@ is specified but the \refmodule{zlib} module is not available, \exception{RuntimeError} is also raised. The default is \constant{ZIP_STORED}. + If \var{allowZip64} is \code{True} zipfile will create zipfiles that use + the ZIP64 extensions when the zipfile is larger than 2GBytes. If it is + false (the default) zipfile will raise an exception when the zipfile would + require ZIP64 extensions. ZIP64 extensions are disabled by default because + the default zip and unzip commands on Unix (the InfoZIP utilities) don't + support these extensions. \end{classdesc} \begin{methoddesc}{close}{} @@ -132,8 +145,8 @@ \end{methoddesc} \begin{methoddesc}{testzip}{} - Read all the files in the archive and check their CRC's. Return the - name of the first bad file, or else return \code{None}. + Read all the files in the archive and check their CRC's and file + headers. Return the name of the first bad file, or else return \code{None}. \end{methoddesc} \begin{methoddesc}{write}{filename\optional{, arcname\optional{, @@ -284,10 +297,6 @@ Byte offset to the file header. \end{memberdesc} -\begin{memberdesc}[ZipInfo]{file_offset} - Byte offset to the start of the file data. -\end{memberdesc} - \begin{memberdesc}[ZipInfo]{CRC} CRC-32 of the uncompressed file. \end{memberdesc}