diff -r b0668185dc15 -r 23bc512c19b7 Lib/tarfile.py --- a/Lib/tarfile.py Tue Jun 25 23:13:47 2013 +0200 +++ b/Lib/tarfile.py Thu Jun 27 14:07:53 2013 +0200 @@ -33,7 +33,7 @@ __author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)" __date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $" __cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $" -__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend." +__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend, Eduardo Robles." #--------- # Imports @@ -93,6 +93,8 @@ GNUTYPE_LONGNAME = b"L" # GNU tar longname GNUTYPE_LONGLINK = b"K" # GNU tar longlink GNUTYPE_SPARSE = b"S" # GNU tar sparse file +GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on + # another volume XHDTYPE = b"x" # POSIX.1-2001 extended header XGLTYPE = b"g" # POSIX.1-2001 global header @@ -111,7 +113,7 @@ SYMTYPE, DIRTYPE, FIFOTYPE, CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, - GNUTYPE_SPARSE) + GNUTYPE_SPARSE, GNUTYPE_MULTIVOL) # File types that will be treated as a regular file. REGULAR_TYPES = (REGTYPE, AREGTYPE, @@ -119,7 +121,7 @@ # File types that are part of the GNU tar format. GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, - GNUTYPE_SPARSE) + GNUTYPE_SPARSE, GNUTYPE_MULTIVOL) # Fields from a pax header that override a TarInfo attribute. PAX_FIELDS = ("path", "linkpath", "size", "mtime", @@ -189,6 +191,14 @@ s = s[:p] return s.decode(encoding, errors) +def sbtn(s, length, encoding, errors): + """Convert a string or a bunch of bytes to a null-terminated bytes object + of specific size. + """ + if isinstance(s, str): + s = s.encode(encoding, errors) + return s[:length] + (length - len(s)) * NUL + def nti(s): """Convert a number field to a python number. """ @@ -263,15 +273,15 @@ blocks, remainder = divmod(length, BUFSIZE) for b in range(blocks): buf = src.read(BUFSIZE) + dst.write(buf) if len(buf) < BUFSIZE: raise OSError("end of file reached") - dst.write(buf) if remainder != 0: buf = src.read(remainder) + dst.write(buf) if len(buf) < remainder: raise OSError("end of file reached") - dst.write(buf) return def filemode(mode): @@ -745,7 +755,7 @@ __slots__ = ("name", "mode", "uid", "gid", "size", "mtime", "chksum", "type", "linkname", "uname", "gname", - "devmajor", "devminor", + "devmajor", "devminor", "volume_offset", "offset", "offset_data", "pax_headers", "sparse", "tarfile", "_sparse_structs", "_link_target") @@ -772,6 +782,8 @@ self.sparse = None # sparse member information self.pax_headers = {} # pax header information + self.volume_offset = 0 # the file's data corresponds with the data + # starting at this position # In pax headers the "name" and "linkname" field are called # "path" and "linkpath". @@ -794,19 +806,21 @@ """Return the TarInfo's attributes as a dictionary. """ info = { - "name": self.name, - "mode": self.mode & 0o7777, - "uid": self.uid, - "gid": self.gid, - "size": self.size, - "mtime": self.mtime, - "chksum": self.chksum, - "type": self.type, - "linkname": self.linkname, - "uname": self.uname, - "gname": self.gname, - "devmajor": self.devmajor, - "devminor": self.devminor + "name": self.name, + "mode": self.mode & 0o7777, + "uid": self.uid, + "gid": self.gid, + "size": self.size, + "mtime": self.mtime, + "chksum": self.chksum, + "type": self.type, + "linkname": self.linkname, + "uname": self.uname, + "gname": self.gname, + "devmajor": self.devmajor, + "devminor": self.devminor, + "offset_data": self.offset_data, + "volume_offset": self.volume_offset } if info["type"] == DIRTYPE and not info["name"].endswith("/"): @@ -846,6 +860,16 @@ """ info["magic"] = GNU_MAGIC + if self.ismultivol(): + prefix = [ + itn(info.get("atime", 0), 12, GNU_FORMAT), + itn(info.get("ctime", 0), 12, GNU_FORMAT), + itn(self.volume_offset, 12, GNU_FORMAT), + itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero + ] + info['prefix'] = b"".join(prefix) + info['size'] = info['size'] - self.volume_offset + buf = b"" if len(info["linkname"]) > LENGTH_LINK: buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors) @@ -862,6 +886,8 @@ """ info["magic"] = POSIX_MAGIC pax_headers = self.pax_headers.copy() + if self.ismultivol(): + info['size'] = info['size'] - self.volume_offset # Test string fields for values that exceed the field length or cannot # be represented in ASCII encoding. @@ -945,7 +971,7 @@ stn(info.get("gname", ""), 32, encoding, errors), itn(info.get("devmajor", 0), 8, format), itn(info.get("devminor", 0), 8, format), - stn(info.get("prefix", ""), 155, encoding, errors) + sbtn(info.get("prefix", ""), 155, encoding, errors) ] buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts)) @@ -1092,6 +1118,8 @@ # Reconstruct a ustar longname. if prefix and obj.type not in GNU_TYPES: obj.name = prefix + "/" + obj.name + else: + obj.offset_data = nti(buf[369:381]) return obj @classmethod @@ -1134,7 +1162,7 @@ """ self.offset_data = tarfile.fileobj.tell() offset = self.offset_data - if self.isreg() or self.type not in SUPPORTED_TYPES: + if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES: # Skip the following data blocks. offset += self._block(self.size) tarfile.offset = offset @@ -1295,6 +1323,18 @@ offset += next._block(next.size) tarfile.offset = offset + if next is not None: + if "GNU.volume.filename" in pax_headers: + if pax_headers["GNU.volume.filename"] == next.name: + if "GNU.volume.size" in pax_headers: + next.size = int(pax_headers["GNU.volume.size"]) + if "GNU.volume.offset" in pax_headers: + next.volume_offset = int(pax_headers["GNU.volume.offset"]) + + for key in pax_headers.keys(): + if key.startswith("GNU.volume"): + del tarfile.pax_headers[key] + return next def _proc_gnusparse_00(self, next, pax_headers, buf): @@ -1390,6 +1430,9 @@ return self.sparse is not None def isdev(self): return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) + def ismultivol(self): + return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\ + "GNU.volume.offset" in self.pax_headers # class TarInfo class TarFile(object): @@ -1404,6 +1447,15 @@ ignore_zeros = False # If true, skips empty or invalid blocks and # continues processing. + max_volume_size = None # If different from None, stablishes maximum + # size of tar volumes + + new_volume_handler = None # function handler to be executed before when + # a new volume is needed + + volume_number = 0 # current volume number, used for multi volume + # support + errorlevel = 1 # If 0, fatal errors only appear in debug # messages (if debug >= 0). If > 0, errors # are passed to the caller as exceptions. @@ -1420,7 +1472,8 @@ def __init__(self, name=None, mode="r", fileobj=None, format=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, - errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None): + errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None, + max_volume_size=None, new_volume_handler=None): """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to read from an existing archive, 'a' to append data to an existing file or 'w' to create a new file overwriting an existing one. `mode' @@ -1448,6 +1501,7 @@ self._mode = fileobj.mode self._extfileobj = True self.name = os.path.abspath(name) if name else None + self.base_name = self.name = os.path.abspath(name) if name else None self.fileobj = fileobj # Init attributes. @@ -1474,6 +1528,13 @@ self.errorlevel = errorlevel # Init datastructures. + if max_volume_size and max_volume_size < 3*BLOCKSIZE: + raise ValueError("max_volume_size needs to be at least %d" % 3*BLOCKSIZE) + if max_volume_size and not callable(new_volume_handler): + raise ValueError("new_volume_handler needs to be set and be callable for multivolume support") + + self.max_volume_size = max_volume_size + self.new_volume_handler = new_volume_handler self.closed = False self.members = [] # list of members as TarInfo objects self._loaded = False # flag if all members have been read @@ -1937,6 +1998,17 @@ else: self.addfile(tarinfo) + def _size_left(self): + """Calculates size left in a volume with a maximum volume size. + Assumes self.max_volume_size is set. + """ + size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset + # limit size left to a discrete number of blocks, because we won't + # write only half a block when writting the end of a volume + # and filling with zeros + blocks, remainder = divmod(size_left, BLOCKSIZE) + return blocks*BLOCKSIZE + def addfile(self, tarinfo, fileobj=None): """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is given, tarinfo.size bytes are read from it and added to the archive. @@ -1952,17 +2024,148 @@ self.fileobj.write(buf) self.offset += len(buf) - # If there's data to follow, append it. - if fileobj is not None: - copyfileobj(fileobj, self.fileobj, tarinfo.size) - blocks, remainder = divmod(tarinfo.size, BLOCKSIZE) + + # If there's no data to follow, finish + if not fileobj: + self.members.append(tarinfo) + return + + # handle multivolume support + if self.max_volume_size: + size_left = self._size_left() + # we only split volumes in the middle of a file, that means we have + # to write at least one block + if size_left < BLOCKSIZE: + size_left = BLOCKSIZE + max_size_to_write = min(size_left, tarinfo.size - tarinfo.volume_offset) + else: + size_left = max_size_to_write = tarinfo.size + + # iterate, one iteration per volume (usually only one volume) + while tarinfo.volume_offset < tarinfo.size: + copyfileobj(fileobj, self.fileobj, max_size_to_write) + blocks, remainder = divmod(max_size_to_write, BLOCKSIZE) + + # only fill with zeros the remainder in a block if it's not + # going to be a file splitted in multiple volumes. + # if file is going to be split in multiple volumes, having a + # remainder means that there's no more space left for a block, so + # we already need to create a new volume. if remainder > 0: self.fileobj.write(NUL * (BLOCKSIZE - remainder)) blocks += 1 + # we already assured previously that if we are doing multivolume, + # there's not going to be a remainder + if self.max_volume_size and max_size_to_write == size_left: + assert remainder == 0 + self.offset += blocks * BLOCKSIZE + size_left -= blocks * BLOCKSIZE + tarinfo.volume_offset += blocks * BLOCKSIZE + + # check if creating a new volume is needed + if tarinfo.volume_offset < tarinfo.size and\ + self.max_volume_size and size_left < 3*BLOCKSIZE: + + tarinfo.type = GNUTYPE_MULTIVOL + + if not self.new_volume_handler or\ + not callable(self.new_volume_handler): + raise Exception("We need to create a new volume and you " + "didn't supply a new_volume_handler") + + # the new volume handler should do everything needed to + # start working in a new volume. usually, the handler calls + # to self.open_volume + self.volume_number += 1 + + # set to be used by open_volume, becuase in the case of a PAX + # tar it needs to write information about the volume and offset + # in the global header + self.volume_tarinfo = tarinfo + self.new_volume_handler(self, self.base_name, self.volume_number) + + self.volume_tarinfo = None + + # write new volume header + buf = tarinfo.tobuf(self.format, self.encoding, self.errors) + self.offset += len(buf) + self.fileobj.write(buf) + size_left = self._size_left() + max_size_to_write = min(size_left, tarinfo.size - tarinfo.volume_offset) self.members.append(tarinfo) + def open_volume(self, name="", fileobj=None): + ''' + Called by the user to change this tar file to point to a new volume. + ''' + # open the file using either fileobj or name + if not fileobj: + if self.mode == "a" and not os.path.exists(name): + # Create nonexistent files in append mode. + self.mode = "w" + self._mode = "wb" + fileobj = bltn_open(name, self._mode) + self._extfileobj = False + else: + if name is None and hasattr(fileobj, "name"): + name = fileobj.name + if hasattr(fileobj, "mode"): + self._mode = fileobj.mode + self._extfileobj = True + self.name = os.path.abspath(name) if name else None + self.fileobj = fileobj + + # init data structures + self.closed = False + self.members = [] # list of members as TarInfo objects + self._loaded = False # flag if all members have been read + self.offset = self.fileobj.tell() + # current position in the archive file + self.inodes = {} # dictionary caching the inodes of + # archive members already added + + try: + if self.mode == "r": + self.firstmember = None + self.firstmember = self.next() + + if self.mode == "a": + # Move to the end of the archive, + # before the first empty block. + while True: + self.fileobj.seek(self.offset) + try: + tarinfo = self.tarinfo.fromtarfile(self) + self.members.append(tarinfo) + except EOFHeaderError: + self.fileobj.seek(self.offset) + break + except HeaderError as e: + raise ReadError(str(e)) + + if self.mode in "aw": + self._loaded = True + + if self.format == PAX_FORMAT: + volume_info = { + "GNU.volume.filename": str(self.volume_tarinfo.name), + "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset), + "GNU.volume.offset": str(self.volume_tarinfo.volume_offset), + } + + self.pax_headers.update(volume_info) + + buf = self.tarinfo.create_pax_global_header(volume_info.copy()) + self.fileobj.write(buf) + self.offset += len(buf) + except: + if not self._extfileobj: + self.fileobj.close() + self.closed = True + raise + def extractall(self, path=".", members=None): """Extract all members from the archive to the current working directory and set owner, modification time and permissions on @@ -1976,6 +2179,11 @@ members = self for tarinfo in members: + # exclude members marked as multivol, because they should have been + # already processed + if self.volume_number > 0 and tarinfo.ismultivol(): + continue + if tarinfo.isdir(): # Extract directories with a safe mode. directories.append(tarinfo) @@ -2049,7 +2257,8 @@ else: tarinfo = member - if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES: + if tarinfo.isreg() or tarinfo.ismultivol() or\ + tarinfo.type not in SUPPORTED_TYPES: # Members with unknown types are treated as regular files. return self.fileobject(self, tarinfo) @@ -2130,15 +2339,43 @@ """ source = self.fileobj source.seek(tarinfo.offset_data) - with bltn_open(targetpath, "wb") as target: - if tarinfo.sparse is not None: + target = bltn_open(targetpath, "wb") + + if tarinfo.sparse is not None: + try: for offset, size in tarinfo.sparse: target.seek(offset) copyfileobj(source, target, size) - else: + target.seek(tarinfo.size) + target.truncate() + finally: + target.close() + return + + iterate = True + while iterate: + iterate = False + try: copyfileobj(source, target, tarinfo.size) - target.seek(tarinfo.size) - target.truncate() + except IOError: + source.close() + # only if we are extracting a multivolume this can be treated + if not self.new_volume_handler: + target.close() + raise Exception("We need to read a new volume and you" + " didn't supply a new_volume_handler") + + # the new volume handler should do everything needed to + # start working in a new volume. usually, the handler calls + # to self.open_volume + self.volume_number += 1 + self.new_volume_handler(self, self.base_name, self.volume_number) + tarinfo = self.firstmember + source = self.fileobj + iterate = True + + target.close() + def makeunknown(self, tarinfo, targetpath): """Make a file from a TarInfo object with an unknown type diff -r b0668185dc15 -r 23bc512c19b7 Lib/test/test_tarfile.py --- a/Lib/test/test_tarfile.py Tue Jun 25 23:13:47 2013 +0200 +++ b/Lib/test/test_tarfile.py Thu Jun 27 14:07:53 2013 +0200 @@ -2,6 +2,7 @@ import os import io import shutil +import string from hashlib import md5 import unittest @@ -32,6 +33,7 @@ bz2name = os.path.join(TEMPDIR, "testtar.tar.bz2") xzname = os.path.join(TEMPDIR, "testtar.tar.xz") tmpname = os.path.join(TEMPDIR, "tmp.tar") +TEMPSUBDIR = os.path.join(TEMPDIR, "subdir") md5_regtype = "65f477c818ad9e15f7feab0c6d37742f" md5_sparse = "a54fbc4ca4f4399a90e1b27164012fc6" @@ -242,7 +244,7 @@ self.assertRaises(tarfile.ReadError, tarfile.open, tmpname) def test_ignore_zeros(self): - # Test TarFile's ignore_zeros option. + # Test tarfile.TarFile's ignore_zeros option. for char in (b'\0', b'a'): # Test if EOFHeaderError ('\0') and InvalidHeaderError ('a') # are ignored correctly. @@ -414,7 +416,7 @@ shutil.rmtree(DIR) def test_init_close_fobj(self): - # Issue #7341: Close the internal file object in the TarFile + # Issue #7341: Close the internal file object in the tarfile.TarFile # constructor in case of an error. For the test we rely on # the fact that opening an empty file raises a ReadError. empty = os.path.join(TEMPDIR, "empty") @@ -1733,7 +1735,7 @@ def test_closed(self): # The __enter__() method is supposed to raise OSError - # if the TarFile object is already closed. + # if the tarfile.TarFile object is already closed. tar = tarfile.open(tarname) tar.close() with self.assertRaises(OSError): @@ -1763,7 +1765,7 @@ def test_eof(self): # __exit__() must write end-of-archive blocks, i.e. call - # TarFile.close() if there was no error. + # tarfile.TarFile.close() if there was no error. with tarfile.open(tmpname, "w"): pass self.assertNotEqual(os.path.getsize(tmpname), 0, @@ -1846,6 +1848,445 @@ def test_partial_input_bz2(self): self._test_partial_input("r:bz2") +def new_volume_handler(tarobj, base_name, volume_number): + ''' + Handles the new volumes + ''' + volume_path = "%s.%d" % (base_name, volume_number) + tarobj.open_volume(volume_path) + +class MultivolGnuFormatTest(unittest.TestCase): + """ + Test multivolume support in tarfile. Tar Format is specified at class level. + """ + + # used as the --format argument to tar command on tar file creation + tar_command_format = "gnu" + + # used as Tarfile.open format option argument for tar file creation + tarfile_format = tarfile.GNU_FORMAT + + # overhead size used to calculate the exact maximum size of a tar file with + # no extra volume that stores only one file. In case of GNU format this is + # the size of three blocks: + # * 1 block used to store the header information of the stored file + # * 2 blocks used to mark the end of the tar file + tarfile_overhead = 3*tarfile.BLOCKSIZE + + # overhead size used to calculate the exact maximum size of a tar volume, + # corresponding with a multivolume tar file storing a single file. In the + # case of GNU format this is the same as tarfile_overhead. + tarvol_overhead = 3*tarfile.BLOCKSIZE + + def tearDown(self): + ''' + Remove temporal files created by unit tests + ''' + os.chdir(self.prevcwd) + if os.path.exists(TEMPSUBDIR): + shutil.rmtree(TEMPSUBDIR) + + def setUp(self): + ''' + Create empty temp dir and set as current directory + ''' + support.unlink(TEMPSUBDIR) + os.makedirs(TEMPSUBDIR) + + self.prevcwd = os.getcwd() + os.chdir(TEMPSUBDIR) + + def create_file(self, path, length): + ''' + Creates a file with some gibberish inside, returning the md5sum of that + file. File path and length are specified as function arguments. + ''' + f = open(path, 'w') + s = string.ascii_lowercase + string.digits + "\n" + if len(s) < length: + s += s*int(length/len(s)) + data = s[:length] + f.write(data) + f.close() + return self.md5sum(path) + + def md5sum(self, filename): + ''' + Returns the md5sum of a file specified by its filename/path + ''' + md5sum = md5() + with open(filename,'rb') as f: + for chunk in iter(lambda: f.read(128*md5sum.block_size), b''): + md5sum.update(chunk) + return md5sum.hexdigest() + + @unittest.skipIf(shutil.which("tar") is None, "required command line 'tar' is missing") + def test_no_volume(self): + """ + Create a tar file with only one file inside and no extra volumes + """ + + # create the content of the file to compress and hash it + hash = self.create_file("big", 50000) + + # create the tar file with volumes + tarobj = tarfile.TarFile.open("sample.tar", mode="w", format=self.tarfile_format) + tarobj.add("big") + tarobj.close() + + # check that the tar volumes were correctly created + assert os.path.exists("sample.tar") + assert not os.path.exists("sample.tar.1") + + os.unlink("big") + assert not os.path.exists("big") + + # extract and check + os.system("tar xfM sample.tar") + assert os.path.exists("big") + assert hash == self.md5sum("big") + + @unittest.skipIf(shutil.which("tar") is None, "required command line 'tar' is missing") + def test_volume_creation1(self): + """ + Create a tar file with two volumes, only one file inside + """ + + # create the content of the file to compress and hash it + hash = self.create_file("big", 50000) + + # create the tar file with volumes + tarobj = tarfile.TarFile.open("sample.tar", + mode="w", + format=self.tarfile_format, + max_volume_size=30000, + new_volume_handler=new_volume_handler) + tarobj.add("big") + tarobj.close() + + # check that the tar volumes were correctly created + assert os.path.exists("sample.tar") + assert os.path.exists("sample.tar.1") + assert not os.path.exists("sample.tar.2") + + os.unlink("big") + assert not os.path.exists("big") + + # extract with normal tar and check output + os.system("tar xfM sample.tar --file=sample.tar.1") + assert os.path.exists("big") + assert hash == self.md5sum("big") + + @unittest.skipIf(shutil.which("tar") is None, "required command line 'tar' is missing") + def test_volume_creation2(self): + """ + Create a tar file with 2 extra volumes, only one file inside + """ + + # create the content of the file to compress and hash it + hash = self.create_file("big", 50000) + + # create the tar file with volumes + tarobj = tarfile.TarFile.open("sample.tar", + mode="w", + format=self.tarfile_format, + max_volume_size=20000, + new_volume_handler=new_volume_handler) + tarobj.add("big") + tarobj.close() + + # check that the tar volumes were correctly created + assert os.path.exists("sample.tar") + assert os.path.exists("sample.tar.1") + assert os.path.exists("sample.tar.2") + assert not os.path.exists("sample.tar.3") + + os.unlink("big") + assert not os.path.exists("big") + + # extract with normal tar and check output + os.system("tar xfM sample.tar --file=sample.tar.1 --file=sample.tar.2") + assert os.path.exists("big") + assert hash == self.md5sum("big") + + @unittest.skipIf(shutil.which("tar") is None, "required command line 'tar' is missing") + def test_multivol_multifiles(self): + ''' + Create a tar file with two volumes and three files inside + ''' + + # create sample data + hash = dict() + hash["big"] = self.create_file("big", 50000) + hash["small"] = self.create_file("small", 100) + hash["small2"] = self.create_file("small2", 354) + + # create the tar file with volumes + tarobj = tarfile.TarFile.open("sample.tar", + mode="w", + format=self.tarfile_format, + max_volume_size=20000, + new_volume_handler=new_volume_handler) + tarobj.add("big") + tarobj.add("small") + tarobj.add("small2") + tarobj.close() + + # check that the tar volumes were correctly created + assert os.path.exists("sample.tar") + assert os.path.exists("sample.tar.1") + assert os.path.exists("sample.tar.2") + assert not os.path.exists("sample.tar.3") + + os.unlink("big") + os.unlink("small") + os.unlink("small2") + + # extract with normal tar and check output + os.system("tar xfM sample.tar --file=sample.tar.1 --file=sample.tar.2") + for key, value in hash.items(): + assert os.path.exists(key) + assert value == self.md5sum(key) + + def test_volume_extract1(self): + ''' + Create a tar file with multiple volumes and one file and extract it + ''' + # create the content of the file to compress and hash it + hash = self.create_file("big", 5*1024*1024) + + # create the tar file with volumes + tarobj = tarfile.TarFile.open("sample.tar", + mode="w", + format=self.tarfile_format, + max_volume_size=3*1024*1024, + new_volume_handler=new_volume_handler) + tarobj.add("big") + tarobj.close() + + # check that the tar volumes were correctly created + assert os.path.exists("sample.tar") + assert os.path.exists("sample.tar.1") + assert not os.path.exists("sample.tar.2") + + os.unlink("big") + assert not os.path.exists("big") + + # extract and check output + tarobj = tarfile.TarFile.open("sample.tar", + mode="r", + new_volume_handler=new_volume_handler) + tarobj.extractall() + tarobj.close() + assert os.path.exists("big") + assert hash == self.md5sum("big") + + @unittest.skipIf(shutil.which("tar") is None, "required command line 'tar' is missing") + def test_volume_extract2(self): + ''' + Create a multivolume tar file with gnu tar command, extract it with + tarfile library + ''' + # create the content of the file to compress and hash it + hash = self.create_file("big", 5*1024*1024) + + # create the tar file with volumes + os.system("tar cM --format=%s -L 3M big --file=sample.tar "\ + "--file=sample.tar.1" % self.tar_command_format) + + # check that the tar volumes were correctly created + assert os.path.exists("sample.tar") + assert os.path.exists("sample.tar.1") + assert not os.path.exists("sample.tar.2") + + os.unlink("big") + assert not os.path.exists("big") + + # extract and check output + tarobj = tarfile.TarFile.open("sample.tar", + mode="r", + new_volume_handler=new_volume_handler) + tarobj.extractall() + tarobj.close() + assert os.path.exists("big") + assert hash == self.md5sum("big") + + def test_multivol_multifile_extract(self): + ''' + create a multivolume tar file with multiple files and extracts it + ''' + + # create sample data + hash = dict() + hash["big"] = self.create_file("big", 50000) + hash["small"] = self.create_file("small", 100) + hash["small2"] = self.create_file("small2", 354) + + # create the tar file with volumes + tarobj = tarfile.TarFile.open("sample.tar", + mode="w", + format=self.tarfile_format, + max_volume_size=20000, + new_volume_handler=new_volume_handler) + tarobj.add("big") + tarobj.add("small") + tarobj.add("small2") + tarobj.close() + + # check that the tar volumes were correctly created + assert os.path.exists("sample.tar") + assert os.path.exists("sample.tar.1") + assert os.path.exists("sample.tar.2") + assert not os.path.exists("sample.tar.3") + + os.unlink("big") + os.unlink("small") + os.unlink("small2") + + # extract and check output + tarobj = tarfile.TarFile.open("sample.tar", + mode="r", + new_volume_handler=new_volume_handler) + tarobj.extractall() + tarobj.close() + + for key, value in hash.items(): + assert os.path.exists(key) + assert value == self.md5sum(key) + + def test_multiple_files_extract(self): + ''' + creates a simple tar file with no volumes and with multiple files + inside and extracts it + ''' + + # create sample data + hash = dict() + hash["big"] = self.create_file("big", 50000) + hash["small"] = self.create_file("small", 100) + hash["small2"] = self.create_file("small2", 354) + + # create the tar file with volumes + tarobj = tarfile.TarFile.open("sample.tar", + format=self.tarfile_format, + mode="w") + tarobj.add("big") + tarobj.add("small") + tarobj.add("small2") + tarobj.close() + + # check that the tar volumes were correctly created + assert os.path.exists("sample.tar") + assert not os.path.exists("sample.tar.1") + + os.unlink("big") + os.unlink("small") + os.unlink("small2") + + # extract and check output + tarobj = tarfile.TarFile.open("sample.tar", + mode="r", + new_volume_handler=new_volume_handler) + tarobj.extractall() + tarobj.close() + + for key, value in hash.items(): + assert os.path.exists(key) + assert value == self.md5sum(key) + + def test_corner_case_split_size1(self): + ''' + Creates a tar file with a single file inside that contains the maximum + size allowed in one volume. + ''' + hash = self.create_file("big", 5*1024*1024) + + # create the tar file with volumes + tarobj = tarfile.TarFile.open("sample.tar", + mode="w", + format=self.tarfile_format, + # see tarfile_overhead description for details + max_volume_size=5*1024*1024 + self.tarfile_overhead, + new_volume_handler=new_volume_handler) + tarobj.add("big") + tarobj.close() + + # check that the tar volumes were correctly created + assert os.path.exists("sample.tar") + assert not os.path.exists("sample.tar.1") + + os.unlink("big") + assert not os.path.exists("big") + + # extract and check output + tarobj = tarfile.TarFile.open("sample.tar", + mode="r", + new_volume_handler=new_volume_handler) + tarobj.extractall() + tarobj.close() + assert os.path.exists("big") + assert hash == self.md5sum("big") + + + def test_corner_case_split_size2(self): + ''' + Creates a tar file with a single file inside that contains the maximum + size allowed in one volume. + ''' + hash = self.create_file("big", 4*1024*1024) + + # create the tar file with volumes + tarobj = tarfile.TarFile.open("sample.tar", + mode="w", + format=self.tarfile_format, + # see tarvol_overhead description for details + max_volume_size=2*1024*1024 + self.tarvol_overhead, + new_volume_handler=new_volume_handler) + tarobj.add("big") + tarobj.close() + + # check that the tar volumes were correctly created + assert os.path.exists("sample.tar") + assert os.path.exists("sample.tar.1") + assert not os.path.exists("sample.tar.2") + + os.unlink("big") + assert not os.path.exists("big") + + # extract and check output + tarobj = tarfile.TarFile.open("sample.tar", + mode="r", + new_volume_handler=new_volume_handler) + tarobj.extractall() + tarobj.close() + assert os.path.exists("big") + assert hash == self.md5sum("big") + +class MultivolPaxFormatTest(MultivolGnuFormatTest): + """ + Test multivolume support in tarfile with PAX format + """ + + tar_command_format = "pax" + + tarfile_format = tarfile.PAX_FORMAT + + # overhead size used to calculate the exact maximum size of a tar file with + # no extra volume that stores only one file. In case of GNU format this is + # the size of three blocks: + # * 1 block used to store the header information of the stored file + # * 1 block used to store the header information of the pax header + # * 1 block used to store the pax header + # * 2 blocks used to mark the end of the tar file + tarfile_overhead = 5*tarfile.BLOCKSIZE + + + # overhead size used to calculate the exact maximum size of a tar volume, + # corresponding with a multivolume tar file storing a single file. In the + # case of Pax format, it's the same as tarfile_overhead plus a block for + # the global header + tarvol_overhead = 6*tarfile.BLOCKSIZE + def setUpModule(): support.unlink(TEMPDIR)