#!/usr/bin/env python # -*- coding: utf-8 -*- # # Time-stamp: <15-Aug-2013 11:58:31 PDT by rich@noir.com> # This code is in the public domain. # Written by K Richard Pixley, in 2013 in order to # demonstrate pathological behavior of the tarfile library. """ A simple demonstration script. Run it and it will tar up your /usr/lib or /usr/include into memory using an outcall and then unpack in several different ways. I think correct behavior should be that using tarfile alone should be comparable, (or faster), than using gzip then tarfile. Worst case, tarfile should be mimicking what I'm doing here and thus should be the same speed. What I'm actually seeing is that tarfile behavior is much worse, (~2x), for unsorted names and more like ~60x for sorted names. Works with python 2.[67], 3.[0123]. (Have not tried any others). I see the degradation on all of the linux boxes I've tried: * ubuntu-13.04, (raring), 64-bit * rhel-5.4 64-bit * rhel-5.7 64-bit * suse-11 64-bit I see some degradation on MacOsX-10.8.4 but it's in the acceptable range, more like 2x than 60x. That is still suspicious, but not as problematic. Similar degradation on debian-7.1 armv6. I see no degradation on ubuntu-13.10 (saucy) on an armv7. """ from __future__ import print_function, unicode_literals __docformat__ = 'restructuredtext en' import platform v3 = int(platform.python_version_tuple()[0]) == 3 import io import contextlib import datetime import gzip import os import subprocess import tarfile @contextlib.contextmanager def opengzip(fileobj): gz = gzip.GzipFile(None, 'r', 9, fileobj=fileobj) yield gz gz.close() @contextlib.contextmanager def opentar(fileobj): t = tarfile.open(fileobj=fileobj, mode='r') yield t t.close() def extract(archive, gzipping=True, sorting=False): # extract print('Extract gzip: {0}, name sorting: {1}... '.format(gzipping, sorting)) start_time = datetime.datetime.now() if gzipping: with opengzip(io.BytesIO(archive)) as g: tarinput = g.read() else: tarinput = archive with opentar(io.BytesIO(tarinput)) as t: bytes_read = 0 names = t.getnames() if sorting: names = sorted(names) for name in names: minfo = t.getmember(name) if not minfo: print('failed to locate {0}'.format(name)) raise AssertionError #print('minfo.name = {0}, minfo.type = {1}'.format(minfo.name, minfo.type)) # 2.6 and 3.1 can't read links but it's not very important. if minfo.isreg(): fileobj = t.extractfile(minfo) if fileobj: content = fileobj.read() bytes_read += len(content) end_time = datetime.datetime.now() gzip_time = end_time - start_time print('member extraction {0} gzip by {1} names read {2} bytes in {3}'.format( 'with' if gzipping else 'without', 'sorted' if sorting else 'unsorted', bytes_read, gzip_time)) if __name__ == '__main__': #target = '/usr/lib' target = '/usr/include' # create archive print('Tar\'ing up {0} into memory via outcall...'.format(target)) start_time = datetime.datetime.now() with open('/dev/null', 'w') as output: p = subprocess.Popen('tar cfz - {0}'.format(target).split(), stdout=subprocess.PIPE, stderr=output) (archive, toss) = p.communicate() toss = p.wait() end_time = datetime.datetime.now() archive_time = end_time - start_time # this will be highly file system dependent print('archive of size {0} collected in {1}'.format(len(archive), archive_time)) # extract via outcall print('Extracting via outcall...') start_time = datetime.datetime.now() with open('/dev/null', 'w') as output: p = subprocess.Popen('tar tvvfz -'.split(), stdin=subprocess.PIPE, stdout=output, stderr=subprocess.STDOUT) toss = p.communicate(archive) toss = p.wait() end_time = datetime.datetime.now() outcall_extraction_time = end_time - start_time print('Outcall extraction in {0}.'.format(outcall_extraction_time)) for gzipping in [True, False]: for sorting in [False, True]: extract(archive, gzipping=gzipping, sorting=sorting)