#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Time-stamp: <15-Aug-2013 11:58:31 PDT by rich@noir.com>

# This code is in the public domain.

# Written by K Richard Pixley, <rich@noir.com> in 2013 in order to
# demonstrate pathological behavior of the tarfile library.

"""
A simple demonstration script.  Run it and it will tar up your /usr/lib or
/usr/include into memory using an outcall and then unpack in several different
ways.

I think correct behavior should be that using tarfile alone should be
comparable, (or faster), than using gzip then tarfile.  Worst case, tarfile
should be mimicking what I'm doing here and thus should be the same speed.

What I'm actually seeing is that tarfile behavior is much worse, (~2x), for
unsorted names and more like ~60x for sorted names.

Works with python 2.[67], 3.[0123].  (Have not tried any others).

I see the degradation on all of the linux boxes I've tried:
* ubuntu-13.04, (raring), 64-bit
* rhel-5.4 64-bit
* rhel-5.7 64-bit
* suse-11 64-bit

I see some degradation on MacOsX-10.8.4 but it's in the acceptable range, more
like 2x than 60x.  That is still suspicious, but not as problematic.  Similar
degradation on debian-7.1 armv6.

I see no degradation on ubuntu-13.10 (saucy) on an armv7.
"""

from __future__ import print_function, unicode_literals

__docformat__ = 'restructuredtext en'

import platform

v3 = int(platform.python_version_tuple()[0]) == 3

import io
import contextlib
import datetime
import gzip
import os
import subprocess
import tarfile

@contextlib.contextmanager
def opengzip(fileobj):
    gz = gzip.GzipFile(None, 'r', 9, fileobj=fileobj)
    yield gz
    gz.close()

@contextlib.contextmanager
def opentar(fileobj):
    t = tarfile.open(fileobj=fileobj, mode='r')
    yield t
    t.close()

def extract(archive, gzipping=True, sorting=False):
    # extract
    print('Extract gzip: {0}, name sorting: {1}... '.format(gzipping, sorting))

    start_time = datetime.datetime.now()

    if gzipping:
        with opengzip(io.BytesIO(archive)) as g:
            tarinput = g.read()
    else:
        tarinput = archive

    with opentar(io.BytesIO(tarinput)) as t:
        bytes_read = 0
        names = t.getnames()
        if sorting:
            names = sorted(names)

        for name in names:
            minfo = t.getmember(name)
            if not minfo:
                print('failed to locate {0}'.format(name))
                raise AssertionError

            #print('minfo.name = {0}, minfo.type = {1}'.format(minfo.name, minfo.type))
            # 2.6 and 3.1 can't read links but it's not very important.
            if minfo.isreg():
                fileobj = t.extractfile(minfo)

                if fileobj:
                    content = fileobj.read()
                    bytes_read += len(content)

    end_time = datetime.datetime.now()
    gzip_time = end_time - start_time
    print('member extraction {0} gzip by {1} names read {2} bytes in {3}'.format(
            'with' if gzipping else 'without',
            'sorted' if sorting else 'unsorted',
            bytes_read,
            gzip_time))


if __name__ == '__main__':
    #target = '/usr/lib'
    target = '/usr/include'

    # create archive
    print('Tar\'ing up {0} into memory via outcall...'.format(target))

    start_time = datetime.datetime.now()
    with open('/dev/null', 'w') as output:
        p = subprocess.Popen('tar cfz - {0}'.format(target).split(),
                             stdout=subprocess.PIPE,
                             stderr=output)
        (archive, toss) = p.communicate()
        toss = p.wait()

    end_time = datetime.datetime.now()
    archive_time = end_time - start_time

    # this will be highly file system dependent
    print('archive of size {0} collected in {1}'.format(len(archive), archive_time))

    # extract via outcall
    print('Extracting via outcall...')

    start_time = datetime.datetime.now()
    with open('/dev/null', 'w') as output:
        p = subprocess.Popen('tar tvvfz -'.split(),
                             stdin=subprocess.PIPE,
                             stdout=output,
                             stderr=subprocess.STDOUT)
        toss = p.communicate(archive)
        toss = p.wait()

    end_time = datetime.datetime.now()
    outcall_extraction_time = end_time - start_time
    print('Outcall extraction in {0}.'.format(outcall_extraction_time))

    for gzipping in [True, False]:
        for sorting in [False, True]:
            extract(archive, gzipping=gzipping, sorting=sorting)