#!/usr/bin/python

# convert.py
# Convert a Wikipedia XML dump into a bunch of lzmaed archives and also make
# a search index (name only).

"""
Alexandria - Offline Wikipedia Viewer
Copyright (C) 2008 Alex Roper
alexr@ugcs.caltech.edu

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 or 3 of the License
(at your option). Other licensing may be available, possibly for free.
Please contact the author to discuss.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
"""

copyright = \
"""Alexandria version 1, Copyright (C) 2008 Alex Roper
Alexandria comes with ABSOLUTELY NO WARRANTY.
This is free software, and you are welcome to copy modify, and redistribute it
under certain conditions; see the file COPYING for details."""

import sys, xml.sax as sax, xml.sax.handler, tempfile, cPickle, bz2, os, shelve

# Default value, can be overridden on commandline
MIN_CHUNK_SIZE = 8388608 * 2

def writechunk(pages, path):
#  if os.fork():
    f = open(path, 'w')
    pagelen = map(len, pages)
    pagestart = [sum(pagelen[:i]) for i in range(len(pagelen))]
    cPickle.dump(pagestart, f, protocol = -1)
    [f.write(p.encode("UTF8")) for p in pages]
#    exit()

class Parser(xml.sax.handler.ContentHandler):
  def __init__(s, outdir, chunksize = MIN_CHUNK_SIZE):
    s.title = ""; s.text = ""; s.pen = None; s.chunks = s.csize = 0
    s.pages = []; s.chunksize = chunksize; s.outdir = outdir
    s.index = shelve.open("%s/index.shelve" % outdir, protocol = 2)
    s.redirect = shelve.open("%s/redirect.shelve" % outdir, protocol = 2)
  def startElement(s, name, attrs):
    if name == "title": s.pen = "title"
    elif name == "text": s.pen = "text"
  def endElement(s, name):
    if name in ("text", "title"): s.pen = None
    elif name == "page":
      s.pagehandler(s.title, s.text)
      s.text = ""; s.title = ""
  def characters(s, c):
    if s.pen: setattr(s, s.pen, getattr(s, s.pen) + c)
  def pagehandler(s, title, text):
    if text[:9] == "#REDIRECT": s.redirect[title.encode("UTF8")] = text[12:-2]
    else:
      s.index[title.encode("UTF8")] = (s.chunks, len(s.pages))
      s.pages += [text]
      s.csize += len(text)
      if s.csize > s.chunksize:
        writechunk(s.pages, "%s/chunk%i.dat" % (s.outdir, s.chunks))
        s.pages = []; s.csize = 0; s.chunks += 1
        sys.stdout.write("."); sys.stdout.flush()
  def endDocument(s):
    if len(s.pages) > 0:
      writechunk(s.pages, "%s/chunk%i.dat" % (s.outdir, s.chunks))
      s.chunks += 1
    print "\nWrote %i chunks."
    s.redirect.close(); s.index.close()
  def __del__(s):
    s.redirect.close(); s.index.close()

if __name__ == "__main__":
  # Sacrifice a goat to Stallman and the FSF:-)
  print copyright
  if len(sys.argv) in (2, 3):
    if len(sys.argv) == 3: MIN_CHUNK_SIZE = int(sys.argv[2])
    # Process the pages
    sax.parse(sys.stdin, Parser(sys.argv[1], MIN_CHUNK_SIZE))
  else:
    print "Usage: cat wikipedia.dump | ./convert.py outdir [min_chunksize_in_bytes]"