#!/usr/bin/python # convert.py # Convert a Wikipedia XML dump into a bunch of lzmaed archives and also make # a search index (name only). """ Alexandria - Offline Wikipedia Viewer Copyright (C) 2008 Alex Roper alexr@ugcs.caltech.edu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 or 3 of the License (at your option). Other licensing may be available, possibly for free. Please contact the author to discuss. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA """ copyright = \ """Alexandria version 1, Copyright (C) 2008 Alex Roper Alexandria comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to copy modify, and redistribute it under certain conditions; see the file COPYING for details.""" import sys, xml.sax as sax, xml.sax.handler, tempfile, cPickle, bz2, os, shelve # Default value, can be overridden on commandline MIN_CHUNK_SIZE = 8388608 * 2 def writechunk(pages, path): # if os.fork(): f = open(path, 'w') pagelen = map(len, pages) pagestart = [sum(pagelen[:i]) for i in range(len(pagelen))] cPickle.dump(pagestart, f, protocol = -1) [f.write(p.encode("UTF8")) for p in pages] # exit() class Parser(xml.sax.handler.ContentHandler): def __init__(s, outdir, chunksize = MIN_CHUNK_SIZE): s.title = ""; s.text = ""; s.pen = None; s.chunks = s.csize = 0 s.pages = []; s.chunksize = chunksize; s.outdir = outdir s.index = shelve.open("%s/index.shelve" % outdir, protocol = 2) s.redirect = shelve.open("%s/redirect.shelve" % outdir, protocol = 2) def startElement(s, name, attrs): if name == "title": s.pen = "title" elif name == "text": s.pen = "text" def endElement(s, name): if name in ("text", "title"): s.pen = None elif name == "page": s.pagehandler(s.title, s.text) s.text = ""; s.title = "" def characters(s, c): if s.pen: setattr(s, s.pen, getattr(s, s.pen) + c) def pagehandler(s, title, text): if text[:9] == "#REDIRECT": s.redirect[title.encode("UTF8")] = text[12:-2] else: s.index[title.encode("UTF8")] = (s.chunks, len(s.pages)) s.pages += [text] s.csize += len(text) if s.csize > s.chunksize: writechunk(s.pages, "%s/chunk%i.dat" % (s.outdir, s.chunks)) s.pages = []; s.csize = 0; s.chunks += 1 sys.stdout.write("."); sys.stdout.flush() def endDocument(s): if len(s.pages) > 0: writechunk(s.pages, "%s/chunk%i.dat" % (s.outdir, s.chunks)) s.chunks += 1 print "\nWrote %i chunks." s.redirect.close(); s.index.close() def __del__(s): s.redirect.close(); s.index.close() if __name__ == "__main__": # Sacrifice a goat to Stallman and the FSF:-) print copyright if len(sys.argv) in (2, 3): if len(sys.argv) == 3: MIN_CHUNK_SIZE = int(sys.argv[2]) # Process the pages sax.parse(sys.stdin, Parser(sys.argv[1], MIN_CHUNK_SIZE)) else: print "Usage: cat wikipedia.dump | ./convert.py outdir [min_chunksize_in_bytes]"