# coding=UTF-8 #from django.utils.encoding import smart_str, smart_unicode from xml.parsers import expat import sys class Parser: def __init__(self): self._parser = expat.ParserCreate() self._parser.StartElementHandler = self.start self._parser.EndElementHandler = self.end self._parser.CharacterDataHandler = self.data def feedFile(self, inputFileName, createSQLFileName, max_inserts_num, insert_header_sql): # init basic flag(s) self.inPage = 0 self.page_counter = 0 self.inTag = {} self.max_inserts_count = 0 # init stuff from params self.max_inserts = max_inserts_num self.insert_header = insert_header_sql self._f = open (inputFileName+'.sql', 'wb') # add create sql ftmp = open (createSQLFileName, 'rb') self._f.write(ftmp.read()) # run parser self._parser.ParseFile(open(inputFileName)) def close(self): self._parser.Parse("", 1) # end of data del self._parser # get rid of circular references # Any tag start found. Note! page must not contain other page tag! def start(self, tag, attrs): if tag == 'page': self.inPage = 1 self.page_counter += 1 self.rev_row_clear() elif self.inPage: self.inTag[tag] = self.inTag.get(tag,0) + 1 if tag == 'revision': self.rev_row_partial_clear() # Any tag end found def end(self, tag): if tag == 'page': self.inPage = 0 elif self.inPage: self.inTag[tag] -= 1 if tag == 'revision': self.rev_row_insert() # Any data found def data(self, data): if self.inPage: #data = unicode(data, "utf-8") if self.inTag.get('contributor', 0): if self.inTag.get('id', 0): self.rev_row['user'] = data else: if self.inTag.get('title', 0): print('@page: ' + data) if data.find("MediaWiki:")==0: self.rev_row['is_mw_ns'] = 1 elif self.inTag.get('revision', 0): if self.inTag.get('id', 0): self.rev_row['id'] = data elif self.inTag.get('timestamp', 0): self.rev_row['log_timestamp'] = str(data).translate(None, "-T:Z") # clear rev_row and init it's data if needed def rev_row_clear(self): self.rev_row = {} self.rev_row['is_mw_ns'] = 0 def rev_row_partial_clear(self): tmp = self.rev_row['is_mw_ns'] self.rev_row = {} self.rev_row['is_mw_ns'] = tmp # insert rev_row (output SQL) def rev_row_insert(self): #sys.stderr.write('Adding revision with id: %s\n' % self.rev_row['id']) #self.page_counter) #print('Adding revision with id: %s' % self.rev_row['id']) #self.page_counter) if (self.max_inserts_count >= self.max_inserts): self.max_inserts_count = 0 self._f.write(";\n") if (self.max_inserts_count==0): self._f.write(self.insert_header) if (self.max_inserts_count!=0): self._f.write(",") self._f.write("(") #self._f.write(self.rev_row['id']) #self._f.write(",'") self._f.write("'") self._f.write(self.rev_row.get('user', '0').encode('utf-8')) self._f.write("','") self._f.write(str(self.rev_row['is_mw_ns'])) self._f.write("','") self._f.write(self.rev_row['log_timestamp'].encode('utf-8')) self._f.write("')\n") self.max_inserts_count+=1 # # Run # p = Parser() #p.feedFile('plwiki-latest-stub-meta-history.xml', 'revision_create.sql', 5000, "INSERT INTO revision (rev_user, rev_is_mw_ns, rev_timestamp) VALUES\n") p.feedFile('revision-test.xml', 'revision_create.sql', 5000, "INSERT INTO revision (rev_user, rev_is_mw_ns, rev_timestamp) VALUES\n") p.close()