from HTMLParser import HTMLParser class VerbatimParser(HTMLParser): def __init__(self, out): HTMLParser.__init__(self) # @#%#@% HTMLParser uses old-style classes, can't use super() self.out = out def emit(self, text): self.out.write(text) def handle_starttag(self, tag, attrs): self.emit(self.get_starttag_text()) def handle_endtag(self, tag): self.emit('') def handle_startendtag(self, tag, attrs): self.emit(self.get_starttag_text()) def handle_data(self, data): self.emit(data) def handle_entityref(self, name): self.emit('&') self.emit(name) self.emit(';') def handle_charref(self, name): self.emit('&#') self.emit(name) self.emit(';') def handle_comment(self, data): self.emit('') def handle_decl(self, decl): self.emit('') def handle_pi(self, data): self.emit('') def unknown_decl(self, data): self.emit('') def doit(infile, outfile): with open(outfile,'w') as fout: parser = VerbatimParser(fout) with open(infile) as f: parser.feed(f.read()) parser.close() doit('test1.html','test1b.html')