import HTMLParser #This goes in line 318 of HTMLParser def parse_endtag(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == "", "unexpected call to parse_endtag" match = endendtag.search(rawdata, i+1) # > if not match: return -1 j = match.end() match = endtagfind.match(rawdata, i) # + tag + > if not match: self.error("bad end tag: %s" % `rawdata[i:j]`) tag = match.group(1) #START BUGFIX if self.interesting == interesting_cdata: #we're in of of the CDATA_CONTENT_ELEMENTS if tag == self.lasttag and tag in self.CDATA_CONTENT_ELEMENTS: #its the end of the CDATA_CONTENT_ELEMENTS tag we are in. self.handle_endtag(tag.lower()) self.clear_cdata_mode()#backto normal mode else: #we're inside the CDATA_CONTENT_ELEMENTS tag still. throw the tag to handle_data instead. self.handle_data(match.group()) else: #we're not in a CDATA_CONTENT_ELEMENTS tag. standard ending: self.handle_endtag(tag.lower()) return j class MyHandler(HTMLParser.HTMLParser): tags = [] def handle_starttag(self, tag, attr): self.tags.append(tag) def handle_endtag(self, tag): if tag != self.tags[-1]: #this should never happen in a well formed document raise "Not well-formed, endtag '" + tag + "' doesn't match starttag '" + self.lasttag + "'" self.tags.pop(-1) s = """
This page is completely well formed blah blah """ m = MyHandler() m.feed(s)