# -*- coding: utf-8 -*- """ Created on Mon Dec 28 13:23:09 2020 Possible bug in HTMLParser() At the end of this script are two simple identical tests (on lines 215, 229), producing string results that are compared with a reference. The tests parse a single HTML element
The two results should be the result of using different parser variables, and produce the same answer. The first test gives a correct result; the second test gives an error; it seems not to have disposed of the first result despite a reset in a new parser. The same effect happens with more complex HTML. ] @author: aeh Anthony Hodson aeh@xdotd.com or aeh@xdotd.co.uk Spring Lanes House, Holly Spring Lane, Bracknell, Berks RG12 2JL Tel: 01344 483347 Mob: 0771 360 7086 Web: www.xdotd.com www.fosmw.com """ from html.parser import HTMLParser parser = [] class MyHTMLParser(HTMLParser): ls= list() #HTML Parser Methods def handle_starttag(self, startTag, attrs): attr_list = list() for attr in attrs: attr_list.append(attr) self.ls.append(("tag:",startTag, (attr_list))) def handle_endtag(self, endTag): self.ls.append(("etag:",endTag)) def handle_startendtag(self,startendTag, attrs): attr_list = list() for attr in attrs: attr_list.append(attr) self.ls.append(("setag:", startendTag, (attr_list))) def handle_data(self,data): self.ls.append(("txt:",data)) def handle_comment(self,data): self.ls.append(("cmt:",data)) def handle_pi(self, processing_instruction): self.ls.append(("pi:", processing_instruction)) def handle_decl(self, declaration): self.ls.append(("decl:", declaration)) def close_parse_structure(): global parser try: parser.close() except AttributeError: parser = None def make_marked_fragment(test_string, index, scope): """ Creates a segment of tjhe test string that is marked up The letter corresponding to the index is placed between vertical bars To left and to right of the test string are adjacent substrings of the test string of length scope, truncated if the bottom or top of test_string does not have enough letters \n is replaced by ~ and spaces replaced by _ """ if (index-scope)< 0: lower_scope = scope + (index-scope) lower_index = 0 else: lower_scope = scope test_index = index lower_index = (index-scope) whole_section = test_string[lower_index:index+scope+1] ref_string = "" for i in range(len(whole_section)): if whole_section[i] == '\n': ref_string+= '~' continue elif whole_section[i] == ' ': ref_string+= '_' continue ref_string+= whole_section[i] return (ref_string[0:lower_scope]+"|"+ref_string[lower_scope]+ "|" + ref_string[lower_scope+1:2*scope+1]) def first_difference(str1, str2): position = -1 len1 = len(str1) len2 = len(str2) if len1 < len2: str1 += " " str2 = str2[0:(len1+1)] differences = "First string is shorter. " elif len1 > len2: str2 += " " str1 = str1[0:(len2+1)] differences = "First string is longer. " else: differences = "" for a, b in zip(str1, str2): position += 1 if a != b: if str(a) == "\n": a = "\\n" elif str(b) == "\n": b = "\\n" return (differences + "First mismatch at " + str(position) + "\n" "RESULT = " + make_marked_fragment(str1,position,10) + "\n" + "REFNCE = " + make_marked_fragment(str2,position,10) + "\n\n"+ str1[0:position]) return differences + "No mismatch otherwise" summary = "" def test(description,result,reference): global summary print("\"" + description + "\"") print("RESULT\n"+result) if result == reference: print("OK\n") else: print("REFERENCE\n"+reference) print(first_difference(result, reference) + "\n") summary += "\"" + description + "\" failed\n" parser = None indent = " " def recursive_plist_analyse(structure, limit): """ Convert a HTMLParser structure into text that represents it textually. structure is the HTMLParser structure limit is the number of members of the list to be analysed. This method is recursive, and recurses inside lists and tuples. In the representatation, the depth if recursion is indicated by indentation. It can only be used by plist_analyse*() Indirectly tested in tools_test.py """ global stack global out level = len(stack) p = structure if isinstance(p, (list,tuple)): out += indent[0:4*level] + str(stack) + ": "+ str(type(p)).replace("", "[" + str(len(p)) + "]") + "\n" for i in range(len(p)): if limit > 0 and i > limit: break stack.append(i) recursive_plist_analyse(p[i], limit) stack.pop() elif isinstance(p, str): out += indent[0:4*level] + str(stack) + "=\"" + p + "\"" + "\n" else: out += indent[0:4*level] + str(stack) +": "+str(type(p))+ " unknown" + "\n" return out def plist_analyse(structure, _limit=10): """ Convert a HTMLParser structure into a text that represents it textually. structure is the HTMLParser structure limit is the number of members of the list to be analysed. This method uses recursively recursive_list_analyse(structure, limit). This recurses inside lists and tuples. In the reoresentatation, the depth if recursion is indicated by four-space indentation. Tested in tools_test.py """ global stack global out out = "" global stack stack = [] limit = _limit out = recursive_plist_analyse(structure, limit) return out def plist_html_string(html_string, limit=0): print(html_string) """ Return a structured string representing HTML from a supplied string html_string is the name of the string. limit is the maximum number of items to be returned. Tested in tools_test.py """ parser.feed(html_string) print("plist_html_string*( returns list with " + str(len(parser.ls)) + " elements)") return_string = plist_analyse(parser.ls, limit) parser.close() return return_string #============================================================================= parser = MyHTMLParser() test("Simple test of Html parses with one element", plist_html_string("
"), ( "[]: list[1]\n" " [0]: tuple[3]\n" " [0, 0]=\"tag:\"\n" " [0, 1]=\"br\"\n" " [0, 2]: list[0]\n" )) parser.close() parser = MyHTMLParser() test("Simple test of Html parses with one element", plist_html_string("
"), ( "[]: list[1]\n" " [0]: tuple[3]\n" " [0, 0]=\"tag:\"\n" " [0, 1]=\"br\"\n" " [0, 2]: list[0]\n" )) parser.close()