# -*- coding: utf-8 -*-
"""
Created on Mon Dec 28 13:23:09 2020
Possible bug in HTMLParser()
At the end of this script are two simple identical tests (on lines 215, 229),
producing string results that are compared with a reference. The tests parse
a single HTML element
The two results should be the result of using different parser variables, and
produce the same answer.
The first test gives a correct result; the second test gives an error; it
seems not to have disposed of the first result despite a reset in a new parser.
The same effect happens with more complex HTML. ]
@author: aeh
Anthony Hodson
aeh@xdotd.com or aeh@xdotd.co.uk
Spring Lanes House, Holly Spring Lane,
Bracknell, Berks RG12 2JL
Tel: 01344 483347
Mob: 0771 360 7086
Web: www.xdotd.com www.fosmw.com
"""
from html.parser import HTMLParser
parser = []
class MyHTMLParser(HTMLParser):
ls= list()
#HTML Parser Methods
def handle_starttag(self, startTag, attrs):
attr_list = list()
for attr in attrs:
attr_list.append(attr)
self.ls.append(("tag:",startTag, (attr_list)))
def handle_endtag(self, endTag):
self.ls.append(("etag:",endTag))
def handle_startendtag(self,startendTag, attrs):
attr_list = list()
for attr in attrs:
attr_list.append(attr)
self.ls.append(("setag:", startendTag, (attr_list)))
def handle_data(self,data):
self.ls.append(("txt:",data))
def handle_comment(self,data):
self.ls.append(("cmt:",data))
def handle_pi(self, processing_instruction):
self.ls.append(("pi:", processing_instruction))
def handle_decl(self, declaration):
self.ls.append(("decl:", declaration))
def close_parse_structure():
global parser
try:
parser.close()
except AttributeError:
parser = None
def make_marked_fragment(test_string, index, scope):
""" Creates a segment of tjhe test string that is marked up
The letter corresponding to the index is placed between vertical bars
To left and to right of the test string are adjacent substrings of
the test string of length scope, truncated if the bottom or top of
test_string does not have enough letters
\n is replaced by ~ and spaces replaced by _
"""
if (index-scope)< 0:
lower_scope = scope + (index-scope)
lower_index = 0
else:
lower_scope = scope
test_index = index
lower_index = (index-scope)
whole_section = test_string[lower_index:index+scope+1]
ref_string = ""
for i in range(len(whole_section)):
if whole_section[i] == '\n':
ref_string+= '~'
continue
elif whole_section[i] == ' ':
ref_string+= '_'
continue
ref_string+= whole_section[i]
return (ref_string[0:lower_scope]+"|"+ref_string[lower_scope]+
"|" + ref_string[lower_scope+1:2*scope+1])
def first_difference(str1, str2):
position = -1
len1 = len(str1)
len2 = len(str2)
if len1 < len2:
str1 += " "
str2 = str2[0:(len1+1)]
differences = "First string is shorter. "
elif len1 > len2:
str2 += " "
str1 = str1[0:(len2+1)]
differences = "First string is longer. "
else:
differences = ""
for a, b in zip(str1, str2):
position += 1
if a != b:
if str(a) == "\n":
a = "\\n"
elif str(b) == "\n":
b = "\\n"
return (differences + "First mismatch at " + str(position) + "\n"
"RESULT = " + make_marked_fragment(str1,position,10) + "\n" +
"REFNCE = " + make_marked_fragment(str2,position,10) + "\n\n"+
str1[0:position])
return differences + "No mismatch otherwise"
summary = ""
def test(description,result,reference):
global summary
print("\"" + description + "\"")
print("RESULT\n"+result)
if result == reference:
print("OK\n")
else:
print("REFERENCE\n"+reference)
print(first_difference(result, reference) + "\n")
summary += "\"" + description + "\" failed\n"
parser = None
indent = " "
def recursive_plist_analyse(structure, limit):
""" Convert a HTMLParser structure into text that represents it textually.
structure is the HTMLParser structure
limit is the number of members of the list to be analysed.
This method is recursive, and recurses inside lists and tuples. In the
representatation, the depth if recursion is indicated by indentation.
It can only be used by plist_analyse*()
Indirectly tested in tools_test.py
"""
global stack
global out
level = len(stack)
p = structure
if isinstance(p, (list,tuple)):
out += indent[0:4*level] + str(stack) + ": "+ str(type(p)).replace("", "[" + str(len(p)) + "]") + "\n"
for i in range(len(p)):
if limit > 0 and i > limit:
break
stack.append(i)
recursive_plist_analyse(p[i], limit)
stack.pop()
elif isinstance(p, str):
out += indent[0:4*level] + str(stack) + "=\"" + p + "\"" + "\n"
else:
out += indent[0:4*level] + str(stack) +": "+str(type(p))+ " unknown" + "\n"
return out
def plist_analyse(structure, _limit=10):
""" Convert a HTMLParser structure into a text that represents it textually.
structure is the HTMLParser structure
limit is the number of members of the list to be analysed.
This method uses recursively recursive_list_analyse(structure, limit).
This recurses inside lists and tuples.
In the reoresentatation, the depth if recursion is indicated by four-space
indentation.
Tested in tools_test.py
"""
global stack
global out
out = ""
global stack
stack = []
limit = _limit
out = recursive_plist_analyse(structure, limit)
return out
def plist_html_string(html_string, limit=0):
print(html_string)
""" Return a structured string representing HTML from a supplied string
html_string is the name of the string.
limit is the maximum number of items to be returned.
Tested in tools_test.py
"""
parser.feed(html_string)
print("plist_html_string*( returns list with " + str(len(parser.ls)) + " elements)")
return_string = plist_analyse(parser.ls, limit)
parser.close()
return return_string
#=============================================================================
parser = MyHTMLParser()
test("Simple test of Html parses with one element",
plist_html_string("
"),
(
"[]: list[1]\n"
" [0]: tuple[3]\n"
" [0, 0]=\"tag:\"\n"
" [0, 1]=\"br\"\n"
" [0, 2]: list[0]\n"
))
parser.close()
parser = MyHTMLParser()
test("Simple test of Html parses with one element",
plist_html_string("
"),
(
"[]: list[1]\n"
" [0]: tuple[3]\n"
" [0, 0]=\"tag:\"\n"
" [0, 1]=\"br\"\n"
" [0, 2]: list[0]\n"
))
parser.close()