from sgmllib import SGMLParser class URLLister(SGMLParser): def __init__(self): SGMLParser.__init__(self) self.reset() def reset(self): SGMLParser.reset(self) self.vul_value = [] self.is_table = None self.is_tbody = None self.is_desc = None self.is_date = None self.is_tr = None self.is_a = None self.is_cve = None self.is_vul_href = None self.is_vul_name = None self.is_small = None self.is_td = None self.cve_name = "" self.vul_name = "" self.href_name = "" def start_table(self, attrs): table_desc = [ v for k, v in attrs if k == "class" and v == "tablesorter"] if table_desc: self.is_table = True def end_table(self): self.is_table = False def start_td(self, attrs): if self.is_table: self.is_td= True def end_td(self): self.is_td = False def start_a(self, attrs): if self.is_td: cve_href = [v for k, v in attrs if k == "target" and v == "_blank"] if cve_href: self.is_a = True self.is_cve = True #for SGMLParser maybe have a bug,a have two has problem vul_href = [v for k, v in attrs if k == "style"] print vul_href if vul_href: vul_href = "".join([v for k, v in attrs if k == "href"]) if vul_href.find("cve") == -1: self.href_name = vul_href else: self.href_name = "" def end_a(self): self.is_a = False self.is_cve = False def handle_data(self,data): if data != None: data = data.rstrip() data = data.lstrip() if __name__ == "__main__": the_page =''' test

title

ok ok
CPAI-2012-809 CVE-2011-2089
SCADA ICONICS WebHMI ActiveX Stack Overflow (2011-2089)
''' lister = URLLister() lister.feed(the_page)