#$Id$ # -*- coding: utf-8 -*- from html.parser import HTMLParser #p3 class HT( HTMLParser): def __init__( me, *a,**k): me.out = [] HTMLParser.__init__( me,*a,**k) def handle_starttag( me, tag, attrs): me.out.append( "start {tag}".format_map( locals() )) def handle_endtag( me, tag): me.out.append( "end {tag}".format_map( locals() )) def handle_data(me, data): data = data.strip() if data: me.out.append( "data {data}".format_map( locals() )) if __name__ == '__main__': import sys if sys.argv[1:]: url = sys.argv[1] from urllib.request import urlopen #p3 u = urlopen( url) d = u.read() import chardet #chardet.feedparser.org, python3-chardet, python-chardet enc = chardet.detect( d)[ 'encoding'] #.confidence d = d.decode( enc ) else: d = '''
- software-and-i - library
''' hs = HT( strict= 1) hs.feed( d) hs.close() hn = HT( strict= 0) hn.feed( d) hn.close() if hs.out != hn.out: print( 'strict: ===============\n' + '\n'.join( hs.out)) print( 'nonstrict: ============\n' + '\n'.join( hn.out)) assert hs.out == hn.out # vim:ts=4:sw=4:expandtab