Index: Lib/htmllib.py
===================================================================
--- Lib/htmllib.py (Revision 52618)
+++ Lib/htmllib.py (Arbeitskopie)
@@ -1,7 +1,7 @@
-"""HTML 2.0 parser.
+"""HTML 4.01 parser.
-See the HTML 2.0 specification:
-http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
+See the HTML 4.01 specification:
+http://www.w3.org/TR/html401
"""
import sgmllib
@@ -19,8 +19,7 @@
"""This is the basic HTML parser class.
It supports all entity names required by the XHTML 1.0 Recommendation.
- It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
- elements.
+ It also defines handlers for all HTML 4.01 elements.
"""
@@ -129,7 +128,7 @@
"""
self.handle_data(alt)
- # --------- Top level elememts
+ # --------- Top level elements
def start_html(self, attrs): pass
def end_html(self): pass
@@ -142,29 +141,38 @@
# ------ Head elements
- def start_title(self, attrs):
- self.save_bgn()
-
- def end_title(self):
- self.title = self.save_end()
-
def do_base(self, attrs):
for a, v in attrs:
if a == 'href':
self.base = v
- def do_isindex(self, attrs):
- self.isindex = 1
-
def do_link(self, attrs):
pass
def do_meta(self, attrs):
pass
- def do_nextid(self, attrs): # Deprecated
- pass
+ def start_noscript(self, attrs):
+ pass
+ def end_noscript(self):
+ pass
+ def start_script(self, attrs):
+ pass
+ def end_script(self):
+ pass
+
+ def start_style(self, attrs):
+ pass
+ def end_style(self):
+ pass
+
+ def start_title(self, attrs):
+ self.save_bgn()
+
+ def end_title(self):
+ self.title = self.save_end()
+
# ------ Body elements
# --- Headings
@@ -219,6 +227,32 @@
# --- Block Structuring Elements
+ def start_address(self, attrs):
+ self.formatter.end_paragraph(0)
+ self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
+
+ def end_address(self):
+ self.formatter.end_paragraph(0)
+ self.formatter.pop_font()
+
+ def start_bdo(self, attrs):
+ pass
+ def end_bdo(self):
+ pass
+
+ def start_div(self, attrs):
+ pass
+ def end_div(self, attrs):
+ pass
+
+ def start_blockquote(self, attrs):
+ self.formatter.end_paragraph(1)
+ self.formatter.push_margin('blockquote')
+
+ def end_blockquote(self):
+ self.formatter.end_paragraph(1)
+ self.formatter.pop_margin()
+
def do_p(self, attrs):
self.formatter.end_paragraph(1)
@@ -232,36 +266,28 @@
self.formatter.pop_font()
self.nofill = max(0, self.nofill - 1)
- def start_xmp(self, attrs):
- self.start_pre(attrs)
- self.setliteral('xmp') # Tell SGML parser
+ def start_q(self, attrs):
+ pass
+ def end_q(self):
+ pass
- def end_xmp(self):
- self.end_pre()
+ def start_span(self, attrs):
+ pass
+ def end_span(self):
+ pass
- def start_listing(self, attrs):
- self.start_pre(attrs)
- self.setliteral('listing') # Tell SGML parser
+ # --- Marking text
- def end_listing(self):
- self.end_pre()
-
- def start_address(self, attrs):
- self.formatter.end_paragraph(0)
- self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
-
- def end_address(self):
- self.formatter.end_paragraph(0)
- self.formatter.pop_font()
-
- def start_blockquote(self, attrs):
- self.formatter.end_paragraph(1)
- self.formatter.push_margin('blockquote')
-
- def end_blockquote(self):
- self.formatter.end_paragraph(1)
- self.formatter.pop_margin()
-
+ def start_del(self, attrs):
+ pass
+ def end_del(self):
+ pass
+
+ def start_ins(self, attrs):
+ pass
+ def end_ins(self):
+ pass
+
# --- List Elements
def start_ul(self, attrs):
@@ -298,18 +324,6 @@
self.formatter.end_paragraph(not self.list_stack)
self.formatter.pop_margin()
- def start_menu(self, attrs):
- self.start_ul(attrs)
-
- def end_menu(self):
- self.end_ul()
-
- def start_dir(self, attrs):
- self.start_ul(attrs)
-
- def end_dir(self):
- self.end_ul()
-
def start_dl(self, attrs):
self.formatter.end_paragraph(1)
self.list_stack.append(['dl', '', 0])
@@ -318,14 +332,20 @@
self.ddpop(1)
if self.list_stack: del self.list_stack[-1]
- def do_dt(self, attrs):
+ def start_dt(self, attrs):
self.ddpop()
- def do_dd(self, attrs):
+ def end_dt(self, attrs):
+ pass
+
+ def start_dd(self, attrs):
self.ddpop()
self.formatter.push_margin('dd')
self.list_stack.append(['dd', '', 0])
+ def end_dd(self, attrs):
+ self.ddpop()
+
def ddpop(self, bl=0):
self.formatter.end_paragraph(bl)
if self.list_stack:
@@ -337,12 +357,21 @@
# Idiomatic Elements
+ def start_abbr(self, attrs): pass
+ def end_abbr(self): pass
+
+ def start_acronym(self, attrs): pass
+ def end_acronym(self): pass
+
def start_cite(self, attrs): self.start_i(attrs)
def end_cite(self): self.end_i()
def start_code(self, attrs): self.start_tt(attrs)
def end_code(self): self.end_tt()
+ def start_dfn(self, attrs): pass
+ def end_dfn(self): pass
+
def start_em(self, attrs): self.start_i(attrs)
def end_em(self): self.end_i()
@@ -360,21 +389,45 @@
# Typographic Elements
+ def start_b(self, attrs):
+ self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
+ def end_b(self):
+ self.formatter.pop_font()
+
+ def start_big(self, attrs):
+ pass
+ def end_big(self):
+ pass
+
def start_i(self, attrs):
self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
def end_i(self):
self.formatter.pop_font()
- def start_b(self, attrs):
- self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
- def end_b(self):
- self.formatter.pop_font()
+ def start_small(self, attrs):
+ pass
+ def end_small(self):
+ pass
+ def start_sub(self, attrs):
+ pass
+ def end_sub(self):
+ pass
+
+ def start_sup(self, attrs):
+ pass
+ def end_sup(self):
+ pass
+
def start_tt(self, attrs):
self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
def end_tt(self):
self.formatter.pop_font()
+
+
+ # Linking
+
def start_a(self, attrs):
href = ''
name = ''
@@ -402,8 +455,16 @@
def do_hr(self, attrs):
self.formatter.add_hor_rule()
- # --- Image
+ # --- Images and objects
+ def do_area(self, attrs):
+ pass
+
+ def start_iframe(self, attrs):
+ pass
+ def end_iframe(self):
+ pass
+
def do_img(self, attrs):
align = ''
alt = '(image)'
@@ -428,12 +489,208 @@
except ValueError: pass
self.handle_image(src, alt, ismap, align, width, height)
- # --- Really Old Unofficial Deprecated Stuff
+ def start_map(self, attrs):
+ pass
+ def end_map(self):
+ pass
+ def start_object(self, attrs):
+ pass
+ def end_object(self):
+ pass
+
+ def do_param(self, attrs):
+ pass
+
+ # --- Forms
+
+ def start_button(self, attrs):
+ pass
+ def end_button(self):
+ pass
+
+ def start_fieldset(self, attrs):
+ pass
+ def end_fieldset(self):
+ pass
+
+ def start_form(self, attrs):
+ pass
+ def end_form(self):
+ pass
+
+ def do_input(self, attrs):
+ pass
+
+ def start_label(self, attrs):
+ pass
+ def end_label(self):
+ pass
+
+ def start_legend(self, attrs):
+ pass
+ def end_legend(self):
+ pass
+
+ def start_optgroup(self, attrs):
+ pass
+ def end_optgroup(self):
+ pass
+
+ def start_option(self, attrs):
+ pass
+ def end_option(self):
+ pass
+
+ def start_select(self, attrs):
+ pass
+ def end_select(self):
+ pass
+
+ def start_textarea(self, attrs):
+ pass
+ def end_textarea(self):
+ pass
+
+
+ # --- Table elements
+
+ def start_caption(self, attrs):
+ pass
+ def end_caption(self):
+ pass
+
+ def do_col(self, attrs):
+ pass
+
+ def start_colgroup(self, attrs):
+ pass
+ def end_colgroup(self):
+ pass
+
+ def start_table(self, attrs):
+ pass
+ def end_table(self):
+ pass
+
+ def start_tbody(self, attrs):
+ pass
+ def end_tbody(self):
+ pass
+
+ def start_td(self, attrs):
+ pass
+ def end_td(self):
+ pass
+
+ def start_tfoot(self, attrs):
+ pass
+ def end_tfoot(self):
+ pass
+
+ def start_th(self, attrs):
+ pass
+ def end_th(self):
+ pass
+
+ def start_thead(self, attrs):
+ pass
+ def end_thead(self):
+ pass
+
+ def start_tr(self, attrs):
+ pass
+ def end_tr(self):
+ pass
+
+ # --- Frames
+
+ def do_frame(self, attrs):
+ pass
+
+ def start_frameset(self, attrs):
+ pass
+ def end_frameset(self):
+ pass
+
+ def start_noframes(self, attrs):
+ pass
+ def end_noframes(self):
+ pass
+
+
+ # --- Constructs deprecated in HTML 4.01
+
+ def start_applet(self, attrs):
+ pass
+ def end_applet(self):
+ pass
+
+ def do_basefont(self, attrs):
+ pass
+
+ def start_center(self, attrs):
+ pass
+ def end_center(self):
+ pass
+
+ def start_dir(self, attrs):
+ self.start_ul(attrs)
+ def end_dir(self):
+ self.end_ul()
+
+ def start_font(self, attrs):
+ pass
+ def end_font(self):
+ pass
+
+ def do_isindex(self, attrs):
+ self.isindex = 1
+
+ def start_menu(self, attrs):
+ self.start_ul(attrs)
+ def end_menu(self):
+ self.end_ul()
+
+ def start_s(self, attrs):
+ pass
+ def end_s(self):
+ pass
+
+ def start_strike(self, attrs):
+ pass
+ def end_strike(self):
+ pass
+
+ def start_u(self, attrs):
+ pass
+ def end_u(self):
+ pass
+
+
+ # --- Unofficial deprecated elements (pre-HTML 2.0, mostly)
+
+ def start_listing(self, attrs):
+ self.start_pre(attrs)
+ self.setliteral('listing') # Tell SGML parser
+
+ def end_listing(self):
+ self.end_pre()
+
+ def do_nextid(self, attrs): # Deprecated
+ pass
+
def do_plaintext(self, attrs):
self.start_pre(attrs)
self.setnomoretags() # Tell SGML parser
+ def start_xmp(self, attrs):
+ self.start_pre(attrs)
+ self.setliteral('xmp') # Tell SGML parser
+
+ def end_xmp(self):
+ self.end_pre()
+
# --- Unhandled tags
def unknown_starttag(self, tag, attrs):