Index: Lib/htmllib.py =================================================================== --- Lib/htmllib.py (Revision 52618) +++ Lib/htmllib.py (Arbeitskopie) @@ -1,7 +1,7 @@ -"""HTML 2.0 parser. +"""HTML 4.01 parser. -See the HTML 2.0 specification: -http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html +See the HTML 4.01 specification: +http://www.w3.org/TR/html401 """ import sgmllib @@ -19,8 +19,7 @@ """This is the basic HTML parser class. It supports all entity names required by the XHTML 1.0 Recommendation. - It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2 - elements. + It also defines handlers for all HTML 4.01 elements. """ @@ -129,7 +128,7 @@ """ self.handle_data(alt) - # --------- Top level elememts + # --------- Top level elements def start_html(self, attrs): pass def end_html(self): pass @@ -142,29 +141,38 @@ # ------ Head elements - def start_title(self, attrs): - self.save_bgn() - - def end_title(self): - self.title = self.save_end() - def do_base(self, attrs): for a, v in attrs: if a == 'href': self.base = v - def do_isindex(self, attrs): - self.isindex = 1 - def do_link(self, attrs): pass def do_meta(self, attrs): pass - def do_nextid(self, attrs): # Deprecated - pass + def start_noscript(self, attrs): + pass + def end_noscript(self): + pass + def start_script(self, attrs): + pass + def end_script(self): + pass + + def start_style(self, attrs): + pass + def end_style(self): + pass + + def start_title(self, attrs): + self.save_bgn() + + def end_title(self): + self.title = self.save_end() + # ------ Body elements # --- Headings @@ -219,6 +227,32 @@ # --- Block Structuring Elements + def start_address(self, attrs): + self.formatter.end_paragraph(0) + self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) + + def end_address(self): + self.formatter.end_paragraph(0) + self.formatter.pop_font() + + def start_bdo(self, attrs): + pass + def end_bdo(self): + pass + + def start_div(self, attrs): + pass + def end_div(self, attrs): + pass + + def start_blockquote(self, attrs): + self.formatter.end_paragraph(1) + self.formatter.push_margin('blockquote') + + def end_blockquote(self): + self.formatter.end_paragraph(1) + self.formatter.pop_margin() + def do_p(self, attrs): self.formatter.end_paragraph(1) @@ -232,36 +266,28 @@ self.formatter.pop_font() self.nofill = max(0, self.nofill - 1) - def start_xmp(self, attrs): - self.start_pre(attrs) - self.setliteral('xmp') # Tell SGML parser + def start_q(self, attrs): + pass + def end_q(self): + pass - def end_xmp(self): - self.end_pre() + def start_span(self, attrs): + pass + def end_span(self): + pass - def start_listing(self, attrs): - self.start_pre(attrs) - self.setliteral('listing') # Tell SGML parser + # --- Marking text - def end_listing(self): - self.end_pre() - - def start_address(self, attrs): - self.formatter.end_paragraph(0) - self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) - - def end_address(self): - self.formatter.end_paragraph(0) - self.formatter.pop_font() - - def start_blockquote(self, attrs): - self.formatter.end_paragraph(1) - self.formatter.push_margin('blockquote') - - def end_blockquote(self): - self.formatter.end_paragraph(1) - self.formatter.pop_margin() - + def start_del(self, attrs): + pass + def end_del(self): + pass + + def start_ins(self, attrs): + pass + def end_ins(self): + pass + # --- List Elements def start_ul(self, attrs): @@ -298,18 +324,6 @@ self.formatter.end_paragraph(not self.list_stack) self.formatter.pop_margin() - def start_menu(self, attrs): - self.start_ul(attrs) - - def end_menu(self): - self.end_ul() - - def start_dir(self, attrs): - self.start_ul(attrs) - - def end_dir(self): - self.end_ul() - def start_dl(self, attrs): self.formatter.end_paragraph(1) self.list_stack.append(['dl', '', 0]) @@ -318,14 +332,20 @@ self.ddpop(1) if self.list_stack: del self.list_stack[-1] - def do_dt(self, attrs): + def start_dt(self, attrs): self.ddpop() - def do_dd(self, attrs): + def end_dt(self, attrs): + pass + + def start_dd(self, attrs): self.ddpop() self.formatter.push_margin('dd') self.list_stack.append(['dd', '', 0]) + def end_dd(self, attrs): + self.ddpop() + def ddpop(self, bl=0): self.formatter.end_paragraph(bl) if self.list_stack: @@ -337,12 +357,21 @@ # Idiomatic Elements + def start_abbr(self, attrs): pass + def end_abbr(self): pass + + def start_acronym(self, attrs): pass + def end_acronym(self): pass + def start_cite(self, attrs): self.start_i(attrs) def end_cite(self): self.end_i() def start_code(self, attrs): self.start_tt(attrs) def end_code(self): self.end_tt() + def start_dfn(self, attrs): pass + def end_dfn(self): pass + def start_em(self, attrs): self.start_i(attrs) def end_em(self): self.end_i() @@ -360,21 +389,45 @@ # Typographic Elements + def start_b(self, attrs): + self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS)) + def end_b(self): + self.formatter.pop_font() + + def start_big(self, attrs): + pass + def end_big(self): + pass + def start_i(self, attrs): self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) def end_i(self): self.formatter.pop_font() - def start_b(self, attrs): - self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS)) - def end_b(self): - self.formatter.pop_font() + def start_small(self, attrs): + pass + def end_small(self): + pass + def start_sub(self, attrs): + pass + def end_sub(self): + pass + + def start_sup(self, attrs): + pass + def end_sup(self): + pass + def start_tt(self, attrs): self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) def end_tt(self): self.formatter.pop_font() + + + # Linking + def start_a(self, attrs): href = '' name = '' @@ -402,8 +455,16 @@ def do_hr(self, attrs): self.formatter.add_hor_rule() - # --- Image + # --- Images and objects + def do_area(self, attrs): + pass + + def start_iframe(self, attrs): + pass + def end_iframe(self): + pass + def do_img(self, attrs): align = '' alt = '(image)' @@ -428,12 +489,208 @@ except ValueError: pass self.handle_image(src, alt, ismap, align, width, height) - # --- Really Old Unofficial Deprecated Stuff + def start_map(self, attrs): + pass + def end_map(self): + pass + def start_object(self, attrs): + pass + def end_object(self): + pass + + def do_param(self, attrs): + pass + + # --- Forms + + def start_button(self, attrs): + pass + def end_button(self): + pass + + def start_fieldset(self, attrs): + pass + def end_fieldset(self): + pass + + def start_form(self, attrs): + pass + def end_form(self): + pass + + def do_input(self, attrs): + pass + + def start_label(self, attrs): + pass + def end_label(self): + pass + + def start_legend(self, attrs): + pass + def end_legend(self): + pass + + def start_optgroup(self, attrs): + pass + def end_optgroup(self): + pass + + def start_option(self, attrs): + pass + def end_option(self): + pass + + def start_select(self, attrs): + pass + def end_select(self): + pass + + def start_textarea(self, attrs): + pass + def end_textarea(self): + pass + + + # --- Table elements + + def start_caption(self, attrs): + pass + def end_caption(self): + pass + + def do_col(self, attrs): + pass + + def start_colgroup(self, attrs): + pass + def end_colgroup(self): + pass + + def start_table(self, attrs): + pass + def end_table(self): + pass + + def start_tbody(self, attrs): + pass + def end_tbody(self): + pass + + def start_td(self, attrs): + pass + def end_td(self): + pass + + def start_tfoot(self, attrs): + pass + def end_tfoot(self): + pass + + def start_th(self, attrs): + pass + def end_th(self): + pass + + def start_thead(self, attrs): + pass + def end_thead(self): + pass + + def start_tr(self, attrs): + pass + def end_tr(self): + pass + + # --- Frames + + def do_frame(self, attrs): + pass + + def start_frameset(self, attrs): + pass + def end_frameset(self): + pass + + def start_noframes(self, attrs): + pass + def end_noframes(self): + pass + + + # --- Constructs deprecated in HTML 4.01 + + def start_applet(self, attrs): + pass + def end_applet(self): + pass + + def do_basefont(self, attrs): + pass + + def start_center(self, attrs): + pass + def end_center(self): + pass + + def start_dir(self, attrs): + self.start_ul(attrs) + def end_dir(self): + self.end_ul() + + def start_font(self, attrs): + pass + def end_font(self): + pass + + def do_isindex(self, attrs): + self.isindex = 1 + + def start_menu(self, attrs): + self.start_ul(attrs) + def end_menu(self): + self.end_ul() + + def start_s(self, attrs): + pass + def end_s(self): + pass + + def start_strike(self, attrs): + pass + def end_strike(self): + pass + + def start_u(self, attrs): + pass + def end_u(self): + pass + + + # --- Unofficial deprecated elements (pre-HTML 2.0, mostly) + + def start_listing(self, attrs): + self.start_pre(attrs) + self.setliteral('listing') # Tell SGML parser + + def end_listing(self): + self.end_pre() + + def do_nextid(self, attrs): # Deprecated + pass + def do_plaintext(self, attrs): self.start_pre(attrs) self.setnomoretags() # Tell SGML parser + def start_xmp(self, attrs): + self.start_pre(attrs) + self.setliteral('xmp') # Tell SGML parser + + def end_xmp(self): + self.end_pre() + # --- Unhandled tags def unknown_starttag(self, tag, attrs):