Author gnarfk
Recipients
Date 2006-03-17.11:57:36
SpamBayes Score
Marked as misclassified
Message-id
In-reply-to
Content
I'd like to illustrate and suggest a fix by showing a
simple python file (which was named htmllib2.py so you
can uncomment the line in the doctest case to see that
my fix works). It's more like a hack than the fix though:
#!/usr/bin/env python2.4

"""
Use this instead of htmllib for having entitydefs
substituted in attributes,too.

Example:
>>> import htmllib
# >>> import htmllib2 as htmllib
>>> import formatter
>>> import StringIO
>>> s = StringIO.StringIO()
>>> p =
htmllib.HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(s)))
>>> p.feed('<img alt="<>&">')
>>> s.getvalue()
'<>&'
"""

__all__ = ("HTMLParser",)

import htmllib
from htmlentitydefs import name2codepoint as entitytable

entitytable = dict([(k, chr(v)) for k, v in
entitytable.items() if v < 256])

def entitysub(s):
    ret = ""
    state = ""
    for c in s:
        if state.startswith('&'):
            if c == ';':
                ret += entitytable.get(state[1:], '%s;'
% state)
                state = ""
            else:
                state += c
        elif c == '&':
            state = c
        else:
            ret += c
    return ret

class HTMLParser(htmllib.HTMLParser):
    def handle_starttag(self, tag, method, attrs):
        """Repair attribute values."""
        attrs = [(k, entitysub(v)) for (k, v) in attrs]
        method(attrs)

if __name__ == '__main__':
    import doctest
    doctest.testmod()
History
Date User Action Args
2007-08-23 14:38:34adminlinkissue1452246 messages
2007-08-23 14:38:34admincreate