Message 19144 - Python tracker

➜

This issue tracker has been migrated to GitHub, and is currently read-only.
For more information, see the GitHub FAQs in the Python's Developer Guide.

Author	d98dzone
Recipients
Date	2003-11-25.16:47:35
SpamBayes Score
Marked as misclassified
Message-id
In-reply-to

Content
During the process of making my masters thesis I discovered the need for a working getpos() in sgmllib.py. As it is now you can successfully call it since it is inherited from markupbase.py but you will always get the answer (1,0) since it is never updated. To fix this one needs to change the goahead function. This is my own implementation of this change, in part influenced by the "sister" goahead-function in HTLMParser.py: ************************************ def goahead(self, end): rawdata = self.rawdata i = 0 k = 0 n = len(rawdata) tmp=0 while i < n: if self.nomoretags: self.handle_data(rawdata[i:n]) i = n break match = interesting.search(rawdata, i) if match: j = match.start() else: j = n if i < j: self.handle_data(rawdata[i:j]) tmp = self.updatepos(i, j) i = j if i == n: break startswith = rawdata.startswith if rawdata[i] == '<': if starttagopen.match(rawdata, i): if self.literal: self.handle_data(rawdata[i]) tmp = self.updatepos(i, i+1) i = i+1 continue k = self.parse_starttag(i) if k < 0: break tmp = self.updatepos(i, k) i = k continue if rawdata.startswith("</", i): k = self.parse_endtag(i) if k < 0: break tmp = self.updatepos(i, k) i = k self.literal = 0 continue if self.literal: if n > (i + 1): self.handle_data("<") i = i+1 tmp = self.updatepos(i, k) else: # incomplete break continue if rawdata.startswith("<!--", i): # Strictly speaking, a comment is --.-- # within a declaration tag <!...>. # This should be removed, # and comments handled only in parse_declaration. k = self.parse_comment(i) if k < 0: break tmp = self.updatepos(i, k) i = k continue if rawdata.startswith("<?", i): k = self.parse_pi(i) if k < 0: break tmp = self.updatepos(i, k) i = i+k continue if rawdata.startswith("<!", i): # This is some sort of declaration; in "HTML as # deployed," this should only be the document type # declaration ("<!DOCTYPE html...>"). k = self.parse_declaration(i) if k < 0: break tmp = self.updatepos(i, k) i = k continue tmp = self.updatepos(i, k) elif rawdata[i] == '&': if self.literal: self.handle_data(rawdata[i]) #tmp = self.updatepos(i,i+1)#added i = i+1 continue match = charref.match(rawdata, i) if match: name = match.group()[2:-1] self.handle_charref(name) k = match.end() if not startswith(';', k-1): k = k - 1 tmp = self.updatepos(i, k) i = k continue match = entityref.match(rawdata, i) if match: name = match.group(1) self.handle_entityref(name) k = match.end() if not startswith(';', k-1): k = k - 1 tmp = self.updatepos(i, k) i = k continue else: self.error('neither < nor & ??') # We get here only if incomplete matches but # nothing else match = incomplete.match(rawdata, i) if not match: self.handle_data(rawdata[i]) i = i+1 continue j = match.end(0) if j == n: break # Really incomplete self.handle_data(rawdata[i:j]) i = j # end while if end and i < n: self.handle_data(rawdata[i:n]) tmp = self.updatepos(i, n) i = n self.rawdata = rawdata[i:] # XXX if end: check for empty stack # Extensions for the DOCTYPE scanner: _decl_otherchars = '=' *************************** The major diffrence is the updatepos functions. It seems to work fine, or at least it has worked fine for me so far.

During the process of making my masters thesis I
discovered the need for a working getpos() in
sgmllib.py. As it is now you can successfully call it
since it is inherited from markupbase.py but you will
always get the answer (1,0) since it is never updated.

To fix this one needs to change the goahead function.
This is my own implementation of this change, in part
influenced by the "sister" goahead-function  in
HTLMParser.py:


************************************
def goahead(self, end):
        rawdata = self.rawdata
        i = 0
        k = 0
        n = len(rawdata)
        tmp=0
        while i < n:
            if self.nomoretags:
                self.handle_data(rawdata[i:n])
                i = n
                break
            match = interesting.search(rawdata, i)
            if match: j = match.start()
            else: j = n
            if i < j:
                self.handle_data(rawdata[i:j])
                tmp = self.updatepos(i, j)
            i = j
            if i == n: break
            startswith = rawdata.startswith
            if rawdata[i] == '<':
                if starttagopen.match(rawdata, i):
                    if self.literal:
                        self.handle_data(rawdata[i])
                        tmp = self.updatepos(i, i+1)
                        i = i+1
                        continue
                    k = self.parse_starttag(i)
                    if k < 0: break
                    tmp = self.updatepos(i, k)
                    i = k
                    continue
                if rawdata.startswith("</", i):
                    k = self.parse_endtag(i)
                    if k < 0: break
                    tmp = self.updatepos(i, k)
                    i = k
                    self.literal = 0
                    continue
                if self.literal:
                    if n > (i + 1):
                        self.handle_data("<")
                        i = i+1
                        tmp = self.updatepos(i, k)
                    else:
                        # incomplete
                        break
                    continue
                if rawdata.startswith("<!--", i):
                        # Strictly speaking, a comment
is --.*--
                        # within a declaration tag <!...>.
                        # This should be removed,
                        # and comments handled only in
parse_declaration.
                    k = self.parse_comment(i)
                    
                    if k < 0: break
                    tmp = self.updatepos(i, k)
                    i = k

                    continue
                if rawdata.startswith("<?", i):
                    k = self.parse_pi(i)
                    if k < 0: break
                    tmp = self.updatepos(i, k)
                    i = i+k
                    continue
                if rawdata.startswith("<!", i):
                    # This is some sort of declaration;
in "HTML as
                    # deployed," this should only be
the document type
                    # declaration ("<!DOCTYPE html...>").
                    k = self.parse_declaration(i)
                    if k < 0: break
                    tmp = self.updatepos(i, k)
                    i = k
                    continue
                tmp = self.updatepos(i, k)
            elif rawdata[i] == '&':
                
                if self.literal:
                    self.handle_data(rawdata[i])
                    #tmp = self.updatepos(i,i+1)#added
                    i = i+1
                    continue
                match = charref.match(rawdata, i)
                if match:
                    name = match.group()[2:-1]
                    self.handle_charref(name)
                    k = match.end()
                    if not startswith(';', k-1):
                        k = k - 1
                    tmp = self.updatepos(i, k)
                    i = k
                    continue
                match = entityref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_entityref(name)
                    k = match.end()
                    if not startswith(';', k-1):
                        k = k - 1
                    tmp = self.updatepos(i, k)
                    i = k
                    continue
                
            else:
                self.error('neither < nor & ??')
            # We get here only if incomplete matches but
            # nothing else
            match = incomplete.match(rawdata, i)
            if not match:
                self.handle_data(rawdata[i])
                i = i+1
                continue
            j = match.end(0)
            if j == n:
                break # Really incomplete
            self.handle_data(rawdata[i:j])

            i = j

            
        # end while
        if end and i < n:
            self.handle_data(rawdata[i:n])
            tmp = self.updatepos(i, n)
            i = n
        self.rawdata = rawdata[i:]
        # XXX if end: check for empty stack

    # Extensions for the DOCTYPE scanner:
    _decl_otherchars = '='

****************************

The major diffrence is the updatepos functions. It
seems to work fine, or at least it has worked fine for
me so far.

History
Date	User	Action	Args
2007-08-23 14:18:28	admin	link	issue849097 messages
2007-08-23 14:18:28	admin	create