--- httplib.py-rev1.95 2005-07-22 14:06:38.000000000 +0200 +++ httplib.py 2005-07-23 20:39:50.000000000 +0200 @@ -69,6 +69,9 @@ import errno import mimetools import socket +import StringIO as SlowStringIO # Can't inherit from cStringIO.StringIO ? +import gzip +import zlib from urlparse import urlsplit try: @@ -153,6 +156,56 @@ INSUFFICIENT_STORAGE = 507 NOT_EXTENDED = 510 +# Allow to seek() back for GzipFile2 +class GzipedHTTPIO(SlowStringIO.StringIO): + def __init__(self, resp): + self.resp = resp + self.eof = False + SlowStringIO.StringIO.__init__(self) + + def read(self, size=None): + ret = SlowStringIO.StringIO.read(self, size) + + if self.eof: + return ret + + if len(ret) != size: + newdata = self.resp.read(size-len(ret), raw=True) + self.write(newdata) + + if len(ret)+len(newdata) != size: + self.eof = True + + return ret+newdata + else: + return ret + + def seek(self, pos, ref=0): + if ref == 2: + raise IOError("Can't seek() relative to end") + return SlowStringIO.StringIO.seek(self, pos, ref) + +# Same as gzip.GzipFile, but don't need to go to the real EOF +class GzipFile2(gzip.GzipFile): + def _read(self, size=1024): + if self.fileobj is None: + raise EOFError, "Reached EOF" + + if self._new_member: + pos = self.fileobj.tell() + SlowStringIO.StringIO.seek(self.fileobj, 0, 2) # seek(0,2) is the end of downloaded data, not the end of the document, so we can do it + if self.fileobj.read(1) == "": + raise EOFError, "Reached EOF" + self.fileobj.seek(self.fileobj, pos) + + self._init_read() + self._read_gzip_header() + self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) + self._new_member = False + + return gzip.GzipFile._read(self, size) + + class HTTPMessage(mimetools.Message): def addheader(self, key, value): @@ -285,6 +338,7 @@ self.chunk_left = _UNKNOWN # bytes left to read in current chunk self.length = _UNKNOWN # number of bytes left in response self.will_close = _UNKNOWN # conn will close at end of response + self.gziped = _UNKNOWN # 1 if content is gzip'ed def _read_status(self): # Initialize with Simple-Response defaults @@ -374,6 +428,13 @@ self.chunk_left = None else: self.chunked = 0 + + cnt_enc = self.msg.getheader('content-encoding') + if cnt_enc and cnt_enc.lower() == "gzip": + self.gziped = 1 + self.gzfp = GzipFile2(fileobj=GzipedHTTPIO(self)) # read uncompressed content from this file descriptor + else: + self.gziped = 0 # will the connection close at the end of the response? self.will_close = self._check_close() @@ -449,9 +510,13 @@ # XXX It would be nice to have readline and __iter__ for this, too. - def read(self, amt=None): + def read(self, amt=None, raw = False): + # raw: don't decompress if self.fp is None: return '' + + if self.gziped and not raw: # read uncompressed + return self.gzfp.read(amt) if self.chunked: return self._read_chunked(amt) @@ -477,9 +542,9 @@ s = self.fp.read(amt) if self.length is not None: self.length -= len(s) - + return s - + def _read_chunked(self, amt): assert self.chunked != _UNKNOWN chunk_left = self.chunk_left @@ -756,10 +821,8 @@ # libraries are updated to recognize other forms, then this # code should be changed (removed or updated). - # we only want a Content-Encoding of "identity" since we don't - # support encodings such as x-gzip or x-deflate. if not skip_accept_encoding: - self.putheader('Accept-Encoding', 'identity') + self.putheader('Accept-Encoding', 'identity,gzip;q=0.9') # we can accept "chunked" Transfer-Encodings, but no others # NOTE: no TE header implies *only* "chunked" @@ -1363,5 +1426,15 @@ for header in headers.headers: print header.strip() print +def test_gzip(): + h = HTTPConnection("www.linuxfr.org") + h.set_debuglevel(1) + h.putrequest("GET", "/pub/") + h.endheaders() + resp = h.getresponse() + print resp.read() + h.close() + if __name__ == '__main__': test() + test_gzip()