python网页下载类
处理了etag和lastmodified,和gzip。
避免了重复或许,节省带宽。
#!/usr/bin/env python
import openanything
USER_HTTP_AGENT = 'python http downloader'
class http_downloader():
def __init__(self,url):
self.result = {}
self.result['url'] = url
self.result['etag'] = None
self.result['lastmodified'] = None
def fetch(self):
f = openanything.openAnything(self.result['url'],self.result['etag'],self.result['lastmodified'],USER_HTTP_AGENT)
self.result['data'] = f.read()
if hasattr(f, 'headers'):
# save ETag, if the server sent one
self.result['etag'] = f.headers.get('ETag')
# save Last-Modified header, if the server sent one
self.result['lastmodified'] = f.headers.get('Last-Modified')
if f.headers.get('content-encoding') == 'gzip':
# data came back gzip-compressed, decompress it
self.result['data'] = gzip.GzipFile(fileobj=StringIO(result['data'])).read()
if hasattr(f, 'url'):
self.result['url'] = f.url
#if f.read success,f has not attr 'status ', so set default here
self.result['status'] = 200
if hasattr(f, 'status'):
self.result['status'] = f.status
f.close()
def set_url(self,url):
if url != self.result['url']:
self.result['etag'] = None
self.result['lastmodified'] = None
self.result['url'] = url
def debug_print(self):
print self.result['url']
print self.result['status']
print self.result['etag']
print self.result['lastmodified']
#print self.result['data']