之前看到geventhttpclient這個項目,https://github.com/gwik/geventhttpclient,官方文檔說非常快,由於響應使用了C的解析,所以我一直想把這玩意用到項目中,
這兩天一直在糾結這玩意,說實在一句話,比較難用,封裝的不給力,最大缺陷如下:
1.不支持重定向,重定向需要自己來寫,很費事
2.新建的httpclient對象只能發送同域名的請求
這相當的蛋疼,我花了一點時間封裝了一下,解決了上面的兩個問題,還增加了自動編解碼問題,代碼如下:
#!/usr/bin/env python
#-*-encoding:UTF-8-*-
import re
from geventhttpclient.url import URL
from geventhttpclient.client import HTTPClient,HTTPClientPool
from urlparse import urljoin
#from core.common import urljoin
HEADERS = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0'
}
DEFAULF_METHOD = "GET"
MAX_REDIRECT_TIME = 10
DEFAULT_PAGE_ENCODING = "utf8"
class DifferDomainException(Exception):
"""
if request different domain url,geventhttpclient will throw it,
see gevent.client 'raise ValueError("Invalid host in URL")'
"""
def __init__(self,uri):
self.args = uri
self.uri = uri
class MaxRedirectException(Exception):
def __init__(self,response):
self.args = response
self.response = response
class HTTP(HTTPClient):
def request(self,request_uri, method=DEFAULF_METHOD,body=b"", headers={},follow_redirect=True,redirects=MAX_REDIRECT_TIME):
if body and method == DEFAULF_METHOD:
method = "POST"
h = [ k.title() for k in headers.iterkeys() ]
headers.update(dict( [ (k,v) for k,v in HEADERS.iteritems() if k not in h ] ))
response = super(HTTP,self).request(method, request_uri, body, headers)
if follow_redirect and response.status_code in (301,302,303,307) and response.method in ("GET","POST"):
if redirects:
location = response.get('location') or response.get('content-location') or response.get('uri')
if location:
location = urljoin(request_uri,location)
if not location.startswith(self._base_url_string):
raise DifferDomainException(location)
return self.request(location, method, body, headers, follow_redirect,redirects-1)
else:
raise MaxRedirectException(response)
return response
class HTTPPool(HTTPClientPool):
def get_client(self, url):
if not isinstance(url, URL):
url = URL(url)
client_key = url.host, url.port
try:
return self.clients[client_key]
except KeyError:
client = HTTP.from_url(url, **self.client_args)
self.clients[client_key] = client
return client
_POLL = HTTPPool(network_timeout=100,connection_timeout=100)
META_CHARSET_REGEX = re.compile(r'(?si)<head>.*<meta http-equiv="?content-type"?[^>]+charset=(?P<result>[^">]+).*</head>')
def decodePage(content,content_type):
httpCharset, metaCharset = None, None
if content_type and content_type.find("charset=") != -1:
httpCharset = content_type.split("charset=")[-1]
match = META_CHARSET_REGEX.search(content)
if match:
metaCharset = match.group('result')
print httpCharset,metaCharset
charset = httpCharset or metaCharset or DEFAULT_PAGE_ENCODING
return content.decode(charset).encode(DEFAULT_PAGE_ENCODING)
def request(request_uri, method=DEFAULF_METHOD,body=b"", headers={},follow_redirect=True,auto_read=True):
client = _POLL.get_client(request_uri)
response = None
try:
response = client.request(request_uri,method,body,headers,follow_redirect)
except DifferDomainException,e:
print "DifferDomainException:"+e.uri
response = request(e.uri,method,body,headers,follow_redirect)
except MaxRedirectException,e:
print "max redirect"
response = e.response # will return previous response,of course redirect response
except Exception,e:
print str(e)
if auto_read and response:
with response:
response.content = decodePage(response.read(),response.get('content-type'))
return response
def test():
# print request("http://127.0.0.1/re.php",follow_redirect=False)
# print request("http://127.0.0.1/re.php",follow_redirect=True).content
r=request("http://www.baidu.com/",follow_redirect=False)
#baidu utf8 utf8
print r.content[:10]
r=request("http://www.163.com/",follow_redirect=False)
#163 gbk gb2312
print r.content[:10]
test()
在測試網頁編碼問題遇到了一些問題,看下面:
由於頭部的請求先到,所以我們一般認爲返回的內容編碼是先根據頭部的,如果沒有再看頁面編碼。
我們看網易的編碼,頭部爲gbk,網頁爲gb2312,但用gb2312解碼竟然有問題,??? 我很不解,各位大大們爲啥呢?
但用頭部gbk解碼是正常的,這也證明了頭部編碼優先。按理說網頁編碼是告訴瀏覽器以gb2312顯示,但明顯有問題,瀏覽器怎麼做到的?
我們再看新浪的,這更讓我鬱悶了,誰來拯救我啊?