讀取網頁時候,如果訪問速度過快,會返回個timeout錯誤(10054),因此要在此做個try,並啓用下一個代理。(代理可以百度,http 代理,要有端口號)。同時模擬瀏覽器,可以防止一些返回錯誤。
#讀取網頁函數
def FormatHTML( url ):
flag = True
count = 0
sleep_download_time = 0
time_out = 10
fails = 0
HTTP_num = 0
HTTP_dl = ['211.142.236.132:80', '118.186.9.21:80', '118.186.9.22:80', '211.142.236.132:80']
while True:
if fails >= 3:
return None
break
try:
print u'========開啓代理========='
opener = urllib2.build_opener( urllib2.ProxyHandler( {'http':HTTP_dl[HTTP_num]} ), urllib2.HTTPHandler( debuglevel = 1 ) )
urllib2.install_opener( opener )
while flag:
try:
print u'=========模擬瀏覽器========='
i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", "Referer": 'http://www.baidu.com'}
req = urllib2.Request( url, headers = i_headers )
time.sleep( sleep_download_time )
print u'==========讀取網頁==========='
f = urllib2.urlopen( req, timeout = time_out )
flag = False
except urllib2.HTTPError, e:
if e.code == 404:
print 'e.code:' + str( e.code )
count += 1
print 'count=' + str( count )
if count >= 4:
print 'count==' + str( count )
flag = False
return None
else:
sleep_download_time = sleep_download_time + 2
time.sleep( sleep_download_time )
count += 1
print 'urllib2.HTTPError:' + str( e.code )
s = sys.exc_info()
print s
print "Error '%s' happened on line %d" % ( s[1], s[2].tb_lineno )
if count == 10:
flag = False
return None
except :
print 2
sleep_download_time = sleep_download_time + 2
time.sleep( sleep_download_time )
count += 1
print url
print u"連接超時!"
s = sys.exc_info()
print "Error '%s' happened on line %d" % ( s[1], s[2].tb_lineno )
if count == 10:
flag = False
return None
reader = BeautifulSoup( f.read() )
print u'==========讀取完畢==========='
f.close()
break
except():
HTTP_num += 1
s = sys.exc_info()
print "Error '%s' happened on line %d" % ( s[1], s[2].tb_lineno )
fails += 1
time_out += 5
return reader