模擬登錄-知乎

偶爾看到爬蟲,就瞭解了下
cookielib:
該模塊用於操作cookie
cookielib.CookieJar()
用於處理cookie,不過在urllib2.HTTPCookieProcessor中對其進行了封裝
所以

<div style="font-family: 微軟雅黑; font-size: 14px; line-height: 21px;"><span style="background-color: inherit; line-height: 1.5;">cookieJar=cookielib.CookieJar()</span></div><div style="font-family: 微軟雅黑; font-size: 14px; line-height: 21px;"><span style="background-color: inherit; line-height: 1.5;">opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)).open(url)</span></div>
也可以寫爲:
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor()).open(url)
因爲在urllib2.HTTPCookieProcessor的__init__()中當cookieJar參數爲空時,會自動實例化一個cookieJar對象來操作cookie。

cookieJar.add_cookie_header(request)
將urllib2.Request的cookie添加添加到cookieJar中

cookieJar是一個可迭代的對象,迭代得到的對象爲cookie的一個鍵值對
for ck in cookieJar:
        print ck.name,'=',ck.value
輸出結果:
q_c1=47d7833820654cd9aaecd0176195c9be
ga=GA1.2.1918555708.1437623636
utma=51854390.1918555708.1437623636.1441865370.1441872012.4

但是cookielib.CookieJar()處理cookie並不可靠,往往會缺少很多字段
對比:
response = opener.open(req)
print response.info().get('Set-Cookie')
print '========================='
for ck in cookieJar: 
    print ck.name,'=',ck.value

q_c1=0c1aa3813d3e4669a2aa6325990a072c|1441967500000|1441967500000; Domain=zhihu.com; expires=Mon, 10 Sep 2018 10:31:40 GMT; Path=/, cap_id="aWQ=|1441967500|4b198f2db1f14b7f16aec0856e84aedc99640017"; Domain=zhihu.com; expires=Sun, 11 Oct 2015 10:31:40 GMT; Path=/, n_c=1; Domain=zhihu.com; Path=/
=========================
cap_id = "aWQ=|1441967500|4b198f2db1f14b7f16aec0856e84aedc99640017"
n_c = 1
q_c1 = 0c1aa3813d3e4669a2aa6325990a072c|1441967500000|1441967500000

在模擬登陸知乎獲取驗證碼時候會導致驗證碼一直錯誤,後來改爲根據set-cookie更新header,然後用新生成的header請求。

cookielib.FileCookieJar()
cookielib.FileCookieJar()繼承了cookielib.CookieJar(),可以將cookie保存到文件

cookielib.MozillaCookieJar()
cookielib.MozillaCookieJar()繼承了cookielib.FileCookieJar(),可以使用瀏覽器格式的cookie文件

def save_cookies(url, postdata = None, header = None, filename = None):
    '''
    @summary: 保存cookies
    @postdata: post提交的數據
    @header: 請求的頭部信息
    @filename: 保存cookie的文件名稱(從該文件中讀取cookie,也可以保存cookie到該文件中)
    '''
    req = urllib2.Request(url, postdata, header)
   
    ckjar = cookielib.MozillaCookieJar(filename)
    ckproc = urllib2.HTTPCookieProcessor(ckjar)
   
    opener = urllib2.build_opener(ckproc)
   
    response = opener.open(req)
    html = response.read()
    response.close()
    '''保存cookie到文件'''
    ckjar.save(ignore_discard=True, ignore_expires=True)
    return  html

urllib2:

urllib2通過data參數來確定是get請求還是post請求
get請求:
1.
import urllib2
response= urllib2.urlopen('http://www.baidu.com/')
content = response.read()
print content
2.
import urllib2
req = urllib2.Request('http://www.baidu.com/')
response= urllib2.urlopen(req)
content = response.read()
print content

post請求:
1.<strong>
</strong>import urllib2
postdata = {'k':'v'}
#post提交的數據是需要進行urlcode編碼
postdata = urllib.urlencode(postdata)
response= urllib2.urlopen('http://www.baidu.com/',data = postdata)
content = response.read()
print content
2.
import urllib2
postdata = {'k':'v'}
postdata = urllib.urlencode(postdata)
req = urllib2.Request('http://www.baidu.com/',data = postdata)
response= urllib2.urlopen(req)
content = response.read()
print content

帶有header的請求:
postdata = {'_xsrf':'','account':'','password': 'xxx','remember_me': 'true'}
postdata = urllib.urlencode(postdata)
headers = {'Host', 'www.zhihu.com','User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'}
req=urllib2.Request('http://www.zhihu.com/login/email', data = postdata, header = headers)
response= urllib2.urlopen(req)
content = response.read()
print content

模擬登錄:
主要驗證是否是瀏覽器訪問,和cookie是否正確,這兩個信息都保存在header中,模擬登錄主要流程如下:
1.構造postdata,然後通過post請求登錄
2.將返回header的set-cookie的cookie保存到header的Cookie字段中
3.然後再用保存的新header訪問(同樣:登錄後直接用瀏覽器將header保存下來,然後通過該header請求一樣可以)
header的"User-Agent"字段保存這瀏覽器信息,urllib2.build_opener可以自動處理cookie。如下(自己的代碼不小心刪了,流程差不多):
import urllib2
import urllib
import cookielib

auth_url = 'http://www.nowamagic.net/'
home_url = 'http://www.nowamagic.net/';
# 登陸用戶名和密碼
data={
	"username":"nowamagic",
	"password":"pass"
}
# urllib進行編碼
post_data=urllib.urlencode(data)
# 發送頭信息
headers ={
        "User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko"
	"Host":"www.nowamagic.net", 
	"Referer": "http://www.nowamagic.net"
}
# 初始化一個CookieJar來處理Cookie
cookieJar=cookielib.CookieJar()
# 實例化一個全局opener
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
# 獲取cookie
req=urllib2.Request(auth_url,post_data,headers)
#動態添加header字段
req.add_header('Accept-Encoding', "gzip, deflate")
result = opener.open(req)
# 訪問主頁 自動帶着cookie信息
#注意:用這種方法獲取的cookie可能不全,導致不能正常登錄,如果該方法不能正常登錄的時候,可以使用
#result.info().get('Set-Cookie')獲取header的'Set-Cookie',然後手動跟新請求header的cookie字段
#然後請求
result = opener.open(home_url)
# 顯示結果
print result.read()

驗證碼登錄:
很多時候登錄時需要驗證碼,驗證碼主要難度在驗證碼識別上,但是很多時候會產生驗證碼錯誤的警告。驗證碼登錄的主要步驟爲:
1.請求驗證碼
2.保存驗證碼和請求驗證碼時返回的'Set-Cookie'到'Cookie'中(服務器會根據,cookie數據判斷驗證碼是否正確)
3.識別驗證碼(可以手動保存查看)
4.將驗證碼和賬號密碼一同post提交
5.保存請求驗證碼返回的'Set-Cookie'到'Cookie'中
6.有時候提交登錄數據後會發生跳轉(其實就是根據查看登錄流程一步步更新Cookie的過程,然後用最終的Cookie訪問)
模擬登錄知乎
import cookielib, urllib2, urllib,re,gzip,time
from StringIO import StringIO
import socket
socket.setdefaulttimeout(300)

headers = {'Host':'www.zhihu.com',
               'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0',
               'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
               'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
               'Accept-Encoding':'gzip, deflate',
               'Referer':'gzip, deflate',
               'connection':'keep-alive',
}

def analy_data(response):
    if response.info().get('Content-Encoding') == 'gzip':
        buf = StringIO( response.read())
        f = gzip.GzipFile(fileobj=buf)
        data = f.read()
    else:
        data = response.read()  
    response.close()
    return data

def get_xsrf(data):
    cer = re.compile('name="_xsrf" value="(.*)"', flags = 0)
    strlist = cer.findall(data)
    xsrf = strlist[0]
    print xsrf   
    return xsrf 

def save_gif(response):
    gif = response.read()
    with open("captcha.gif", 'wb') as f:
        f.write(gif)
    response.close()
    captcha = raw_input("查看驗證碼:")
    captcha = captcha.strip()
    print captcha
    return captcha


def get_postdata(xsrf, captcha = None):
    postdata = {
    '_xsrf': xsrf,
    'account': 'user',
    'password': 'password',
    'captcha' : captcha,
    'remember_me': 'true'
     }
    postdata = urllib.urlencode(postdata)
    print postdata
    return postdata


def save_cookie(set_cookie,headers):
    '''手動處理cookie'''
    old_dic = []
    new_dic = []
    old_cookie = headers.get('Cookie')
    old_kv_list = old_cookie.split(";")
    new_kv_list = set_cookie.split(";")
    for i in old_kv_list:
        old_dic[i.split(';')[0]] = i.split(';')[1]
    for i in new_kv_list:
        new_dic[i.split(';')[0]] = i.split(';')[1]
    for k in new_dic:
        old_dic[k] = new_dic[k]
    cookie = ''
    for k in old_dic:
        str_node = str(k) + '=' + str(old_dic[k])
        cookie += str_node + ';'
    cookie = cookie.strip(';')
    headers['Cookie'] = cookie
    return headers

if __name__ == "__main__":
    '''發現cookielib處理cookie並不可靠,會缺少很多字段'''
    cookieJar=cookielib.CookieJar()
    opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
    req=urllib2.Request('http://www.zhihu.com/')
    response = opener.open(req)
    data = analy_data(response)
    xsrf = get_xsrf(data)
    
    req = urllib2.Request('http://www.zhihu.com/captcha.gif')
    
    
    
    response = opener.open(req)
    print response.info()
    print response.info().get('Set-Cookie')
    captcha = save_gif(response)
    for ck in cookieJar:
        print ck.name,'=',ck.value
    print "----------------------------"
     
    postdata = get_postdata(xsrf,captcha)
    req=urllib2.Request('http://www.zhihu.com/login/email',data = postdata)
    response = opener.open(req)
    data = analy_data(response)
    print data
    
    
if __name__ == "__main__":
    '''手動處理cookie''
    req=urllib2.Request('http://www.zhihu.com/')
    response = urllib2.urlopen(req)
    data = analy_data(response)
    xsrf = get_xsrf(data)
    
    req = urllib2.Request('http://www.zhihu.com/captcha.gif', headers = headers)
    
    response = urllib2.urlopen(req)
    print response.info()
    set_cookie = response.info().get('Set-Cookie')
    headers = save_cookie(set_cookie,headers)
    captcha = save_gif(response)
    for ck in cookieJar:
        print ck.name,'=',ck.value
    print "----------------------------"
     
    postdata = get_postdata(xsrf,captcha)
    req=urllib2.Request('http://www.zhihu.com/login/email',data = postdata, headers = headers)
    response = urllib2.urlopen(req)
    data = analy_data(response)
    print data


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章