爬蟲 登錄csdn並獲取個人博客文章列表

# coding:utf-8
import requests
from lxml import etree

is_next=True  #全局變量


class csdncrawl():
    #獲取登錄時所需的post參數
    def get_params(self,username,password,post_url,post_session,post_headers):
        param_username=username
        param_password=password
        get_url=post_url

        get_session=post_session
        get_headers=post_headers

        index_page=get_session.get(get_url,headers=get_headers)
        html=etree.HTML(index_page.text)
        lt=html.xpath(".//input[@name='lt']//@value")[0]
        execution=html.xpath(".//input[@name='execution']//@value")[0]
        _eventId=html.xpath(".//input[@name='_eventId']//@value")[0]
        postdata = {
             'username':param_username,
             'password':param_password,
             'lt':lt,
             'execution':execution,
             '_eventId':_eventId,
         }
        return postdata
#登錄函數

 def csdn_login(self,username,password,index_url,session,headers):
        login_username=username
        login_password=password
        post_url = index_url
        post_session=session
        post_headers=headers

        postdata = self.get_params(login_username,login_password,post_url,post_session,post_headers)


        post_session.post(post_url,data=postdata,headers=post_headers)

#啓動爬蟲函數
 def startcrawl(self,session):
        username = 'zkwniky'
        password = '+++++++'
        start_page_number=1

        dict_blog={}
        index_url = 'https://passport.csdn.net/account/login'
        agent = 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0'
        headers = {
            'User-Agent': agent
        }
        start_session = session
        self.csdn_login(username,password,index_url,start_session,headers)#成功登錄
        self.crawl_person_csdn(session,headers) #進入個人中心
        self.crawl_blog_list(dict_blog,username,session,headers,start_page_number)#進入個人博客
#爬取個人中心,確定登錄成功   
 def crawl_person_csdn(self,session,headers):
        person_url='http://my.csdn.net/my/mycsdn'
        person_session=session
        person_headers=headers
        person=person_session.get(person_url,headers=person_headers)

        print person.text
#爬取我的博客列表
    def crawl_blog_list(self,dict_blog,username,session,headers,start_page_number):
        global is_next
        page_number=start_page_number
        dict_blog_list = dict_blog
        while is_next:

            blog_username=username
            blog_url='http://blog.csdn.net/'+blog_username+'/article/list/'+str(page_number)
            blog_session=session
            blog_headers=headers
            blog_page=blog_session.get(blog_url,headers=blog_headers)
            print blog_page.text

            html = etree.HTML(blog_page.text)
            href= html.xpath(".//span[@class='link_title']//a//@href")
            title = html.xpath(".//span[@class='link_title']//a/text()")
            current_page_number=html.xpath(".//div[@class='pagelist']//strong/text()")
            last_page_number= html.xpath(".//div[@class='pagelist']//a//@href")
            i = 0
            while i < len(href):
                dict_blog_list['http://blog.csdn.net' + href[i]] = title[i]
                i += 1


            if self.judge_next_page(current_page_number[0],last_page_number[-1][-1]):
                page_number+=1



        print len(dict_blog_list)

        return dict_blog_list
#判斷博客是否有下一頁
def judge_next_page(self,current_page,next_page): global is_next if current_page<next_page: #還有下一頁 is_next=True else: is_next=False return is_nextif __name__=='__main__': session=requests.session() csdncrawl=csdncrawl() csdncrawl.startcrawl(session) #成功登錄

解釋如下:
1)
整體過程比較簡單,登錄時post數據如下:
eventId=submit 
execution=e1s1 
lt=LT-597060-IAanNajzYkoNV67gnQpFNT9m7goQ7U  
password=++++++
username=zkwniky

其中前三個的值需要在登錄頁面中的隱藏標籤中獲取)
2)
判斷是否有下一頁時,使用了當前頁面數最小的方法
3)
python 2.7 執行成功
4)登錄到個人中心時 返回json數據地址如下
全部文章
http://my.csdn.net/my/mycsdn/get_read_list?lastId=-&size=10&direction=down&type=
熱門博客列表
http://my.csdn.net/my/mycsdn/get_hot_blog_list?pageno=1&pagesize=5&username=zkwniky
熱門資源列表
http://my.csdn.net/my/mycsdn/get_hot_download_list
熱門搜索 :java,python,spring,mysql,php
http://so.csdn.net/so/search/hotQuery.do?&callback=jQuery19009254916422648101_1501121642771&size=5&_=1501121642773
精彩回答
http://my.csdn.net/my/mycsdn/get_ask_list





 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章