# coding:utf-8 import requests from lxml import etree is_next=True #全局變量 class csdncrawl(): #獲取登錄時所需的post參數 def get_params(self,username,password,post_url,post_session,post_headers): param_username=username param_password=password get_url=post_url get_session=post_session get_headers=post_headers index_page=get_session.get(get_url,headers=get_headers) html=etree.HTML(index_page.text) lt=html.xpath(".//input[@name='lt']//@value")[0] execution=html.xpath(".//input[@name='execution']//@value")[0] _eventId=html.xpath(".//input[@name='_eventId']//@value")[0] postdata = { 'username':param_username, 'password':param_password, 'lt':lt, 'execution':execution, '_eventId':_eventId, } return postdata #登錄函數 def csdn_login(self,username,password,index_url,session,headers): login_username=username login_password=password post_url = index_url post_session=session post_headers=headers postdata = self.get_params(login_username,login_password,post_url,post_session,post_headers) post_session.post(post_url,data=postdata,headers=post_headers) #啓動爬蟲函數 def startcrawl(self,session): username = 'zkwniky' password = '+++++++' start_page_number=1 dict_blog={} index_url = 'https://passport.csdn.net/account/login' agent = 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0' headers = { 'User-Agent': agent } start_session = session self.csdn_login(username,password,index_url,start_session,headers)#成功登錄 self.crawl_person_csdn(session,headers) #進入個人中心 self.crawl_blog_list(dict_blog,username,session,headers,start_page_number)#進入個人博客 #爬取個人中心,確定登錄成功 def crawl_person_csdn(self,session,headers): person_url='http://my.csdn.net/my/mycsdn' person_session=session person_headers=headers person=person_session.get(person_url,headers=person_headers) print person.text #爬取我的博客列表 def crawl_blog_list(self,dict_blog,username,session,headers,start_page_number): global is_next page_number=start_page_number dict_blog_list = dict_blog while is_next: blog_username=username blog_url='http://blog.csdn.net/'+blog_username+'/article/list/'+str(page_number) blog_session=session blog_headers=headers blog_page=blog_session.get(blog_url,headers=blog_headers) print blog_page.text html = etree.HTML(blog_page.text) href= html.xpath(".//span[@class='link_title']//a//@href") title = html.xpath(".//span[@class='link_title']//a/text()") current_page_number=html.xpath(".//div[@class='pagelist']//strong/text()") last_page_number= html.xpath(".//div[@class='pagelist']//a//@href") i = 0 while i < len(href): dict_blog_list['http://blog.csdn.net' + href[i]] = title[i] i += 1 if self.judge_next_page(current_page_number[0],last_page_number[-1][-1]): page_number+=1 print len(dict_blog_list) return dict_blog_list
#判斷博客是否有下一頁def judge_next_page(self,current_page,next_page): global is_next if current_page<next_page: #還有下一頁 is_next=True else: is_next=False return is_nextif __name__=='__main__': session=requests.session() csdncrawl=csdncrawl() csdncrawl.startcrawl(session) #成功登錄
解釋如下:
1)
整體過程比較簡單,登錄時post數據如下:
eventId=submit
execution=e1s1
lt=LT-597060-IAanNajzYkoNV67gnQpFNT9m7goQ7U
password=++++++
username=zkwniky
其中前三個的值需要在登錄頁面中的隱藏標籤中獲取)
2)
判斷是否有下一頁時,使用了當前頁面數最小的方法
3)
python 2.7 執行成功
4)登錄到個人中心時 返回json數據地址如下
全部文章
http://my.csdn.net/my/mycsdn/get_read_list?lastId=-&size=10&direction=down&type=
熱門博客列表
http://my.csdn.net/my/mycsdn/get_hot_blog_list?pageno=1&pagesize=5&username=zkwniky
熱門資源列表
http://my.csdn.net/my/mycsdn/get_hot_download_list
熱門搜索 :java,python,spring,mysql,php
http://so.csdn.net/so/search/hotQuery.do?&callback=jQuery19009254916422648101_1501121642771&size=5&_=1501121642773
精彩回答
http://my.csdn.net/my/mycsdn/get_ask_list