csdn博文爬蟲

#詳細代碼
‘’’
program: csdn博文爬蟲
function: 實現對我的csdn主頁所有博文的日期、主題、訪問量、評論個數信息爬取
version: python 3.5.1
time: 2016/05/29
author: yr
‘’’

import urllib.request,re,time,random,gzip

#定義保存文件函數
def saveFile(data,i):
path = “E:\projects\Spider\05_csdn\papers\paper_”+str(i+1)+".txt"
file = open(path,‘wb’)
page = ‘當前頁:’+str(i+1)+’\n’
file.write(page.encode(‘gbk’))
#將博文信息寫入文件(以utf-8保存的文件聲明爲gbk)
for d in data:
d = str(d)+’\n’
file.write(d.encode(‘gbk’))
file.close()

#解壓縮數據
def ungzip(data):
try:
#print(“正在解壓縮…”)
data = gzip.decompress(data)
#print(“解壓完畢…”)
except:
print(“未經壓縮,無需解壓…”)
return data

#CSDN爬蟲類
class CSDNSpider:
def init(self,pageIdx=1,url=“http://blog.csdn.net/fly_yr/article/list/1”):
#默認當前頁
self.pageIdx = pageIdx
self.url = url[0:url.rfind(’/’) + 1] + str(pageIdx)
self.headers = {
“Connection”: “keep-alive”,
“User-Agent”: "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
“(KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36”,
“Accept”: “text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8”,
“Accept-Encoding”: “gzip, deflate, sdch”,
“Accept-Language”: “zh-CN,zh;q=0.8”,
“Host”: “blog.csdn.net”
}

#求總頁數
def getPages(self):
    req = urllib.request.Request(url=self.url, headers=self.headers)
    res = urllib.request.urlopen(req)

    # 從我的csdn博客主頁抓取的內容是壓縮後的內容,先解壓縮
    data = res.read()
    data = ungzip(data)
    data = data.decode('utf-8')

    pages = r'<div.*?pagelist">.*?<span>.*?共(.*?)頁</span>'
    #link = r'<div.*?pagelist">.*?<a.*?href="(.*?)".*?</a>'
    # 計算我的博文總頁數
    pattern = re.compile(pages, re.DOTALL)
    pagesNum = re.findall(pattern, data)[0]
    return pagesNum

#設置要抓取的博文頁面
def setPage(self,idx):
    self.url = self.url[0:self.url.rfind('/')+1]+str(idx)

#讀取博文信息
def readData(self):
    ret=[]
    str = r'<dl.*?list_c clearfix">.*?date_t"><span>(.*?)</span><em>(.*?)</em>.*?date_b">(.*?)</div>.*?'+\
          r'<a.*?set_old">(.*?)</a>.*?<h3.*?list_c_t"><a href="(.*?)">(.*?)</a></h3>.*?'+\
          r'<div.*?fa fa-eye"></i><span>\((.*?)\)</span>.*?fa-comment-o"></i><span>\((.*?)\)</span></div>'
    req = urllib.request.Request(url=self.url, headers=self.headers)
    res = urllib.request.urlopen(req)

    # 從我的csdn博客主頁抓取的內容是壓縮後的內容,先解壓縮
    data = res.read()
    data = ungzip(data)
    data = data.decode('utf-8')
    pattern = re.compile(str,re.DOTALL)
    items = re.findall(pattern,data)
    for item in items:
        ret.append(item[0]+'年'+item[1]+'月'+item[2]+'日'+'\t'+item[3]+'\n標題:'+item[5]
                   +'\n鏈接:http://blog.csdn.net'+item[4]
                   +'\n'+'閱讀:'+item[6]+'\t評論:'+item[7]+'\n')
    return ret

#定義爬蟲對象
cs = CSDNSpider()
#求取
pagesNum = int(cs.getPages())
print("博文總頁數: ",pagesNum)

for idx in range(pagesNum):
cs.setPage(idx)
print(“當前頁:”,idx+1)
#讀取當前頁的所有博文,結果爲list類型
papers = cs.readData()
saveFile(papers,idx)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章