最近看到一篇推文,作者自己爬了自己的博客並做出了標籤雲。下面用我自己的方法來爬那位博主的博客(沒辦法,我的博客沒什麼價值,推薦大家可以關注他—下面有他的博客鏈接!)。
首先,調包,我用requests包爬取html,用pyquery與正則表達式結合匹配數據。
# -*- coding: utf-8 -*-
import requests
import time
from pyquery import PyQuery as pq
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
獲取網頁html方法:
def get_html(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'}#使用瀏覽器查看自己的user-agent
try:
w = requests.get(url, headers=headers,timeout = 10)
html = w.text
except: #下面是應付反爬
time.sleep(5)
for i in range(3):
try:
w = requests.get(url, headers=headers,timeout = 10)
html = w.text
break
except Exception as err:
print err
html = None
continue
return html
獲取下一頁鏈接方法:
def get_newt_page(html):
doc=pq(html)
pagelist=doc(".pagelist").html()
link=re.findall("<a.*?</a>",pagelist)
for i in link:
if "下一頁" in i:
next_page=re.findall('href="(.*?)"',i)[0]
return next_page
return None
獲取文章鏈接方法:
def get_article_title(html):
link=re.findall('<span class="link_title">.*?</span>',html,re.S) #re.S匹配換行
title=[]
for i in link:
doc=pq(i)
each=doc.text()
if "[置頂]" in each:
each=each.replace("[置頂]",'')
each=each.strip()
title.append(each)
return title
將爬下來的標題寫入文件(本次爬蟲只爬取標題)。
def write(text):
with open("text","a+") as f:
f.write(text+"\n")
main方法:
def main():
url="http://blog.csdn.net/forezp"
page=url
while page!=None:
page_html=get_html(page)
title=get_article_title(page_html)
for i in title:
write(i)
next_page=get_newt_page(page_html)
if next_page!=None:
page="http://blog.csdn.net"+next_page
else:
page=None
if __name__ == '__main__':
main()
最後執行一下main方法就可以了,到此爬蟲完成。
接下來進行分詞和計算詞頻。
結巴分詞,去停用詞去重:
# -*- coding: utf-8 -*-
import jieba
def get_all(lines,sw):
words = []
for i in lines:
ws = jieba.cut(i)
for w in ws:
if w not in sw and w not in words:
words.append(w)
return words
讀取文件
def get_file(path,charset = 'utf-8'):
with open(path) as f:
lines = [ line.strip().decode(charset) for line in f.readlines()]
f.close()
return lines
計算詞頻:
def get_count(lines,words):
ws=[]
for i in lines:
a = jieba.cut(i)
for j in a:
ws.append(j)
v = []
for w in words:
v.append(ws.count(w))
return v
最後輸出到控制檯
lines=get_file("text")
sw=get_file("stopword.txt")
words= get_all(lines,sw)
count=get_count(lines,words)
for i in range(len(words)):
print words[i]+' '+str(count[i])