# pip install bs4
from bs4 import BeautifulSoup # python 爬蟲利器
"""
Beautiful Soup 是一個可以從HTML或XML文件中提取數據的Python庫.
它能夠通過你喜歡的轉換器實現慣用的文檔導航,查找,
修改文檔的方式.Beautiful Soup會幫你節省數小時甚至數天的工作時間.
"""
import requests
blog_url = 'http://blog.51cto.com/13118411/2154806'
data = requests.get(blog_url)
print(data)
# print(data.text)
<Response [200]>
contents = BeautifulSoup(data.text, 'html.parser') # data.text博客文本,html.parser這個類自帶的功能
# print(contents) 輸出更標準化
all_p = contents.find_all('p') # 尋找p標籤
all_text = ''
for p in all_p:
# print(p.text)
all_text += str(p.text) # 拼接成一個句子
print(all_text)
掃一掃體驗手機閱讀0分享收藏Ctrl+Enter 發佈發佈取消0
# pip install jieba 對中文進行拆解爲獨立的詞語
import jieba
text = jieba.cut(all_text) # jieba.cut()
"""
Signature: jieba.cut(sentence, cut_all=False, HMM=True)
Docstring:
The main function that segments an entire sentence that contains
Chinese characters into seperated words.
"""
text_list= []
for t in text:
print(t)
text_list.append(t)
Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\coop\AppData\Local\Temp\jieba.cache
Loading model cost 1.107 seconds.
Prefix dict has been built succesfully.
掃一掃
體驗
手機
閱讀
0
分享
收藏
Ctrl
+
Enter
發佈
發佈
取消
0
import collections # python 內置的api,以上jieba也可叫做api,收集
count = collections.Counter(text_list) # 產生一個對象count
for key, val in count.most_common(30):
# 有序(返回前n個出現次數最多的)
print(key, val)
0 2
發佈 2
掃一掃 1
體驗 1
手機 1
閱讀 1
分享 1
收藏 1
Ctrl 1
+ 1
Enter 1
1
取消 1
# 做接口 可以給被人這個py文件,也可以是個鏈接
import collections
def get_most_common(text_list, max_num = 30):
"""根據max_num取排名靠前的詞和出現次數"""
ret = {'status':0, "statusText":'ok', 'data':{}} # api通用格式
try:
new_list = list(text_list)
count = collections.Counter(new_list)
ret['data'] = count.most_common(max_num)
except Exception as e:
ret['status'] = 1
ret['statusText'] = e
return ret
get_most_common(text_list)
{'status': 0,
'statusText': 'ok',
'data': [('0', 2),
('發佈', 2),
('掃一掃', 1),
('體驗', 1),
('手機', 1),
('閱讀', 1),
('分享', 1),
('收藏', 1),
('Ctrl', 1),
('+', 1),
('Enter', 1),
('\xa0', 1),
('取消', 1)]}