小白逆襲大神之綜合大作業
這裏是三歲,轉眼間已經到了大作業了,成功與否最後一搏,最後給小白一點建議吧!加油!等到全部結束後回過頭再看一遍效果特別好,會發現忽然間恍然大悟!
綜合大作業
一:先爬取評論
二:數據處理,分詞,清洗
三:詞頻統計
四:詞雲生成
其他具體的在文章裏面已經很齊全了
不需要多說什麼了
小白專屬嘛,怕大家沒有爬取過評論,沒有做過詞雲,在這裏把前段時間做的B站up主評論爬取和中文詞雲製作模板給大家做以參考
'''
B站爬取
https://api.bilibili.com/x/v1/dm/list.so?oid=837806779 彈幕api
https://api.bilibili.com/x/v2/reply?type=1&oid=837806779&&pn=1 評論api
彈幕只能夠用oid,目前抓包未在到oid集中出現地址
bug:部分網站沒有那麼嚴格按照['data']['replies']['content']['message']的順序來
'''
#導入庫
import requests
from bs4 import BeautifulSoup
import re
import json
def Gethtml(url): #獲取網頁
kv = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36"}
r = requests.get(url, headers = kv)
# print(r.text)
r.encoding = r.apparent_encoding
# print(r.text)
return r.text
#獲取up的uid
def u_id(uid_html):
# url = f"https://search.bilibili.com/all?keyword={name}"
# r = Gethtml(url)
# print(r)
# r = requests.get(url, headers=kv)
html = BeautifulSoup(uid_html,'lxml')
# print(html)
uid = html.find(name = 'a',attrs = {"class":"title"})
# print(uid)
#uid = '<a class="title" href="//space.bilibili.com/390461123?from=search" target="_blank" title="徐大sao">徐大sao</a>'
uid = re.findall(r'<a.*?href="//space.bilibili.com/(.+)f.*',str(uid))
uid = uid[0][:-1]
print('以獲取up主的id爲',uid)
return uid
def a_id(aid_html):#文章的aid
videos = json.loads(aid_html)
# print(videos)
videos_lists = videos['data']['list']['vlist']
aid_list = []
for videos_list in videos_lists:
aid = videos_list['aid']
aid_list.append(aid)
print('已獲得視頻id長度爲:',len(aid_list))
return aid_list
def comment_save(name, comment_html): #爬取每個視頻下的評論
videos = json.loads(comment_html.text)
videos_lists = videos['data']['replies']#['replies']['content']
# print(videos_lists)
bvid_list = []
if videos_lists :
for videos_list in videos_lists:
bvid = videos_list['content']['message']
bvid_list.append(bvid)
print(bvid_list)
with open(f'{name}.txt','a+',encoding ='utf-8') as f:
f.write(bvid)
print('提取完畢!')
def main(name): #主函數
#獲得阿婆主的id
url_uid = f"https://search.bilibili.com/all?keyword={name}"
uid_html = Gethtml(url_uid)
uid = u_id(uid_html)
# print(uid)
# 循環獲取至少9頁的視頻(不一定有那麼多)
for i in range(1,10):
aid_url = f"https://api.bilibili.com/x/space/arc/search?mid={uid}&ps=30&tid=0&pn={i}&keyword=&order=pubdate&jsonp=jsonp"
aid_html = Gethtml(aid_url)
aid_list = a_id(aid_html)
# print(aid_list)
#獲取保存評論
for j in range(len(aid_list)):
for i in range(1,15):
comment_uil = f"https://api.bilibili.com/x/v2/reply?type=1&oid={aid_list[j]}&&pn={i}"
comment_html = requests.get(comment_uil)
comment_save(name, comment_html)
# print(comment_html)
main('敬漢卿')#調用主函數
#賢寶寶baby
#老師好我叫何同學
#大祥哥來了
#女胖胖
#記錄生活的蛋黃派
詞雲生成器(自己修改)
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
d = path.dirname(__file__)
# Read the whole text. encoding = ' gbk'或'utf-8'
#文檔地址
f = open("D:\python3.7.4\爬蟲\女胖胖.txt", 'r' , encoding = ' utf-8')
t = f.read()
ls = jieba.lcut(t)
text = ' '.join(ls)
#線下詞雲圖必須是png格式白底的不然不好看!
alice_coloring = np.array(Image.open(path.join(d, "手繪美女.png")))
# 設置停用詞這裏面原來是英文的需要自己添加,中文的停用詞太多自己網上找
stopwords = set(STOPWORDS)
stopwords.add(" ")
# print(stopwords)
# 你可以通過 mask 參數 來設置詞雲形狀
wc = WordCloud(background_color="white",font_path = "msyh.ttc" , max_words=2000, mask=alice_coloring,
stopwords=stopwords, max_font_size=40, random_state=42)
# generate word cloud
wc.generate(text)
# create coloring from image
image_colors = ImageColorGenerator(alice_coloring)
# show
# 在只設置mask的情況下,你將會得到一個擁有圖片形狀的詞雲
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.figure()
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
# 我們還可以直接在構造函數中直接給顏色
# 通過這種方式詞雲將會按照給定的圖片顏色佈局生成字體顏色策略
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.figure()
plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
plt.axis("off")
plt.show()
李子柒最後的詞雲圖,因爲數據處理太麻煩,沒有好好處理,大家多多包涵!!!
大家加油!時間不多,還得多多努力!!!奧利給!