20.2 程序設計的思路
·抓取網頁數據
https://movie.douban.com/cinema/nowplaying/ganzhou/
·清理數據
·用詞雲進行展示
20.3 關鍵技術
· 使用WordCloud
import wordcloud
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
import matplotlib.pyplot as plt
from scipy.misc import imread
get_ipython().run_line_magic('pinfo2', 'open')
text = open('test.txt', 'r').read()
bg_pic = imread('alice.png')
wc = WordCloud(
background_color = 'white',
mask = bg_pic,
font_path = 'simhei.ttf',
max_words = 100,
max_font_size = 150,
random_state = 30,
scale = 1.5
)
wc.generate_from_text(text)
image_colors = ImageColorGenerator(bg_pic)
plt.imshow(wc)
plt.axis('off')
plt.show()
print('display success!')
wc.to_file('test2.jpg')
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
get_ipython().run_line_magic('pinfo2', 'path.join')
text = open('test.txt', 'r').read()
alice_coloring = np.array(Image.open(path.join('C:', 'alice.png')))
stopwords = set(STOPWORDS)
stopwords.add('的')
stopwords.add('了')
wc = WordCloud(background_color='white',
max_words= 2000,
mask=alice_coloring,
stopwords=stopwords,
max_font_size = 40,
random_state = 42,
font_path = 'simhei.ttf')
wc.generate(text)
image_colors = ImageColorGenerator(alice_coloring)
plt.imshow(wc, interpolation='bilinear',)
plt.axis('off')
wc.to_file('test3.jpg')
plt.show()
get_ipython().run_line_magic('pinfo2', 'plt.imshow')
import jieba.analyse
from PIL import Image, ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
lyric = ''
f = open('./test.txt', 'r')
for i in f:
lyric += f.read()
result = jieba.analyse.textrank(lyric, topK=50, withWeight=True)
keywords = dict()
for i in result:
keywords[i[0]] = i[1]
print(keywords)
result
image = Image.open('./alice.png')
graph = np.array(image)
wc = WordCloud(font_path='simhei.ttf',
background_color='White',
max_words = 50,
mask = graph)
wc.generate_from_frequencies(keywords)
image_color = ImageColorGenerator(graph)
plt.imshow(wc)
plt.imshow(wc.recolor(color_func=image_color))
plt.axis('off')
plt.show()
wc.to_file('dream.png')
20.4 源代碼
1、抓取網頁數據
'''
from urllib import request
resp = request.urlopen('https://movie.douban.com/cinema/nowplaying/ganzhou/')
html_data = resp.read().decode('utf-8')
'''
import requests
url = 'https://movie.douban.com/nowplaying/ganzhou/'
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
r = requests.get(url,headers=headers)
html_data = r.text
print(html_data)
from bs4 import BeautifulSoup as bs
soup = bs(html_data, 'html.parser')
nowplaying_movie = soup.find_all('div', id = 'upcoming')
print('爬取的div的長度是', len(nowplaying_movie), end = '\n--------------------------\n')
print(nowplaying_movie, end = '\n--------------------------\n')
nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_ = 'list-item')
print('爬取的li的長度是', len(nowplaying_movie_list), end = '\n--------------------------\n')
print(nowplaying_movie_list)
print(nowplaying_movie_list[5])
nowplaying_list = []
for item in nowplaying_movie_list:
print(item,end = '\n--------------\n')
nowplaying_dict = {}
nowplaying_dict['id'] = item['data-subject']
for tag_img_item in item.find_all('img'):
print(tag_img_item,end = '\n++++++++++++++++++++++\n')
nowplaying_dict['name'] = tag_img_item['alt']
nowplaying_list.append(nowplaying_dict)
nowplaying_list
nowplaying_dict
requrl = 'https://movie.douban.com/subject/' + nowplaying_list[8]['id'] + '/comments?status=F'
resp = requests.get(requrl , headers = headers)
html_data = resp.text
print(html_data, end = '\n------------------------------------------------------------------------------------\n')
soup = bs(html_data, 'html.parser')
comment_div_lists = soup.find_all('div', class_ = 'comment')
print('有%d人評論' % len(comment_div_lists))
print(comment_div_lists)
eachCommentList = []
for item in comment_div_lists:
comment = item.find_all('p', class_ = '')
c = comment[0].find_all('span')[0].string
eachCommentList.append(c)
print(eachCommentList)
2、數據清洗
comments = ''
for k in range(len(eachCommentList)):
comments = comments + (str(eachCommentList[k])).strip()
print(comments)
import re
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
print(cleaned_comments)
import jieba.analyse
result = jieba.analyse.textrank(cleaned_comments, topK=50, withWeight=True)
keywords = dict()
for i in result:
keywords[i[0]] = i[1]
print('刪除停用詞前', keywords)
stopwords = set(STOPWORDS)
f = open('./StopWords.txt', encoding='utf8')
while True:
word = f.readline()
if word == '':
break
stopwords.add(word[:-1])
print(stopwords)
print('------------------------------------')
keywords = {x: keywords[x] for x in keywords if x not in stopwords}
print('刪除停用詞後', keywords)
3、用詞雲進行顯示
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud
wordcloud = WordCloud(font_path='simhei.ttf',
background_color='white',
max_font_size = 80,
stopwords = stopwords)
word_frequence = keywords
myword = wordcloud.fit_words(word_frequence)
plt.imshow(myword)
plt.axis('off')
plt.show()
4、完整程序代碼(取前二十頁評論)
import warnings
warnings.filterwarnings('ignore')
import jieba
import jieba.analyse
import numpy
import re
import matplotlib.pyplot as plt
from urllib import request
from bs4 import BeautifulSoup as bs
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud
def getNowPlayingMovie_list():
url = 'https://movie.douban.com/nowplaying/ganzhou/'
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
r = requests.get(url,headers=headers)
html_data = r.text
soup = bs(html_data, 'html.parser')
nowplaying_movie = soup.find_all('div', id = 'upcoming')
nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_ = 'list-item')
nowplaying_list = []
for item in nowplaying_movie_list:
nowplaying_dict = {}
nowplaying_dict['id'] = item['data-subject']
for tag_img_item in item.find_all('img'):
nowplaying_dict['name'] = tag_img_item['alt']
nowplaying_list.append(nowplaying_dict)
return nowplaying_list
def getCommentsById(movieId, pageNum):
eachCommentList = []
if pageNum > 0:
start = (pageNum - 1) * 20
else:
return False
requrl = 'https://movie.douban.com/subject/' + movieId + '/comments?start='+str(start)+'&limit=20&sort=new_score&status=F'
print(requrl)
resp = requests.get(requrl , headers = headers)
html_data = resp.text
soup = bs(html_data, 'html.parser')
comment_div_lists = soup.find_all('div', class_ = 'comment')
for item in comment_div_lists:
comment = item.find_all('p', class_ = '')
c = comment[0].find_all('span')[0].string
eachCommentList.append(c)
return eachCommentList
def main():
commentList = []
NowPlayingMovie_list = getNowPlayingMovie_list()
for i in range(10):
num = i + 1
commentList_temp = getCommentsById(NowPlayingMovie_list[1]['id'], num)
commentList.append(commentList_temp)
comments = ''
for k in range(len(commentList)):
comments = comments + (str(commentList[k])).strip()
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
print(cleaned_comments)
result = jieba.analyse.textrank(cleaned_comments, topK=50, withWeight=True)
keywords = dict()
for i in result:
keywords[i[0]] = i[1]
print('刪除停用詞前', keywords)
stopwords = set(STOPWORDS)
f = open('./StopWords.txt', encoding='utf8')
while True:
word = f.readline()
if word == '':
break
stopwords.add(word[:-1])
print(stopwords)
keywords = {x: keywords[x] for x in keywords if x not in stopwords}
print('刪除停用詞後', keywords)
wordcloud = WordCloud(font_path='simhei.ttf',
background_color='white',
max_font_size = 80,
stopwords = stopwords)
word_frequence = keywords
myword = wordcloud.fit_words(word_frequence)
plt.imshow(myword)
plt.axis('off')
plt.show()
main()