19 詞雲實戰——爬取豆瓣影評生成詞雲

#!/usr/bin/env python
# coding: utf-8

20.2 程序設計的思路

# In[ ]:


·抓取網頁數據
    https://movie.douban.com/cinema/nowplaying/ganzhou/
·清理數據
·用詞雲進行展示

20.3 關鍵技術

· 使用WordCloud

# In[2]:


import wordcloud


# In[4]:


# WordCloud 舉例
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
import matplotlib.pyplot as plt
from scipy.misc import imread


# In[11]:


get_ipython().run_line_magic('pinfo2', 'open')


# In[25]:


text = open('test.txt', 'r').read()
bg_pic = imread('alice.png')


# In[36]:


wc = WordCloud(
    background_color = 'white',
    mask = bg_pic,
    font_path = 'simhei.ttf',
    max_words = 100,
    max_font_size = 150,
    random_state = 30,
    scale = 1.5
)


# In[37]:


wc.generate_from_text(text)
image_colors = ImageColorGenerator(bg_pic)
plt.imshow(wc)
plt.axis('off')
plt.show()
print('display success!')
wc.to_file('test2.jpg')


# In[38]:


# 設置停用詞
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS


# In[48]:


get_ipython().run_line_magic('pinfo2', 'path.join')


# In[51]:


text = open('test.txt', 'r').read()

alice_coloring = np.array(Image.open(path.join('C:', 'alice.png'))) # 上一個例子用的是scipy.misc下的imread!!!


# In[52]:


stopwords = set(STOPWORDS)
stopwords.add('的')
stopwords.add('了')


# In[67]:


wc = WordCloud(background_color='white', 
               max_words= 2000, 
               mask=alice_coloring, 
               stopwords=stopwords,
               max_font_size = 40, 
               random_state = 42,
               font_path = 'simhei.ttf') # 不加這句每個字都是框框

wc.generate(text)

image_colors = ImageColorGenerator(alice_coloring)
plt.imshow(wc, interpolation='bilinear',)
plt.axis('off')
wc.to_file('test3.jpg')
plt.show()


# In[68]:


get_ipython().run_line_magic('pinfo2', 'plt.imshow')


# In[69]:


# 使用詞頻
import jieba.analyse
from PIL import Image, ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator


# In[70]:


# 從文件搞出要分析的字符串來
lyric = ''
f = open('./test.txt', 'r')
for i in f:
    lyric += f.read()

# 用jieba進行中文分詞,詞頻排序!
result = jieba.analyse.textrank(lyric, topK=50, withWeight=True)

# 搞成字典的形式輸出!!!(自行DIY部分!!)
keywords = dict()
for i in result:
    keywords[i[0]] = i[1]
print(keywords)


# In[71]:


result


# In[73]:


# 繪製 詞雲
image = Image.open('./alice.png')
graph = np.array(image)

wc = WordCloud(font_path='simhei.ttf', 
               background_color='White',
              max_words = 50,
              mask = graph)
wc.generate_from_frequencies(keywords)
image_color = ImageColorGenerator(graph)

plt.imshow(wc)
plt.imshow(wc.recolor(color_func=image_color))
plt.axis('off')
plt.show()
wc.to_file('dream.png')

20.4 源代碼

1、抓取網頁數據

# In[76]:


'''
from urllib import request


resp = request.urlopen('https://movie.douban.com/cinema/nowplaying/ganzhou/')
html_data = resp.read().decode('utf-8')
'''
# 以上已經失效,會被反爬蟲!!!!


# In[79]:


import requests
url = 'https://movie.douban.com/nowplaying/ganzhou/'
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}

r = requests.get(url,headers=headers)
html_data = r.text

print(html_data)


# In[91]:


from bs4 import BeautifulSoup as bs

soup = bs(html_data, 'html.parser')


nowplaying_movie = soup.find_all('div', id = 'upcoming')
print('爬取的div的長度是', len(nowplaying_movie), end = '\n--------------------------\n') 
# 一個超大的div(即將上映)
print(nowplaying_movie, end = '\n--------------------------\n')


nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_ = 'list-item')
print('爬取的li的長度是', len(nowplaying_movie_list), end = '\n--------------------------\n') # 九部電影
print(nowplaying_movie_list)


# In[98]:


print(nowplaying_movie_list[5])


# In[107]:


# 獲得電影id和名稱
nowplaying_list = []
for item in nowplaying_movie_list:
    print(item,end = '\n--------------\n')
    nowplaying_dict = {}
    nowplaying_dict['id'] = item['data-subject']
    for tag_img_item in item.find_all('img'):
        print(tag_img_item,end = '\n++++++++++++++++++++++\n')
        nowplaying_dict['name'] = tag_img_item['alt']
        nowplaying_list.append(nowplaying_dict)


# In[103]:


nowplaying_list


# In[104]:


nowplaying_dict


# In[187]:


# 對該標籤進行解析
requrl = 'https://movie.douban.com/subject/' + nowplaying_list[8]['id'] + '/comments?status=F'
resp = requests.get(requrl , headers = headers)
html_data = resp.text
print(html_data, end = '\n------------------------------------------------------------------------------------\n')
soup = bs(html_data, 'html.parser')


comment_div_lists = soup.find_all('div', class_ = 'comment')


print('有%d人評論' % len(comment_div_lists))
print(comment_div_lists)


# In[188]:


# 提取評論
eachCommentList = []
for item in comment_div_lists:
    comment = item.find_all('p', class_ = '')
#     print(comment)
    c = comment[0].find_all('span')[0].string
#     print(c)
    eachCommentList.append(c)
#     c??
#     print(c['span'])

print(eachCommentList)

2、數據清洗

# In[190]:


comments = ''
for k in range(len(eachCommentList)):
    comments = comments + (str(eachCommentList[k])).strip()
print(comments)


# In[191]:


# 去除標點
import re
pattern = re.compile(r'[\u4e00-\u9fa5]+') # 匹配中文文本的正則表達式!!
# pattern = re.compile(r'[\x00-\xff]+') # 匹配雙字節文本的正則表達式!!
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
print(cleaned_comments)


# In[192]:


# 用jieba進行詞頻分析

import jieba.analyse

result = jieba.analyse.textrank(cleaned_comments, topK=50, withWeight=True)
keywords = dict()
for i in result:
    keywords[i[0]] = i[1]
print('刪除停用詞前', keywords)


# In[201]:


# 去掉停用詞

stopwords = set(STOPWORDS)
f = open('./StopWords.txt', encoding='utf8')
while True:
    word = f.readline() # 可以把換行都讀進來!!
    #     print(word[:-1], end = ' ')
    if word == '':
        break
    #     print(len(word[:-1]), len(word))
    stopwords.add(word[:-1])
    
# 上述生成了停止詞集合 

   
print(stopwords)
print('------------------------------------')
keywords = {x: keywords[x] for x in keywords if x not in stopwords}
print('刪除停用詞後', keywords)



3、用詞雲進行顯示

# In[203]:


import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import  WordCloud


# In[206]:


wordcloud = WordCloud(font_path='simhei.ttf', 
                      background_color='white',
                     max_font_size = 80,
                     stopwords = stopwords)


word_frequence = keywords
myword = wordcloud.fit_words(word_frequence)


plt.imshow(myword)
plt.axis('off')
plt.show()

4、完整程序代碼(取前二十頁評論)

# In[211]:


# 導包
import warnings
warnings.filterwarnings('ignore')
import jieba
import jieba.analyse
import numpy
import re
import matplotlib.pyplot as plt
from urllib import request
from bs4 import BeautifulSoup as bs
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud


# In[223]:


#  分析網頁數據
def getNowPlayingMovie_list():
    url = 'https://movie.douban.com/nowplaying/ganzhou/'
    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}

    r = requests.get(url,headers=headers)
    html_data = r.text
    soup = bs(html_data, 'html.parser')
    nowplaying_movie = soup.find_all('div', id = 'upcoming')
    nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_ = 'list-item')
    
    nowplaying_list = []
    for item in nowplaying_movie_list:
#         print(item,end = '\n--------------\n')
        nowplaying_dict = {}
        nowplaying_dict['id'] = item['data-subject']
        for tag_img_item in item.find_all('img'):
#             print(tag_img_item,end = '\n++++++++++++++++++++++\n')
            nowplaying_dict['name'] = tag_img_item['alt']
            nowplaying_list.append(nowplaying_dict)
        
    return nowplaying_list


# In[227]:


# 抓取評論
def getCommentsById(movieId, pageNum):
    eachCommentList = []
    if pageNum > 0:
        start = (pageNum - 1) * 20
    else:
        return False
#     https://movie.douban.com/subject/30216731/comments?start=20&limit=20&sort=new_score&status=F
    requrl = 'https://movie.douban.com/subject/' + movieId +                    '/comments?start='+str(start)+'&limit=20&sort=new_score&status=F'
    print(requrl)
    resp = requests.get(requrl , headers = headers)
    html_data = resp.text
#     print(html_data, end = '\n------------------------------------------------------------------------------------\n')
    soup = bs(html_data, 'html.parser')


    comment_div_lists = soup.find_all('div', class_ = 'comment')


#     print('有%d人評論' % len(comment_div_lists))
#     print(comment_div_lists)
    
    for item in comment_div_lists:
        comment = item.find_all('p', class_ = '')
    #     print(comment)
        c = comment[0].find_all('span')[0].string
    #     print(c)
        eachCommentList.append(c)
    #     c??
    #     print(c['span'])

    # print(eachCommentList)
    
    return eachCommentList


# In[228]:


def main():
    commentList = []
    NowPlayingMovie_list = getNowPlayingMovie_list()
    for i in range(10): # 前十頁
        num = i + 1
        commentList_temp = getCommentsById(NowPlayingMovie_list[1]['id'], num)
        commentList.append(commentList_temp)
        
    comments = ''
    for k in range(len(commentList)):
        comments = comments + (str(commentList[k])).strip()
#     print(comments)
    
    pattern = re.compile(r'[\u4e00-\u9fa5]+') # 匹配中文文本的正則表達式!!
    # pattern = re.compile(r'[\x00-\xff]+') # 匹配雙字節文本的正則表達式!!
    filterdata = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterdata)
    print(cleaned_comments)
    
    result = jieba.analyse.textrank(cleaned_comments, topK=50, withWeight=True)
    keywords = dict()
    for i in result:
        keywords[i[0]] = i[1]
    print('刪除停用詞前', keywords)

    stopwords = set(STOPWORDS)
    f = open('./StopWords.txt', encoding='utf8')
    while True:
        word = f.readline() # 可以把換行都讀進來!!
        #     print(word[:-1], end = ' ')
        if word == '':
            break
        #     print(len(word[:-1]), len(word))
        stopwords.add(word[:-1])
    print(stopwords)
        
    keywords = {x: keywords[x] for x in keywords if x not in stopwords}
    print('刪除停用詞後', keywords)
    
    wordcloud = WordCloud(font_path='simhei.ttf', 
                      background_color='white',
                     max_font_size = 80,
                     stopwords = stopwords)


    word_frequence = keywords
    myword = wordcloud.fit_words(word_frequence)


    plt.imshow(myword)
    plt.axis('off')
    plt.show()
        
        
    
    


# In[229]:


main()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章