參考思路

step1：愛奇藝《青春有你2》評論數據爬取(參考鏈接：https://www.iqiyi.com/v_19ryfkiv8w.html#curid=15068699100_9f9bab7e0d1e30c494622af777f4ba39)

爬取任意一期正片視頻下評論
評論條數不少於1000條

step2：詞頻統計並可視化展示

數據預處理：清理清洗評論中特殊字符（如：@#￥%、emoji表情符）,清洗後結果存儲爲txt文檔
中文分詞：添加新增詞（如：青你、奧利給、衝鴨），去除停用詞（如：哦、因此、不然、也好、但是）
統計top10高頻詞
可視化展示高頻詞

step3：繪製詞雲

根據詞頻生成詞雲
可選項-添加背景圖片，根據背景圖片輪廓生成詞雲

step4：結合PaddleHub，對評論進行內容審覈

實現環境：linux環境下的 AI Studio 平臺(百度飛槳）

網頁分析

隨便打開一期視頻分析，點擊“更多評論”這裏，沒有可以獲取的相關鏈接。

因此通過Network 對行爲進行檢測，選擇JS,在點擊更多評論同時，捕獲到get_comments.actionXXXX, 在右邊Response 可以觀察到內容，評論的內容就包括在裏面

實踐

1. 配置與準備

中文分詞需要jieba
詞雲繪製需要wordcloud
可視化展示中需要的中文字體
網上公開資源中找一箇中文停用詞表
根據分詞結果自己製作新增詞表
準備一張詞雲背景圖（附加項，不做要求，可用hub摳圖實現）
paddlehub配置

 #飛槳線上平臺
 !pip install jieba
 !pip install wordcloud
 # window 命令行 進入相應的環境
 pip install jieba
 pip install wordcloud

# Linux系統默認字體文件路徑
# !ls /usr/share/fonts/
# 查看系統可用的ttf格式中文字體
!fc-list :lang=zh | grep ".ttf"


# !wget https://mydueros.cdn.bcebos.com/font/simhei.ttf # 下載中文字體(下載有問題，可以通過本地上傳字體)
#將字體文件複製到系統字體目錄下
!cp simhei.ttf /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/
#aistudio上路徑沒有寫權限 此方法不能用 ！cp simhei.ttf /usr/share/fonts

# #創建字體目錄fonts（系統字體w）
!mkdir .fonts
# # 複製字體文件到該路徑
!cp simhei.ttf .fonts/
!rm -rf .cache/matplotlib/

#安裝模型
!hub install porn_detection_lstm==1.1.0
!pip install --upgrade paddlehub #更新paddlehub

2.導入庫

from __future__ import print_function
import requests
import json
import re #正則匹配
import time #時間處理模塊
import jieba #中文分詞
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
from PIL import Image
from wordcloud import WordCloud  #繪製詞雲模塊
import paddlehub as hub

3.模塊功能

3.1 請求接口

def getMovieinfo(url):
    '''
    請求愛奇藝評論接口，返回response信息
    參數  url: 評論的url
    :return: response信息
    '''
    session = requests.Session()
    headers = {
        "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        "Accept": "application/json",
        "Referer": "http://m.iqiyi.com/v_19rqriflzg.html",
        "Origin": "http://m.iqiyi.com",
        "Host": "sns-comment.iqiyi.com",
        "Connection": "keep-alive",
        "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6",
        "Accept-Encoding": "gzip, deflate"
    }
    response = session.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    return None

3.2 json數據解析

def saveMovieInfoToFile(lastId,arr):
    '''
    解析json數據，獲取評論
    參數  lastId:最後一條評論ID  arr:存放文本的list
    :return: 新的lastId
    '''
    #更換url
    url="https://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=null&business_type=17&content_id=15472264800&page=&page_size=20&types=time&last_id="
    url+=str(lastId)
    responseTxt = getMovieinfo(url)
    responseJson=json.loads(responseTxt) #這個源的response對象是json文件格式 解析完後是一個str字典
    comments=responseJson['data']['comments']
    for val in comments:
        if 'content' in val.keys():
            arr.append(val['content'])
        lastId = str(val['id'])
    return lastId

3.3 數據清洗

def clear_special_char(content):
    """
    正則化處理一些字符
    content:爬取的單條評論
    return :清洗後的文本
    有表情， []
    """
    #compile 用於編譯正則表達式,生成一個正則表達式對象（Pattern），供match()和search()連個函數使用
    comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') #除數字 英文 和 漢字外都不能數去。
    return comp.sub('', content) #去掉空格 ，需要執行編譯對象

補充：是因爲觀察了分詞後的數據添加

correct_dict = {
        '安琦': '安崎','a組':"A組",'b組':"B組"
    }

#增加 去除停用詞（這一步可不要，新的分詞表已包含）
new_add=["真的","這","已經","來","看起來","看","而","色","色色","色色色"]

with open("stopword.txt",'a')as f:
    for i in new_add:
        f.writelines(i+"\n")

3.4 jieba 分詞

def fenci(text):
    """
    利用jieba進行分詞
    text:需要分詞的句子或文字
    return :分詞結果
    """
    jieba.load_userdict('add_work.txt') #添加用戶自定義的分詞表
    text=jieba.lcut(text)
    for num,k in enumerate(text):
        if k in correct_dict.keys():
                text[num]=correct_dict[k]

    return text

3.5 創建並去除停用詞表

def stopwordslist(file_path):
    """
    創建停用詞表：
    file_path:停用詞文本路徑
    :return: 停用詞list
    """
    stopwordslist=[]
    for line in open(file_path,encoding='UTF-8'):
        stopwordslist.append(line.strip())#去掉'\n
    return stopwordslist

def movespword(text,stopwords,counts):
    """
    去除停用詞，統計詞頻
    text :評論由多個分詞構成的列表[]
    stopwords:停用詞
    list counts:詞頻統計結果
    :return:None
    """
    for i in text:
        if i not in stopwords:
            if len(i)!=1:#去掉一個字
                counts[i]=counts.get(i,0)+1
    return None

3.5 繪製詞頻統計表

def drawcounts(counts,top_N):
    """
    繪製詞頻統計表
    counts:
    top_N:保留最高詞頻的個數
    :return:
    """
    xvalue = []
    yvalue = []
    sort = sorted(counts.items(), key=lambda x: x[1], reverse=True)  # 默認從小到大
    for i in sort[:top_N]:
        xvalue.append(i[0])
        yvalue.append(i[1])

    # 繪製圖形
    # 設置中文字體
    matplotlib.rcParams['font.sans-serif'] = ['SimHei']
    plt.bar(xvalue, yvalue)
    for a,b in zip(xvalue,yvalue):
        plt.text(a,b+0.05,'%d'%b,ha='center',va='bottom',fontdict={"size":10})
    plt.title('詞頻統計結果')
    plt.xlabel('高頻詞')
    plt.ylabel('詞頻')
    plt.show()

3.7 繪製詞雲圖

def drawclod(counts):
    """
    根據詞頻繪製詞雲圖
    :return:
    """
    #打開一張白底圖片。圖片必須是白底的，透明的也不行
    image=np.array(Image.open("aixin.png"))
    stopwords=['還是','有點','一直 ','還是','的話','就是','而且','啊啊啊']
    cloud=WordCloud(background_color='white',
                    mask=image,
                    max_words=200,
                    font_path='simhei.ttf',
                    min_font_size=10,
                    max_font_size=100,
                    relative_scaling='auto',
                    stopwords=stopwords
                    )
    cloud.fit_words(counts)
    cloud.to_file('pic.png')

3.8 內容審覈

調用模型對評論內容進行審覈，輸出覺得晃晃(huanghuang)的評論

def text_detection(text, file_path):
    '''
    使用hub對評論進行內容分析
    return：分析結果

    '''
    porn_detection_lstm=hub.Module(name='porn_detection_lstm')
    f=open('record.txt','r',encoding='utf-8')
    for line in f:
        if len(line.strip())==1:
            continue
        else:
            test_text.append(line)
    f.close

    input_dict={'text':test_text}
    results=porn_detection_lstm.detection(data=input_dict,use_gpu=True,batch_size=64)
    for index, item in enumerate(results):
        if item['porn_detection_key'] =='porn':
            print(item['text'],':', item['porn_probs'])

4.執行

num=55 #num是頁數，一般設置爲一頁20條評論。
lastId='0'
arr=[] #作爲全局變量
with open('record.txt','a',encoding='utf-8') as f:
for i in range(num):
   lastId=saveMovieInfoToFile(lastId,arr)
   time.sleep(0.5)
print(len(arr))
error_num=0
# 寫入數據
for item in arr:
   # 2.詞頻統計並可視化展示
   #進行數據清洗
   item=clear_special_char(item)
   #如果去除後不是爲空：
   if item.strip()!='':#轉化成str
       try:
           f.write(item+"\n")#分行
       except Exception as e:
           error_num+=1
           continue
print("抓取評論總數：",len(arr))
print("特殊符號評論未處理個數",error_num)
f=open('record.txt','r',encoding='utf-8')
counts={}
for line in f:
words=fenci(line)#讀取逐行信息
stopwords=stopwordslist("stopword.txt")
#移除停用詞
movespword(words,stopwords,counts)
# print(counts.items())
#counts.items()得到[('key',value),()]
drawcounts(counts,10)
drawclod(counts)
f.close()

display(Image.open('pic.png')) #顯示生成的詞雲圖像

file_path='record.txt'
test_text=[]
text_detection(test_text, file_path)

總結：

添加分詞的方法：jieba.load_userdict(‘add_work.txt’) #添加用戶自定義的分詞表
數據清洗的規則制定與正則表達式
通過Network 監控網頁行爲

Day5《青春有你2》評論數據爬取與詞雲分析

參考思路

網頁分析

實踐

1. 配置與準備

2.導入庫

3.模塊功能

3.1 請求接口

3.2 json數據解析

3.3 數據清洗

3.4 jieba 分詞

3.5 創建並去除停用詞表

3.5 繪製詞頻統計表

3.7 繪製詞雲圖

3.8 內容審覈

4.執行

總結：

【強化學習】百度Paddle7日打卡營學習心得

Matlab GUI入門-文件顯示界面製作

Matlab GUI入門-直方分佈圖界面製作

Day5《青春有你2》評論數據爬取與詞雲分析

飛槳PaddlePaddle-AI結營心得

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結