爬蟲爬取電商網站商品評論進行詞頻分析

 

#!/usr/bin/env python
# coding: utf-8

# In[1]:


from selenium import webdriver
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import os
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from pandas import DataFrame
import re
import jieba
import jieba.analyse as anls
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from PIL import Image
import matplotlib.pyplot as plt

#讀取的文本存入本地文件
storage_file_path=r"E:\jd_comments.txt"
if os.path.exists(storage_file_path):
    os.remove(storage_file_path)

url0="https://item.jd.com/28501887051.html#comment"
#url0="https://item.jd.com/34378572773.html#comment"
url=input("請輸入您要讀取評論的網頁地址:ENTER 默認"+url0)
if url=="":
    url=url0
    
max_page_number=input("請輸入需要讀取多少頁評論:")
max_page_number=int(max_page_number)
print("好的,即將爲你讀取",max_page_number,"頁評論")
    
driver=webdriver.Chrome()
driver.get(url)
    
#尋找到評論按鈕並點擊,翻開評論
try:
    WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.XPATH,"//li[@data-anchor='#comment']")))
    driver.find_element_by_xpath("//li[@data-anchor='#comment']").click()
    
    #等待第一頁評論出現後,顯示評論
    try:
        WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.ID,"comment")))
        WebDriverWait(driver,10).until(EC.text_to_be_present_in_element((By.ID,"comment"),""))
        text=driver.find_element_by_id("comment").get_attribute("innerHTML")
        #print(text)
        with open(storage_file_path,"a+") as storage_file:
            storage_file.write(text)
    except:
         print("\n!!!!!!!!!!找不到評論文本!!!!!!!!!!\n")
        
except:
     print("\n!!!!!!!!!!沒有找到評論按鈕!!!!!!!!!!\n")
        

        
#連續翻開後續評論頁面
for try_time in range(max_page_number): 
    try:
        time.sleep(3)
        WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CLASS_NAME,"ui-pager-next")))
        element=driver.find_element_by_class_name("ui-pager-next")
        #點擊“下一頁”按鈕
        driver.execute_script("arguments[0].click();", element)
        
        try:
            WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.ID,"comment")))
            WebDriverWait(driver,10).until(EC.text_to_be_present_in_element((By.ID,"comment"),""))
            #獲取評論文本
            text=driver.find_element_by_id("comment").get_attribute("innerHTML")
            #print(text)
            with open(storage_file_path,"a+") as storage_file:
                storage_file.write(text)
        except Exception as e:
            print(e)
            
    except Exception as msg:
        print(msg)
        print("\n",try_time,". !!!!!!!!!!沒有下一頁評論了!!!!!!!!!!")
        break
        driver.quit()
    
driver.quit()

#從保存的本地文件中讀取HTML代碼
html_text=""
if os.path.exists(storage_file_path):
    with open(storage_file_path,"r") as f:
        html_text=f.read()
bs=BeautifulSoup(html_text,"html.parser")
#print(bs.prettify())        

print("\n* * * * * 獲取到的全部評論 * * * * *\n")
comment_list=[]
comments=bs.select("p[class='comment-con']")
for comment in comments:
    comment_list.append(comment.get_text())
    print(comment.get_text())
#print(comment_list)

print("\n* * * * * 評論對應訂單和時間 * * * * *\n")

comment_order=[]
comment_time=[]

comments=bs.select("div[class='order-info']")
n=0
for comment in comments:
    order_time=comment.get_text().split(" ",1)
    order=order_time[0].strip()
    ctime=order_time[1].strip()
    comment_order.append(order)
    comment_time.append(ctime)
    print(ctime,order)
    n+=1
    
#print(comment_order)
#print(comment_time)


comment_dict= {'id':range(len(comment_list)),'time':comment_time,'order':comment_order,'comment':comment_list}
comment_df= pd.DataFrame(comment_dict)


comment_dict= {'id':range(len(comment_list)),'time':comment_time,'order':comment_order,'comment':comment_list}
comment_df= pd.DataFrame(comment_dict)
comment_df.to_excel(r"E:\Jinduoduo_JD_COMMENTS.xlsx",index=False)
print("\n* * * * * 評論已保存到EXCEL表格 E:\\Jinduoduo_JD_COMMENTS.xlsx * * * * *\n")


comments_joined=" ".join(comment_list)

#去除文本中的標點符號
reg='[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \,\。\?\!\:\;\‘\【\{\|\《\¥\&\*\》\}\】\…]+'

comments_joined=re.sub(reg,' ',comments_joined)

keywords=jieba.cut(comments_joined,cut_all=False)
keywords_joined=" ".join(keywords)
print("\n* * * * * 評論中所有關鍵詞彙總 * * * * *\n")
print(keywords_joined)

print("\n* * * * * 關鍵詞重要性權重分佈 * * * * *\n")
for x, w in anls.extract_tags(keywords_joined, topK=36, withWeight=True):
    print('%s %s' % (x, w))

print("\n* * * * * 關鍵詞文本頻率權重分佈 * * * * *\n")
for x, w in anls.extract_tags(keywords_joined, topK=36, withWeight=True):
    print('%s %s' % (x, w))
    
#用詞雲可視化評論詞頻
fontpath=r"C:\Windows\Fonts\SourceHanSerifCN-Bold.otf"

imgmask=np.array(Image.open(r"E:\jinduoduo.png"))
genclr=ImageColorGenerator(imgmask)

wc=WordCloud(font_path=fontpath,
            background_color="white",
            max_words=100,
            max_font_size=500,
            random_state=50,
            collocations=False,
            mask=imgmask,
            color_func=genclr,
            width=4800,height=4800,margin=2)
wc.generate(keywords_joined)


plt.imshow(wc)
plt.axis("off")
plt.show()

wc.to_file(r"E:\jinduoduo_wordcloud.png")

print("\n* * * * *  詞雲圖已經保存到 E:\\jinduoduo_wordcloud.png * * * * *\n")


# In[ ]:




 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章