爬虫爬取电商网站商品评论进行词频分析

 

#!/usr/bin/env python
# coding: utf-8

# In[1]:


from selenium import webdriver
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import os
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from pandas import DataFrame
import re
import jieba
import jieba.analyse as anls
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from PIL import Image
import matplotlib.pyplot as plt

#读取的文本存入本地文件
storage_file_path=r"E:\jd_comments.txt"
if os.path.exists(storage_file_path):
    os.remove(storage_file_path)

url0="https://item.jd.com/28501887051.html#comment"
#url0="https://item.jd.com/34378572773.html#comment"
url=input("请输入您要读取评论的网页地址:ENTER 默认"+url0)
if url=="":
    url=url0
    
max_page_number=input("请输入需要读取多少页评论:")
max_page_number=int(max_page_number)
print("好的,即将为你读取",max_page_number,"页评论")
    
driver=webdriver.Chrome()
driver.get(url)
    
#寻找到评论按钮并点击,翻开评论
try:
    WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.XPATH,"//li[@data-anchor='#comment']")))
    driver.find_element_by_xpath("//li[@data-anchor='#comment']").click()
    
    #等待第一页评论出现后,显示评论
    try:
        WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.ID,"comment")))
        WebDriverWait(driver,10).until(EC.text_to_be_present_in_element((By.ID,"comment"),""))
        text=driver.find_element_by_id("comment").get_attribute("innerHTML")
        #print(text)
        with open(storage_file_path,"a+") as storage_file:
            storage_file.write(text)
    except:
         print("\n!!!!!!!!!!找不到评论文本!!!!!!!!!!\n")
        
except:
     print("\n!!!!!!!!!!没有找到评论按钮!!!!!!!!!!\n")
        

        
#连续翻开后续评论页面
for try_time in range(max_page_number): 
    try:
        time.sleep(3)
        WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CLASS_NAME,"ui-pager-next")))
        element=driver.find_element_by_class_name("ui-pager-next")
        #点击“下一页”按钮
        driver.execute_script("arguments[0].click();", element)
        
        try:
            WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.ID,"comment")))
            WebDriverWait(driver,10).until(EC.text_to_be_present_in_element((By.ID,"comment"),""))
            #获取评论文本
            text=driver.find_element_by_id("comment").get_attribute("innerHTML")
            #print(text)
            with open(storage_file_path,"a+") as storage_file:
                storage_file.write(text)
        except Exception as e:
            print(e)
            
    except Exception as msg:
        print(msg)
        print("\n",try_time,". !!!!!!!!!!没有下一页评论了!!!!!!!!!!")
        break
        driver.quit()
    
driver.quit()

#从保存的本地文件中读取HTML代码
html_text=""
if os.path.exists(storage_file_path):
    with open(storage_file_path,"r") as f:
        html_text=f.read()
bs=BeautifulSoup(html_text,"html.parser")
#print(bs.prettify())        

print("\n* * * * * 获取到的全部评论 * * * * *\n")
comment_list=[]
comments=bs.select("p[class='comment-con']")
for comment in comments:
    comment_list.append(comment.get_text())
    print(comment.get_text())
#print(comment_list)

print("\n* * * * * 评论对应订单和时间 * * * * *\n")

comment_order=[]
comment_time=[]

comments=bs.select("div[class='order-info']")
n=0
for comment in comments:
    order_time=comment.get_text().split(" ",1)
    order=order_time[0].strip()
    ctime=order_time[1].strip()
    comment_order.append(order)
    comment_time.append(ctime)
    print(ctime,order)
    n+=1
    
#print(comment_order)
#print(comment_time)


comment_dict= {'id':range(len(comment_list)),'time':comment_time,'order':comment_order,'comment':comment_list}
comment_df= pd.DataFrame(comment_dict)


comment_dict= {'id':range(len(comment_list)),'time':comment_time,'order':comment_order,'comment':comment_list}
comment_df= pd.DataFrame(comment_dict)
comment_df.to_excel(r"E:\Jinduoduo_JD_COMMENTS.xlsx",index=False)
print("\n* * * * * 评论已保存到EXCEL表格 E:\\Jinduoduo_JD_COMMENTS.xlsx * * * * *\n")


comments_joined=" ".join(comment_list)

#去除文本中的标点符号
reg='[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \,\。\?\!\:\;\‘\【\{\|\《\¥\&\*\》\}\】\…]+'

comments_joined=re.sub(reg,' ',comments_joined)

keywords=jieba.cut(comments_joined,cut_all=False)
keywords_joined=" ".join(keywords)
print("\n* * * * * 评论中所有关键词汇总 * * * * *\n")
print(keywords_joined)

print("\n* * * * * 关键词重要性权重分布 * * * * *\n")
for x, w in anls.extract_tags(keywords_joined, topK=36, withWeight=True):
    print('%s %s' % (x, w))

print("\n* * * * * 关键词文本频率权重分布 * * * * *\n")
for x, w in anls.extract_tags(keywords_joined, topK=36, withWeight=True):
    print('%s %s' % (x, w))
    
#用词云可视化评论词频
fontpath=r"C:\Windows\Fonts\SourceHanSerifCN-Bold.otf"

imgmask=np.array(Image.open(r"E:\jinduoduo.png"))
genclr=ImageColorGenerator(imgmask)

wc=WordCloud(font_path=fontpath,
            background_color="white",
            max_words=100,
            max_font_size=500,
            random_state=50,
            collocations=False,
            mask=imgmask,
            color_func=genclr,
            width=4800,height=4800,margin=2)
wc.generate(keywords_joined)


plt.imshow(wc)
plt.axis("off")
plt.show()

wc.to_file(r"E:\jinduoduo_wordcloud.png")

print("\n* * * * *  词云图已经保存到 E:\\jinduoduo_wordcloud.png * * * * *\n")


# In[ ]:




 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章