爬虫爬取电商网站商品评论进行词频分析

原創

2020-06-22 23:12

#!/usr/bin/env python
# coding: utf-8

# In[1]:


from selenium import webdriver
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import os
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from pandas import DataFrame
import re
import jieba
import jieba.analyse as anls
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from PIL import Image
import matplotlib.pyplot as plt

#读取的文本存入本地文件
storage_file_path=r"E:\jd_comments.txt"
if os.path.exists(storage_file_path):
    os.remove(storage_file_path)

url0="https://item.jd.com/28501887051.html#comment"
#url0="https://item.jd.com/34378572773.html#comment"
url=input("请输入您要读取评论的网页地址：ENTER 默认"+url0)
if url=="":
    url=url0
    
max_page_number=input("请输入需要读取多少页评论：")
max_page_number=int(max_page_number)
print("好的，即将为你读取",max_page_number,"页评论")
    
driver=webdriver.Chrome()
driver.get(url)
    
#寻找到评论按钮并点击，翻开评论
try:
    WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.XPATH,"//li[@data-anchor='#comment']")))
    driver.find_element_by_xpath("//li[@data-anchor='#comment']").click()
    
    #等待第一页评论出现后，显示评论
    try:
        WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.ID,"comment")))
        WebDriverWait(driver,10).until(EC.text_to_be_present_in_element((By.ID,"comment"),""))
        text=driver.find_element_by_id("comment").get_attribute("innerHTML")
        #print(text)
        with open(storage_file_path,"a+") as storage_file:
            storage_file.write(text)
    except:
         print("\n！！！！！！！！！！找不到评论文本！！！！！！！！！！\n")
        
except:
     print("\n！！！！！！！！！！没有找到评论按钮！！！！！！！！！！\n")
        

        
#连续翻开后续评论页面
for try_time in range(max_page_number): 
    try:
        time.sleep(3)
        WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CLASS_NAME,"ui-pager-next")))
        element=driver.find_element_by_class_name("ui-pager-next")
        #点击“下一页”按钮
        driver.execute_script("arguments[0].click();", element)
        
        try:
            WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.ID,"comment")))
            WebDriverWait(driver,10).until(EC.text_to_be_present_in_element((By.ID,"comment"),""))
            #获取评论文本
            text=driver.find_element_by_id("comment").get_attribute("innerHTML")
            #print(text)
            with open(storage_file_path,"a+") as storage_file:
                storage_file.write(text)
        except Exception as e:
            print(e)
            
    except Exception as msg:
        print(msg)
        print("\n",try_time,". ！！！！！！！！！！没有下一页评论了！！！！！！！！！！")
        break
        driver.quit()
    
driver.quit()

#从保存的本地文件中读取HTML代码
html_text=""
if os.path.exists(storage_file_path):
    with open(storage_file_path,"r") as f:
        html_text=f.read()
bs=BeautifulSoup(html_text,"html.parser")
#print(bs.prettify())        

print("\n* * * * * 获取到的全部评论 * * * * *\n")
comment_list=[]
comments=bs.select("p[class='comment-con']")
for comment in comments:
    comment_list.append(comment.get_text())
    print(comment.get_text())
#print(comment_list)

print("\n* * * * * 评论对应订单和时间 * * * * *\n")

comment_order=[]
comment_time=[]

comments=bs.select("div[class='order-info']")
n=0
for comment in comments:
    order_time=comment.get_text().split(" ",1)
    order=order_time[0].strip()
    ctime=order_time[1].strip()
    comment_order.append(order)
    comment_time.append(ctime)
    print(ctime,order)
    n+=1
    
#print(comment_order)
#print(comment_time)


comment_dict= {'id':range(len(comment_list)),'time':comment_time,'order':comment_order,'comment':comment_list}
comment_df= pd.DataFrame(comment_dict)


comment_dict= {'id':range(len(comment_list)),'time':comment_time,'order':comment_order,'comment':comment_list}
comment_df= pd.DataFrame(comment_dict)
comment_df.to_excel(r"E:\Jinduoduo_JD_COMMENTS.xlsx",index=False)
print("\n* * * * * 评论已保存到EXCEL表格 E:\\Jinduoduo_JD_COMMENTS.xlsx * * * * *\n")


comments_joined=" ".join(comment_list)

#去除文本中的标点符号
reg='[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \，\。\？\！\：\；\‘\【\{\|\《\￥\&\*\》\}\】\…]+'

comments_joined=re.sub(reg,' ',comments_joined)

keywords=jieba.cut(comments_joined,cut_all=False)
keywords_joined=" ".join(keywords)
print("\n* * * * * 评论中所有关键词汇总 * * * * *\n")
print(keywords_joined)

print("\n* * * * * 关键词重要性权重分布 * * * * *\n")
for x, w in anls.extract_tags(keywords_joined, topK=36, withWeight=True):
    print('%s %s' % (x, w))

print("\n* * * * * 关键词文本频率权重分布 * * * * *\n")
for x, w in anls.extract_tags(keywords_joined, topK=36, withWeight=True):
    print('%s %s' % (x, w))
    
#用词云可视化评论词频
fontpath=r"C:\Windows\Fonts\SourceHanSerifCN-Bold.otf"

imgmask=np.array(Image.open(r"E:\jinduoduo.png"))
genclr=ImageColorGenerator(imgmask)

wc=WordCloud(font_path=fontpath,
            background_color="white",
            max_words=100,
            max_font_size=500,
            random_state=50,
            collocations=False,
            mask=imgmask,
            color_func=genclr,
            width=4800,height=4800,margin=2)
wc.generate(keywords_joined)


plt.imshow(wc)
plt.axis("off")
plt.show()

wc.to_file(r"E:\jinduoduo_wordcloud.png")

print("\n* * * * *  词云图已经保存到 E:\\jinduoduo_wordcloud.png * * * * *\n")


# In[ ]:

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

爬虫爬取电商网站商品评论进行词频分析

钉钉打卡速度慢

Nginx R31 doc 官方文档-01-nginx 如何安装

Qt/C++音视频开发74-合并标签图形/生成yolo运算结果图形/文字和图形合并成一个/水印滤镜

挑战程序设计竞赛 2.2章习题 POJ - 3617 Best Cow Line 贪心

字节面试：MySQL什么时候锁表？如何防止锁表？

.NET8连接SQL SERVER 2008 R2 报：证书链是由不受信任的颁发机构颁发的

golang开发环境搭建(win10)

python计算机视觉学习笔记——PIL库的用法

Golang初学：获取程序内存使用情况，std runtime

html css 實現文字豎排模擬古籍排版

blender用視頻做背景渲染動畫節點設置

本地登錄mysql服務器

qrcode.js生成二維碼示例

用windows的遠程桌面連接CentOS

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結