#!/usr/bin/env python
# coding: utf-8
# In[1]:
from selenium import webdriver
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import os
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from pandas import DataFrame
import re
import jieba
import jieba.analyse as anls
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from PIL import Image
import matplotlib.pyplot as plt
#读取的文本存入本地文件
storage_file_path=r"E:\jd_comments.txt"
if os.path.exists(storage_file_path):
os.remove(storage_file_path)
url0="https://item.jd.com/28501887051.html#comment"
#url0="https://item.jd.com/34378572773.html#comment"
url=input("请输入您要读取评论的网页地址:ENTER 默认"+url0)
if url=="":
url=url0
max_page_number=input("请输入需要读取多少页评论:")
max_page_number=int(max_page_number)
print("好的,即将为你读取",max_page_number,"页评论")
driver=webdriver.Chrome()
driver.get(url)
#寻找到评论按钮并点击,翻开评论
try:
WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.XPATH,"//li[@data-anchor='#comment']")))
driver.find_element_by_xpath("//li[@data-anchor='#comment']").click()
#等待第一页评论出现后,显示评论
try:
WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.ID,"comment")))
WebDriverWait(driver,10).until(EC.text_to_be_present_in_element((By.ID,"comment"),""))
text=driver.find_element_by_id("comment").get_attribute("innerHTML")
#print(text)
with open(storage_file_path,"a+") as storage_file:
storage_file.write(text)
except:
print("\n!!!!!!!!!!找不到评论文本!!!!!!!!!!\n")
except:
print("\n!!!!!!!!!!没有找到评论按钮!!!!!!!!!!\n")
#连续翻开后续评论页面
for try_time in range(max_page_number):
try:
time.sleep(3)
WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CLASS_NAME,"ui-pager-next")))
element=driver.find_element_by_class_name("ui-pager-next")
#点击“下一页”按钮
driver.execute_script("arguments[0].click();", element)
try:
WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.ID,"comment")))
WebDriverWait(driver,10).until(EC.text_to_be_present_in_element((By.ID,"comment"),""))
#获取评论文本
text=driver.find_element_by_id("comment").get_attribute("innerHTML")
#print(text)
with open(storage_file_path,"a+") as storage_file:
storage_file.write(text)
except Exception as e:
print(e)
except Exception as msg:
print(msg)
print("\n",try_time,". !!!!!!!!!!没有下一页评论了!!!!!!!!!!")
break
driver.quit()
driver.quit()
#从保存的本地文件中读取HTML代码
html_text=""
if os.path.exists(storage_file_path):
with open(storage_file_path,"r") as f:
html_text=f.read()
bs=BeautifulSoup(html_text,"html.parser")
#print(bs.prettify())
print("\n* * * * * 获取到的全部评论 * * * * *\n")
comment_list=[]
comments=bs.select("p[class='comment-con']")
for comment in comments:
comment_list.append(comment.get_text())
print(comment.get_text())
#print(comment_list)
print("\n* * * * * 评论对应订单和时间 * * * * *\n")
comment_order=[]
comment_time=[]
comments=bs.select("div[class='order-info']")
n=0
for comment in comments:
order_time=comment.get_text().split(" ",1)
order=order_time[0].strip()
ctime=order_time[1].strip()
comment_order.append(order)
comment_time.append(ctime)
print(ctime,order)
n+=1
#print(comment_order)
#print(comment_time)
comment_dict= {'id':range(len(comment_list)),'time':comment_time,'order':comment_order,'comment':comment_list}
comment_df= pd.DataFrame(comment_dict)
comment_dict= {'id':range(len(comment_list)),'time':comment_time,'order':comment_order,'comment':comment_list}
comment_df= pd.DataFrame(comment_dict)
comment_df.to_excel(r"E:\Jinduoduo_JD_COMMENTS.xlsx",index=False)
print("\n* * * * * 评论已保存到EXCEL表格 E:\\Jinduoduo_JD_COMMENTS.xlsx * * * * *\n")
comments_joined=" ".join(comment_list)
#去除文本中的标点符号
reg='[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \,\。\?\!\:\;\‘\【\{\|\《\¥\&\*\》\}\】\…]+'
comments_joined=re.sub(reg,' ',comments_joined)
keywords=jieba.cut(comments_joined,cut_all=False)
keywords_joined=" ".join(keywords)
print("\n* * * * * 评论中所有关键词汇总 * * * * *\n")
print(keywords_joined)
print("\n* * * * * 关键词重要性权重分布 * * * * *\n")
for x, w in anls.extract_tags(keywords_joined, topK=36, withWeight=True):
print('%s %s' % (x, w))
print("\n* * * * * 关键词文本频率权重分布 * * * * *\n")
for x, w in anls.extract_tags(keywords_joined, topK=36, withWeight=True):
print('%s %s' % (x, w))
#用词云可视化评论词频
fontpath=r"C:\Windows\Fonts\SourceHanSerifCN-Bold.otf"
imgmask=np.array(Image.open(r"E:\jinduoduo.png"))
genclr=ImageColorGenerator(imgmask)
wc=WordCloud(font_path=fontpath,
background_color="white",
max_words=100,
max_font_size=500,
random_state=50,
collocations=False,
mask=imgmask,
color_func=genclr,
width=4800,height=4800,margin=2)
wc.generate(keywords_joined)
plt.imshow(wc)
plt.axis("off")
plt.show()
wc.to_file(r"E:\jinduoduo_wordcloud.png")
print("\n* * * * * 词云图已经保存到 E:\\jinduoduo_wordcloud.png * * * * *\n")
# In[ ]: