# -*- coding: utf-8 -*-
#@Time :18-9-23 上午11:22
#@Author : LiMeng
#@Email : [email protected]
#@File : yanxigonglvu.py
#Software:PyCharm
import requests
import ppretty
import collections
from wordcloud import WordCloud
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from scipy.misc import imread
import jieba
def get():
url='http://www.tvzn.com/14784/yanyuanbiao.html'
res=requests.get(url=url)
html=res.content
dianshiju_list=[]
nameList=[]
soup=BeautifulSoup(html,'lxml')
dianshuju_x=[]
# 主演
contents1 = soup.find('ul', attrs={'class':'gclearfix'}).findAll("li")
for content in contents1:
#actorNamezhuyan=content.find('p',attrs={'class':'mh-actor'}).find('a',attrs={'class':'mh-actor'})
actorNamezhuyan=content.find('a',attrs={'class':'mh-actor'})
# print(actorNamezhuyan)
href=actorNamezhuyan.attrs['href']
# 將分析得到的網頁地址進行二次爬蟲,這裏是要尋找某個演員參演的電視劇,需要再次發送請求
res1=requests.get(('http://www.tvzn.com/'+href))
rsp=res1.text
soup1=BeautifulSoup(rsp,"lxml")
content1 = soup1.find('ul',attrs={'class':'tn-avatar-list tn-helper-reset tn-helper-clearfix'})
# print(type(dianshiju_list))
for x in content1.strings:#這裏是獲取節點下面所有的內容
if (x):#有的節點下面沒有內容,所以需要將其過濾掉
dianshiju_list.append(x)#將電視劇目表添加到數組中
# 配角
contents2=soup.find('div',attrs={'class':'mh-name-list'}).findAll('li')
for contentx in contents2:
aclist=contentx.findAll('p')
for x in aclist:
nameList.append(x.find('',attrs={'class':'mh-actor'}).getText())
# 得到包含演員的數組
surnamelist = []
givennamelist = []
surname_dict = {}
for actorname in nameList:
surnamelist.append(actorname[0])
for givenname in actorname[2:]:
givennamelist.append(givenname)
if actorname[0] not in surname_dict:
surname_dict[actorname[0]]=1
else:
surname_dict[actorname[0]]+=1
file=open('./data.txt','w')
for x in nameList:
file.write(x)
file.write(" ")
file.close()
word_count = collections.Counter(dianshiju_list);
bg_pic = imread('mask.jpeg')
wordcloud = WordCloud(font_path='./simhei.ttf', mask=bg_pic, background_color="white", width=1000, height=860,
margin=2).generate_from_frequencies((word_count))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
if __name__ == '__main__':
get()