這裏是爬取豆瓣視頻信息,用pyquery庫(jquery的python庫)。
一:代碼
from urllib.request import quote
from pyquery import PyQuery as pq
import requests
import pandas as pd
def get_text_page(movie_name):
'''
函數功能:獲得指定電影名的源代碼
參數:電影名
返回值:電影名結果的源代碼
'''
url = 'https://www.douban.com/search?q=' + movie_name
headers = {
'Host' : 'www.douban.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
}
r = requests.get(url,headers = headers,timeout=5)
return r.text
def get_last_url(this_text):
'''
函數功能:根據指定的源代碼得到最終的網頁地址
參數:搜索結果源代碼
返回值:最終的網頁地址
'''
doc = pq(this_text)
lis = doc('.title a').items()
k = 0
this_str = ''
for i in lis:
# print('豆瓣搜索結果爲:{0}'.format(i.text()))
# print('地址爲:{0}'.format(i.attr.href))
# print('\n')
if k == 0:
this_str = i.attr.href
k += 1
return this_str
def the_last_page(this_url):
'''
函數功能:獲得最終電影網頁的源代碼
參數:最終的地址
返回值:最終電影網頁的源代碼
'''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
}
r = requests.get(this_url,headers = headers,timeout=20)
return r.text
def the_last_text(this_text,movie_name):
'''
函數功能:獲得每一項的數據
參數:爬取頁面的源代碼
返回值:返回空
'''
doc = pq(this_text)
# 獲取標題
title = doc('#content h1').text()
# 獲取海報
photo = doc('.nbgnbg img')
photo_url = photo.attr.src
r = requests.get(photo_url)
with open('{m}.jpg'.format(m = movie_name),'wb') as f:
f.write(r.content)
# 電影信息
message = doc('#info').text()
# 豆瓣評分
grade = doc('#interest_sectl').text()
# 劇情
things = doc('.related-info').text()
with open('{0}.txt'.format(movie_name),'w+') as f:
try:
f.writelines([title,'\n','\n\n',message,'\n\n',grade,'\n\n',things])
except:
f.writelines([title,'\n','\n\n',message,'\n\n',grade])
# 演員
# 演員名
name = []
person_name = doc('.info').items()
for i in person_name:
name.append(i.text())
# 演員圖片地址
person_photo = doc('#celebrities')
j = 0
for i in person_photo .find('.avatar').items():
m = i.attr('style')
person_download_url = m[m.find('(') + 1:m.find(')')]
# 下載演員地址
r = requests.get(person_download_url)
try:
with open('{name}.jpg'.format(name = name[j]),'wb') as f:
f.write(r.content)
except:
continue
j += 1
def lookUrl(this_text,my_str):
'''
函數功能:獲得觀看鏈接
參數:爬取頁面的源代碼
返回值:返回空
'''
doc = pq(this_text)
all_url = doc('.bs li a').items()
movie_f = []
movie_url = []
for i in all_url:
movie_f.append(i.text())
movie_url.append(i.attr.href)
dataframe = pd.DataFrame({'觀看平臺':movie_f,'觀看地址':movie_url})
dataframe.to_csv("{movie_name}的觀看地址.csv".format(movie_name = my_str),index=False,encoding = 'utf_8_sig',sep=',')
def main():
name = input('')
my_str = name
movie_name = quote(my_str)
page_text = get_text_page(movie_name) # 得指定電影名的源代碼
last_url = get_last_url(page_text) # 根據指定的源代碼得到最終的網頁地址
page_text2 = the_last_page(last_url) # 獲得最終電影網頁的源代碼
the_last_text(page_text2,my_str) # 獲得每一項的數據
lookUrl(page_text2,my_str) # 得到並處理觀看鏈接
main()
二:結果如下(部分例子)
1.輸入天氣之子
2.輸入百變小櫻魔法卡
必須是已經上映的電影纔有觀看地址