利用線程池進行視頻抓取
上圖是網頁的源碼分析,在首頁上ul標籤下面有4個li標籤,每個li標籤下是有包含視頻網頁的地址,因此先取li標籤,如:li_list=tree.xpath('//ul[@id="listvideoListUl"]/li')
以上是li標籤結構,所以視頻播放頁地址這樣取得 srcurl="https://www.pearvideo.com/"+li.xpath('./div/a/@href')[0]
def get_data(dic)是調用的函數。
mport requests
from lxml import etree
from multiprocessing.dummy import Pool
import re
#梨視頻體育網址
url="https://www.pearvideo.com/category_9"
# 設置user-agent用字典的形式
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"}
#session=requests.Session()
res=requests.get(url=url,headers=headers).text
#print(res)
tree=etree.HTML(res)
li_list=tree.xpath('//ul[@id="listvideoListUl"]/li')
# print(li_list)
urls=[]
for li in li_list:
srcurl="https://www.pearvideo.com/"+li.xpath('./div/a/@href')[0]
name=li.xpath('./div/a/div[2]/text()')[0]
name=name.replace(' ','')+'.mp4'
#print(name)
detail_page=requests.get(url=srcurl,headers=headers).text
ex='srcUrl="(.*?)",vdoUrl' # 用正則是因爲視頻地址取自javascrip中
#srcUrl="https://video.pearvideo.com/mp4/adshort/20190518/cont-1555912-13920965_adpkg-ad_hd.mp4",vdoUrl......
url=re.findall(ex,detail_page)[0]
dic={"url":url,"name":name}
urls.append(dic)
def get_data(dic):
url=dic["url"]
# print(url)
print("正在下載:",dic['name'])
data=requests.get(url=url,headers=headers).content
with open(dic['name'],'wb') as fp:
fp.write(data)
print("下載完成:", dic['name'])
pool=Pool(4)#建立有4個線程的線程池
pool.map(get_data,urls)#啓動線程
pool.close()#關閉線程
pool.join()#主進程要等待所有線程進行完,再關閉。