使用python爬蟲爬取秒懂百科的視頻

python 爬蟲抓取百度百科視頻源代碼

from urllib.parse import quote
from bs4 import BeautifulSoup
import requests
import re
#抓取secondId的頭
header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip,deflate,br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'baike.baidu.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/71.0.3578.98Safari/537.36',
}
#抓取視頻的頭
header2={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip,deflate,br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Host':'baike.baidu.com',
'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/71.0.3578.98Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
}

#得到該名字的id
def getSecondId(name):
    url="https://baike.baidu.com/item/"+quote(name)
    r=requests.get(headers=header,url=url)
    r.encoding = "utf-8"
    bs = BeautifulSoup(r.text, 'lxml')
# print(bs)
    bsStr=str(bs)
    ret=re.findall('(\"secondId\":[\d]{0,10})',bsStr)
    print(ret[1])
    ret2=re.sub('"secondId":',"",str(ret[1]))
    return ret2


#獲取視頻鏈接
def getUrl(secondId):
    url="https://baike.baidu.com/api/wikisecond/playurl?secondId="+secondId+"&t=1549201158256&_243463=1549201151114"
    r=requests.get(headers=header2,url=url)
    r2=r.content
    r3=str(r2)
    ret3=r3.split(",")
    ret4=ret3[2]
    ret5=re.sub("\"list\":{\"mp4Url\":","",ret4)
    ret6=re.sub("[\"]","",ret5)
    # print(ret6)
    ret7=re.sub(r'\\',"",ret6)
    print(ret7)
    url=ret7
    return url

#下載視頻
def download(url,name):
    print('start')
#     r=requests.get(headers=headers,url=url)
    r=requests.get(url,stream=True)
    with open(name+'.mp4', "wb") as mp4:
        for chunk in r.iter_content(chunk_size=1024*1024):
            if chunk:
                mp4.write(chunk)
    print('download over')
    
    
name="杜鵑"    
secondId=getSecondId(name)   
url=getUrl(secondId)
download(url,name)  

我在這裏給大家一個百度雲鏈接,提取碼x62s。這個裏面有可以直接在windows上運行的工具。可能界面不太友好。原諒我的美術功底。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章