嘗試用python抓取視頻並且處理title中的非法字符,保存爲mp4
已經處理了分頁的問題,默認下載爲360p格式。
#!/usr/bin/python
from bs4 import BeautifulSoup as bs
from requests.exceptions import ConnectTimeout,ConnectionError
import requests,time,sys,re,queue
import youtube_dl
#基本URL
base = "https://www.youku.com/results?search_query="
qstring = "cctv+空中劇院"
pagestring = "&page="
proxystr = '127.0.0.1:49705'
#proxystr = ''
#設置代理
sess = requests.session()
sess.proxies = {'https': proxystr}
video_urls = queue.Queue() # url隊列
counter=0 #頁碼
while True:
counter += 1
try:
response = sess.get(base + qstring+ pagestring + str(counter))
except (ConnectTimeout, ConnectionError):
print("不能訪問youku 檢查是否已設置代理")
sys.exit()
page = response.text
soup = bs(page, 'html.parser')# 開始解析html
No_more_results = soup.findAll('div',attrs={'class':'display-message'})
if No_more_results and No_more_results[0].text=="No more results":
break#翻頁過頭了
vids = soup.findAll('a', attrs={'class': 'yt-uix-tile-link'})
if(vids):
for v in vids:
if len(v['href']) > 20:
continue#超過20的可能是廣告?
v_link = 'https://www.youku.com' + v['href']
video_urls.put([v_link,v['title']])
else:#沒有找到視頻,結束了?
break
print("page:{} size:{}".format(counter,video_urls.qsize()))
time.sleep(1)#休息一下
counter=0
while not video_urls.empty():
v_url,title = video_urls.get()
print(v_url,title)
# pattern = re.compile(r"\||CCTV戲曲| |來自")
# file_name = re.sub(pattern, "", title).replace("/", "-")
try:
ydl_opts = {# 定義下載參數
'format' : '[height=360]',#360p已經足夠了
# 格式化下載後的文件名,加入處理後的title
'outtmpl': '%(title)s.%(ext)s',
'proxy' : proxystr
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
# 下載給定的URL列表
result = ydl.download([v_url])
print('下載完成')
except (TimeoutError,ConnectTimeout, ConnectionError):
print("不能訪問youku 檢查是否已設置代理")
sys.exit()
counter += 1
if(counter>3):
break#測試3條