Youku爬蟲抓取視頻

嘗試用python抓取視頻並且處理title中的非法字符,保存爲mp4

已經處理了分頁的問題,默認下載爲360p格式。

 

#!/usr/bin/python
from bs4 import BeautifulSoup as bs
from requests.exceptions import ConnectTimeout,ConnectionError
import requests,time,sys,re,queue
import youtube_dl

#基本URL
base = "https://www.youku.com/results?search_query="
qstring = "cctv+空中劇院"
pagestring = "&page="
proxystr = '127.0.0.1:49705'
#proxystr = ''

#設置代理
sess = requests.session()
sess.proxies = {'https': proxystr}

video_urls = queue.Queue()   # url隊列
counter=0 #頁碼
while True:
    counter += 1
    try:
        response = sess.get(base + qstring+ pagestring + str(counter))
    except (ConnectTimeout, ConnectionError):
        print("不能訪問youku 檢查是否已設置代理")
        sys.exit()
    page = response.text
    soup = bs(page, 'html.parser')# 開始解析html

    No_more_results = soup.findAll('div',attrs={'class':'display-message'})
    if No_more_results and No_more_results[0].text=="No more results":
        break#翻頁過頭了

    vids = soup.findAll('a', attrs={'class': 'yt-uix-tile-link'})
    if(vids):
        for v in vids:
            if len(v['href']) > 20:
                continue#超過20的可能是廣告?
            v_link = 'https://www.youku.com' + v['href']
            video_urls.put([v_link,v['title']])
    else:#沒有找到視頻,結束了?
        break

    print("page:{} size:{}".format(counter,video_urls.qsize()))
    time.sleep(1)#休息一下

counter=0
while not video_urls.empty():
    v_url,title = video_urls.get()
    print(v_url,title)
    # pattern = re.compile(r"\||CCTV戲曲| |來自")
    # file_name = re.sub(pattern, "", title).replace("/", "-")

    try:
        ydl_opts = {# 定義下載參數
            'format' : '[height=360]',#360p已經足夠了
            # 格式化下載後的文件名,加入處理後的title
            'outtmpl': '%(title)s.%(ext)s',
            'proxy'  : proxystr
        }
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            # 下載給定的URL列表
            result = ydl.download([v_url])
        print('下載完成')

    except (TimeoutError,ConnectTimeout, ConnectionError):
        print("不能訪問youku 檢查是否已設置代理")
        sys.exit()

    counter += 1
    if(counter>3):
        break#測試3條


 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章