有許多專門爬取的youtube小視頻的Python包,本次選擇youtube_dl,直接pip install youtube_dl
youtube有具有一個各種視頻分類的模塊。 https://research.google.com/youtube8m/explore.html
現根據關鍵詞爬取其下的全部視頻:
import hashlib import youtube_dl from multiprocessing import Pool import time import re import os import requests import logging import socket import pandas as pd from threading import Thread socket.setdefaulttimeout(20) keyword = "Waterfall" # 設置關鍵詞 log_name = 'E:\youtube\{}_log.txt'.format(keyword) logging.basicConfig(filename=log_name, level=logging.INFO, format="%(levelname)s:%(asctime)s:%(message)s") def get_nameCode_dict(): data = pd.read_csv('train-histogram-min.csv',header=None,sep=",",encoding="gbk") number=data[1]#編號 kw=data[2]#關鍵詞 d={} for i in range(len(number)): d[kw[i]] = number[i].replace("/m/","") return d def get_str(id_url): """ 根據關鍵詞id獲取其所有視頻的鏈接,並存入列表 :param id_url: :return: """ lit = [] print('starting download code, It takes some time,Please wait a moment......') headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36", "Referer": "https://research.google.com/youtube8m/explore.html", } response = requests.get(id_url,headers=headers).text ids_str = re.findall("\[(.*)\]",response)[0] id1=ids_str.split(",") id1[0]=id1[0].replace('[','') id1[-1]=id1[-1].replace(']','') print("the keyword has video numbers:", len(id1)) # 全部下載,部分下載可以指定,如只下載前500個,for id in id1[:500]: for id in id1: url = "https://storage.googleapis.com/data.yt8m.org/2/j/i/{0}/{1}.js".format(id[1:3], id[1:-1]) try: resp = requests.get(url, headers=headers).text get_str = resp.split(",")[-1].split(")")[0] ur = "https://www.youtube.com/watch?v={}".format(get_str[1:-1]) lit.append(ur) except Exception as e: print("Error:", e) continue return lit def download_video(url,keyword): """ 下載視頻 :param url: :param keyword: :return: """ storePath = "E:\youtube\{}\\".format(keyword) if not os.path.exists(storePath): os.mkdir(storePath) try: print('Downloading:',url) ydl_opts = { 'outtmpl': hashlib.md5(url.encode()).hexdigest() + '.mp4' } os.chdir(storePath) with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) print(url,"download compete",) time.sleep(0.5) except Exception as e: print("Download failed..",url) logging.error("url error:{}".format(url)) if __name__ == "__main__": d = get_nameCode_dict() pool=Pool() code = d[keyword] #獲取關鍵詞對應編號 if code: total_id_url = "https://storage.googleapis.com/data.yt8m.org/2/j/v/{}.js".format(code) #每個視頻id的url print(total_id_url) urls_list=get_str(total_id_url) for url in urls_list: pool.apply_async(func=download_video,args=(url,keyword)) pool.close() pool.join()