抖音小視頻爬取(配合mitmproxy和夜神模擬器)

# #_*_encoding:utf-8_*_
import json
import ssl
from urllib.request import Request
import urllib
import sys
import time
import hashlib
import os
from threading import Thread
import logging
from queue import Queue
ssl._create_default_https_context = ssl._create_unverified_context

#啓動命令: mitmdump -p 8081 -s douyin1.py

keyword = 'shshs'
BASE_PATH = 'E:\douyin'

STORE_PATH = '{}\\{}'.format(BASE_PATH,keyword)#視頻保存路徑
if not os.path.exists(STORE_PATH):
    os.mkdir(STORE_PATH)
TXT_FILENAME = STORE_PATH + '.txt'

header = {
    "User-Agent": "com.ss.android.ugc.aweme/251 (Linux; U; Android 4.4.2; zh_CN; MI 6 ; Build/NMF26X; Cronet/58.0.2991.0)",
}

logging.basicConfig(filename='{}\douyin.log'.format(BASE_PATH),level=logging.INFO,format="%(asctime)s-%(name)s-%(levelno)s-%(lineno)d-%(message)s")
logger = logging.getLogger(__name__)
q = Queue(maxsize=2000)
def response(flow):
# 如果經過中間人的請求中有以url或者url1開頭請求,我就解析它的響應
    url = 'https://aweme.snssdk.com/aweme/v1/search/'
    url1 = 'https://api.amemv.com/aweme/v1/search/'
    url2=" https://aweme.snssdk.com/aweme/v1/general/search/"
    url3 = 'https://aweme-hl.snssdk.com/aweme/v1/search/'
    if flow.request.url.startswith(url) or flow.request.url.startswith(url1) or flow.request.url.startswith(url3) or flow.request.url.startswith(url2):
        logger.info(flow.request.url)
        resp = flow.response.text
        data = json.loads(resp)
        # 解析url地址和視頻名稱
        url_list = data['aweme_list']
        for url in url_list:
            video_url = url['video']['play_addr']['url_list'][0]
            real_url = video_url.replace("play", "playwm")#視頻鏈接
            video_name = url['desc']#視頻名稱
            name = hashlib.md5(video_name.encode()).hexdigest()#將視頻名hash,作爲去重依據
            q.put(real_url + "+" + name)
        print("The len of queue is **********************************************:",q.qsize())

def download(q):
    while True:
        try:
            a =q.get()
            with open(TXT_FILENAME, 'a') as f:#將抓取的視頻鏈接寫入文件
                f.write(a)
                f.write('\n')
            url = a.split("+")[0]
            video_name = a.split("+")[1]
        except:
            continue
        print("---------------queue has %s------------"%q.qsize())
        try:
            filename = '{}\\{}.mp4'.format(STORE_PATH, hashlib.md5(video_name.encode()).hexdigest())
            if not os.path.exists(filename):
                r = Request(url, headers=header)
                data = urllib.request.urlopen(r)
                with open(filename, 'ab') as f:
                    f.write(data.read())
                data.close()
                print("download ok:", video_name)
                q.task_done()
                time.sleep(0.1)
        except Exception as e:
            q.put(a)
            print("Error:", e)
            time.sleep(0.2)

for i in range(15):
    t = Thread(target=download, args=(q,))
    print("threading start")
    t.start()
    q.join()
    sys.exit(1)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章