爬蟲--今日頭條街拍圖片1

import requests
import re
import requests
import json,os
from urllib import request
# def a (offset):
for i in range(0,60,20):
    url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(i)
    # a_url = url.format(offset)
    # print(url)
    response = requests.get(url)
    # 可以通過response.json 直接獲取轉化後的對象(dict)
    html_json_dict = response.json()
    # print(html_json_dict)
    # 獲取dict中的data key對應的列表
    data_list = html_json_dict['data']
    # print(data_list)
    # num = offset / 20
    # if num <= 1:
    # 如果列表中的每一項,有article_url我們就取這個值
    for data_item in data_list:
        if 'article_url' in data_item:
            article_url = data_item['article_url']
            # print(article_url)
            # response = requests.get(article_url)
            # print(response)
        # a(offset)
# if __name__=='__main__':
    # a(0)
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
            }
            url = 'https://www.toutiao.com/a6590127156037157379/'

            response = requests.get(article_url,headers=headers)

            # with open('beiying.html', 'wb') as f:
            #     f.write(response.content)

            html_str = response.text

            pattern = r'gallery: JSON\.parse\((.*)\),'

            match_res = re.search(pattern, html_str)
            # print(match_res)
            # 新建文件夾
            if not os.path.exists('download') :
                os.mkdir('download')

            if match_res:
                # 這本來就是str
                # print(match_res.group(1))
                json_origin = match_res.group(1)
                # 這是第一遍loads, 返回值是str
                res_buzhidao = json.loads(json_origin)
                # print(res_buzhidao)
                # print(type(res_buzhidao))
                res_dict = json.loads(res_buzhidao)
                # print(res_dict)
                # print(type(res_dict))

                sub_images_list = res_dict['sub_images']
                # print(sub_images_list)
                for image in sub_images_list:
                    image_url = image['url']
                    print(image_url)
                    filename = 'download/' + image_url.split('/')[-1] + '.jpg'
                    # 下載圖片
                    request.urlretrieve(image_url, filename)
            else:
                print('你寫錯了, 不應該來我這')
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章