import requests
import re
import requests
import json,os
from urllib import request
# def a (offset):
for i in range(0,60,20):
url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(i)
# a_url = url.format(offset)
# print(url)
response = requests.get(url)
# 可以通過response.json 直接獲取轉化後的對象(dict)
html_json_dict = response.json()
# print(html_json_dict)
# 獲取dict中的data key對應的列表
data_list = html_json_dict['data']
# print(data_list)
# num = offset / 20
# if num <= 1:
# 如果列表中的每一項,有article_url我們就取這個值
for data_item in data_list:
if 'article_url' in data_item:
article_url = data_item['article_url']
# print(article_url)
# response = requests.get(article_url)
# print(response)
# a(offset)
# if __name__=='__main__':
# a(0)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
url = 'https://www.toutiao.com/a6590127156037157379/'
response = requests.get(article_url,headers=headers)
# with open('beiying.html', 'wb') as f:
# f.write(response.content)
html_str = response.text
pattern = r'gallery: JSON\.parse\((.*)\),'
match_res = re.search(pattern, html_str)
# print(match_res)
# 新建文件夾
if not os.path.exists('download') :
os.mkdir('download')
if match_res:
# 這本來就是str
# print(match_res.group(1))
json_origin = match_res.group(1)
# 這是第一遍loads, 返回值是str
res_buzhidao = json.loads(json_origin)
# print(res_buzhidao)
# print(type(res_buzhidao))
res_dict = json.loads(res_buzhidao)
# print(res_dict)
# print(type(res_dict))
sub_images_list = res_dict['sub_images']
# print(sub_images_list)
for image in sub_images_list:
image_url = image['url']
print(image_url)
filename = 'download/' + image_url.split('/')[-1] + '.jpg'
# 下載圖片
request.urlretrieve(image_url, filename)
else:
print('你寫錯了, 不應該來我這')
爬蟲--今日頭條街拍圖片1
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.