import json, re, requests, pymongo, os, sys
from hashlib import md5
from urllib.parse import urlencode
from requests.exceptions import RequestException
from config import *
from multiprocessing import Pool
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
"cookie": "__tasessionId=5wqgy5a4x1592018780676; csrftoken=dd264bf93a77880b1720c1d80d76db2a; tt_webid=6837668550247122440; s_v_web_id=kbd2vlh4_ef3hme3C_JVak_4zpi_8c8v_Ch0GwbNfYkII",
}
client = pymongo.MongoClient(MONGO_URL, connect=False)
db = client[MONGO_DB]
def get_page_index(offset, keyword):
# 請求索引頁
data = {
'aid': '24',
'app_name': 'web_search',
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'en_qc': '1',
'cur_tab': '1',
'from': 'search_tab',
'pd': 'synthesis',
}
url = "https://www.toutiao.com/api/search/content/?"+urlencode(data)
try:
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
return resp.text
except RequestException:
print('請求索引頁出錯')
return None
def parse_index_page(html):
# 解析索引頁
data = json.loads(html)
if data and 'data' in data.keys() and data.get('data'):
for item in data.get('data'):
if 'article_url' in item.keys():
yield item.get('article_url')
def get_page_detail(url):
# 獲取詳情頁
try:
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
return resp.text
except RequestException:
print('請求詳情頁出錯')
return None
def parse_page_detail(html, url):
# 解析詳情頁
image_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S)
result = re.search(image_pattern, html)
if result:
title = "default_title"
# 獲取標題
title_pattern = re.compile('><title>(.*?)</title><meta')
title_result = re.search(title_pattern, html)
if title_result:
title = title_result.group(1)
# 獲取圖片
json_str = result.group(1).encode('utf-8').decode("unicode-escape")
data = json.loads(json_str)
if data and 'sub_images' in data.keys():
sub_images = data.get("sub_images")
images = [item.get('url') for item in sub_images]
# 下載圖片
for image in images:
download_image(image, title)
return {
'title':title,
'url':url,
'images':images
}
def save_to_mongo(result):
# 存儲到數據庫
result['_id'] = result["url"].split("/")[-2]
try:
if db[MONGO_TABLE].insert_one(result):
print('存儲到MongoDB', result['title'])
return True
except pymongo.errors.DuplicateKeyError:
return False
def download_image(url, title):
# 下載圖片
print("正在下載", url)
try:
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
save_image(resp.content, title)
except RequestException:
print('請求圖片出錯')
return None
def save_image(content, title):
# 存儲圖片
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
title = re.sub(rstr, "_", title)
file_path = '{0}.{1}'.format(os.path.join(os.getcwd(), title, md5(content).hexdigest()) ,'jpg')
if not os.path.exists(os.path.split(file_path)[0]):
os.mkdir(os.path.split(file_path)[0])
with open(file_path, 'wb') as f:
f.write(content)
def main(offset):
html = get_page_index(offset, KEYWORD)
for url in parse_index_page(html):
html = get_page_detail(url)
if html:
result = parse_page_detail(html, url)
if result:
save_to_mongo(result)
if __name__ == '__main__':
groups = [x * 20 for x in range(GROUP_START, GROUP_END+1)]
pool = Pool()
pool.map(main, groups)
config.py
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'
GROUP_START = 1
GROUP_END = 20
KEYWORD = '街拍'
存儲到數據庫的效果:
存儲到本地圖片的效果: