from multiprocessing import Pool
import requests, asyncio, aiohttp
import os
import urllib.request
from time import sleep
class Cartoon():
path = ''
image_list = []
key = 0
curtitle = 0
def str_dict(self):
'''把字符串转化成字典,通常的请求头一个一个写成字典麻烦'''
headers = {}
heads = '''
Host: www.canva.com
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Referer: https://www.canva.com/_ajax/fonts/scripts/recommendations?find&locale=en&limit=57&includeHiddenFonts=false&includeNonLegacyFonts&includePremiumFonts&includePaidFonts=false&includeHanyiFonts&useExtendedStylesFontFamilies=false
Content-Type: application/x-www-form-urlencoded
Origin: https://www.canva.com
Connection: keep-alive
Cookie: __cfduid=dab37c60e818ac043e4327c7e7ac3fa8d1590496209; CDI=537a9753-9237-452a-b9d7-80f91a302029; CL=zh-CN; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%22f0d86891-67fa-4f1b-b6b2-62dce9ce18bc%22; _gcl_au=1.1.1437216432.1590496303; _uetsid=4da25ae7-2449-5476-8e72-c4160ccd9aef; ab.storage.sessionId.320f7332-8571-45d7-b342-c54192dae547=%7B%22g%22%3A%22dd4a173d-62b5-4c16-a568-d1ae75d8a63c%22%2C%22e%22%3A1590498103213%2C%22c%22%3A1590496303214%2C%22l%22%3A1590496303214%7D; ab.storage.deviceId.320f7332-8571-45d7-b342-c54192dae547=%7B%22g%22%3A%22a76bae95-4f5e-301d-7af6-07ca1b599682%22%2C%22c%22%3A1590496303216%2C%22l%22%3A1590496303216%7D; cf_clearance=7a24b07ec0521a856146297a742690efbe495789-1590543393-0-250; CPA=-mZkhSgMVbsfE7_i2n2WGqvJsOinHvOZyTkXW0i_ZNd3xjgMj5mqpRZiKpeGqhbClLbYnsh2pJMSW4MbCvnas6a-25GUgyzX0lnluPGM5S9szocQdDMnkAj4Uu3lx9RLjJ7gYw; CCK=MA5NCilRPuKd2cxieUZN1w
Upgrade-Insecure-Requests: 1
'''
heads = heads.split('\n')
for head in heads:
head = head.strip()
if head:
head_key,head_value = head.split(':',1)
headers[head_key] = head_value.strip()
return headers
def __init__(self):
self.mkdir('漫画')
def mkdir(self, name):
if name not in os.listdir('.'):
os.mkdir(name)
os.chdir(os.path.join(os.path.abspath('.'), name))
return True
else:
return False
def fetch_url(self, url, times = 0):
try:
response = requests.get(url, headers=self.str_dict())
if response.status_code != 200:
print('fetch ' + url + ' not success')
self.fetch_url(url)
return response
except Exception as ex:
print('fetch ' + url + ' error: ', ex)
if times == 10:
return
times += 1
return self.fetch_url(url, times)
def auto_down(self, url, filename, times = 0):
print(filename)
try:
urllib.request.urlretrieve(url, filename)
except Exception as ex:
print('download img error: ', url, ' ', ex)
if times == 10:
return
times += 1
self.auto_down(url, filename, times)
def left_pad_zero(self, ori, target_len):
str_len = len(ori)
if str_len >= target_len:
return ori
paddingLen = target_len - str_len
rst = "0" * paddingLen + ori
return rst
async def get_image(self, url, semaphore, times = 0):
'''异步请求库aiohttp 加快图片 url 的网页请求'''
try:
async with semaphore:
async with aiohttp.ClientSession() as session:
response = await session.get(url)
content = await response.read()
response.close()
# sleep(2)
return content
except Exception as ex:
print(self.curtitle, ' ', times, ' ', url, ' download ERROR: ', ex)
if times == 10:
return None
times += 1
return await self.get_image(url, semaphore, times)
async def download_image(self, image, semaphore):
html = await self.get_image(image[0], semaphore)
if html is not None:
with open(self.left_pad_zero(image[1], 5) + '.jpg','wb') as f:
f.write(html)
print(self.curtitle, ' download: ', image[1] + ' ' + image[0])
def crawl_catalog(self):
rootpath = os.path.abspath('.')
response = self.fetch_url(
"https://comiccdnhw.jsmlny.top/hcomic/home?channelNo=H5_MH_0000")
data = response.json()
comic_list = data['data']['columnList']
for comic in comic_list:
for list in comic['comicList']:
os.chdir(rootpath)
if ':' in list['title']:
list['title'] = list['title'].split(':')[-1]
if list['title'] in os.listdir('.'):
print(list['title'], ' is exists')
continue
if not self.mkdir(list['title']):
print(list['title'], ' mkdir fail')
continue
response = self.fetch_url(
"https://comiccdnhw.jsmlny.top/hcomic/qryComicInfoByComicId?channelNo=H5_MH_0000&comicId=" + str(list['comicId']))
data = response.json()
chapter_info = data['data']['comicBaseInfo']
self.curtitle = chapter_info['title']
for chapter in chapter_info['comicChapterList']:
chapter_id = str(chapter['chapterId'])
self.crawl_chapter(chapter_id)
semaphore = asyncio.Semaphore(150)
task = [asyncio.ensure_future(self.download_image((img_url, str(key+1)), semaphore)) for key, img_url in enumerate(self.image_list)]
# 获取事件循环 Eventloop
loop = asyncio.get_event_loop()
# 执行协程
loop.run_until_complete(asyncio.wait(task))
def crawl_chapter(self, chapter_id):
try:
response = self.fetch_url("https://comiccdnhw.jsmlny.top/hcomic/chaptercontent?chapterId=" + chapter_id)
data = response.json()
chaptercontentlist = data['data']['chapterContentList']
dir=os.path.abspath('.')
for img in chaptercontentlist:
print(self.curtitle, ' fetchurl: ', str(self.key + 1) + ' ' + img['content'])
self.image_list.append(img['content'])
self.key += 1
except Exception as ex:
print(self.curtitle, ' ERROR: ', ex)
if __name__ == "__main__":
print('start')
crawl = Cartoon()
p = Pool(4)
i = 0
while i < 4:
i += 1
print(i)
p.apply_async(crawl.crawl_catalog)
p.close()
p.join()
print('end')