最近想要自動下載一些東方project同人圖,看了看幾家同人圖網站,最後想先拿yande.re試手,不用登陸,頁面上也沒有亂七八糟的廣告混淆視聽,而且圖片普遍質量很高,以下是代碼:
如果有想用的,直接修改tag,文件保存路徑,頁數就可以直接用了
import urllib
import urllib.request
import re
import time
import os
def delRepeat(a):
for x in a:
while a.count(x)>1:
del a[a.index(x)]
return a
def name(photo):
a = photo[33:]
b = a.replace("%20", "_").replace("%28", "(").replace("%29", ")")
return b
def save_img(img_url,file_name,file_path='D:\圖片\從yande爬的圖'):
#保存圖片到磁盤文件夾 file_path中
try:
if not os.path.exists(file_path):
print ('文件夾',file_path,'不存在,重新建立')
os.makedirs(file_path)
file_suffix = os.path.splitext(img_url)[1] # 獲得圖片後綴
filename = '{}{}{}{}'.format(file_path,os.sep,file_name,file_suffix) # 拼接圖片名(包含路徑)
urllib.request.urlretrieve(img_url,filename=filename) # 下載圖片,並保存到文件夾中
except IOError as e:
print ('文件操作失敗',e)
except Exception as e:
print ('錯誤 :',e)
for page in range(1,10): # 填入爬取1-10頁
time.sleep(3)
url = "https://yande.re/post?page=" + str(page) + "&tags=touhou" # 這個tag自己填寫
html = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
find_index = re.findall(r'id="p\d{3,}', html)
for each in find_index: # 搜索每一張圖
try:
time_start = time.time()
count += 1
words = "正在保存第"+ str(page) + "頁,第" + str(count) + "張圖"
print (words, end = '')
n =each[5:]
page_url = "https://yande.re/post/show/" + str(n)
page_html = urllib.request.urlopen(page_url).read().decode("utf-8", "ignore")
photo_find = delRepeat(re.findall(r'href="https://files.yande.re/image/([\s\S]*?)"', page_html))
if(len(photo_find)==0):
continue
photo_url = "https://files.yande.re/image/" + photo_find[0]
photo_name = name(photo_find[0])
save_img(photo_url, photo_name)
time_end = time.time()
print(' 用時%d秒 ' %(time_end - time_start))
except urllib.error.URLError or socket.gaierror or NameError or ConnectionAbortedError as e:
print('錯誤 :', e)
continue