因爲要訓練DCGAN網絡,需要很大的數據集,使用爬蟲從網上爬取了上萬張圖片。
此程序參考了爬蟲怎麼根據一個關鍵詞爬取上千張網絡圖片,代碼如下
#-*- coding:utf-8 -*-
import re
import requests
import traceback
import os
def dowmloadPic(html,keyword,startNum):
kv = {'user-agent':'Mozilla/5.0'}
pic_url = re.findall('"objURL":"(.*?)",',html,re.S)
num = len(pic_url)
i = startNum
root = 'D:/pics/'
print('找到關鍵詞:'+keyword+'的圖片,現在開始下載圖片...')
for each in pic_url:
print('正在下載第'+str(i+1)+'張圖片,圖片地址:'+str(each))
path = root + each.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
pic = requests.get(each,headers = kv,timeout = 10)
with open(path,'wb') as f:
f.write(pic.content)
f.close()
except:
traceback.print_exc()
print ('【錯誤】當前圖片無法下載')
continue
i += 1
return i
if __name__ == '__main__':
kv = {'user-agent':'Mozilla/5.0'}
lastNum = 0
words = ['敦煌壁畫','敦煌人物壁畫','莫高窟人物壁畫']
#words爲一個列表,可以自動保存多個關鍵字的圖片
for word in words:
# word = input("Input key word: ")
if word.strip() == "exit":
break
pageId = 0
#此處的參數爲需爬取的頁數
for i in range(2):
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + "&pn="+str(pageId)+"&gsm=?&ct=&ic=0&lm=-1&width=0&height=0"
pageId += 20
result = requests.get(url,headers = kv)
lastNum = dowmloadPic(result.text, '敦煌壁畫', lastNum)