python關鍵字爬去biying圖片

# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import urllib.request
import requests
import time
import json
import sys
import re
import os


#爬取目標網站url
CRAWL_TARGET_URL = 'https://cn.bing.com/images/async?q=%s&first=%d&count=%d&relp=%d&lostate=r&mmasync=1'

#每次抓取圖片數量(35是此網頁每次翻頁請求數量)
NUMS_PER_CRAWL = 35

#抓取圖片最小大小(單位字節),小於此值拋棄
MIN_IMAGE_SIZE = 10


def get_image(url, path, count):
     try:
         u = urllib.request.urlopen(url, timeout=5)
         t = u.read()
         if sys.getsizeof(t) < MIN_IMAGE_SIZE:
             return -1
     except Exception as e:
         print(url, e)
         return -2
     #提取圖片格式
     frmt = url[url.rfind('.'):]
     p = re.compile("^\\.[a-zA-Z]+")
     m = p.match(frmt)
     frmt = m.group(0)
     try:
         if not os.path.exists(path):
             os.mkdir(path)
         f = open(os.path.join(path, str(count)+frmt), 'wb')
         f.write(t)
         f.close()
     except Exception as e:
         print(os.path.join(path, str(count)+frmt), e)
         return -3
     return 0


def crawl_data(info, path, num):
     first = 0
     count = 0
     #創建一個會話
     s = requests.Session()
     while(count < num):
         u = CRAWL_TARGET_URL%(info, first, NUMS_PER_CRAWL, NUMS_PER_CRAWL)
         #3.05s爲發送超時時間,10s爲接收到數據超時時間
         req = s.get(url =u, timeout=(3.05, 10))
         bf = BeautifulSoup(req.text, "html.parser")
         imgtags = bf.find_all("a", class_ = "iusc")
         for e in imgtags:
             if count == num:
                 return
             urldict = json.loads(e.get('m'))
             if get_image(urldict["murl"], path, count) < 0:
                 continue
             print("第%d張圖片下載完成,總進度%d%%"%(count+1, (count+1)*100/num))
             sys.stdout.flush()
             count =count+1
             time.sleep(0.01)
         first = first + NUMS_PER_CRAWL
         time.sleep(0.1)



if __name__ == '__main__':
    tstart = time.time()
    key_words = ['行李','衣服']

    
    for k in range(len(key_words)):
        if os.path.exists('./' + key_words[k])==False:
            os.makedirs('./' + key_words[k])
        path = './' + key_words[k] + '/'

        picture_num = 1000
        crawl_data(key_words[k], path, picture_num)
    print("所有圖片下載完畢,總用時%.2fs"%(time.time()-tstart))

此代碼爲網上所找,不記得網址了。原作看見請告知。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章