python爬蟲爬取百度圖片

爬蟲爬取百度圖片

因公司業務需要,而且公司人手不足,我這個測試工程師需要臨時客串一下其他職位,所以,由我來爬取百度圖片。

說明

1、最近稍微有點兒忙,沒顧得上整理。而且代碼量比較少,所以註釋比較少。
2、如果需要直接使用我的代碼,請將相應路徑文件名稱更改。具體使用方法我會在下面代碼中詳細介紹。
3.python2.7

實現思路及功能

1.讀取excel中第一列的關鍵詞,保存在列表中,等待遍歷
2.根據關鍵詞開啓線程
3.將關鍵詞傳入img中,開始獲取圖片
4.將圖片保存在指定目錄

上代碼

#__author__ = 'chubbysuperman'
#_*_coding=utf-8 _*_
import requests
from fake_useragent import UserAgent
import xlrd
from compiler.ast import flatten
import os
import time
import threading
def imgUrls(keyWord, userAgent, pn):
    url = 'https://image.baidu.com/search/index'
    params = {'tn': 'resultjson_com', 'ipn': 'rj', 'ct': '201326592', 'is': '', 'fp': 'result', 'queryWord': keyWord, 'cl': '2', 'lm': '-1', 'ie': 'utf-8', 'oe': 'utf-8', 'adpicid': '', 'st': '-1', 'z': '', 'ic': '0', 'word': keyWord, 's': '', 'se': '', 'tab': '', 'width': '', 'height': '', 'face': '0', 'istype': '2', 'qc': '', 'nc': '1', 'fr': '', 'pn': pn,  'rn': 200, 'gsm': '1e', '1491808945838': '' }
    rep = requests.get(url, headers={'user-Agent': userAgent}, params=params,timeout=(4,7))
    if(int(rep.status_code) == 200):
        try:
            time.sleep(1)
            imgs = rep.json()
            def decodeUrl(imgUrl):
                longDic={'_z2C$q': ":",'_z&e3B': ".",'AzdH3F': "/"}
                mapDic={'w': "a",'k': "b",'v': "c",'1': "d",'j': "e",'u': "f",'2': "g",'i': "h",'t': "i",'3': "j",'h': "k",'s': "l",'4': "m",'g': "n","5": "o",'r': "p",'q': "q","6": "r",'f': "s",'p': "t","7": "u",'e': "v",'o': "w","8": "1",'d': "2",'n': "3","9": "4",'c': "5",'m': "6","0": "7",'b': "8",'l': "9",'a': "0"}
                for k in longDic:
                    imgUrl=imgUrl.replace(k,longDic[k])
                imgUrl=list(imgUrl)
                tmp=[]
                for i in imgUrl:
                    if i in mapDic:
                        tmp.append(mapDic[i])
                    else:
                        tmp.append(i)
                return ''.join(tmp)
            imgUrls = [decodeUrl(imgs['data'][sec]['objURL'])
                       for sec in range(len(imgs['data']) - 1)]
            result = imgUrls
            status = True
        except Exception as e:
            result = 'wuyunlunbi'
            status = False
        finally:
            return {'result': result, 'status': status}
def img(keyWord, userAgent):
    add1=[]
    for i in range(100):
        #print(imgUrls(keyWord,userAgent,pn=i*20)['result'])
        add1.append(imgUrls(keyWord,userAgent,pn=i*20)['result'])
    add1 = flatten(add1)
    #return {keyWord:add1}
    x = keyWord
    print(len(add1))
    print(add1)
    os.makedirs(r'D:\yyyyy5\%s'%x)
    #創建存儲目錄
    for iii in range(len(add1)):
        print(iii)
        iii = add1[iii]
        iii = iii.replace(" ","")
        time.sleep(0.15)
        if 'wuyunlunbi' in iii:
            print('error_%s'%iii)
        elif "yuan_" in iii:
            print("error001_%s"%iii)
        else:
            try:
                a = requests.get('%s'%iii,timeout=(3,4))
                img = a.content
                ccc =time.time()
                time.sleep(0.15)
                asdf = 'D:/yyyyy5/%s/%s.jpg'%(x,ccc)
                #將圖片寫入指定目錄
                with open( asdf,'wb' ) as f:
                    f.write(img)
            except Exception as e:
                pass
if __name__ == '__main__':
    workbook = xlrd.open_workbook(r'C:\Users\Administrator\Desktop\Ashicai (2).xlsx')
    #這是關鍵詞存儲的excel,請將關鍵詞放在第一個sheet中的第一列
    a = workbook.sheet_by_index(0).col_values(0)
    ua = UserAgent()
    urls = []
    for x in range(len(a)):
        aa = time.time()
        threading.Thread(target=img,args=(a[x],ua.random)).start()
        time.sleep(0.05)
        #urls.append(img(keyWord=a[x], userAgent=ua.random))
        print(aa)


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章