爬蟲爬取百度圖片
因公司業務需要,而且公司人手不足,我這個測試工程師需要臨時客串一下其他職位,所以,由我來爬取百度圖片。
說明
1、最近稍微有點兒忙,沒顧得上整理。而且代碼量比較少,所以註釋比較少。
2、如果需要直接使用我的代碼,請將相應路徑文件名稱更改。具體使用方法我會在下面代碼中詳細介紹。
3.python2.7
實現思路及功能
1.讀取excel中第一列的關鍵詞,保存在列表中,等待遍歷
2.根據關鍵詞開啓線程
3.將關鍵詞傳入img中,開始獲取圖片
4.將圖片保存在指定目錄
上代碼
#__author__ = 'chubbysuperman'
#_*_coding=utf-8 _*_
import requests
from fake_useragent import UserAgent
import xlrd
from compiler.ast import flatten
import os
import time
import threading
def imgUrls(keyWord, userAgent, pn):
url = 'https://image.baidu.com/search/index'
params = {'tn': 'resultjson_com', 'ipn': 'rj', 'ct': '201326592', 'is': '', 'fp': 'result', 'queryWord': keyWord, 'cl': '2', 'lm': '-1', 'ie': 'utf-8', 'oe': 'utf-8', 'adpicid': '', 'st': '-1', 'z': '', 'ic': '0', 'word': keyWord, 's': '', 'se': '', 'tab': '', 'width': '', 'height': '', 'face': '0', 'istype': '2', 'qc': '', 'nc': '1', 'fr': '', 'pn': pn, 'rn': 200, 'gsm': '1e', '1491808945838': '' }
rep = requests.get(url, headers={'user-Agent': userAgent}, params=params,timeout=(4,7))
if(int(rep.status_code) == 200):
try:
time.sleep(1)
imgs = rep.json()
def decodeUrl(imgUrl):
longDic={'_z2C$q': ":",'_z&e3B': ".",'AzdH3F': "/"}
mapDic={'w': "a",'k': "b",'v': "c",'1': "d",'j': "e",'u': "f",'2': "g",'i': "h",'t': "i",'3': "j",'h': "k",'s': "l",'4': "m",'g': "n","5": "o",'r': "p",'q': "q","6": "r",'f': "s",'p': "t","7": "u",'e': "v",'o': "w","8": "1",'d': "2",'n': "3","9": "4",'c': "5",'m': "6","0": "7",'b': "8",'l': "9",'a': "0"}
for k in longDic:
imgUrl=imgUrl.replace(k,longDic[k])
imgUrl=list(imgUrl)
tmp=[]
for i in imgUrl:
if i in mapDic:
tmp.append(mapDic[i])
else:
tmp.append(i)
return ''.join(tmp)
imgUrls = [decodeUrl(imgs['data'][sec]['objURL'])
for sec in range(len(imgs['data']) - 1)]
result = imgUrls
status = True
except Exception as e:
result = 'wuyunlunbi'
status = False
finally:
return {'result': result, 'status': status}
def img(keyWord, userAgent):
add1=[]
for i in range(100):
#print(imgUrls(keyWord,userAgent,pn=i*20)['result'])
add1.append(imgUrls(keyWord,userAgent,pn=i*20)['result'])
add1 = flatten(add1)
#return {keyWord:add1}
x = keyWord
print(len(add1))
print(add1)
os.makedirs(r'D:\yyyyy5\%s'%x)
#創建存儲目錄
for iii in range(len(add1)):
print(iii)
iii = add1[iii]
iii = iii.replace(" ","")
time.sleep(0.15)
if 'wuyunlunbi' in iii:
print('error_%s'%iii)
elif "yuan_" in iii:
print("error001_%s"%iii)
else:
try:
a = requests.get('%s'%iii,timeout=(3,4))
img = a.content
ccc =time.time()
time.sleep(0.15)
asdf = 'D:/yyyyy5/%s/%s.jpg'%(x,ccc)
#將圖片寫入指定目錄
with open( asdf,'wb' ) as f:
f.write(img)
except Exception as e:
pass
if __name__ == '__main__':
workbook = xlrd.open_workbook(r'C:\Users\Administrator\Desktop\Ashicai (2).xlsx')
#這是關鍵詞存儲的excel,請將關鍵詞放在第一個sheet中的第一列
a = workbook.sheet_by_index(0).col_values(0)
ua = UserAgent()
urls = []
for x in range(len(a)):
aa = time.time()
threading.Thread(target=img,args=(a[x],ua.random)).start()
time.sleep(0.05)
#urls.append(img(keyWord=a[x], userAgent=ua.random))
print(aa)