查看了一些别人写的代码,照着大体的模板,写了一个自己的版本,亲测可用。
输入:一个文本,关键词断行分隔。
特点:一类别一文件夹,可使用自定义多线程下载,可自定义下载图片数目上限。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import requests
import os
from lxml import etree
import json
import threading
import ctypes
import inspect
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Connection': 'keep - alive',
'content-type': 'application/json'
}
class myThread(threading.Thread):
def __init__(self, threadID, urls_list, names_list, id_list, path):
threading.Thread.__init__(self)
self.threadID = threadID
self.urls_list = urls_list
self.names_list = names_list
self.id_list = id_list
self.path = path
def _async_raise(self, tid, exctype):
"""raises the exception, performs cleanup if needed"""
tid = ctypes.c_long(tid)
if not inspect.isclass(exctype):
exctype = type(exctype)
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
tid, ctypes.py_object(exctype))
if res == 0:
raise ValueError("invalid thread id")
elif res != 1:
# """if it returns a number greater than one, you're in trouble,
# and you should call it again with exc=NULL to revert the effect"""
ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
raise SystemError("PyThreadState_SetAsyncExc failed")
def stop_thread(self):
self._async_raise(self.ident, SystemExit)
def run(self):
print("线程%s开始" % self.threadID)
for i in range(len(self.urls_list)):
# 下载
try:
ir = requests.get(
self.urls_list[i], headers=headers, timeout=10)
open(self.path + '/%d.jpg' % (self.id_list[i]),
'wb').write(ir.content)
print("download picture id: %d sucess" % self.id_list[i])
except Exception as ex2:
print('download error!!' + str(ex2))
continue
# self.stop_thread()
print("线程%s退出" % self.threadID)
class Crawler():
def __init__(self, query, path, thread_count):
self.url = base_url_part1 + search_query + base_url_part2
self.search_query = query
self.path = path
self.thread_count = thread_count
# 启动Chrome浏览器驱动
def start_brower(self):
chrome_options = Options()
chrome_options.add_argument("--disable-infobars")
# chrome_options.add_argument('--headless')
"""谷歌浏览器驱动地址"""
executable_path = "C:/Anaconda3/chromedriver.exe"
"""启动Chrome浏览器"""
driver = webdriver.Chrome(
chrome_options=chrome_options, executable_path=executable_path)
"""最大化窗口,因为每一次爬取只能看到视窗内的图片"""
driver.maximize_window()
"""浏览器打开爬取页面"""
driver.get(self.url)
return driver
def downloadImg(self, driver):
"""滑动滚动条至:加载更多处"""
end = False
while True:
html_page = driver.page_source
html = etree.HTML(html_page)
pictures = html.xpath('//*[@id="rg_s"]/div')
google_url = 'https://www.google.com'
urls_list = []
names_list = []
for picture in pictures:
url = picture.xpath('./div/text()')
if url != []:
raw_data = str(url[0])
raw_data_dict = json.loads(raw_data)
urls_list.append(raw_data_dict["ou"])
name = picture.xpath(
'./a[2]/div[@class="mVDMnf nJGrxf"]/text()')
names_list.append(str(name[0]))
# 比较当前的下载数目是否已经满足要求
if len(names_list) >= download_count:
urls_list = urls_list[:download_count]
names_list = names_list[:download_count]
break
if end is True:
break
# 滑动刷新
for i in range(5):
pos = i * 50000
js = "document.documentElement.scrollTop=%d" % pos
driver.execute_script(js)
time.sleep(1)
try:
driver.find_element_by_xpath(
"./*//input[@value='显示更多结果']").click()
except:
end = True
continue
time.sleep(1)
file_write = open(
self.path + '/' + self.search_query + '.txt',
'w+',
encoding='utf-8')
# 在txt中书写id-图片名
length = len(names_list)
id_list = [i for i in range(length)]
for i in id_list:
file_write.write(str(i) + ' ' + names_list[i] + '\n')
file_write.close()
time.sleep(10)
# 开始下载
thread_list = []
next_start = 0
for i in range(thread_count):
start_id = next_start
end_id = int(float(length) / thread_count * (i + 1))
end_id += 1
next_start = end_id
thread_list.append(
myThread(i, urls_list[start_id:end_id],
names_list[start_id:end_id], id_list[start_id:end_id],
self.path))
thread_list[i].start()
for i in range(thread_count):
thread_list[i].join()
def run(self):
driver = self.start_brower()
self.downloadImg(driver)
driver.close()
print("{} download has finished.".format(self.search_query))
if __name__ == '__main__':
start = time.time()
# base_url_part1以及base_url_part2都是固定不变的,无需更改
base_url_part1 = 'https://www.google.com/search?q='
base_url_part2 = '&source=lnms&tbm=isch'
# 下载图片数目
download_count = 2000
# 爬取关键字
file_read = open('search_imgs.txt', 'r+')
search_list = file_read.readlines()
totalPath = 'F:/张晋豪资料包/人工智能/视频分析资料/正式工作/爬虫/google_picture/picture/downloads2/'
# 针对每一个开始下载
craw_list = []
for search_query in search_list:
search_query = search_query.strip()
thread_count = 200 # 每个类别的下载线程数
path = os.path.join(totalPath, search_query)
try:
if not os.path.exists(path):
os.mkdir(path)
time.sleep(1)
except Exception as e:
print(e)
craw = Crawler(
query=search_query, path=path, thread_count=thread_count)
craw.run()
end = time.time()
print('all have been downloaded.')
print('total cost time %d' % (end - start))