一个爬取谷歌图片的python程序

查看了一些别人写的代码,照着大体的模板,写了一个自己的版本,亲测可用。

输入:一个文本,关键词断行分隔。

特点:一类别一文件夹,可使用自定义多线程下载,可自定义下载图片数目上限。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import requests
import os
from lxml import etree
import json
import threading
import ctypes
import inspect

import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
    '(KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
    'Connection': 'keep - alive',
    'content-type': 'application/json'
}


class myThread(threading.Thread):
    def __init__(self, threadID, urls_list, names_list, id_list, path):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.urls_list = urls_list
        self.names_list = names_list
        self.id_list = id_list
        self.path = path

    def _async_raise(self, tid, exctype):
        """raises the exception, performs cleanup if needed"""
        tid = ctypes.c_long(tid)
        if not inspect.isclass(exctype):
            exctype = type(exctype)
        res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
            tid, ctypes.py_object(exctype))
        if res == 0:
            raise ValueError("invalid thread id")
        elif res != 1:
            # """if it returns a number greater than one, you're in trouble,
            # and you should call it again with exc=NULL to revert the effect"""
            ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
            raise SystemError("PyThreadState_SetAsyncExc failed")

    def stop_thread(self):
        self._async_raise(self.ident, SystemExit)

    def run(self):
        print("线程%s开始" % self.threadID)
        for i in range(len(self.urls_list)):
            # 下载
            try:
                ir = requests.get(
                    self.urls_list[i], headers=headers, timeout=10)
                open(self.path + '/%d.jpg' % (self.id_list[i]),
                     'wb').write(ir.content)
                print("download picture id: %d sucess" % self.id_list[i])
            except Exception as ex2:
                print('download error!!' + str(ex2))
                continue
        # self.stop_thread()
        print("线程%s退出" % self.threadID)


class Crawler():
    def __init__(self, query, path, thread_count):
        self.url = base_url_part1 + search_query + base_url_part2
        self.search_query = query
        self.path = path
        self.thread_count = thread_count

    # 启动Chrome浏览器驱动
    def start_brower(self):
        chrome_options = Options()
        chrome_options.add_argument("--disable-infobars")
        # chrome_options.add_argument('--headless')
        """谷歌浏览器驱动地址"""
        executable_path = "C:/Anaconda3/chromedriver.exe"
        """启动Chrome浏览器"""
        driver = webdriver.Chrome(
            chrome_options=chrome_options, executable_path=executable_path)
        """最大化窗口,因为每一次爬取只能看到视窗内的图片"""
        driver.maximize_window()
        """浏览器打开爬取页面"""
        driver.get(self.url)
        return driver

    def downloadImg(self, driver):
        """滑动滚动条至:加载更多处"""
        end = False
        while True:
            html_page = driver.page_source
            html = etree.HTML(html_page)
            pictures = html.xpath('//*[@id="rg_s"]/div')
            google_url = 'https://www.google.com'
            urls_list = []
            names_list = []
            for picture in pictures:
                url = picture.xpath('./div/text()')
                if url != []:
                    raw_data = str(url[0])
                    raw_data_dict = json.loads(raw_data)
                    urls_list.append(raw_data_dict["ou"])
                    name = picture.xpath(
                        './a[2]/div[@class="mVDMnf nJGrxf"]/text()')
                    names_list.append(str(name[0]))
            # 比较当前的下载数目是否已经满足要求
            if len(names_list) >= download_count:
                urls_list = urls_list[:download_count]
                names_list = names_list[:download_count]
                break
            if end is True:
                break
            # 滑动刷新
            for i in range(5):
                pos = i * 50000
                js = "document.documentElement.scrollTop=%d" % pos
                driver.execute_script(js)
                time.sleep(1)
            try:
                driver.find_element_by_xpath(
                    "./*//input[@value='显示更多结果']").click()
            except:
                end = True
                continue
            time.sleep(1)
        file_write = open(
            self.path + '/' + self.search_query + '.txt',
            'w+',
            encoding='utf-8')
        # 在txt中书写id-图片名
        length = len(names_list)
        id_list = [i for i in range(length)]
        for i in id_list:
            file_write.write(str(i) + ' ' + names_list[i] + '\n')
        file_write.close()
        time.sleep(10)
        # 开始下载
        thread_list = []
        next_start = 0
        for i in range(thread_count):
            start_id = next_start
            end_id = int(float(length) / thread_count * (i + 1))
            end_id += 1
            next_start = end_id
            thread_list.append(
                myThread(i, urls_list[start_id:end_id],
                         names_list[start_id:end_id], id_list[start_id:end_id],
                         self.path))
            thread_list[i].start()
        for i in range(thread_count):
            thread_list[i].join()

    def run(self):
        driver = self.start_brower()
        self.downloadImg(driver)
        driver.close()
        print("{} download has finished.".format(self.search_query))


if __name__ == '__main__':
    start = time.time()
    # base_url_part1以及base_url_part2都是固定不变的,无需更改
    base_url_part1 = 'https://www.google.com/search?q='
    base_url_part2 = '&source=lnms&tbm=isch'
    # 下载图片数目
    download_count = 2000
    # 爬取关键字
    file_read = open('search_imgs.txt', 'r+')
    search_list = file_read.readlines()
    totalPath = 'F:/张晋豪资料包/人工智能/视频分析资料/正式工作/爬虫/google_picture/picture/downloads2/'
    # 针对每一个开始下载
    craw_list = []
    for search_query in search_list:
        search_query = search_query.strip()
        thread_count = 200  # 每个类别的下载线程数
        path = os.path.join(totalPath, search_query)
        try:
            if not os.path.exists(path):
                os.mkdir(path)
                time.sleep(1)
        except Exception as e:
            print(e)
        craw = Crawler(
            query=search_query, path=path, thread_count=thread_count)
        craw.run()
    end = time.time()
    print('all have been downloaded.')
    print('total cost time %d' % (end - start))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章