一個爬取谷歌圖片的python程序

查看了一些別人寫的代碼,照着大體的模板,寫了一個自己的版本,親測可用。

輸入:一個文本,關鍵詞斷行分隔。

特點:一類別一文件夾,可使用自定義多線程下載,可自定義下載圖片數目上限。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import requests
import os
from lxml import etree
import json
import threading
import ctypes
import inspect

import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
    '(KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
    'Connection': 'keep - alive',
    'content-type': 'application/json'
}


class myThread(threading.Thread):
    def __init__(self, threadID, urls_list, names_list, id_list, path):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.urls_list = urls_list
        self.names_list = names_list
        self.id_list = id_list
        self.path = path

    def _async_raise(self, tid, exctype):
        """raises the exception, performs cleanup if needed"""
        tid = ctypes.c_long(tid)
        if not inspect.isclass(exctype):
            exctype = type(exctype)
        res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
            tid, ctypes.py_object(exctype))
        if res == 0:
            raise ValueError("invalid thread id")
        elif res != 1:
            # """if it returns a number greater than one, you're in trouble,
            # and you should call it again with exc=NULL to revert the effect"""
            ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
            raise SystemError("PyThreadState_SetAsyncExc failed")

    def stop_thread(self):
        self._async_raise(self.ident, SystemExit)

    def run(self):
        print("線程%s開始" % self.threadID)
        for i in range(len(self.urls_list)):
            # 下載
            try:
                ir = requests.get(
                    self.urls_list[i], headers=headers, timeout=10)
                open(self.path + '/%d.jpg' % (self.id_list[i]),
                     'wb').write(ir.content)
                print("download picture id: %d sucess" % self.id_list[i])
            except Exception as ex2:
                print('download error!!' + str(ex2))
                continue
        # self.stop_thread()
        print("線程%s退出" % self.threadID)


class Crawler():
    def __init__(self, query, path, thread_count):
        self.url = base_url_part1 + search_query + base_url_part2
        self.search_query = query
        self.path = path
        self.thread_count = thread_count

    # 啓動Chrome瀏覽器驅動
    def start_brower(self):
        chrome_options = Options()
        chrome_options.add_argument("--disable-infobars")
        # chrome_options.add_argument('--headless')
        """谷歌瀏覽器驅動地址"""
        executable_path = "C:/Anaconda3/chromedriver.exe"
        """啓動Chrome瀏覽器"""
        driver = webdriver.Chrome(
            chrome_options=chrome_options, executable_path=executable_path)
        """最大化窗口,因爲每一次爬取只能看到視窗內的圖片"""
        driver.maximize_window()
        """瀏覽器打開爬取頁面"""
        driver.get(self.url)
        return driver

    def downloadImg(self, driver):
        """滑動滾動條至:加載更多處"""
        end = False
        while True:
            html_page = driver.page_source
            html = etree.HTML(html_page)
            pictures = html.xpath('//*[@id="rg_s"]/div')
            google_url = 'https://www.google.com'
            urls_list = []
            names_list = []
            for picture in pictures:
                url = picture.xpath('./div/text()')
                if url != []:
                    raw_data = str(url[0])
                    raw_data_dict = json.loads(raw_data)
                    urls_list.append(raw_data_dict["ou"])
                    name = picture.xpath(
                        './a[2]/div[@class="mVDMnf nJGrxf"]/text()')
                    names_list.append(str(name[0]))
            # 比較當前的下載數目是否已經滿足要求
            if len(names_list) >= download_count:
                urls_list = urls_list[:download_count]
                names_list = names_list[:download_count]
                break
            if end is True:
                break
            # 滑動刷新
            for i in range(5):
                pos = i * 50000
                js = "document.documentElement.scrollTop=%d" % pos
                driver.execute_script(js)
                time.sleep(1)
            try:
                driver.find_element_by_xpath(
                    "./*//input[@value='顯示更多結果']").click()
            except:
                end = True
                continue
            time.sleep(1)
        file_write = open(
            self.path + '/' + self.search_query + '.txt',
            'w+',
            encoding='utf-8')
        # 在txt中書寫id-圖片名
        length = len(names_list)
        id_list = [i for i in range(length)]
        for i in id_list:
            file_write.write(str(i) + ' ' + names_list[i] + '\n')
        file_write.close()
        time.sleep(10)
        # 開始下載
        thread_list = []
        next_start = 0
        for i in range(thread_count):
            start_id = next_start
            end_id = int(float(length) / thread_count * (i + 1))
            end_id += 1
            next_start = end_id
            thread_list.append(
                myThread(i, urls_list[start_id:end_id],
                         names_list[start_id:end_id], id_list[start_id:end_id],
                         self.path))
            thread_list[i].start()
        for i in range(thread_count):
            thread_list[i].join()

    def run(self):
        driver = self.start_brower()
        self.downloadImg(driver)
        driver.close()
        print("{} download has finished.".format(self.search_query))


if __name__ == '__main__':
    start = time.time()
    # base_url_part1以及base_url_part2都是固定不變的,無需更改
    base_url_part1 = 'https://www.google.com/search?q='
    base_url_part2 = '&source=lnms&tbm=isch'
    # 下載圖片數目
    download_count = 2000
    # 爬取關鍵字
    file_read = open('search_imgs.txt', 'r+')
    search_list = file_read.readlines()
    totalPath = 'F:/張晉豪資料包/人工智能/視頻分析資料/正式工作/爬蟲/google_picture/picture/downloads2/'
    # 針對每一個開始下載
    craw_list = []
    for search_query in search_list:
        search_query = search_query.strip()
        thread_count = 200  # 每個類別的下載線程數
        path = os.path.join(totalPath, search_query)
        try:
            if not os.path.exists(path):
                os.mkdir(path)
                time.sleep(1)
        except Exception as e:
            print(e)
        craw = Crawler(
            query=search_query, path=path, thread_count=thread_count)
        craw.run()
    end = time.time()
    print('all have been downloaded.')
    print('total cost time %d' % (end - start))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章