使用requests爬取廣西人才W職位信息(多線程)入庫mongodb

requests_thread_gxrc_com.py
#!/usr/bin/env python3
# coding=utf-8
# Version:python3.6.1
# File:gxrc_com.py
# Author:LGSP_Harold
import pymongo
import requests
from multiprocessing import Queue
from lxml import etree
import threading

from handle_mongo import MongoClient


# 處理頁碼類
class CrawlPage(threading.Thread):
    # 重寫父類(子類CrawPage需要添加新的參數時,需要重寫父類),thread_name, page_queue, data_queue是子類使用,不需要傳給父類
    def __init__(self, thread_name, page_queue, data_queue, *args, **kwargs):
        # super自動找到父類(threading.Thread),幫助調用父類方法__init__()
        super(CrawlPage, self).__init__(*args, **kwargs)
        # 線程的名稱
        self.thread_name = thread_name
        # 頁碼的隊列
        self.page_queue = page_queue
        # 數據的隊列
        self.data_queue = data_queue
        # 默認請求頭
        self.headers = {
            'Accept': 'text / html, application / xhtml + xml, application / xml; q = 0.9, image / webp, image / apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'DNT': '1',
            'Host': 's.gxrc.com',
            'Pragma': 'no-cache',
            'Referer': 'https://www.gxrc.com/',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
        }

    def run(self) -> None:
        print('當前啓動處理頁碼的任務線程爲:%s' % self.thread_name)
        while not page_flag:
            # Queue隊列去put或者get的時候,需要設置block
            # block默認爲True,需設置成False
            # 當前隊列沒有數據或隊列滿了,將會拋出異常(empty,full)
            try:
                # 通過get方法,將隊列裏面的頁碼get出來。
                # block默認爲True,當隊列爲空時,會一直等待,或設置timeout等待時間
                # 將block設置爲False,當隊列爲空,會報異常,通過捕獲來處理
                page = self.page_queue.get(block=False)
                page_url = 'https://s.gxrc.com/sJob?schType=1&page=' + str(page)
                print('當前構造的url爲%s' % page_url)
                # 配置動態代理
                # proxy = {
                #     'http': 'http://xxxxxx:[email protected]:9999',
                #     'https': 'http://xxxxxx:[email protected]:9999'
                # }

                # 通過requests方法請求構造的url
                # res = requests.get(url=page_url, headers=self.headers, proxies=proxy)
                res = requests.get(url=page_url, headers=self.headers)
                # 設置網頁編碼
                res.encoding = 'utf-8'
                # 將請求回來的數據放到數據隊列裏面去
                self.data_queue.put(res.text)
            except Exception as e:
                pass


# 處理網頁文本數據類
class CrawlHtml(threading.Thread):
    # 從頁碼解析過來的數據,需要保存到data_queue
    def __init__(self, thread_name, data_queue, lock, db, collections, *args, **kwargs):
        super(CrawlHtml, self).__init__(*args, **kwargs)
        self.thread_name = thread_name
        self.data_queue = data_queue
        self.lock = lock
        self.db = db
        self.collections = collections

    # 處理網頁方法
    def parse(self, text):
        # HTML實例化
        html = etree.HTML(text)
        items = html.xpath('//div[@class="rlOne"]/ul[@class="posDetailUL clearfix"]')
        data_list = []
        for item in items:
            data = {}
            data['job_name'] = item.xpath('.//a[@class="posName"]/text()')[0]
            data['company_name'] = item.xpath('.//a[@class="entName"]/text()')[0]
            try:
                data['company_address'] = item.xpath('.//li[@class="w4"]/text()')[0]
            except Exception as e:
                data['company_address'] = '未知'
            try:
                data['money'] = item.xpath('.//li[@class="w3"]/text()')[0]
            except Exception as e:
                data['money'] = '面議'
            data['date'] = item.xpath('.//li[@class="w5"]/text()')[0]
            data_list.append(data)
        return data_list

    def run(self) -> None:
        print('當前啓動處理數據任務線程爲:%s' % self.thread_name)
        while not data_flag:
            try:
                # 從隊列取數據
                text = self.data_queue.get(block=False)
                # 調用方法處理數據
                result = self.parse(text)
                # print(result)
                # 引入鎖
                with self.lock:
                    insert_data = MongoClient(self.db, self.collections)
                    insert_data.insert_db(result)
            except Exception as e:
                pass


# 定義兩個全局的flag
page_flag = False
data_flag = False


def main():
    # 定義兩個隊列,存放頁碼和文本數據的隊列
    page_queue = Queue()
    data_queue = Queue()

    # 定義一個鎖
    lock = threading.Lock()

    # 將頁碼放到頁碼隊列裏面去
    for page in range(1, 504):
        # 通過put方法將頁碼存放到page_queue裏面
        page_queue.put(page)

    # 打印一個提示,page_queue.qsize()返回當前隊列的長度
    print('當前頁碼隊列的總量爲%s' % page_queue.qsize())

    # 包含線程的名稱,開三個線程
    crawl_page_list = ['頁碼線程1', '頁碼線程2', '頁碼線程3']
    page_thread_list = []
    for thread_name_page in crawl_page_list:
        thread_page = CrawlPage(thread_name_page, page_queue, data_queue)
        # 啓動線程
        thread_page.start()
        page_thread_list.append(thread_page)

    # 設置三個線程,處理文本數據
    parse_list = ['文本線程1', '文本線程2', '文本線程3']
    parse_thread_list = []

    db = 'db_gxrc'
    collections = 'collections_gxrc'
    for thread_name_parse in parse_list:
        thread_parse = CrawlHtml(thread_name_parse, data_queue, lock, db, collections)
        thread_parse.start()
        parse_thread_list.append(thread_parse)

    # 設置線程退出機制
    # 頁碼線程
    global page_flag
    # 在page_queue爲空時,while不成立
    while not page_queue.empty():
        pass
    page_flag = True

    # 結束頁碼處理線程
    for thread_page_join in page_thread_list:
        thread_page_join.join()
        print(thread_page_join.thread_name, '處理結束')

    # 文本線程
    global data_flag
    while not data_queue.empty():
        pass
    data_flag = True

    for thread_data_join in parse_thread_list:
        thread_data_join.join()
        print(thread_data_join.thread_name, '處理結束')


if __name__ == '__main__':
    # 函數入口
    main()

 handle_mongo.py

#!/usr/bin/env python3
# coding=utf-8
# Version:python3.6.1
# File:handle_mongo.py
# Author:LGSP_Harold
import pymongo


class MongoClient:
    def __init__(self, db, collections, *args, **kwargs):
        super(MongoClient, self).__init__(*args, **kwargs)
        client = pymongo.MongoClient('mongodb://admin:[email protected]:27017')
        self.db = client[db]
        self.collections = self.db[collections]

    def insert_db(self, item):
        self.collections.insert_many(item)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章