使用requests爬取广西人才W职位信息(多线程)入库mongodb

requests_thread_gxrc_com.py
#!/usr/bin/env python3
# coding=utf-8
# Version:python3.6.1
# File:gxrc_com.py
# Author:LGSP_Harold
import pymongo
import requests
from multiprocessing import Queue
from lxml import etree
import threading

from handle_mongo import MongoClient


# 处理页码类
class CrawlPage(threading.Thread):
    # 重写父类(子类CrawPage需要添加新的参数时,需要重写父类),thread_name, page_queue, data_queue是子类使用,不需要传给父类
    def __init__(self, thread_name, page_queue, data_queue, *args, **kwargs):
        # super自动找到父类(threading.Thread),帮助调用父类方法__init__()
        super(CrawlPage, self).__init__(*args, **kwargs)
        # 线程的名称
        self.thread_name = thread_name
        # 页码的队列
        self.page_queue = page_queue
        # 数据的队列
        self.data_queue = data_queue
        # 默认请求头
        self.headers = {
            'Accept': 'text / html, application / xhtml + xml, application / xml; q = 0.9, image / webp, image / apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'DNT': '1',
            'Host': 's.gxrc.com',
            'Pragma': 'no-cache',
            'Referer': 'https://www.gxrc.com/',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
        }

    def run(self) -> None:
        print('当前启动处理页码的任务线程为:%s' % self.thread_name)
        while not page_flag:
            # Queue队列去put或者get的时候,需要设置block
            # block默认为True,需设置成False
            # 当前队列没有数据或队列满了,将会抛出异常(empty,full)
            try:
                # 通过get方法,将队列里面的页码get出来。
                # block默认为True,当队列为空时,会一直等待,或设置timeout等待时间
                # 将block设置为False,当队列为空,会报异常,通过捕获来处理
                page = self.page_queue.get(block=False)
                page_url = 'https://s.gxrc.com/sJob?schType=1&page=' + str(page)
                print('当前构造的url为%s' % page_url)
                # 配置动态代理
                # proxy = {
                #     'http': 'http://xxxxxx:[email protected]:9999',
                #     'https': 'http://xxxxxx:[email protected]:9999'
                # }

                # 通过requests方法请求构造的url
                # res = requests.get(url=page_url, headers=self.headers, proxies=proxy)
                res = requests.get(url=page_url, headers=self.headers)
                # 设置网页编码
                res.encoding = 'utf-8'
                # 将请求回来的数据放到数据队列里面去
                self.data_queue.put(res.text)
            except Exception as e:
                pass


# 处理网页文本数据类
class CrawlHtml(threading.Thread):
    # 从页码解析过来的数据,需要保存到data_queue
    def __init__(self, thread_name, data_queue, lock, db, collections, *args, **kwargs):
        super(CrawlHtml, self).__init__(*args, **kwargs)
        self.thread_name = thread_name
        self.data_queue = data_queue
        self.lock = lock
        self.db = db
        self.collections = collections

    # 处理网页方法
    def parse(self, text):
        # HTML实例化
        html = etree.HTML(text)
        items = html.xpath('//div[@class="rlOne"]/ul[@class="posDetailUL clearfix"]')
        data_list = []
        for item in items:
            data = {}
            data['job_name'] = item.xpath('.//a[@class="posName"]/text()')[0]
            data['company_name'] = item.xpath('.//a[@class="entName"]/text()')[0]
            try:
                data['company_address'] = item.xpath('.//li[@class="w4"]/text()')[0]
            except Exception as e:
                data['company_address'] = '未知'
            try:
                data['money'] = item.xpath('.//li[@class="w3"]/text()')[0]
            except Exception as e:
                data['money'] = '面议'
            data['date'] = item.xpath('.//li[@class="w5"]/text()')[0]
            data_list.append(data)
        return data_list

    def run(self) -> None:
        print('当前启动处理数据任务线程为:%s' % self.thread_name)
        while not data_flag:
            try:
                # 从队列取数据
                text = self.data_queue.get(block=False)
                # 调用方法处理数据
                result = self.parse(text)
                # print(result)
                # 引入锁
                with self.lock:
                    insert_data = MongoClient(self.db, self.collections)
                    insert_data.insert_db(result)
            except Exception as e:
                pass


# 定义两个全局的flag
page_flag = False
data_flag = False


def main():
    # 定义两个队列,存放页码和文本数据的队列
    page_queue = Queue()
    data_queue = Queue()

    # 定义一个锁
    lock = threading.Lock()

    # 将页码放到页码队列里面去
    for page in range(1, 504):
        # 通过put方法将页码存放到page_queue里面
        page_queue.put(page)

    # 打印一个提示,page_queue.qsize()返回当前队列的长度
    print('当前页码队列的总量为%s' % page_queue.qsize())

    # 包含线程的名称,开三个线程
    crawl_page_list = ['页码线程1', '页码线程2', '页码线程3']
    page_thread_list = []
    for thread_name_page in crawl_page_list:
        thread_page = CrawlPage(thread_name_page, page_queue, data_queue)
        # 启动线程
        thread_page.start()
        page_thread_list.append(thread_page)

    # 设置三个线程,处理文本数据
    parse_list = ['文本线程1', '文本线程2', '文本线程3']
    parse_thread_list = []

    db = 'db_gxrc'
    collections = 'collections_gxrc'
    for thread_name_parse in parse_list:
        thread_parse = CrawlHtml(thread_name_parse, data_queue, lock, db, collections)
        thread_parse.start()
        parse_thread_list.append(thread_parse)

    # 设置线程退出机制
    # 页码线程
    global page_flag
    # 在page_queue为空时,while不成立
    while not page_queue.empty():
        pass
    page_flag = True

    # 结束页码处理线程
    for thread_page_join in page_thread_list:
        thread_page_join.join()
        print(thread_page_join.thread_name, '处理结束')

    # 文本线程
    global data_flag
    while not data_queue.empty():
        pass
    data_flag = True

    for thread_data_join in parse_thread_list:
        thread_data_join.join()
        print(thread_data_join.thread_name, '处理结束')


if __name__ == '__main__':
    # 函数入口
    main()

 handle_mongo.py

#!/usr/bin/env python3
# coding=utf-8
# Version:python3.6.1
# File:handle_mongo.py
# Author:LGSP_Harold
import pymongo


class MongoClient:
    def __init__(self, db, collections, *args, **kwargs):
        super(MongoClient, self).__init__(*args, **kwargs)
        client = pymongo.MongoClient('mongodb://admin:[email protected]:27017')
        self.db = client[db]
        self.collections = self.db[collections]

    def insert_db(self, item):
        self.collections.insert_many(item)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章