使用selenium爬取51Job職位信息入庫mongoDB

selenium_51job_com.py

#!/usr/bin/env python3
# coding=utf-8
# Version:python3.6.1
# File:51job_com.py
# Author:LGSP_Harold
import pymongo
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import time


class HandleWebdriver:
    def __init__(self):
        # 設置無頭模式
        options = Options()
        options.add_argument('--headless')

        self.browser = webdriver.Firefox(firefox_options=options)
        # self.browser.maximize_window()

    def handle_job(self):
        # 打開目的地址
        self.browser.get(
            'https://search.51job.com/list/000000,000000,0000,00,9,99,+,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=')

        # 通過WebDriverWait進行顯式等待，等待搜索框
        if WebDriverWait(self.browser, 5, 0.5).until(EC.presence_of_element_located((By.ID, 'keywordInput'))):
            # 外部獲取輸入崗位信息
            input_keyword = input('請輸入要查找的崗位：')
            # 將要查找的信息發送到搜索框
            self.browser.find_element_by_id('keywordInput').send_keys(input_keyword)
            # 點擊搜索
            self.browser.find_element_by_id('search_btn').click()

        if WebDriverWait(self.browser, 5, 0.5).until(EC.presence_of_element_located((By.CLASS_NAME, 'j_joblist'))):
            # 查看網頁源代碼
            # print(self.browser.page_source)
            while True:
                time.sleep(2)
                self.handle_parse(self.browser.page_source)
                try:
                    if self.browser.find_element_by_xpath('//li[@class="next"]/a'):
                        self.browser.find_element_by_xpath('//li[@class="next"]/a').click()
                except:
                    break

            self.browser.quit()

    def handle_parse(self, page_source):
        html_obj = etree.HTML(page_source)
        items = html_obj.xpath('//div[@class="j_joblist"]/div[@class="e"]')
        data_list = []
        for item in items:
            data = {}
            data['job_name'] = item.xpath('.//a/p[@class="t"]/span[@class="jname at"]/text()')[0]
            data['time'] = item.xpath('.//a/p[@class="t"]/span[@class="time"]/text()')[0]
            try:
                data['money'] = item.xpath('.//a/p[@class="info"]/span[@class="sal"]/text()')[0]
            except:
                data['money'] = '面議'
            data['address'] = item.xpath('.//a/p[@class="info"]/span[@class="d at"]/text()')[0]
            try:
                tags = item.xpath('.//a/p[@class="tags"]/span/i/text()')
                text = ''
                for tag in tags:
                    text += tag + ' | '
                data['tags'] = text
            except:
                data['tags'] = '暫無'
            data_list.append(data)
        # print(data_list)
        self.handle_mongodb(data_list)

    def handle_mongodb(self, data_list):
        client = pymongo.MongoClient('mongodb://admin:[email protected]:27017')
        db = client['db_51job_com']
        collections = db['collections_51job']
        collections.insert_many(data_list)


selenium = HandleWebdriver()
selenium.handle_job()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

使用selenium爬取51Job職位信息入庫mongoDB

2024年DataOps趨勢預測：AI不會取代數據工程師

雲原生週刊：K8s 中的服務和網絡｜ 2024.4.29

通過Http鏈接地址爬取有贊微信商城商品信息及下載至EXCEL

多人同時導出 Excel 幹崩服務器！新來的阿里大佬給出的解決方案太優雅了！

[轉帖]cpupower

今天，昨天，近七天，近30天，近90天，js封裝

華爲云云原生FinOps解決方案，釋放雲原生最大價值

jinja2 for循環中if語句不起作用

異步提交tinymce富文本

本地Flask項目添加SSL

Flask接入第三方（Facebook）登錄

設置overflow:hiden出現偏移現象

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

使用selenium爬取51Job職位信息 入庫mongoDB

使用selenium爬取51Job職位信息入庫mongoDB