【Python行業分析4】BOSS直聘招聘信息獲取之爬蟲程序數據處理

今天我們要正式使用程序來把爬取到的頁面篩選出有效數據並保存到文件中,我會從最基礎的一步一步去完善程序,幫助大家來理解爬蟲程序,其中還是有許多問題我沒能解決,也希望有大佬可以留言幫助一下
由於cookies調試比較麻煩,我是先當了個靜態頁面來取數據的,通了後有加的爬取過程。

數據提取

from tp.boss.get_cookies import get_cookie_from_chrome
from bs4 import BeautifulSoup as bs
import requests
HOST = "https://www.zhipin.com/"


def test5(query_url, job_list):
    """
    獲取下一頁數據,直到沒有
    :return:
    """
    # User-Agent信息
    user_agent = r'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
    # Headers信息
    headers = {'User-Agnet': user_agent, 'Connection': 'keep-alive'}
    cookie_dict = get_cookie_from_chrome('.zhipin.com')
    # 將字典轉爲CookieJar:
    cookies = requests.utils.cookiejar_from_dict(cookie_dict, cookiejar=None, overwrite=True)
    s = requests.Session()
    s.cookies = cookies
    s.headers = headers
    req = s.get(HOST + query_url)
    content = req.content.decode("utf-8")
    content = bs(content, "html.parser")
    # 處理職位列表
    for item in content.find_all(class_="job-primary"):
        job_title = item.find("div", attrs={"class": "job-title"})
        job_name = job_title.a.attrs["title"]
        job_href = job_title.a.attrs["href"]
        data_jid = job_title.a['data-jid']
        data_lid = job_title.a["data-lid"]
        job_area = job_title.find(class_="job-area").text

        job_limit = item.find(class_="job-limit")
        salary = job_limit.span.text
        exp = job_limit.p.contents[0]
        degree = job_limit.p.contents[2]

        company = item.find(class_="info-company")
        company_name = company.h3.a.text
        company_type = company.p.a.text

        stage = company.p.contents[2]
        scale = company.p.contents[4]
        info_desc = item.find(class_="info-desc").text
        tags = [t.text for t in item.find_all(class_="tag-item")]

        job_list.append([job_area, company_type, company_name, data_jid, data_lid, job_name, stage, scale, job_href,
                         salary, exp, degree, info_desc, "、".join(tags)])

    page = content.find(class_="page")
    if page:
        next_page = page.find(class_="next")
        if next_page:
            next_href = next_page.attrs["href"]
            test5(next_href, job_list)

HTML元素獲取

獲取 標籤爲div class是 job-title的標籤

job_title = item.find("div", attrs={"class": "job-title"})

獲取 class是 job-area 的標籤

job_area = job_title.find(class_="job-area")

獲取 class是 job-primary 的標籤列表

job-primary = job_title.find_all(class_="job-primary")

獲取 a 標籤中是 title 屬性的數據

job_name = job_title.a.attrs["title"]

contents獲取直接子節點,返回的是一個列表

# 示例html  <p>不限<em class="vline"></em>本科</p>
# 結果   ["不限", "<em class="vline"></em>", "本科"]
exp = job_limit.p.contents[0]

保存csv文件

def test5_main(query_url, file_name):
    job_list = []
    test5(query_url, job_list)
    if len(job_list):
        print("爬到%d條數據" % len(job_list))
        with open(file_name, "w", newline='', encoding='utf8') as f:
            birth_header = ["城市", "公司行業", "公司", "jid", "lid", "職位名稱", "融資階段", "公司規模", "詳情頁", "薪資",
                            "工作經驗", "學歷要求", "福利", "技能要求", ]
            writer = csv.writer(f)
            writer.writerows([birth_header])
            writer.writerows(job_list)
            f.close()
    else:
        print("沒有爬取到數據")


if __name__ == "__main__":
    query_url = "/job_detail/?query=&city=101020100&industry=&position=100109"
    test5_main(query_url, "boss_20200611.csv")

結果文件
在這裏插入圖片描述

今天就到這吧,開始寫公衆號了,老鐵們求關注

微信搜一搜關注公衆號領取更多學習諮學習資料

每週一本經典書籍免費送了在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章