Python爬蟲獲取拉勾網招聘信息

之前寫過一份爬取拉勾網搜索“數據分析”相關職位的文章拉勾網職位信息爬蟲練習
,最近入職了一家設計爲主的公司,所以想做一份關於“設計”的數據分析報告,發現直接跑原來的代碼會爬不到數據,所以稍微修改了一下。本篇主要記錄爬蟲代碼。

首先要明確拉勾網的招聘信息存儲網頁是post形式的,所以必須填寫from_data信息。我們這裏填的是from_data = {‘first’:‘true’, ‘pn’:‘1’, ‘kd’:‘設計’},其中pn代表當前頁碼,kd就是我們搜索的職位關鍵詞。 第二就是記得要用Session 獲取動態cookies,否則爬下來的數據空空如也,還容易被封IP封號一條龍安排找到存數據的json

#導入使用的庫
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time
from datetime import datetime

#從職位詳情頁面內獲取職位要求
def getjobneeds(positionId):
    url = 'https://www.lagou.com/jobs/{}.html'
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
        'Host':'www.lagou.com',
        'Referer':'https://www.lagou.com/jobs/list_%E8%AE%BE%E8%AE%A1/p-city_0?px=default',
        'Upgrade-Insecure-Requests':'1'
        }
    
    s = requests.Session()
    s.get(url.format(positionId), headers=headers, timeout=3)  # 請求首頁獲取cookies
    cookie = s.cookies  # 爲此次獲取的cookies
    response = s.get(url.format(positionId), headers=headers, cookies=cookie, timeout=3)  # 獲取此次文本
    time.sleep(5)#休息 休息一下
    
    
    soup = BeautifulSoup(response.text,'html.parser')
    need =  ' '.join([p.text.strip()for p in soup.select('.job_bt div')])
    return need


#獲取職位具體信息#獲取職位具體 
def getjobdetails(jd):
    results= {}
    results['businessZones'] = jd['businessZones']
    results['companyFullName'] = jd['companyFullName']
    results['companyLabelList'] = jd['companyLabelList']
    results['financeStage'] = jd['financeStage']
    results['skillLables'] = jd['skillLables']
    results['companySize'] = jd['companySize']
    results['latitude'] = jd['latitude']
    results['longitude'] = jd['longitude']
    results['city'] = jd['city']
    results['district'] = jd['district']
    results['salary'] = jd['salary']
    results['secondType'] = jd['secondType']
    results['workYear'] = jd['workYear']
    results['education'] = jd['education']
    results['firstType'] = jd['firstType']
    results['thirdType'] = jd['thirdType']
    results['positionName'] = jd['positionName']
    results['positionLables'] = jd['positionLables']
    results['positionAdvantage'] = jd['positionAdvantage']
    positionId = jd['positionId']
    results['need'] = getjobneeds(positionId)
    time.sleep(2)#設置暫停時間,控制頻率
    print(jd,'get')
    return results

#獲取整個頁面上的職位信息
def parseListLinks(url_start,url_parse):
    jobs = []
    from_data = {'first':'true',
                'pn':'1',
                'kd':'設計'}
    
    headers = {
        'Host':'www.lagou.com',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Referer':'https://www.lagou.com/jobs/list_%E8%AE%BE%E8%AE%A1/p-city_0?px=default',
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
        'X-Anit-Forge-Code':'0',
        'X-Anit-Forge-Token':'None',
        'X-Requested-With':'XMLHttpRequest',
    }
    
    res = []
    for n in range(30):
        from_data['pn'] = n + 1
        s = requests.Session()
        s.get(url_start, headers=headers, timeout=3)  # 請求首頁獲取cookies
        cookie = s.cookies  # 爲此次獲取的cookies
        response = s.post(url_parse, data=from_data, headers=headers, cookies=cookie, timeout=3)  # 獲取此次文本
        time.sleep(5)
        res.append(response)
        
    jd = []
    for m in range(len(res)):
        jd.append(json.loads(res[m].text)['content']['positionResult']['result'])
    for j in range(len(jd)):
        for i in range(15):
            jobs.append(getjobdetails(jd[j][i]))
    time.sleep(30)
    return jobs

def main():
    url_start = "https://www.lagou.com/jobs/list_設計?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput="
    url_parse = "https://www.lagou.com/jobs/positionAjax.json?city=&needAddtionalResult=false"
    jobs_total = parseListLinks(url_start,url_parse)
    now = datetime.now().strftime('%m%d_%H%M%S')
    newsname = 'lagou_sj'+now+'.xlsx'#按時間命名文件
    df = pd.DataFrame(jobs_total)
    df.to_excel(newsname)
    print('文件已保存')
    
if __name__ == '__main__':
    main()

拉勾網每頁有15條數據,默認顯示30頁,一共450條數據。我這裏直接寫死啦,大家可以根據需要修改爬取頁數。也可以選擇不獲取“崗位要求”信息,或者其他不需要的信息。保存下來的文件是這個樣子的。保存爲excel

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章