爬蟲實現51job誰看過我的簡歷多條記錄功能

默認情況下51job只能看到最近一條記錄,查看更多記錄需要付費。

本文目的:用爬蟲進行定時循環抓取記錄,並追加寫入到文本。

import requests
from bs4 import BeautifulSoup

class www_51job_com(object):
    def __init__(self):
        self.url = "https://i.51job.com/userset/resume_browsed.php?lang=c"

        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
        }

        self.cookies = "替換你的cookie"

    def get_url(self,url):
        cookies_dict = {}
        for i in self.cookies.split("; "):
            cookies_dict[i.split("=")[0]] = i.split("=")[1]

        response = requests.get(url=url,headers=self.headers,cookies=cookies_dict)
        return response.content.decode('gbk')

    def soup(self,data):
        soup_data = BeautifulSoup(data,'lxml')
        company = soup_data.select("body > div.content > div.exrt > div.lmsg > div.e > div.txt > div.li.l1 > p > a")[0].get_text()
        care_time = soup_data.select("body > div.content > div.exrt > div.lmsg > div.e > div.txt > div.li.l3 > div.f12 > span")[0].get_text()
        return company +' '+ care_time

    def save_file(self,company_caretime):
        with open('www_51job_com.txt','a+',encoding='utf-8') as f:
            f.seek(0)
            lines = f.readlines()
            try:
                if lines[-1] != company_caretime+'\n':
                    f.write(company_caretime + '\n')
            except IndexError:
                    f.write(company_caretime+'\n')

    def run(self):
        response = self.get_url(self.url)
        result = self.soup(response)
        self.save_file(result)

if __name__ == '__main__':
    personal_center = www_51job_com()
    personal_center.run()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章