爬蟲爬取招聘信息並進行數據分析

一、結果放在最前面

請忽略
(請忽略截圖右邊的藍色正方形)
使用爬蟲爬取智聯招聘上關於“數據分析師”崗位的信息,並應用flask和echarts技術實現數據分析結果。

二、爬蟲

主函數(main):實現網頁解析以及數據存儲。

def main():
    kw = input("請輸入你要搜索的崗位關鍵字:").strip()
    keyword = urllib.parse.quote(urllib.parse.quote(kw))   #二次編碼
    # ka = input("請輸入你要搜索的地區:").strip()
    # karea = getArea(ka)

    for i in range(1, 165):
        print('正在爬取第{}頁信息'.format(i))
        baseurl = "https://search.51job.com/list/"+ str(000000) +",000000,0000,00,9,99,"+ keyword +",2,"+ str(i) +".html"    #全國+keyword
        html = askURL(baseurl)
        bs = BeautifulSoup(html,"html.parser")

        datalist = getData(bs)
        dbpath = "./51job.db"
        saveDB(datalist, dbpath)

網頁解析(askURL):調用request和BeautifulSoup實現網頁解析。

def askURL(url):
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
    }

    request = urllib.request.Request(url,headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode('gbk', 'ignore')
        # print(html)
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)

    return html

數據爬取(getData):獲取招聘信息,公司鏈接、招聘崗位鏈接、公司名稱、崗位名稱、地區、薪水;調用getCOM獲取公司鏈接內的信息,調用getREC獲取招聘崗位信息,併合並返回給main()。

def getData(bs):
    datalist = []
    for item in bs.select(".dw_table > div.el"):
        data = {}
        item = str(item)

        link = re.findall(findLink, item)
        data['link'] = ''.join(link)

        title = re.findall(findTitle, item)
        data['title'] = ''.join(title)

        area = re.findall(findArea, item)
        data['area'] = ''.join(area)

        com = re.findall(findCom, item)
        data['com'] = ''.join(com)

        comlink = re.findall(findComLink, item)
        data['comlink'] = ''.join(comlink)

        salary = re.findall(findSalary, item)
        data['salary'] = ''.join(salary)


        strhtml = 'https://jobs.51job.com/'
        if data["link"].startswith(strhtml) and data['comlink'].startswith(strhtml):

            com_link = data['comlink']
            com_data = getCOM(com_link)
            data = dict(data.items(), **com_data)

            rec_link = data['link']
            recruit_data = getREC(rec_link)
            data = dict(data.items(), **recruit_data)
            datalist.append(data)

    return datalist

爬取公司信息(getCOM):獲取公司鏈接內的信息,公司性質、公司規模、公司行業

def getCOM(com_link):
    com_html = askURL(com_link)
    bs = BeautifulSoup(com_html, "html.parser")
    # 公司信息
    CP_TYPE = ['民營公司', '上市公司', '事業單位', '國企', '外資(歐美)', '外資(非歐美)',
               '創業公司', '政府機關', '合資', '外資', '合資', '外企代表處', '非營利組織']
    CP_SCALE = ['少於50人', '50-150人', '150-500人', '500-1000人',
                '1000-5000人', '5000-10000人', '10000人以上']

    cp_info = bs.select('.in > p.ltype')[0].text.split('\xa0\xa0|\xa0\xa0')
    com_data = {}
    com_data['cp_type'] = com_data['cp_scale'] = com_data['industry'] = ''
    for i in CP_TYPE:
        if i in cp_info:
            com_data['cp_type'] = i
            break
    for i in CP_SCALE:
        if i in cp_info:
            com_data['cp_scale'] = i
            break
    for i in cp_info:
        if i not in CP_TYPE and i not in CP_SCALE:
            com_data['industry'] = i

    return com_data

爬取招聘信息(getREC):獲取招聘崗位鏈接內的信息,經驗、學歷、招聘人數、發佈日期、工作描述

def getREC(rec_link):
    jobHtml = askURL(rec_link)  #獲取詳情頁
    bs = BeautifulSoup(jobHtml,"html.parser")

    # 經驗、學歷、招聘人數、發佈日期
    text = bs.select(".ltype")
    job = {}
    if len(text) != 0:
        info = text[0].text.split('\xa0\xa0|\xa0\xa0')
        EDU = ['博士', '碩士', '本科', '大專',
               '中專', '中技', '高中', '初中及以下']

        job['exp'] = job['edu'] = job['other'] = job['demand'] = job['pubdate'] = " "
        for i in info:
            if '經驗' in i:
                job['exp'] = i
            elif i in EDU:
                job['edu'] = i
            elif '招' in i:
                job['demand'] = i
            elif '發佈' in i:
                job['pubdate'] = i
            else:
                job['other'] = i
    else:
        job['exp'] = job['edu'] = job['other'] = job['demand'] = job['pubdate'] = " "


    job['msg'] = " "
    jobMsgList = bs.select(".job_msg > p")  #工作描述
    jobMsgStr = ""
    for str in jobMsgList:
        jobMsgStr = jobMsgStr + str.text
    job["msg"] = jobMsgStr

    # jobList.append(job)
    return job

數據存儲(saveData):

def saveDB(datalist, dbpath):
    init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()

    for data in datalist:
        print(data)
        sql = '''
                insert or ignore into job_quanguo(
                link,title,comlink,com,area,salary,cp_type,cp_scale,industry,exp,edu,other,demand,pubdate,msg
                 ) 
                 values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'''
        # print(sql)
        cur.execute(sql,(data['link'],data['title'],data['comlink'],data['com'],data['area'],data['salary'],data['cp_type'],
                         data['cp_scale'],data['industry'],data['exp'],data['edu'],data['other'],data['demand'],data['pubdate'],data['msg']))
        conn.commit()
    cur.close()
    conn.close()

爬取結果:
在這裏插入圖片描述
一共爬取了7277條數據,包括:公司名稱、鏈接、崗位名稱、地區、薪水、企業性質、企業規模、行業、招聘要求、招聘信息等等。

三、數據處理

薪水處理:由於薪水是以上下限顯示的,故將薪水分成三列,分別爲下限,上限以及平均薪水。同時刪除一部分沒有顯示薪水的信息。

def getSalary():
    datalist = []
    con = sqlite3.connect("51job.db")
    cur = con.cursor()
    sql = "SELECT com,title,area,cp_type,cp_scale,industry,exp,edu,salary FROM job_quanguo"
    data_quanguo = cur.execute(sql)
    for item in data_quanguo:
        string = "".join(item[8])
        if string.endswith('千/月'):
            num = string.replace("千/月","").split("-")
            sal = pd.to_numeric(num)*1000
            # datalist.append(pd.to_numeric(num)*1000)
            data1 = append_other(item)
            data2 = append_salary(sal)
        elif string.endswith('萬/月'):
            num = string.replace("萬/月","").split("-")
            sal = pd.to_numeric(num)*10000
            # datalist.append(pd.to_numeric(num)*10000)
            data1 = append_other(item)
            data2 = append_salary(sal)
        elif string.endswith('萬/年'):
            num = string.replace("萬/年","").split("-")
            sal = pd.to_numeric(num)*10000/12
            # datalist.append(pd.to_numeric(num)*10000/12)
            # append_other(item)
            data1 = append_other(item)
            data2 = append_salary(sal)
        else:
            continue
        data = dict(data1.items(), **data2)
        datalist.append(data)
    cur.close()
    con.close()

    # df_salary = pd.DataFrame(columns=['low-salary','high-salary'])
    dbpath = "./51job.db"
    saveDB(datalist, dbpath)


def append_salary(sal):
    data1 = {}
    data1['low-salary'] = sal[0].astype(np.int64)
    data1['high-salary'] = sal[1].astype(np.int64)
    data1['avg-salary'] = (sal[0].astype(np.int64)+sal[1].astype(np.int64))/2

    return data1

四、Flask與ECharts

主要參考了ECharts官方文檔,在此就不一一論述了。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章