一、結果放在最前面
(請忽略截圖右邊的藍色正方形)
使用爬蟲爬取智聯招聘上關於“數據分析師”崗位的信息,並應用flask和echarts技術實現數據分析結果。
二、爬蟲
主函數(main):實現網頁解析以及數據存儲。
def main():
kw = input("請輸入你要搜索的崗位關鍵字:").strip()
keyword = urllib.parse.quote(urllib.parse.quote(kw)) #二次編碼
# ka = input("請輸入你要搜索的地區:").strip()
# karea = getArea(ka)
for i in range(1, 165):
print('正在爬取第{}頁信息'.format(i))
baseurl = "https://search.51job.com/list/"+ str(000000) +",000000,0000,00,9,99,"+ keyword +",2,"+ str(i) +".html" #全國+keyword
html = askURL(baseurl)
bs = BeautifulSoup(html,"html.parser")
datalist = getData(bs)
dbpath = "./51job.db"
saveDB(datalist, dbpath)
網頁解析(askURL):調用request和BeautifulSoup實現網頁解析。
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode('gbk', 'ignore')
# print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
數據爬取(getData):獲取招聘信息,公司鏈接、招聘崗位鏈接、公司名稱、崗位名稱、地區、薪水;調用getCOM獲取公司鏈接內的信息,調用getREC獲取招聘崗位信息,併合並返回給main()。
def getData(bs):
datalist = []
for item in bs.select(".dw_table > div.el"):
data = {}
item = str(item)
link = re.findall(findLink, item)
data['link'] = ''.join(link)
title = re.findall(findTitle, item)
data['title'] = ''.join(title)
area = re.findall(findArea, item)
data['area'] = ''.join(area)
com = re.findall(findCom, item)
data['com'] = ''.join(com)
comlink = re.findall(findComLink, item)
data['comlink'] = ''.join(comlink)
salary = re.findall(findSalary, item)
data['salary'] = ''.join(salary)
strhtml = 'https://jobs.51job.com/'
if data["link"].startswith(strhtml) and data['comlink'].startswith(strhtml):
com_link = data['comlink']
com_data = getCOM(com_link)
data = dict(data.items(), **com_data)
rec_link = data['link']
recruit_data = getREC(rec_link)
data = dict(data.items(), **recruit_data)
datalist.append(data)
return datalist
爬取公司信息(getCOM):獲取公司鏈接內的信息,公司性質、公司規模、公司行業
def getCOM(com_link):
com_html = askURL(com_link)
bs = BeautifulSoup(com_html, "html.parser")
# 公司信息
CP_TYPE = ['民營公司', '上市公司', '事業單位', '國企', '外資(歐美)', '外資(非歐美)',
'創業公司', '政府機關', '合資', '外資', '合資', '外企代表處', '非營利組織']
CP_SCALE = ['少於50人', '50-150人', '150-500人', '500-1000人',
'1000-5000人', '5000-10000人', '10000人以上']
cp_info = bs.select('.in > p.ltype')[0].text.split('\xa0\xa0|\xa0\xa0')
com_data = {}
com_data['cp_type'] = com_data['cp_scale'] = com_data['industry'] = ''
for i in CP_TYPE:
if i in cp_info:
com_data['cp_type'] = i
break
for i in CP_SCALE:
if i in cp_info:
com_data['cp_scale'] = i
break
for i in cp_info:
if i not in CP_TYPE and i not in CP_SCALE:
com_data['industry'] = i
return com_data
爬取招聘信息(getREC):獲取招聘崗位鏈接內的信息,經驗、學歷、招聘人數、發佈日期、工作描述
def getREC(rec_link):
jobHtml = askURL(rec_link) #獲取詳情頁
bs = BeautifulSoup(jobHtml,"html.parser")
# 經驗、學歷、招聘人數、發佈日期
text = bs.select(".ltype")
job = {}
if len(text) != 0:
info = text[0].text.split('\xa0\xa0|\xa0\xa0')
EDU = ['博士', '碩士', '本科', '大專',
'中專', '中技', '高中', '初中及以下']
job['exp'] = job['edu'] = job['other'] = job['demand'] = job['pubdate'] = " "
for i in info:
if '經驗' in i:
job['exp'] = i
elif i in EDU:
job['edu'] = i
elif '招' in i:
job['demand'] = i
elif '發佈' in i:
job['pubdate'] = i
else:
job['other'] = i
else:
job['exp'] = job['edu'] = job['other'] = job['demand'] = job['pubdate'] = " "
job['msg'] = " "
jobMsgList = bs.select(".job_msg > p") #工作描述
jobMsgStr = ""
for str in jobMsgList:
jobMsgStr = jobMsgStr + str.text
job["msg"] = jobMsgStr
# jobList.append(job)
return job
數據存儲(saveData):
def saveDB(datalist, dbpath):
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
print(data)
sql = '''
insert or ignore into job_quanguo(
link,title,comlink,com,area,salary,cp_type,cp_scale,industry,exp,edu,other,demand,pubdate,msg
)
values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'''
# print(sql)
cur.execute(sql,(data['link'],data['title'],data['comlink'],data['com'],data['area'],data['salary'],data['cp_type'],
data['cp_scale'],data['industry'],data['exp'],data['edu'],data['other'],data['demand'],data['pubdate'],data['msg']))
conn.commit()
cur.close()
conn.close()
爬取結果:
一共爬取了7277條數據,包括:公司名稱、鏈接、崗位名稱、地區、薪水、企業性質、企業規模、行業、招聘要求、招聘信息等等。
三、數據處理
薪水處理:由於薪水是以上下限顯示的,故將薪水分成三列,分別爲下限,上限以及平均薪水。同時刪除一部分沒有顯示薪水的信息。
def getSalary():
datalist = []
con = sqlite3.connect("51job.db")
cur = con.cursor()
sql = "SELECT com,title,area,cp_type,cp_scale,industry,exp,edu,salary FROM job_quanguo"
data_quanguo = cur.execute(sql)
for item in data_quanguo:
string = "".join(item[8])
if string.endswith('千/月'):
num = string.replace("千/月","").split("-")
sal = pd.to_numeric(num)*1000
# datalist.append(pd.to_numeric(num)*1000)
data1 = append_other(item)
data2 = append_salary(sal)
elif string.endswith('萬/月'):
num = string.replace("萬/月","").split("-")
sal = pd.to_numeric(num)*10000
# datalist.append(pd.to_numeric(num)*10000)
data1 = append_other(item)
data2 = append_salary(sal)
elif string.endswith('萬/年'):
num = string.replace("萬/年","").split("-")
sal = pd.to_numeric(num)*10000/12
# datalist.append(pd.to_numeric(num)*10000/12)
# append_other(item)
data1 = append_other(item)
data2 = append_salary(sal)
else:
continue
data = dict(data1.items(), **data2)
datalist.append(data)
cur.close()
con.close()
# df_salary = pd.DataFrame(columns=['low-salary','high-salary'])
dbpath = "./51job.db"
saveDB(datalist, dbpath)
def append_salary(sal):
data1 = {}
data1['low-salary'] = sal[0].astype(np.int64)
data1['high-salary'] = sal[1].astype(np.int64)
data1['avg-salary'] = (sal[0].astype(np.int64)+sal[1].astype(np.int64))/2
return data1
四、Flask與ECharts
主要參考了ECharts官方文檔,在此就不一一論述了。