簡介
- 本文針對 北京、上海、深圳、廣州、杭州、武漢 這幾個城市進行GIS就業分析,分析內容:gis行業需求量,gis開發人員需求佔比,gis相關技能。
- 本文數據來源 智聯
數據下載
爬蟲構思方案相對簡單,直接訪問一個接口即可
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __file__: download_zhilian
import requests
import json
import pymongo
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.lvyou
collection = db.gis_zhaoping
url = "https://fe-api.zhaopin.com/c/i/sou?start={}&pageSize=90&cityId={}&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=gis&kt=3&=0&rt={}&_v=0.53968509&x-zp-page-request-id={}"
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
}
def insert_mongo(all):
for result in all:
for i in result:
collection.insert_one(i)
def get_zhiwei(cityid):
all_data = []
for i in range(20):
req_url = url.format(i * 90, cityid, 'dc52638bdafe4e749e72a26c3aac727d',
'6f3e278b4d344939b526c1f21a74b26e-1557706634617-533000')
req = requests.get(url=req_url, headers=headers)
res_test = req.text
all_json = json.loads(res_test)
result = all_json['data']['results']
if result:
all_data.append(result)
else:
print(i )
break
print("職位數量", len(all_data))
return all_data
def spider(cityid):
all = get_zhiwei(cityid)
insert_mongo(all)
print("完成" , cityid)
def run():
city_id = {
"北京": 530,
"上海": 538,
"深圳": 765,
"廣州": 763,
"杭州": 653,
"武漢": 736,
}
for i in city_id.values():
spider(i)
if __name__ == '__main__':
run()
- 下載後其中一條數據如下
{"_id":"5cd8b7acd58cb320088356c5","applyType":"1","refreshMulscore":"0.0","endDate":"2019-07-02 00:00:00","showLicence":0,"g_weight":0,"extractSkillTag":["測繪","地理信息系統","團隊合作精神","gis(地理信息系統)","遙感","工程測量","團隊合作","jquery","javascript","gis","信息管理與信息系統","溝通能力","團隊領導","學習能力","地理信息","信息系統","數據庫"],"welfare":["五險一金","年終分紅","帶薪年假","定期體檢"],"salary":"15K-20K","SOU_POSITION_ID":"CC120525764J00155066913","score":609.2526,"number":"CC120525764J00155066913","recruitCount":1,"workingExp":{"code":"305","name":"3-5年"},"companyScore":0,"tagIntHighend":0,"jobName":"gis開發工程師","manualScore":"0.0","eduLevel":{"code":"4","name":"本科"},"rootOrgId":12052576,"recentAndTotal":{"applyTotal":"4","exposureTotal":"460","clickTotal":"4","exposureRecent":"0","clickRecent":"0","applyRecent":"0"},"tags":[],"businessArea":"馬連窪","positionLabel":"{\"qualifications\":null,\"chatWindow\":20,\"jobLight\":[\"五險一金\",\"年終分紅\",\"帶薪年假\",\"定期體檢\"],\"role\":null,\"companyTag\":null,\"level\":null,\"refreshLevel\":1,\"skill\":null}","jobTag":{"searchTag":"五險一金,年終分紅,帶薪年假,定期體檢"},"updateDate":"2019-05-13 07:49:28","g_sort":"sort-ps-score-pqks-ranking","city":{"display":"北京","items":[{"code":"530","name":"北京"}]},"saleType":0,"positionURL":"https://jobs.zhaopin.com/CC120525764J00155066913.htm","industry":"160400,160000,300300,210500","extractNormalizedTag":["信息管理與信息系統","溝通能力","gis(地理信息系統)","團隊領導","學習能力","工程測量","javascript"],"duplicated":0,"geo":{"lon":"116.280067","lat":"40.031482"},"vipLevel":1002,"company":{"number":"CZ120525760","size":{"code":"2","name":"20-99人"},"name":"北京天匯創業軟件有限公司","type":{"code":"4","name":"合資"},"url":"https://company.zhaopin.com/CZ120525760.htm"},"seo":"0","jobType":{"display":"軟件/互聯網開發/系統集成,軟件工程師","items":[{"code":"160000","name":"軟件/互聯網開發/系統集成"},{"code":"45","name":"軟件工程師"}]},"g_query":"query-ps-score-3","resumeCount":4,"createDate":"2019-05-10 13:34:24","companyLogo":"","futureJob":false,"emplType":"全職","g_source":"source-solr-position","staff":{"id":100782870},"SOU_POSITION_SOURCE_TYPE":1,"expandCount":0,"feedbackRation":0,"staffId":100782870,"selected":false,"applied":false,"collected":false,"isShow":false,"timeState":"最新","rate":""}
數據分析
- 分析這邊利用mongo的查詢,pandas分組求和,plt的可視化
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __file__: attr
import pymongo
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
zhfont1 = matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\STFANGSO.TTF')
def get_collection():
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.lvyou
collection = db.gis_zhaoping
return collection
def city_quantity_demanded():
"""
每個城市的需求量分析
:param data: gis 職位信息
:return: {城市:需求量}
"""
collection = get_collection()
pipeline = {'_id': "$city.display", 'count': {'$sum': 1}}
ret = collection.aggregate(
[
{'$group': pipeline},
]
)
result = []
x = []
y = []
for i in ret:
id = i['_id'].split("-")[0]
count = i['count']
x.append(id)
y.append(count)
result.append(i)
df = pd.DataFrame({"city": x, "count": y})
sssss = df.groupby("city").sum()
a = sssss.apply(list).to_dict()['count']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.bar(range(len(a.values())), a.values(), tick_label=list(a.keys()), label="需求量")
for xx, yy in zip(range(len(a.values())), a.values()):
plt.text(xx, yy + 0.1, str(yy), ha='center')
plt.title("gis需求量", fontproperties=zhfont1)
plt.legend(prop=zhfont1)
plt.savefig("gis需求量.png")
plt.show()
pass
def is_dev():
"""
開發者需求量
:return:
"""
collection = get_collection()
db_count = collection.count_documents({})
dev_count = collection.count_documents({'jobName': {'$regex': ".*開發.*"}})
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.pie([dev_count, db_count - dev_count], labels=["開發者需求", "非開發者需求"], autopct='%1.2f%%')
plt.title("開發者需求量", fontproperties=zhfont1)
plt.legend(prop=zhfont1)
plt.savefig("開發者需求量.png")
plt.show()
pass
def gis_skill_sort():
"""
gis技能需求
:return:
"""
collection = get_collection()
s = collection.find()
result = []
for i in s:
extractSkillTag = i['extractSkillTag']
extractNormalizedTag = i['extractNormalizedTag']
result.extend(extractSkillTag)
result.extend(extractNormalizedTag)
dict = list_count(result)
df = pd.DataFrame({"profession": list(dict.keys()), "count": list(dict.values())})
sssss = df.groupby("profession").sum()
sssss = sssss.sort_values(by=['count'], ascending=False).head(20)
skill_data = sssss.apply(list).to_dict()['count']
print(sssss)
print(skill_data)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.bar(range(len(skill_data.values())), skill_data.values(), tick_label=list(skill_data.keys()), label="技能排名")
for xx, yy in zip(range(len(skill_data.values())), skill_data.values()):
plt.text(xx, yy + 0.1, str(yy), ha='center')
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(26, 24)
plt.xticks(rotation=270)
plt.title("技能排名", fontproperties=zhfont1)
plt.legend(prop=zhfont1)
plt.savefig("gis技能排名.png")
plt.show()
def list_count(result):
dict = {}
for key in result:
dict[key] = dict.get(key, 0) + 1
return dict
def salary_bar(data, title):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.bar(range(len(data.values())), data.values(), tick_label=list(data.keys()), label=title)
for xx, yy in zip(range(len(data.values())), data.values()):
plt.text(xx, yy + 0.1, str(yy), ha='center')
fig = matplotlib.pyplot.gcf()
plt.xticks(rotation=270)
plt.title(title, fontproperties=zhfont1)
plt.legend(prop=zhfont1)
plt.savefig("{}.png".format(title))
plt.show()
def salary_plot():
collection = get_collection()
s = collection.find()
dev_salary = []
not_dev_salary = []
for i in s:
salary = i['salary']
if "開發" in i['jobName']:
dev_salary.append(salary)
else:
not_dev_salary.append(salary)
dev_salary_count = list_count(dev_salary)
not_dev_salary_count = list_count(not_dev_salary)
# gis 開發者薪資情況
df = pd.DataFrame({"profession": list(dev_salary_count.keys()), "count": list(dev_salary_count.values())})
sssss = df.groupby("profession").sum()
sssss = sssss.sort_values(by=['count'], ascending=False).head(20)
dev_salary_count_data = sssss.apply(list).to_dict()['count']
# 非開發者薪資情況
df = pd.DataFrame({"profession": list(not_dev_salary_count.keys()), "count": list(not_dev_salary_count.values())})
sssss = df.groupby("profession").sum()
sssss = sssss.sort_values(by=['count'], ascending=False).head(20)
not_dev_salary_count_data = sssss.apply(list).to_dict()['count']
# 繪圖
salary_bar(dev_salary_count_data, "開發者薪資")
salary_bar(not_dev_salary_count_data, "非開發者薪資")
if __name__ == '__main__':
city_quantity_demanded()
is_dev()
gis_skill_sort()
salary_plot()
pass
- gis需求量
- gis技能排名
- gis開發人員需求佔比
- gis薪資情況
- 開發者
- 非開發者
- 開發者