爬取百度地圖遷徙數據的方法請參考這篇文章:
python_爬蟲_百度地圖遷徙_遷入地來源_遷出目的地
將json數據處理成excel請參考這篇文章:
python_將爬取的百度地圖遷徙json數據寫入到excel
原始數據格式:
“jsonp_1584195671576_1286958({“errno”:0,“errmsg”:“SUCCESS”,“data”:{“list”:[{“province_name”:“山東省”,“value”:42.64},{“province_name”:“河南省”,“value”:24.15},
…
{“province_name”:“青海省”,“value”:0.02},{“province_name”:“新疆維吾爾自治區”,“value”:0.01}]}})”
處理成功的數據格式,矩陣的格式可用於機器學習研究
import os
import re
from utils.read_write import readTXT, writeOneJSON, eachFile, writeOneCSV
os.chdir(r'D:\data\百度遷徙大數據\最新城市省份流入流出數據\json')
# 把txt文件讀取成字符串數組
lines = readTXT('D:\project\jianguiyuan\data\BaiduMap_cityCode_1102.txt')
title = [0]
for i in range(1, 327):
obj = lines[i].split(',')
title.append(obj[1])
def writeTitle(riqi):
writeOneCSV(title,dir+'各城市遷入矩陣'+ "_" + riqi +'.csv')
writeOneCSV(title,dir+'各城市遷出矩陣'+ "_" + riqi +'.csv')
# writeOneCSV(title,dir+'各省份遷入矩陣'+ "_" + riqi +'.csv')
# writeOneCSV(title,dir+'各省份遷出矩陣'+ "_" + riqi +'.csv')
# 先將數據下載爲json文件
def city_range(n,riqi):
shengqianru = []
shengqianchu = []
titles = title
for i in range(n, 327):
qianru = []
qianchu = []
# print(i)
# 把城市id號和城市名分開
obj = lines[i].split(',')
# print(obj[1])
fileline = readTXT("城市遷入_" + obj[1] + "_" + riqi + ".json")
ner = fileline[0].replace('\\','')
pat = '{"city_name":"(.*?)","province_name":".*?","value":.*?}'
pat1 = '{"city_name":".*?","province_name":".*?","value":(.*?)}'
city_name = re.compile(pat).findall(ner)
value = re.compile(pat1).findall(ner)
qianru.append(obj[1])
combine = []
# 獲取每一列對應的索引
for name in city_name:
for k in range(1, len(title)):
if title[k] == name:
combine.append(title.index(name))
# 獲取數組索引所對應的值
for m in range(1,327):
if m in combine:
col_value = value[combine.index(m)]
qianru.append(float(col_value))
else:
qianru.append(0)
fileline = readTXT("城市遷出_" + obj[1] + "_" + riqi + ".json")
fileline[0] = fileline[0].replace('\\', '')
pat = '{"city_name":"(.*?)","province_name":".*?","value":.*?}'
pat1 = '{"city_name":".*?","province_name":".*?","value":(.*?)}'
result2 = re.compile(pat).findall(fileline[0])
result12 = re.compile(pat1).findall(fileline[0])
qianchu.append(obj[1])
combine = []
for name in result2:
for k in range(1, len(title)):
if title[k] == name:
combine.append(title.index(name))
for m in range(1,327):
if m in combine:
col_value = result12[combine.index(m)]
qianchu.append(float(col_value))
else:
qianchu.append(0)
writeOneCSV(qianru, dir + '各城市遷入矩陣' + "_" + riqi + '.csv')
writeOneCSV(qianchu, dir + '各城市遷出矩陣' + "_" + riqi + '.csv')
def date_change(date):
date_list=[]
# 注意這個日期,一個月只有31天,爬取2月份的數據需要重新改
for riqi in range(date, 20200131):
date_list.append(str(riqi))
for riqi in range(20200201, 20200230):
date_list.append(str(riqi))
for riqi in range(20200301, 20200328):
date_list.append(str(riqi))
for riqi in date_list:
print(riqi)
writeTitle(riqi)
city_range(1,riqi)
print("大吉大利,今晚喫雞啊!")
if __name__ == '__main__':
dir = 'D:\data\人口數據\百度遷徙大數據\最新城市省份流入流出數據\矩陣\\'
date_change(20200101)
其中的參考文件請移步到我的下載
我的下載
如需幫忙處理數據和爬取數據請私聊我。。。