爬取百度地圖遷徙數據的方法請參考這篇文章:
python_爬蟲_百度地圖遷徙_遷入地來源_遷出目的地
import os
import re
from urllib import request
import xlwt
from utils.read_write import readTXT, writeOneJSON, eachFile
os.chdir(r'D:\data\百度遷徙大數據\最新城市省份流入流出數據\json')
# 設置excel的樣式
def set_style(name, height, bold=False):
style = xlwt.XFStyle() # 初始化樣式
font = xlwt.Font() # 爲樣式創建字體
font.name = name # 'Times New Roman'
font.bold = bold
font.color_index = 4
font.height = height
style.font = font
return style
f = xlwt.Workbook()
sheet2 = f.add_sheet(u'sheet2', cell_overwrite_ok=True) # 創建sheet2
row0 = [u'遷入城市',u'所在城市',u'佔比',u'遷出城市',u'所在城市',u'佔比',u'遷入省份',u'所在城市',u'佔比',u'遷出省份',u'所在城市',u'佔比']
# 生成第一行
for i in range(0, len(row0)):
sheet2.write(0, i, row0[i], set_style('Times New Roman', 200, True))
default = set_style('Times New Roman', 220)
# 把txt文件讀取成字符串數組
lines = readTXT('D:\project\jianguiyuan\data\BaiduMap_cityCode_1102.txt')
# 先將數據下載爲json文件
def city_range(n,riqi):
for i in range(n, 327):
print(i)
# 把城市id號和城市名分開
obj = lines[i].split(',')
print(obj[1])
fileline = readTXT("城市遷入_" + obj[1] + "_" + riqi + ".json")
ner = fileline[0].replace('\\','')
pat = '{"city_name":"(.*?)","province_name":".*?","value":.*?}'
pat1 = '{"city_name":".*?","province_name":".*?","value":(.*?)}'
result = re.compile(pat).findall(ner)
result1 = re.compile(pat1).findall(ner)
column0 = result
column1 = result1
column2 = obj[1]
for i1 in range(0, len(column0)):
sheet2.write(i1 + len(column0) * (i-1) + 1, 0, column0[i1], default)
for i1 in range(0, len(column0)):
sheet2.write(i1 + len(column0) * (i-1) + 1, 1, column2, default)
for i1 in range(0, len(column1)):
sheet2.write(i1 + len(column0) * (i-1) + 1, 2, column1[i1], default)
fileline = readTXT("城市遷出_" + obj[1] + "_" + riqi + ".json")
fileline[0] = fileline[0].replace('\\', '')
pat = '{"city_name":"(.*?)","province_name":".*?","value":.*?}'
pat1 = '{"city_name":".*?","province_name":".*?","value":(.*?)}'
result2 = re.compile(pat).findall(fileline[0])
result12 = re.compile(pat1).findall(fileline[0])
column0 = result2
column1 = result12
column2 = obj[1]
for i1 in range(0, len(column0)):
sheet2.write(i1 + len(column0) * (i-1) + 1, 3, column0[i1], default)
for i1 in range(0, len(column0)):
sheet2.write(i1 + len(column0) * (i-1) + 1, 4, column2, default)
for i1 in range(0, len(column1)):
sheet2.write(i1 + len(column0) * (i-1) + 1, 5, column1[i1], default)
fileline = readTXT("省份遷入_" + obj[1] + "_" + riqi + ".json")
fileline[0] = fileline[0].replace('\\', '')
# 對Unicode編碼進行改造
pat = '{"province_name":(.*?),"value":.*?}'
pat1 = '{"province_name":".*?","value":(.*?)}'
result = re.compile(pat).findall(fileline[0])
result1 = re.compile(pat1).findall(fileline[0])
column0 = result
column1 = result1
column2 = obj[1]
for i1 in range(0, len(column0)):
sheet2.write(i1 + len(column0) * (i-1) + 1, 6, column0[i1], default)
for i1 in range(0, len(column0)):
sheet2.write(i1 + len(column0) * (i-1) + 1, 7, column2, default)
for i1 in range(0, len(column1)):
sheet2.write(i1 + len(column0) * (i-1) + 1, 8, column1[i1], default)
fileline = readTXT("省份遷出_" + obj[1] + "_" + riqi + ".json")
fileline[0] = fileline[0].replace('\\', '')
pat = '{"province_name":(.*?),"value":.*?}'
pat1 = '{"province_name":".*?","value":(.*?)}'
result2 = re.compile(pat).findall(fileline[0])
result12 = re.compile(pat1).findall(fileline[0])
column0 = result2
column1 = result12
column2 = obj[1]
for i1 in range(0, len(column0)):
sheet2.write(i1 + len(column0) * (i-1) + 1, 9, column0[i1], default)
for i1 in range(0, len(column0)):
sheet2.write(i1 + len(column0) * (i-1) + 1, 10, column2, default)
for i1 in range(0, len(column1)):
sheet2.write(i1 + len(column0) * (i-1) + 1, 11, column1[i1], default)
def date_change(date):
date_list=[]
# 注意這個日期,一個月只有31天,爬取2月份的數據需要重新改
for riqi in range(date, 20200131):
date_list.append(str(riqi))
for riqi in range(20200201, 20200230):
date_list.append(str(riqi))
for riqi in range(20200301, 20200316):
date_list.append(str(riqi))
for riqi in date_list:
print(riqi)
city_range(1,riqi)
print("大吉大利,今晚喫雞啊!")
filename = 'D:\data\人口數據\百度遷徙大數據\最新城市省份流入流出數據\\riqi\\'+riqi+'.xls'
f.save(filename)
if __name__ == '__main__':
# dir = 'D:\data\百度遷徙大數據\最新城市省份流入流出數據\json'
date_change(20200104)
其中的參考文件請移步到我的下載
我的下載
如需幫忙處理數據和爬取數據請私聊我。。。