需求:
- 前期使用金数据收集数据,现在使用金数据提供的API抓取之前的数据
问题:
curl用法:
curl -u api_key:api_secret https://jinshuju.net/api/v1/forms/ex27t2
中文乱码问题:
curl -u key:secret https://jinshuju.net/api/v1/forms/XXX/entries|iconv -f utf-8 -t gbk
python:
问题解决:
- 认证问题:request(auth=(user,password)) user,password 并不是按照官方说的放在heads里面
- 没有字段名问题:事先下载了一下表格,各取一条数据,然后通过数据比对获取相应的字段名。
- 获取限制问题:超过50条数据,需要递归获取新的data
import requests
import json
import pandas as pd
def get_data():
headers = {'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Mobile Safari/537.36',
'Authorization':'Basic Auth',
'Content-Type': 'application/json',
# 'user':user,
# 'password':password
}
user = 'key'
password = 'secret'
host = 'https://jinshuju.net'
use_class = '/api/v1'
entries_path = '/forms/Lo8qfz/entries'
form_path = '/forms/Lo8qfz'
url = host + use_class + entries_path
# url2 = host + use_class + form_path
try:
request = requests.get(url,auth=(user,password),headers=headers)
request.raise_for_status()
request.encoding = request.apparent_encoding
#print(request.text)
except:
print("数据获取失败")
js = json.loads(request.text)
data50 = js.get('data')
def more50(js,data):
if js.get('count') <= 50:
data = js.get('data')
return data
else:
next_number = js.get('next')
url = host + use_class + entries_path + '?' + 'next=' + next_number
try:
request2 = requests.get(url,auth=(user,password),headers=headers)
request2.raise_for_status()
request2.encoding = request2.apparent_encoding
#print(request.text)
except:
print("数据获取失败")
js2 = json.loads(request2.text)
next_number2 = js2.get('next')
data_next = js2.get('data')
data = data + data_next
if next_number2:
return more50(js,data)
else:
return data
data = more50(js,data50)
return data
data = get_data()
def deal_data(data):
new_dic = {}
join_list = []
df2 = pd.DataFrame()
for i,dic in enumerate(data):
if 'creator_name' in list(dic.keys()):
dic.pop('creator_name')
if 'serial_number' in list(dic.keys()):
dic.pop('serial_number')
for k,v in dic.items():
if len(k.split('_')) > 1:
k2 = k.split('_')[1]
if type(v)==list:
if len(v)>0 and type(v[0])==str:
v2 = ','.join(v)
new_dic[k2] = v2
elif len(v)>0 and type(v[0])==dict:
for j in range(len(v)):
join_list.append(','.join([v[j]['statement'],v[j]['choice']]))
v2 = ';'.join(join_list)
new_dic[k2] = v2
elif len(v)==0:
new_dic[k2] = ''
else:
v2=v
new_dic[k2] = v2
df2 = df2.append(new_dic,ignore_index=True)
return df2
df2 = deal_data(data)
df3 = df2.drop('at',axis=1)
namelist = df3.columns.to_list()
namelist2 = [int(i) for i in namelist if i] #对数字列排序
namelist2.sort()
namelist = [str(i) for i in namelist2 if i]
df4 = df3[namelist].drop('20',axis=1)
##获取比对列名
excel_data = pd.read_csv(r'D:\python_code\ruijin_metaboliaze\CODE_API\info_patients.csv')
match_df = excel_data[excel_data['ident_id'].isin(['130403197605251817'])]
oldmatch_df = df4[df4['22'].isin(['130403197605251817'])].reset_index(drop=True)
##数据预处理
oldmatch_df = oldmatch_df.fillna(0)
match_df = match_df.fillna(0)
match_df[['weight','food_preference']] = match_df[['weight','food_preference']].astype(int)
##比对映射函数
def match_colname(match_df,oldmatch_df):
m_dict = {}
for index,row in match_df.iteritems():
flag = 0
if flag == 1:
continue
else:
pass
for index2,row2 in oldmatch_df.iteritems():
if row2[0]=='' and row[0]==0 and flag == 0:
flag = 1
m_dict[index]=index2
oldmatch_df = oldmatch_df.drop(index2,axis=1)
match_df = match_df.drop(index,axis=1)
# print(index2)
break
elif str(row2[0]) == str(row[0]).strip() and flag == 0:
flag = 1
m_dict[index]=index2
oldmatch_df = oldmatch_df.drop(index2,axis=1)
match_df = match_df.drop(index,axis=1)
# print(index2)
break
return m_dict
m_dict = match_colname(match_df,oldmatch_df)
f_dict = {v:k for k,v in m_dict.items()} #键值对转换
df5 = df4.rename(columns=f_dict)