pandas保存、讀取外部數據
1,pandas保存、讀取csv文件
import numpy as np
import pandas as pd
import numpy as np
np.random.seed(4)
day_data = np.random.normal(0,1,(500,507))
day_data = np.round(day_data,1)
stock_list = ["股票"+ str(i) for i in range(day_data.shape[0])]
date = ["第"+ str(i)+"天" for i in range(day_data.shape[1])]
df = pd.DataFrame(day_data,index=stock_list,columns=date)
# 保存csv文件
# index-儲存不會將索引值變成一列數據
# mode-'a'追加的方式;'w'覆蓋的方式
# header-是否保存數據列索引
df.to_csv("./test.csv",columns=["第0天","第1天"],index=True,mode='w',header=True)
# 讀取csv文件
# 指定某列來讀取部分數據,其他用不到的就不讀取了
df = pd.read_csv("./test.csv",usecols=["第0天"])
print(df)
2,hdf文件保存、讀取數據
import numpy as np
import pandas as pd
import numpy as np
np.random.seed(4)
day_data = np.random.normal(0,1,(500,507))
day_data = np.round(day_data,1)
stock_list = ["股票"+ str(i) for i in range(day_data.shape[0])]
date = ["第"+ str(i)+"天" for i in range(day_data.shape[1])]
df = pd.DataFrame(day_data,index=stock_list,columns=date)
# 存儲h5文件,key的作用爲加快讀取速速
df.to_hdf("./hdf1.h5",key="lalala")
# 讀取h5文件
temp = pd.read_hdf("./hdf1.h5")
注意:
# 需要安裝tables模塊,避免不能讀取hdf(h5)文件
pip install tables
# 一個h5文件可以放入多個key,來保存這種三維的結構
先選擇hdf文件儲存
- hdf在儲存的是支持壓縮,使用的方式是blosc,這個是速度最快的也是pandas默認支持的。
- 使用打算可以提升磁盤利用率,節省空間
- hdf還是快平臺的,可以輕鬆遷移到hadoop上面
3,pandas讀取MongDB數據`
# coding=utf-8
import pandas as pd
from pymongo import MongoClient
client = MongoClient()
collection = client["douban"]["tv1"]
data = list(collection.find())
t1 = data[0]
t1 = pd.Series(ti)
print(t1)
4,pandas讀取mysql數據
import pandas as pd
from sqlalchemy import create_engine
# 創建連接
engine = create_engine('mysql+pymysql://root:root@localhost:3306/yoyo')
sql = """select * from role_info;"""
# 讀取連接數據
df = pd.read_sql_query(sql,engine)
print(df,type(df))
參考自:https://www.cnblogs.com/fuqia/p/8996033.html
實例:
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://root:root@localhost:3306/yoyo')
sql = """select * from book_management_signin;"""
df = pd.read_sql(sql,engine)
# pandas取行或者列的注意點
# 方括號寫數組,表示取行,對行進行操作
# 方括號寫字符串,表示取列索引,對列進行操作
df = df.sort_values(by="sign_number",ascending=False)
print(df[2:4] ,df['sign_present'],type(df['sign_present']))