這裏記錄一些使用numpy和pandas進行數據讀取的常用操作。
數據讀取
一般來說numpy和pandas靈活的接替使用就好
txt與csv文件讀取
import pandas as pd
data = pd.read_csv('data.csv')
data = pd.read_csv('data.txt')
# Data = pd.read_csv(file, sep = '') pd指定分割符
#記 ns3仿真數據轉換
file = 'ns3.tsxt'
Data = pd.read_csv(file)
Data['rsrp'] = 10*np.log10(Data['rsrp']) + 30.0
Data['sinr'] = 10*np.log10(Data['sinr'])
print(Data.describe())
Data.to_csv('data/ns3.txt', index=False, header=True, mode='a+')
#pandas進行數據排序
Data.sort_values(['IMSI'][:3200], inplace=True)
#只讀某幾列
data= pd.read_csv("data.txt", delimiter=',',usecols=[1,2,3,4])
#當文件沒有header時候,需要指定不跳過head
data= pd.read_csv('data.txt', header=None) #pd
data= np.loadtxt('sensor_data3.csv', delimiter=',', usecols=[0,2,3,4,5,6,7,8]) #np
注,dilimiter爲分隔符,根據自己的文件格式進行適應,一般都是是空格或者逗號。
實測:txt文件閱讀與csv文件的閱讀時間相同:
import time
t1 = time.time()
t2 = time.time()
print("the time is: ",t2-t1)
csv大文件的讀取
chunk = pd.read_csv('sensor_data3.csv', iterator=True)
data = chunk.get_chunk(50000)
批量文件讀入
讀取某個文件下所有的帶相同前綴或後綴的文件。
下面例子展示了,讀取文件夾下的所有文件,並存儲到一個‘all_data.txt’中。
import os
import numpy as np
import pandas as pd
def scan_files(directory, prefix=None, postfix=None):
files_list = []
for root, sub_dirs, files in os.walk(directory):
for special_file in files:
if postfix:
if special_file.endswith(postfix):
files_list.append(os.path.join(root, special_file))
elif prefix:
if special_file.startswith(prefix):
files_list.append(os.path.join(root, special_file))
else:
pass
return files_list
def main():
file_name = [] # read the raw file name
path = 'luce_station/'
file_name = scan_files(path, postfix=".txt")
output_file = "all_data.txt"
pos = np.array([0,1,2,3,4,5,6,9,12]) #要讀取的列。這個可以自己選擇,濾除原來數據中不需要的列
s = [] #保存結果的shape
for name in file_name:
print(name)
file = open(name,'r')
Data = []
try:
while True:
text_line = file.readline()
if text_line:
temp = text_line.split(" ")
temp = np.asanyarray(temp)
data = temp[pos]
data = np.array(data)
Data.append(data)
else:
break
finally:
Data = pd.DataFrame(data=Data, index=None, columns=None)
Data.to_csv(output_file, index=False, header=False, mode='a+') #mode a+表示接數據存
del(Data) #防止存儲爆炸
file.close()
if __name__ == main():
main()
數據存儲
output_file = "data.txt"
Data = pd.DataFrame(data=Data, index=None, columns=None) #在存儲的時候不加index和列名
Data.to_csv(output_file, index=False, header=False, mode='a+')
for file in file_list:
Data1 = pd.read_csv(file)
for i in range(500):
Data = Data1.iloc[3200*(i):3200*i+3200,:]
Data.sort_values(['IMSI'], inplace=True)
Data = Data.iloc[:1600,:]
if i == 0:
Data.to_csv('data/ns3_ground.txt', index=False, header=True, mode='a+')
else:
Data.to_csv('data/ns3_ground.txt', index=False, header=False, mode='a+')
# print(Data.describe())
# print(Data.skew())
# print(Data.kurt())
data = Data['rsrp']
data = np.array(data)
data = np.reshape(data,[40,40])
plot_image(data)
數據可視化
fig = plt.figure()
plt.plot(x, data1,'r+-' ,label = 'a')
plt.plot(x, data2, 'b*-',label = 'b')
plt.plot(x, data3, '-o',label = 'c')
plt.plot(x, data4, 'g.-', label = 'd')
plt.legend(fontsize='x-large')
plt.xlabel('Missing rate', fontsize='x-large')
plt.ylabel('Mean Absolute Error in dB', fontsize='x-large')
plt.grid()
plt.show()
fig.savefig('result/usrp_cost.png',dpi=300)