这里记录一些使用numpy和pandas进行数据读取的常用操作。
数据读取
一般来说numpy和pandas灵活的接替使用就好
txt与csv文件读取
import pandas as pd
data = pd.read_csv('data.csv')
data = pd.read_csv('data.txt')
# Data = pd.read_csv(file, sep = '') pd指定分割符
#记 ns3仿真数据转换
file = 'ns3.tsxt'
Data = pd.read_csv(file)
Data['rsrp'] = 10*np.log10(Data['rsrp']) + 30.0
Data['sinr'] = 10*np.log10(Data['sinr'])
print(Data.describe())
Data.to_csv('data/ns3.txt', index=False, header=True, mode='a+')
#pandas进行数据排序
Data.sort_values(['IMSI'][:3200], inplace=True)
#只读某几列
data= pd.read_csv("data.txt", delimiter=',',usecols=[1,2,3,4])
#当文件没有header时候,需要指定不跳过head
data= pd.read_csv('data.txt', header=None) #pd
data= np.loadtxt('sensor_data3.csv', delimiter=',', usecols=[0,2,3,4,5,6,7,8]) #np
注,dilimiter为分隔符,根据自己的文件格式进行适应,一般都是是空格或者逗号。
实测:txt文件阅读与csv文件的阅读时间相同:
import time
t1 = time.time()
t2 = time.time()
print("the time is: ",t2-t1)
csv大文件的读取
chunk = pd.read_csv('sensor_data3.csv', iterator=True)
data = chunk.get_chunk(50000)
批量文件读入
读取某个文件下所有的带相同前缀或后缀的文件。
下面例子展示了,读取文件夹下的所有文件,并存储到一个‘all_data.txt’中。
import os
import numpy as np
import pandas as pd
def scan_files(directory, prefix=None, postfix=None):
files_list = []
for root, sub_dirs, files in os.walk(directory):
for special_file in files:
if postfix:
if special_file.endswith(postfix):
files_list.append(os.path.join(root, special_file))
elif prefix:
if special_file.startswith(prefix):
files_list.append(os.path.join(root, special_file))
else:
pass
return files_list
def main():
file_name = [] # read the raw file name
path = 'luce_station/'
file_name = scan_files(path, postfix=".txt")
output_file = "all_data.txt"
pos = np.array([0,1,2,3,4,5,6,9,12]) #要读取的列。这个可以自己选择,滤除原来数据中不需要的列
s = [] #保存结果的shape
for name in file_name:
print(name)
file = open(name,'r')
Data = []
try:
while True:
text_line = file.readline()
if text_line:
temp = text_line.split(" ")
temp = np.asanyarray(temp)
data = temp[pos]
data = np.array(data)
Data.append(data)
else:
break
finally:
Data = pd.DataFrame(data=Data, index=None, columns=None)
Data.to_csv(output_file, index=False, header=False, mode='a+') #mode a+表示接数据存
del(Data) #防止存储爆炸
file.close()
if __name__ == main():
main()
数据存储
output_file = "data.txt"
Data = pd.DataFrame(data=Data, index=None, columns=None) #在存储的时候不加index和列名
Data.to_csv(output_file, index=False, header=False, mode='a+')
for file in file_list:
Data1 = pd.read_csv(file)
for i in range(500):
Data = Data1.iloc[3200*(i):3200*i+3200,:]
Data.sort_values(['IMSI'], inplace=True)
Data = Data.iloc[:1600,:]
if i == 0:
Data.to_csv('data/ns3_ground.txt', index=False, header=True, mode='a+')
else:
Data.to_csv('data/ns3_ground.txt', index=False, header=False, mode='a+')
# print(Data.describe())
# print(Data.skew())
# print(Data.kurt())
data = Data['rsrp']
data = np.array(data)
data = np.reshape(data,[40,40])
plot_image(data)
数据可视化
fig = plt.figure()
plt.plot(x, data1,'r+-' ,label = 'a')
plt.plot(x, data2, 'b*-',label = 'b')
plt.plot(x, data3, '-o',label = 'c')
plt.plot(x, data4, 'g.-', label = 'd')
plt.legend(fontsize='x-large')
plt.xlabel('Missing rate', fontsize='x-large')
plt.ylabel('Mean Absolute Error in dB', fontsize='x-large')
plt.grid()
plt.show()
fig.savefig('result/usrp_cost.png',dpi=300)