使用numpy和pandas進行數據讀取、存儲

這裏記錄一些使用numpy和pandas進行數據讀取的常用操作。

數據讀取

一般來說numpy和pandas靈活的接替使用就好

txt與csv文件讀取

import pandas as pd
data = pd.read_csv('data.csv')
data = pd.read_csv('data.txt')
# Data = pd.read_csv(file, sep = '') pd指定分割符
#記 ns3仿真數據轉換
file = 'ns3.tsxt'
Data = pd.read_csv(file)
Data['rsrp'] = 10*np.log10(Data['rsrp']) + 30.0
Data['sinr'] = 10*np.log10(Data['sinr'])
print(Data.describe())
Data.to_csv('data/ns3.txt', index=False, header=True, mode='a+')
#pandas進行數據排序
Data.sort_values(['IMSI'][:3200], inplace=True)
#只讀某幾列
data= pd.read_csv("data.txt", delimiter=',',usecols=[1,2,3,4]) 
#當文件沒有header時候,需要指定不跳過head
data= pd.read_csv('data.txt', header=None)  #pd
data= np.loadtxt('sensor_data3.csv', delimiter=',', usecols=[0,2,3,4,5,6,7,8]) #np

注,dilimiter爲分隔符,根據自己的文件格式進行適應,一般都是是空格或者逗號。
實測:txt文件閱讀與csv文件的閱讀時間相同:

import time
t1 = time.time()
t2 = time.time()
print("the time is: ",t2-t1)

csv大文件的讀取

chunk = pd.read_csv('sensor_data3.csv', iterator=True)
data = chunk.get_chunk(50000)

批量文件讀入

讀取某個文件下所有的帶相同前綴或後綴的文件。
下面例子展示了,讀取文件夾下的所有文件,並存儲到一個‘all_data.txt’中。

import os
import numpy as np
import pandas as pd

def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                   files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                pass
    return files_list   

def main():
    
    file_name = []  # read the raw file name
    path = 'luce_station/'
    file_name = scan_files(path, postfix=".txt")    
    output_file = "all_data.txt"
    
    pos = np.array([0,1,2,3,4,5,6,9,12])  #要讀取的列。這個可以自己選擇,濾除原來數據中不需要的列
    s = []  #保存結果的shape
    for name in file_name:
        print(name)
        file = open(name,'r')
        Data = []
        try:
            while True:
                text_line = file.readline()
                if text_line:
                    temp = text_line.split(" ")
                    temp = np.asanyarray(temp)                  
                    data = temp[pos]
                    data = np.array(data)
                    Data.append(data)
                else:
                    break
        finally:
            Data = pd.DataFrame(data=Data, index=None, columns=None)
            Data.to_csv(output_file, index=False, header=False, mode='a+') #mode a+表示接數據存
            del(Data) #防止存儲爆炸
            file.close()  

if __name__ == main():
	main()

數據存儲

output_file = "data.txt"
Data = pd.DataFrame(data=Data, index=None, columns=None) #在存儲的時候不加index和列名
Data.to_csv(output_file, index=False, header=False, mode='a+') 

for file in file_list:
    Data1 = pd.read_csv(file)
    for i in range(500):
        Data = Data1.iloc[3200*(i):3200*i+3200,:]
        Data.sort_values(['IMSI'], inplace=True)
        Data = Data.iloc[:1600,:]
        if i == 0:
            Data.to_csv('data/ns3_ground.txt', index=False, header=True, mode='a+') 
        else:
            Data.to_csv('data/ns3_ground.txt', index=False, header=False, mode='a+') 
#    print(Data.describe())
#    print(Data.skew())
#    print(Data.kurt())
    data = Data['rsrp']
    data = np.array(data)
    data = np.reshape(data,[40,40])
    plot_image(data)

數據可視化

fig = plt.figure()
plt.plot(x, data1,'r+-' ,label = 'a')
plt.plot(x, data2, 'b*-',label = 'b')
plt.plot(x, data3, '-o',label = 'c')
plt.plot(x, data4, 'g.-', label = 'd')

plt.legend(fontsize='x-large')
plt.xlabel('Missing rate', fontsize='x-large')
plt.ylabel('Mean Absolute Error in dB', fontsize='x-large')
plt.grid()  
plt.show() 
fig.savefig('result/usrp_cost.png',dpi=300) 
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章