源數據如下:含髒數據(price列)
下列源碼用到如下四個包:
import pandas as pd
import glob
import os
import csv
目錄
1.基礎文件讀寫(等同於copy文件)
def base_read_and_write():
input_file = 'csv_python.csv'#sys.argv[1]
output_file = 'csv_python_write.csv'#sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame.to_csv(output_file,index = False)
2.行中的值滿足某個條件(含整理原始文件髒數據)
def write_row_in_col():
input_file = 'csv_python.csv'
output_file = 'csv_python_write.csv'
data_frame = pd.read_csv(input_file)
data_frame['price'] = data_frame['price'].str.strip('¥').str.replace(',','').astype(float)
#清洗髒數據,這裏有以萬爲單位的,也有以元爲單位的,根據房產實際情況,我們把它們都整理成以萬爲單位的
for i,millions_row in data_frame.iterrows():
'''if (millions_row['price']>10000):
million = millions_row['price']/10000
else:
million = millions_row['price']'''
million = millions_row['price']/10000 if millions_row['price']>10000 else millions_row['price']#等同於上面的if-else
data_frame.at[i,'price'] = '{}'.format(million)
#取出含有'世貿'且房價大於200萬的房子
#[,:],逗號前爲行,逗號後爲列,:表示所有,如選定列,例如爲:df.loc[:,'A']
data_frame_value_meets_condition = data_frame.loc[(data_frame['name'].str.contains('世茂')) & (data_frame['price']>200),:]
data_frame_value_meets_condition.to_csv(output_file,index = False)
運行,新文件顯示如下:
3.行中的值滿足某個集合
def write_row_in_set():
input_file = 'csv_python.csv'
output_file = 'csv_python_write.csv'
data_frame = pd.read_csv(input_file)
#取出2室2廳,3室2廳房子
important_dates = ['2室2廳','3室2廳']
data_frame_value_in_set = data_frame.loc[data_frame['style'].isin(important_dates),:]
data_frame_value_in_set.to_csv(output_file,index = False)
運行,新文件顯示如下:
4.行中的值匹配於某個模式
def write_row_in_re():
input_file = 'csv_python.csv'
output_file = 'csv_python_write.csv'
data_frame = pd.read_csv(input_file)
#取出以'世貿'開頭的房子
data_frame_value_matches_pattern = data_frame.loc[data_frame['name'].str.startswith('世茂'),:]
data_frame_value_matches_pattern.to_csv(output_file,index = False)
運行,新文件顯示如下:
5.選取特定的列(列索引值)
def write_row_index():
input_file = 'csv_python.csv'
output_file = 'csv_python_write.csv'
data_frame = pd.read_csv(input_file)
#取0,2,3,4列
data_frame_column_by_index = data_frame.iloc[:,[0,2,3,4]]
data_frame_column_by_index.to_csv(output_file,index = False)
6.選取特定的列(列標題)
def write_row_colname():
input_file = 'csv_python.csv'
output_file = 'csv_python_write.csv'
data_frame = pd.read_csv(input_file)
#取name,style,size,price列
data_frame_column_by_name = data_frame.loc[:,['name','style','size','price']]
data_frame_column_by_name.to_csv(output_file,index = False)
5,6運行,新文件顯示相同,如下:
7.選取連續的行(丟棄不需要的行)
源數據如下:
在源數據的頭部和尾部有說明文字,但這並不是我們想要的,我們需要把開頭和結尾的說明文字去掉後再寫入新文件。
def write_row_selectrows():
input_file = 'csv_python_1.csv'
output_file = 'csv_python_write.csv'
data_frame = pd.read_csv(input_file)
#丟掉0行和61行
#data_frame = data_frame.drop([0,61])
data_frame = data_frame.reindex(data_frame.index.drop([0,61]))#和上面的效果一樣,這種方式爲數據框架重新生成索引
data_frame.to_csv(output_file,index = False)
運行,新文件顯示如下,開頭和結尾的說明被去掉了:
8.添加標題行
源數據如下:源數據沒有標題行,我們需要添加一個標題行:
#添加標題行
def write_row_addnames():
input_file = 'csv_python_2.csv'
output_file = 'csv_python_write.csv'
header_list = ['name','loc','style','size','price','foc']
data_frame = pd.read_csv(input_file,names=header_list)
data_frame.to_csv(output_file,index = False)
運行,新文件顯示如下,添加了標題行
9.讀取多個csv文件,包含去重
如下圖結果,在當前運行.py文件目錄同級,有如下文件:
我們要找出以csv_python_3_開頭的csv文件,先做一個簡單的計數,然後經過去掉重複數據操作後,寫入到一個文件裏。
def read_multiple_csv():
#文件計數與文件中的行列計數
all_files = glob.glob(os.path.join(os.getcwd(),'csv_python_3_*'))
file_counter = 0
for input_file_path in all_files:
row_counter = 1
with open(input_file_path,'r',newline='') as csv_in_file:
filereader = csv.reader(csv_in_file)
header = next(filereader,None)
for row in filereader:
row_counter +=1
print('file:{}\trows:{}\tcols:{}'.format(os.path.basename(input_file_path),row_counter,len(header)))
file_counter+=1
print('number of files:{}'.format(file_counter))
#從多個文件中連接數據
output_file = 'csv_python_write.csv'
all_data_frames = []
for file in all_files:
data_frame = pd.read_csv(file,index_col=None)
all_data_frames.append(data_frame)
data_frame_concat = pd.concat(all_data_frames,axis=0,ignore_index=True)#ignore_index=True 對index重新安排
#清洗重複數據
data_frame_concat.drop_duplicates(['name','loc','style','size','price','foc'],inplace=True,ignore_index=True)#本例也可寫爲data_frame_concat.drop_duplicates(inplace=True,ignore_index=True),即去掉完全重複的行數
data_frame_concat.to_csv(output_file,index = False)
10.計算每個文件中值的總和與平均值,寫入新文件
利用上面的三張表,我們根據表中foc字段(關注數)來計算每張表內總的foc和平均的foc
def read_sum_csv():
output_file = 'csv_python_total_write.csv'
all_files = glob.glob(os.path.join(os.getcwd(),'csv_python_3_*'))
all_data_frames = []
for input_file in all_files:
data_frame = pd.read_csv(input_file,index_col=None)
#計算總關注度
total_foc = pd.DataFrame(int(str(value).strip()) for value in data_frame.loc[:,'foc']).sum()
#計算平均關注度
average_foc = pd.DataFrame(int(str(value).strip()) for value in data_frame.loc[:,'foc']).mean()
data = {'file_name':os.path.basename(input_file),
'total_foc':total_foc,
'average_foc':average_foc}
all_data_frames.append(pd.DataFrame(data,columns=['file_name','total_foc','average_foc']))
data_frame_concat = pd.concat(all_data_frames,axis=0,ignore_index=True)
data_frame_concat.to_csv(output_file,index = False)#將新建這個三個列單獨寫成一個文件
運行,結果如下:
11.計算每個文件中值的總和與平均值,寫入原文件(這裏先複製出一份和原來一樣的文件後寫入新文件)
這裏我們在上一個計算總關注數和平均關注數的基礎上,求一個每個樓盤的總關注數,這裏用到了groupby這個分組函數。求出來的結果插入到最下行
def read_sum_csv_2():
all_files = glob.glob(os.path.join(os.getcwd(),'csv_python_3_*'))
for input_file in all_files:
output_file = 'csv_python_write_{}'.format(os.path.basename(input_file).split('csv_python_3_')[1])
print(output_file)
data_frame = pd.read_csv(input_file,index_col=None)
#在最下行插入total_foc和average_foc
total_foc_frame = pd.DataFrame(['關注數總計','','','','',data_frame['foc'].sum()]).T
total_foc_frame.columns = data_frame.columns
df_new = pd.concat([data_frame,total_foc_frame],ignore_index=True)
average_foc_frame = pd.DataFrame(['平均關注數','','','','',data_frame['foc'].mean()]).T
average_foc_frame.columns = df_new.columns
df_new = pd.concat([df_new,average_foc_frame],ignore_index=True)
#計算每個樓盤的總關注度
community_total_foc = data_frame.groupby('name')['foc'].sum().reset_index(name='community_total_foc')
#在最下插入community_name和community_total_foc
community_total_foc_frame = pd.DataFrame(community_total_foc)
df_new = pd.concat([df_new,community_total_foc_frame],ignore_index=True)
df_new.to_csv(output_file,index = False)
運行,結果如下:
三個文件,每個文件的格式都如上圖所示。