- 對缺失值比例過高的指標進行剔除
# 單列缺失值的統計,返回缺失比例、缺失記錄數等
def cal_single_column_nullrate(df,colname,is_len=False):
if is_len:
return df[colname].isnull().sum(),df[colname].isnull().sum()/df.shape[0]
else:
return df[colname].isnull().sum()/df.shape[0]
df=pd.DataFrame({
'age':[10,None,10,30],
'name':['a',None,None,'d']
})
'''
example
print(single_column_nullrate(df,'name'))
'''
# 所有列缺失值統計,返回缺失比例、缺失記錄數等
def cal_all_columns_nullrate(df,is_len=False):
if is_len:
return df.isnull().sum(),df.isnull().sum()/df.shape[0]
else:
return df.isnull().sum()/df.shape[0]
'''
example:
len1,df1=all_columns_nullrate(df,True)
df1=all_columns_nullrate(df)
'''
# dataframe中剔除缺失率超過一定閾值的列
def filter_columns_by_nullrate(df,threshold):
nullrate_df=cal_all_columns_nullrate(df)
nullrate_df_filter=nullrate_df[nullrate_df<threshold]
return df[nullrate_df_filter.index.tolist()]
'''
print(filter_columns_by_nullrate(df,0.3))
'''
- 對指標中單一值佔比過高的指標進行剔除
# 單列重複率計算
def cal_single_column_repeatrate(df,colname,is_len=False):
if is_len:
value_freq = df[colname].value_counts(dropna=True).reset_index(drop=False)
return value_freq,value_freq/value_freq.sum()
else:
value_freq = df[colname].value_counts(dropna=True).reset_index(drop=True)
return value_freq/value_freq.sum()
# dataframe中剔除單值重複率大於某個閾值的列
def filter_columns_by_repeatrate(df,threshold):
column_names=df.columns.values.tolist()
retain_columns=[ele for ele in column_names if cal_single_column_repeatrate(df,ele)[0]<threshold]
return df[retain_columns]