-
重複值,輸入爲DataFrame,檢測是否有重複的行以及刪除重複的行並生成新的DataFrame
class Duplicated():
def __init__(self,df,subset=None,keep='first',inplace=False):
self.subset=subset
if isinstance(self.subset, list) and len(self.subset) > 0 and True not in [ True for a in self.subset if a not in df.columns] :
self.df = df[self.subset]
elif subset == None:
self.df = df
else:
raise Exception("subset參數設置有誤,正確的格式爲 None 或者 '長度大於1的list,且list中的值包含在df的列中' ")
self.keep=keep
self.inplace=inplace
def find_duplicated(self):
'''
判斷哪些行是否重複,返回的是series格式
:return:
'''
if self.keep not in ['first','last',False]:
raise Exception("keep參數超出了範圍")
else:
return self.df.duplicated(keep=self.keep)
def drop_duplicated(self):
'''
刪除重複的行,默認保留第一次重複出現的行,即默認keep='first'
:return:
'''
if self.keep not in ['first', 'last', False]:
raise Exception("keep參數超出了範圍")
else:
return self.df.drop_duplicates(keep=self.keep,inplace=self.inplace)
使用方式:
import pandas as pd
df=pd.DataFrame({"a":[1,1,1],"b":[2, 2, 3]})
dd=Duplicated(df)
print(dd.find_duplicated())
print(type(dd.find_duplicated()))
print(dd.drop_duplicated())
'''
0 False
1 True
2 False
dtype: bool
<class 'pandas.core.series.Series'>
a b
0 1 2
2 1 3
'''
2. 單一值,輸入爲DataFrame,檢測是否有單一值比例過高的列以及根據閾值剔除這些列並生成新的DataFrame
class Single():
def __init__(self,df,threshold=0.5,detect_columns=None,subset=None):
'''
:param df: 輸入的數據爲dataframe格式
:param threshold: 如果大於該閾值,那麼剔除該列
:param detect_colunms: 檢測的列
:param subset: 對df取子集
'''
self.threshold = threshold
self.subset=subset
# 流程: df -> df[subset] -> df[subset] 判斷其中的detect_columns中的單一值
if isinstance(self.subset, list) and len(self.subset) > 0 and True not in [ True for a in self.subset if a not in df.columns] :
self.df = df[self.subset]
elif subset == None:
self.df = df
else:
raise Exception("subset參數設置有誤,正確的格式爲 None 或者 '長度大於1的list,且list中的值包含在df的列中' ")
if detect_columns == None:
self.detect_columns = self.df.columns
elif isinstance(detect_columns, list) and len(detect_columns) > 0 and True not in [ True for a in detect_columns if a not in self.df.columns] :
self.detect_columns = detect_columns
else:
raise Exception("detect_columns參數填寫有誤,正確的格式爲 None 或者 '長度大於1的list,且list中的值包含在df的列中' ")
def get_single_columns(self):
'''
針對detect_columns中的列,獲取大於threshold的列
:return:
'''
tempdf=self.df
result=[]
for col in self.detect_columns:
cntdist=tempdf[col].value_counts() # 列中每個值的分佈
ratedist=cntdist/len(tempdf[col]) # 列中每個值的比例
if max(ratedist)>self.threshold: #
result.append(col)
return result
def drop_single(self):
'''
最初的df 沒有發生改變
:return:
'''
tempdf=self.df
exclude_columns=self.get_single_columns()
return tempdf.drop(exclude_columns,axis=1) # axis=1 表示要刪除列
使用方式:
df=pd.DataFrame({
"a":[1,2,1,1,1,1,1,1,1,2,3,4],
"b":[1,2,34,5,6,7,8,9,12,5,7,8]
})
single=Single(df,threshold=0.5)
df=single.drop_single()
print(single.get_single_columns())
print(df.shape)
'''
['a']
(12, 1)
'''
3.異常值,輸入爲DataFrame,檢測某列中是否存在異常值以及刪除有異常值的列並生成新的DataFrame
class Outlier():
def __init__(self,df,method=None,detect_columns=None,subset=None):
'''
:param df: 輸入的數據爲dataframe格式
:param threshold: 異常識別的方法
:param detect_columns: 檢測的列
:param subset: 對df取子集
'''
self.method = method
self.subset=subset
# 流程: df -> df[subset] -> df[subset] 判斷其中的detect_columns中的單一值
if isinstance(self.subset, list) and len(self.subset) > 0 and True not in [ True for a in self.subset if a not in df.columns] :
self.df = df[self.subset]
elif subset == None:
self.df = df
else:
raise Exception("subset參數設置有誤,正確的格式爲 None 或者 '長度大於1的list,且list中的值包含在df的列中' ")
if detect_columns == None:
self.detect_columns = self.df.columns
elif isinstance(detect_columns, list) and len(detect_columns) > 0 and True not in [ True for a in detect_columns if a not in self.df.columns] :
self.detect_columns = detect_columns
else:
raise Exception("detect_columns參數填寫有誤,正確的格式爲 None 或者 '長度大於1的list,且list中的值包含在df的列中' ")
def get_exper_bound(self,series):
'''
根據經驗判斷,[0.25Q-1.5*IQR , 0.75Q+1.5*IQR] 作爲臨界點
:param series:
:return:
'''
quntilelist=series.quantile([0.25,0.5,0.75])
IQR=quntilelist[0.75]-quntilelist[0.25]
low_bound=quntilelist[0.25]-1.5*IQR
upper_bound=quntilelist[0.75]+1.5*IQR
return low_bound,upper_bound
def get_3sigma_bound(self,series):
'''
[mean-3*sigma,mean+3*sigma] 作爲臨界點
:param series:
:return:
'''
mean=series.mean()
std=series.std()
low_bound=mean-3*std
upper_bound=mean+3*std
return low_bound,upper_bound
def drop_outlier(self):
'''
剔除異常值的行,獲得新的dataframe,最初的df仍然保留着
:return:
'''
if self.method=="exper":
tempdf = self.df
for col in self.detect_columns:
low_bound, upper_bound=self.get_exper_bound(self.df[col])
tempdf=tempdf[tempdf[col]<=upper_bound]
tempdf=tempdf[tempdf[col]>=low_bound]
return tempdf
if self.method=="3sigma":
tempdf = self.df
for col in self.detect_columns:
low_bound, upper_bound=self.get_3sigma_bound(self.df[col])
tempdf=tempdf[tempdf[col]<=upper_bound]
tempdf=tempdf[tempdf[col]>=low_bound]
return tempdf
def get_outlier_details(self):
'''
獲取每列的異常值,結果是dict形式
:return:
'''
result={}
tempdf = self.df
if self.method=="exper":
for col in self.detect_columns:
series=tempdf[col]
low_bound, upper_bound=self.get_exper_bound(self.df[col])
upper_bound_values=series[series>upper_bound].values
lower_bound_values=series[series<low_bound].values
if len(lower_bound_values)>0 or len(upper_bound_values)>0:
result[str(col)]={"lower":lower_bound_values,"upper":upper_bound_values}
return result
if self.method=="3sigma":
for col in self.detect_columns:
series=tempdf[col]
low_bound, upper_bound=self.get_3sigma_bound(self.df[col])
upper_bound_values=series[series>upper_bound].values
lower_bound_values=series[series<low_bound].values
if len(lower_bound_values)>0 or len(upper_bound_values)>0:
result[str(col)]={"lower":lower_bound_values,"upper":upper_bound_values}
return result
使用方式:
data=pd.DataFrame({
"a":[1,2,3,4,5,6,7,7,7,7,100],
"b":[1,2,3,4,5,6,7,7,7,7,200],
})
outlier=Outlier(data,method="exper",detect_columns=['a','b'])
res=outlier.drop_outlier()
print(res)
4.缺失值填補,輸入爲DataFrame,採用knn,均值,中位數,衆數,指定值,前值,後值,線性插補的方式進行填充,並生成新的DataFrame
import pandas as pd
from fancyimpute import KNN
from sklearn.impute import SimpleImputer
import numpy as np
from ML.TOOLS import tools
def get_fill_methods():
result = ['knn', 'mean', 'median', 'most_frequent', 'fixvalue', 'ffill', 'bfill', 'linear']
return result
class FillMissingValue():
def __init__(self,df,method='mode',k=3,toDF=True,missingvalue=np.NaN,fixvalue='-999',axis=0,subset=None):
self.subset=subset
# 流程: df -> df[subset] -> df[subset] 判斷其中的detect_columns中的單一值
if isinstance(self.subset, list) and len(self.subset) > 0 and True not in [ True for a in self.subset if a not in df.columns] :
self.df = df[self.subset]
elif subset == None:
self.df = df
else:
raise Exception("subset參數設置有誤,正確的格式爲 None 或者 '長度大於1的list,且list中的值包含在df的列中' ")
self.method=method
self.k=k
self.all_columns=self.df.columns
self.toDF=toDF
self.missingvalue=missingvalue
self.fixvalue=fixvalue
self.axis=axis
def get_missing_dist(self):
'''
return df
:return:
'''
missing_rate_result = pd.DataFrame()
missing_rate_result["missing_count"] = self.df.isnull().sum()
missing_rate_result["missing_rate"] = self.df.isnull().sum() / self.df.shape[0]
return missing_rate_result
def fill_missing_value(self):
tempdf = self.df
if self.method=='knn':
knn = KNN(k=self.k)
result = knn.fit_transform(tempdf)
return tools.array_2df(result,self.all_columns)
elif self.method in ['mean','median','most_frequent']:
imputer = SimpleImputer(missing_values=self.missingvalue, strategy=self.method)
result=imputer.fit_transform(tempdf)
return tools.array_2df(result,self.all_columns)
elif self.method=='fixvalue':
result=tempdf.fillna(self.fixvalue,axis=self.axis)
return result
elif self.method in ['ffill','bfill']: # 如果缺失值最後處於第一行,採用ffill,填補會失效,反之,處於最後一行,採用bfill,填補會失效
result=tempdf.fillna(method=self.method,axis=self.axis)
return result
elif self.method == 'linear':
result=tempdf.interpolate(method=self.method)
return result
else:
raise Exception("不在缺失值填補的方法裏面")
使用方式:
df=pd.DataFrame({"a":[1,2,4],"b":[2, 3, np.NaN]})
for ele in get_fill_methods():
df1=FillMissingValue(df,method=ele).fill_missing_value()
print(ele)
print(df1)
'''
knn
a b
0 1.0 2.000000
1 2.0 3.000000
2 4.0 2.692308
mean
a b
0 1.0 2.0
1 2.0 3.0
2 4.0 2.5
median
a b
0 1.0 2.0
1 2.0 3.0
2 4.0 2.5
most_frequent
a b
0 1.0 2.0
1 2.0 3.0
2 4.0 2.0
fixvalue
a b
0 1 2
1 2 3
2 4 -999
ffill
a b
0 1 2.0
1 2 3.0
2 4 3.0
bfill
a b
0 1 2.0
1 2 3.0
2 4 NaN
linear
a b
0 1 2.0
1 2 3.0
2 4 3.0
'''