Python中異常值,單一值,重複值,缺失值的處理

  1. 重複值,輸入爲DataFrame,檢測是否有重複的行以及刪除重複的行並生成新的DataFrame

class Duplicated():

    def __init__(self,df,subset=None,keep='first',inplace=False):

        self.subset=subset

        if isinstance(self.subset, list) and len(self.subset) > 0 and  True not in [ True for a in self.subset if a not in df.columns] :

            self.df = df[self.subset]

        elif subset == None:

            self.df = df

        else:
            raise  Exception("subset參數設置有誤,正確的格式爲 None 或者 '長度大於1的list,且list中的值包含在df的列中' ")

        self.keep=keep
        self.inplace=inplace

    def find_duplicated(self):
        '''
        判斷哪些行是否重複,返回的是series格式
        :return:
        '''

        if self.keep not in ['first','last',False]:
            raise Exception("keep參數超出了範圍")
        else:
            return self.df.duplicated(keep=self.keep)

    def drop_duplicated(self):
        '''
        刪除重複的行,默認保留第一次重複出現的行,即默認keep='first'
        :return:
        '''
        if self.keep not in ['first', 'last', False]:
            raise Exception("keep參數超出了範圍")
        else:
            return self.df.drop_duplicates(keep=self.keep,inplace=self.inplace)

 使用方式:

import pandas as pd
df=pd.DataFrame({"a":[1,1,1],"b":[2, 2, 3]})
dd=Duplicated(df)
print(dd.find_duplicated())
print(type(dd.find_duplicated()))
print(dd.drop_duplicated())


'''

0    False
1     True
2    False
dtype: bool
<class 'pandas.core.series.Series'>
   a  b
0  1  2
2  1  3


'''

2. 單一值,輸入爲DataFrame,檢測是否有單一值比例過高的列以及根據閾值剔除這些列並生成新的DataFrame

class Single():


    def __init__(self,df,threshold=0.5,detect_columns=None,subset=None):
        '''
        :param df: 輸入的數據爲dataframe格式
        :param threshold: 如果大於該閾值,那麼剔除該列
        :param detect_colunms: 檢測的列
        :param subset: 對df取子集
        '''
        self.threshold = threshold
        self.subset=subset

        # 流程: df -> df[subset] -> df[subset] 判斷其中的detect_columns中的單一值

        if isinstance(self.subset, list) and len(self.subset) > 0 and  True not in [ True for a in self.subset if a not in df.columns] :

            self.df = df[self.subset]

        elif subset == None:

            self.df = df

        else:
            raise  Exception("subset參數設置有誤,正確的格式爲 None 或者 '長度大於1的list,且list中的值包含在df的列中' ")


        if detect_columns == None:

            self.detect_columns = self.df.columns

        elif isinstance(detect_columns, list) and len(detect_columns) > 0 and True not in [ True for a in detect_columns if a not in self.df.columns] :

            self.detect_columns = detect_columns

        else:
            raise Exception("detect_columns參數填寫有誤,正確的格式爲 None 或者 '長度大於1的list,且list中的值包含在df的列中' ")



    def get_single_columns(self):
        '''
        針對detect_columns中的列,獲取大於threshold的列
        :return:
        '''
        tempdf=self.df
        result=[]
        for col in self.detect_columns:
            cntdist=tempdf[col].value_counts() # 列中每個值的分佈
            ratedist=cntdist/len(tempdf[col])  # 列中每個值的比例
            if max(ratedist)>self.threshold:   #
                result.append(col)
        return result


    def drop_single(self):
        '''
        最初的df 沒有發生改變
        :return:
        '''
        tempdf=self.df
        exclude_columns=self.get_single_columns()
        return  tempdf.drop(exclude_columns,axis=1) # axis=1 表示要刪除列

使用方式:

df=pd.DataFrame({

    "a":[1,2,1,1,1,1,1,1,1,2,3,4],
    "b":[1,2,34,5,6,7,8,9,12,5,7,8]
})

single=Single(df,threshold=0.5)

df=single.drop_single()
print(single.get_single_columns())
print(df.shape)

'''
['a']
(12, 1)

'''

 

3.異常值,輸入爲DataFrame,檢測某列中是否存在異常值以及刪除有異常值的列並生成新的DataFrame

class Outlier():

    def __init__(self,df,method=None,detect_columns=None,subset=None):
        '''
        :param df: 輸入的數據爲dataframe格式
        :param threshold: 異常識別的方法
        :param detect_columns: 檢測的列
        :param subset: 對df取子集
        '''
        self.method = method
        self.subset=subset

        # 流程: df -> df[subset] -> df[subset] 判斷其中的detect_columns中的單一值

        if isinstance(self.subset, list) and len(self.subset) > 0 and  True not in [ True for a in self.subset if a not in df.columns] :

            self.df = df[self.subset]

        elif subset == None:

            self.df = df

        else:
            raise  Exception("subset參數設置有誤,正確的格式爲 None 或者 '長度大於1的list,且list中的值包含在df的列中' ")


        if detect_columns == None:

            self.detect_columns = self.df.columns

        elif isinstance(detect_columns, list) and len(detect_columns) > 0 and True not in [ True for a in detect_columns if a not in self.df.columns] :

            self.detect_columns = detect_columns

        else:
            raise Exception("detect_columns參數填寫有誤,正確的格式爲 None 或者 '長度大於1的list,且list中的值包含在df的列中' ")


    def get_exper_bound(self,series):
        '''
        根據經驗判斷,[0.25Q-1.5*IQR , 0.75Q+1.5*IQR] 作爲臨界點
        :param series:
        :return:
        '''

        quntilelist=series.quantile([0.25,0.5,0.75])
        IQR=quntilelist[0.75]-quntilelist[0.25]
        low_bound=quntilelist[0.25]-1.5*IQR
        upper_bound=quntilelist[0.75]+1.5*IQR
        return  low_bound,upper_bound


    def get_3sigma_bound(self,series):
        '''
        [mean-3*sigma,mean+3*sigma] 作爲臨界點
        :param series:
        :return:
        '''
        mean=series.mean()
        std=series.std()
        low_bound=mean-3*std
        upper_bound=mean+3*std
        return low_bound,upper_bound


    def drop_outlier(self):
        '''
        剔除異常值的行,獲得新的dataframe,最初的df仍然保留着
        :return:
        '''

        if self.method=="exper":
            tempdf = self.df
            for col in self.detect_columns:
                low_bound, upper_bound=self.get_exper_bound(self.df[col])
                tempdf=tempdf[tempdf[col]<=upper_bound]
                tempdf=tempdf[tempdf[col]>=low_bound]

            return tempdf


        if self.method=="3sigma":

            tempdf = self.df
            for col in self.detect_columns:
                low_bound, upper_bound=self.get_3sigma_bound(self.df[col])
                tempdf=tempdf[tempdf[col]<=upper_bound]
                tempdf=tempdf[tempdf[col]>=low_bound]

            return tempdf

    def get_outlier_details(self):
        '''
        獲取每列的異常值,結果是dict形式
        :return:
        '''
        result={}
        tempdf = self.df

        if self.method=="exper":
            for col in self.detect_columns:
                series=tempdf[col]
                low_bound, upper_bound=self.get_exper_bound(self.df[col])
                upper_bound_values=series[series>upper_bound].values
                lower_bound_values=series[series<low_bound].values
                if len(lower_bound_values)>0 or len(upper_bound_values)>0:
                    result[str(col)]={"lower":lower_bound_values,"upper":upper_bound_values}
            return result

        if self.method=="3sigma":
            for col in self.detect_columns:
                series=tempdf[col]
                low_bound, upper_bound=self.get_3sigma_bound(self.df[col])
                upper_bound_values=series[series>upper_bound].values
                lower_bound_values=series[series<low_bound].values
                if len(lower_bound_values)>0 or len(upper_bound_values)>0:
                    result[str(col)]={"lower":lower_bound_values,"upper":upper_bound_values}
            return result

使用方式:

data=pd.DataFrame({
    "a":[1,2,3,4,5,6,7,7,7,7,100],
    "b":[1,2,3,4,5,6,7,7,7,7,200],
})
outlier=Outlier(data,method="exper",detect_columns=['a','b'])
res=outlier.drop_outlier()
print(res)

4.缺失值填補,輸入爲DataFrame,採用knn,均值,中位數,衆數,指定值,前值,後值,線性插補的方式進行填充,並生成新的DataFrame

 

import pandas as pd
from fancyimpute import KNN
from sklearn.impute import SimpleImputer
import numpy as np
from ML.TOOLS import tools


def get_fill_methods():
    result = ['knn', 'mean', 'median', 'most_frequent', 'fixvalue', 'ffill', 'bfill', 'linear']
    return result


class FillMissingValue():

    def __init__(self,df,method='mode',k=3,toDF=True,missingvalue=np.NaN,fixvalue='-999',axis=0,subset=None):

        self.subset=subset

        # 流程: df -> df[subset] -> df[subset] 判斷其中的detect_columns中的單一值

        if isinstance(self.subset, list) and len(self.subset) > 0 and  True not in [ True for a in self.subset if a not in df.columns] :

            self.df = df[self.subset]

        elif subset == None:

            self.df = df

        else:
            raise  Exception("subset參數設置有誤,正確的格式爲 None 或者 '長度大於1的list,且list中的值包含在df的列中' ")

        self.method=method
        self.k=k
        self.all_columns=self.df.columns
        self.toDF=toDF
        self.missingvalue=missingvalue
        self.fixvalue=fixvalue
        self.axis=axis



    def get_missing_dist(self):
        '''
        return df
        :return:
        '''

        missing_rate_result = pd.DataFrame()
        missing_rate_result["missing_count"] = self.df.isnull().sum()
        missing_rate_result["missing_rate"] = self.df.isnull().sum() / self.df.shape[0]
        return missing_rate_result


    def fill_missing_value(self):

        tempdf = self.df

        if self.method=='knn':
            knn = KNN(k=self.k)
            result = knn.fit_transform(tempdf)
            return tools.array_2df(result,self.all_columns)

        elif self.method in ['mean','median','most_frequent']:

            imputer = SimpleImputer(missing_values=self.missingvalue, strategy=self.method)
            result=imputer.fit_transform(tempdf)
            return tools.array_2df(result,self.all_columns)


        elif self.method=='fixvalue':
            result=tempdf.fillna(self.fixvalue,axis=self.axis)
            return result


        elif self.method in ['ffill','bfill']:    # 如果缺失值最後處於第一行,採用ffill,填補會失效,反之,處於最後一行,採用bfill,填補會失效
            result=tempdf.fillna(method=self.method,axis=self.axis)
            return result

        elif self.method == 'linear':

            result=tempdf.interpolate(method=self.method)
            return result

        else:
            raise Exception("不在缺失值填補的方法裏面")

使用方式:

df=pd.DataFrame({"a":[1,2,4],"b":[2, 3, np.NaN]})

for ele in get_fill_methods():
    df1=FillMissingValue(df,method=ele).fill_missing_value()
    print(ele)
    print(df1)


'''

knn
     a         b
0  1.0  2.000000
1  2.0  3.000000
2  4.0  2.692308
mean
     a    b
0  1.0  2.0
1  2.0  3.0
2  4.0  2.5
median
     a    b
0  1.0  2.0
1  2.0  3.0
2  4.0  2.5
most_frequent
     a    b
0  1.0  2.0
1  2.0  3.0
2  4.0  2.0
fixvalue
   a     b
0  1     2
1  2     3
2  4  -999
ffill
   a    b
0  1  2.0
1  2  3.0
2  4  3.0
bfill
   a    b
0  1  2.0
1  2  3.0
2  4  NaN
linear
   a    b
0  1  2.0
1  2  3.0
2  4  3.0


'''

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章