5-03異常值處理

#處理異常值
import numpy as np
import pandas as pd
df = pd.DataFrame({"A":["a0","a1","a1","a2","a3","a4"],"B":["b0","b1","b2","b2","b3",None],"C":[1,2,None,3,4,5],"D":[0.1,0.4,0.4,0.7,8.3,None],"E":["e1","e2","e3","e4","e5","w2"]})
print(df)#注意數字爲NaN,字符爲None
#空值>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
print(df.isnull())#所有爲true都是空值
print(df.dropna())#去掉全部的空值
print(df.dropna(subset=["B"]))#去掉屬性的空值,保留其他的
#重複值>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
print(df.duplicated(["A"]))#重複值2標註爲true
print(df.duplicated(["A","B"]))#兩個聯合起來,所以都不重複
print(df.drop_duplicates(["A"]))#刪除重複值(默認刪除的是最後一個
print(df.drop_duplicates(["A"],keep="first"))#刪除重複值第一個
#填充空值>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
print(df.fillna("B*"))
print(df.fillna(df["C"].mean()))#空值填充爲均值
print(df["C"].interpolate())#差值自動填充到空餘位置,差值就是左右相鄰兩個數的均值,或者是最頭上和最末尾兩個數(只能用於series)
print(pd.Series([1,None,3,4,5]).interpolate())

#去掉四分位之外的數字>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
upper_d=df["D"].quantile(0.75)
lower_d=df["D"].quantile(0.25)
d_int=upper_d-lower_d
k=1.5
print(df[df["D"]>lower_d-k*d_int][df["D"]<upper_d+k*d_int])#報警了,報他個玩意,關我屁事
#根據條件去除值>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
print(df[[True if item.startswith("e") else False for item in list(df["E"].values)]])
print(df["E"].values)#在pandas中列表list就是ndarray,需要加一個list轉成list類型



D:\anaconda\python.exe G:/pycharm/untitled/lesson1/BigData/1.py
    A     B    C    D   E
0  a0    b0  1.0  0.1  e1
1  a1    b1  2.0  0.4  e2
2  a1    b2  NaN  0.4  e3
3  a2    b2  3.0  0.7  e4
4  a3    b3  4.0  8.3  e5
5  a4  None  5.0  NaN  w2
       A      B      C      D      E
0  False  False  False  False  False
1  False  False  False  False  False
2  False  False   True  False  False
3  False  False  False  False  False
4  False  False  False  False  False
5  False   True  False   True  False
    A   B    C    D   E
0  a0  b0  1.0  0.1  e1
1  a1  b1  2.0  0.4  e2
3  a2  b2  3.0  0.7  e4
4  a3  b3  4.0  8.3  e5
    A   B    C    D   E
0  a0  b0  1.0  0.1  e1
1  a1  b1  2.0  0.4  e2
2  a1  b2  NaN  0.4  e3
3  a2  b2  3.0  0.7  e4
4  a3  b3  4.0  8.3  e5
0    False
1    False
2     True
3    False
4    False
5    False
dtype: bool
0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool
    A     B    C    D   E
0  a0    b0  1.0  0.1  e1
1  a1    b1  2.0  0.4  e2
3  a2    b2  3.0  0.7  e4
4  a3    b3  4.0  8.3  e5
5  a4  None  5.0  NaN  w2
    A     B    C    D   E
0  a0    b0  1.0  0.1  e1
1  a1    b1  2.0  0.4  e2
3  a2    b2  3.0  0.7  e4
4  a3    b3  4.0  8.3  e5
5  a4  None  5.0  NaN  w2
    A   B   C    D   E
0  a0  b0   1  0.1  e1
1  a1  b1   2  0.4  e2
2  a1  b2  B*  0.4  e3
3  a2  b2   3  0.7  e4
4  a3  b3   4  8.3  e5
5  a4  B*   5   B*  w2
    A   B    C    D   E
0  a0  b0  1.0  0.1  e1
1  a1  b1  2.0  0.4  e2
2  a1  b2  3.0  0.4  e3
3  a2  b2  3.0  0.7  e4
4  a3  b3  4.0  8.3  e5
5  a4   3  5.0  3.0  w2
0    1.0
1    2.0
2    2.5
3    3.0
4    4.0
5    5.0
Name: C, dtype: float64
0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64
G:/pycharm/untitled/lesson1/BigData/1.py:26: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  print(df[df["D"]>lower_d-k*d_int][df["D"]<upper_d+k*d_int])#報警了,報他個玩意,關我屁事
    A   B    C    D   E
0  a0  b0  1.0  0.1  e1
1  a1  b1  2.0  0.4  e2
2  a1  b2  NaN  0.4  e3
3  a2  b2  3.0  0.7  e4
    A   B    C    D   E
0  a0  b0  1.0  0.1  e1
1  a1  b1  2.0  0.4  e2
2  a1  b2  NaN  0.4  e3
3  a2  b2  3.0  0.7  e4
4  a3  b3  4.0  8.3  e5
['e1' 'e2' 'e3' 'e4' 'e5' 'w2']
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章