#處理異常值
import numpy as np
import pandas as pd
df = pd.DataFrame({"A":["a0","a1","a1","a2","a3","a4"],"B":["b0","b1","b2","b2","b3",None],"C":[1,2,None,3,4,5],"D":[0.1,0.4,0.4,0.7,8.3,None],"E":["e1","e2","e3","e4","e5","w2"]})
print(df)#注意數字爲NaN,字符爲None
#空值>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
print(df.isnull())#所有爲true都是空值
print(df.dropna())#去掉全部的空值
print(df.dropna(subset=["B"]))#去掉屬性的空值,保留其他的
#重複值>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
print(df.duplicated(["A"]))#重複值2標註爲true
print(df.duplicated(["A","B"]))#兩個聯合起來,所以都不重複
print(df.drop_duplicates(["A"]))#刪除重複值(默認刪除的是最後一個
print(df.drop_duplicates(["A"],keep="first"))#刪除重複值第一個
#填充空值>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
print(df.fillna("B*"))
print(df.fillna(df["C"].mean()))#空值填充爲均值
print(df["C"].interpolate())#差值自動填充到空餘位置,差值就是左右相鄰兩個數的均值,或者是最頭上和最末尾兩個數(只能用於series)
print(pd.Series([1,None,3,4,5]).interpolate())
#去掉四分位之外的數字>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
upper_d=df["D"].quantile(0.75)
lower_d=df["D"].quantile(0.25)
d_int=upper_d-lower_d
k=1.5
print(df[df["D"]>lower_d-k*d_int][df["D"]<upper_d+k*d_int])#報警了,報他個玩意,關我屁事
#根據條件去除值>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
print(df[[True if item.startswith("e") else False for item in list(df["E"].values)]])
print(df["E"].values)#在pandas中列表list就是ndarray,需要加一個list轉成list類型
D:\anaconda\python.exe G:/pycharm/untitled/lesson1/BigData/1.py
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
2 a1 b2 NaN 0.4 e3
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
5 a4 None 5.0 NaN w2
A B C D E
0 False False False False False
1 False False False False False
2 False False True False False
3 False False False False False
4 False False False False False
5 False True False True False
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
2 a1 b2 NaN 0.4 e3
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
0 False
1 False
2 True
3 False
4 False
5 False
dtype: bool
0 False
1 False
2 False
3 False
4 False
5 False
dtype: bool
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
5 a4 None 5.0 NaN w2
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
5 a4 None 5.0 NaN w2
A B C D E
0 a0 b0 1 0.1 e1
1 a1 b1 2 0.4 e2
2 a1 b2 B* 0.4 e3
3 a2 b2 3 0.7 e4
4 a3 b3 4 8.3 e5
5 a4 B* 5 B* w2
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
2 a1 b2 3.0 0.4 e3
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
5 a4 3 5.0 3.0 w2
0 1.0
1 2.0
2 2.5
3 3.0
4 4.0
5 5.0
Name: C, dtype: float64
0 1.0
1 2.0
2 3.0
3 4.0
4 5.0
dtype: float64
G:/pycharm/untitled/lesson1/BigData/1.py:26: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
print(df[df["D"]>lower_d-k*d_int][df["D"]<upper_d+k*d_int])#報警了,報他個玩意,關我屁事
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
2 a1 b2 NaN 0.4 e3
3 a2 b2 3.0 0.7 e4
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
2 a1 b2 NaN 0.4 e3
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
['e1' 'e2' 'e3' 'e4' 'e5' 'w2']
5-03異常值處理
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.