#处理异常值
import numpy as np
import pandas as pd
df = pd.DataFrame({"A":["a0","a1","a1","a2","a3","a4"],"B":["b0","b1","b2","b2","b3",None],"C":[1,2,None,3,4,5],"D":[0.1,0.4,0.4,0.7,8.3,None],"E":["e1","e2","e3","e4","e5","w2"]})
print(df)#注意数字为NaN,字符为None
#空值>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
print(df.isnull())#所有为true都是空值
print(df.dropna())#去掉全部的空值
print(df.dropna(subset=["B"]))#去掉属性的空值,保留其他的
#重复值>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
print(df.duplicated(["A"]))#重复值2标注为true
print(df.duplicated(["A","B"]))#两个联合起来,所以都不重复
print(df.drop_duplicates(["A"]))#删除重复值(默认删除的是最后一个
print(df.drop_duplicates(["A"],keep="first"))#删除重复值第一个
#填充空值>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
print(df.fillna("B*"))
print(df.fillna(df["C"].mean()))#空值填充为均值
print(df["C"].interpolate())#差值自动填充到空余位置,差值就是左右相邻两个数的均值,或者是最头上和最末尾两个数(只能用于series)
print(pd.Series([1,None,3,4,5]).interpolate())
#去掉四分位之外的数字>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
upper_d=df["D"].quantile(0.75)
lower_d=df["D"].quantile(0.25)
d_int=upper_d-lower_d
k=1.5
print(df[df["D"]>lower_d-k*d_int][df["D"]<upper_d+k*d_int])#报警了,报他个玩意,关我屁事
#根据条件去除值>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
print(df[[True if item.startswith("e") else False for item in list(df["E"].values)]])
print(df["E"].values)#在pandas中列表list就是ndarray,需要加一个list转成list类型
D:\anaconda\python.exe G:/pycharm/untitled/lesson1/BigData/1.py
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
2 a1 b2 NaN 0.4 e3
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
5 a4 None 5.0 NaN w2
A B C D E
0 False False False False False
1 False False False False False
2 False False True False False
3 False False False False False
4 False False False False False
5 False True False True False
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
2 a1 b2 NaN 0.4 e3
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
0 False
1 False
2 True
3 False
4 False
5 False
dtype: bool
0 False
1 False
2 False
3 False
4 False
5 False
dtype: bool
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
5 a4 None 5.0 NaN w2
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
5 a4 None 5.0 NaN w2
A B C D E
0 a0 b0 1 0.1 e1
1 a1 b1 2 0.4 e2
2 a1 b2 B* 0.4 e3
3 a2 b2 3 0.7 e4
4 a3 b3 4 8.3 e5
5 a4 B* 5 B* w2
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
2 a1 b2 3.0 0.4 e3
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
5 a4 3 5.0 3.0 w2
0 1.0
1 2.0
2 2.5
3 3.0
4 4.0
5 5.0
Name: C, dtype: float64
0 1.0
1 2.0
2 3.0
3 4.0
4 5.0
dtype: float64
G:/pycharm/untitled/lesson1/BigData/1.py:26: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
print(df[df["D"]>lower_d-k*d_int][df["D"]<upper_d+k*d_int])#报警了,报他个玩意,关我屁事
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
2 a1 b2 NaN 0.4 e3
3 a2 b2 3.0 0.7 e4
A B C D E
0 a0 b0 1.0 0.1 e1
1 a1 b1 2.0 0.4 e2
2 a1 b2 NaN 0.4 e3
3 a2 b2 3.0 0.7 e4
4 a3 b3 4.0 8.3 e5
['e1' 'e2' 'e3' 'e4' 'e5' 'w2']
5-03异常值处理
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.