缺失值處理
找出缺失值
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(10,6))
df.iloc[:4, 1] = None
df.iloc[:2, 4:6] = None
df.iloc[6, 3:5] = None
df.iloc[8, 0:2] = None
print(df)
0 1 2 3 4 5
0 1.905276 NaN -0.978137 1.682227 NaN NaN
1 0.164089 NaN -1.373336 1.388321 NaN NaN
2 0.599135 NaN 2.294221 0.737271 0.238057 0.526875
3 -0.481358 NaN -0.519011 0.214852 0.040489 0.599064
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602 0.726568 1.432684 -1.403756 0.252173
6 -0.842605 1.192915 -0.831709 NaN NaN -0.073455
7 2.223354 -1.164356 -1.222986 0.009452 1.687198 0.344141
8 NaN NaN -1.024819 0.689602 1.210335 -0.714473
9 -0.397766 0.739464 -1.535297 -1.868259 0.724042 -1.714549
res = df.isnull()
res
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
False |
True |
False |
False |
True |
True |
1 |
False |
True |
False |
False |
True |
True |
2 |
False |
True |
False |
False |
False |
False |
3 |
False |
True |
False |
False |
False |
False |
4 |
False |
False |
False |
False |
False |
False |
5 |
False |
False |
False |
False |
False |
False |
6 |
False |
False |
False |
True |
True |
False |
7 |
False |
False |
False |
False |
False |
False |
8 |
True |
True |
False |
False |
False |
False |
9 |
False |
False |
False |
False |
False |
False |
res = df.isnull().any()
res
0 True
1 True
2 False
3 True
4 True
5 True
dtype: bool
result = df[df.isnull().values==True].drop_duplicates()
result
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
1.905276 |
NaN |
-0.978137 |
1.682227 |
NaN |
NaN |
1 |
0.164089 |
NaN |
-1.373336 |
1.388321 |
NaN |
NaN |
2 |
0.599135 |
NaN |
2.294221 |
0.737271 |
0.238057 |
0.526875 |
3 |
-0.481358 |
NaN |
-0.519011 |
0.214852 |
0.040489 |
0.599064 |
6 |
-0.842605 |
1.192915 |
-0.831709 |
NaN |
NaN |
-0.073455 |
8 |
NaN |
NaN |
-1.024819 |
0.689602 |
1.210335 |
-0.714473 |
res = df.columns[df.isnull().any()==True]
res
Int64Index([0, 1, 3, 4, 5], dtype='int64')
num = df.isnull().sum()
num
0 1
1 5
2 0
3 1
4 3
5 2
dtype: int64
num = df.isnull().sum(axis=1)
num
0 3
1 3
2 1
3 1
4 0
5 0
6 2
7 0
8 2
9 0
dtype: int64
刪除缺失值所在的行列
df.dropna()
|
0 |
1 |
2 |
3 |
4 |
5 |
4 |
-0.358184 |
-0.794557 |
-0.454329 |
-1.024130 |
-0.090786 |
-0.018543 |
5 |
-0.679481 |
-0.126602 |
0.726568 |
1.432684 |
-1.403756 |
0.252173 |
7 |
2.223354 |
-1.164356 |
-1.222986 |
0.009452 |
1.687198 |
0.344141 |
9 |
-0.397766 |
0.739464 |
-1.535297 |
-1.868259 |
0.724042 |
-1.714549 |
df.dropna(axis=1)
|
2 |
0 |
-0.978137 |
1 |
-1.373336 |
2 |
2.294221 |
3 |
-0.519011 |
4 |
-0.454329 |
5 |
0.726568 |
6 |
-0.831709 |
7 |
-1.222986 |
8 |
-1.024819 |
9 |
-1.535297 |
df.dropna(how='all')
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
1.905276 |
NaN |
-0.978137 |
1.682227 |
NaN |
NaN |
1 |
0.164089 |
NaN |
-1.373336 |
1.388321 |
NaN |
NaN |
2 |
0.599135 |
NaN |
2.294221 |
0.737271 |
0.238057 |
0.526875 |
3 |
-0.481358 |
NaN |
-0.519011 |
0.214852 |
0.040489 |
0.599064 |
4 |
-0.358184 |
-0.794557 |
-0.454329 |
-1.024130 |
-0.090786 |
-0.018543 |
5 |
-0.679481 |
-0.126602 |
0.726568 |
1.432684 |
-1.403756 |
0.252173 |
6 |
-0.842605 |
1.192915 |
-0.831709 |
NaN |
NaN |
-0.073455 |
7 |
2.223354 |
-1.164356 |
-1.222986 |
0.009452 |
1.687198 |
0.344141 |
8 |
NaN |
NaN |
-1.024819 |
0.689602 |
1.210335 |
-0.714473 |
9 |
-0.397766 |
0.739464 |
-1.535297 |
-1.868259 |
0.724042 |
-1.714549 |
df.dropna(thresh=4)
|
0 |
1 |
2 |
3 |
4 |
5 |
2 |
0.599135 |
NaN |
2.294221 |
0.737271 |
0.238057 |
0.526875 |
3 |
-0.481358 |
NaN |
-0.519011 |
0.214852 |
0.040489 |
0.599064 |
4 |
-0.358184 |
-0.794557 |
-0.454329 |
-1.024130 |
-0.090786 |
-0.018543 |
5 |
-0.679481 |
-0.126602 |
0.726568 |
1.432684 |
-1.403756 |
0.252173 |
6 |
-0.842605 |
1.192915 |
-0.831709 |
NaN |
NaN |
-0.073455 |
7 |
2.223354 |
-1.164356 |
-1.222986 |
0.009452 |
1.687198 |
0.344141 |
8 |
NaN |
NaN |
-1.024819 |
0.689602 |
1.210335 |
-0.714473 |
9 |
-0.397766 |
0.739464 |
-1.535297 |
-1.868259 |
0.724042 |
-1.714549 |
df.dropna(subset=[2,4])
|
0 |
1 |
2 |
3 |
4 |
5 |
2 |
0.599135 |
NaN |
2.294221 |
0.737271 |
0.238057 |
0.526875 |
3 |
-0.481358 |
NaN |
-0.519011 |
0.214852 |
0.040489 |
0.599064 |
4 |
-0.358184 |
-0.794557 |
-0.454329 |
-1.024130 |
-0.090786 |
-0.018543 |
5 |
-0.679481 |
-0.126602 |
0.726568 |
1.432684 |
-1.403756 |
0.252173 |
7 |
2.223354 |
-1.164356 |
-1.222986 |
0.009452 |
1.687198 |
0.344141 |
8 |
NaN |
NaN |
-1.024819 |
0.689602 |
1.210335 |
-0.714473 |
9 |
-0.397766 |
0.739464 |
-1.535297 |
-1.868259 |
0.724042 |
-1.714549 |
缺失值填充
df.fillna(0)
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
1.905276 |
0.000000 |
-0.978137 |
1.682227 |
0.000000 |
0.000000 |
1 |
0.164089 |
0.000000 |
-1.373336 |
1.388321 |
0.000000 |
0.000000 |
2 |
0.599135 |
0.000000 |
2.294221 |
0.737271 |
0.238057 |
0.526875 |
3 |
-0.481358 |
0.000000 |
-0.519011 |
0.214852 |
0.040489 |
0.599064 |
4 |
-0.358184 |
-0.794557 |
-0.454329 |
-1.024130 |
-0.090786 |
-0.018543 |
5 |
-0.679481 |
-0.126602 |
0.726568 |
1.432684 |
-1.403756 |
0.252173 |
6 |
-0.842605 |
1.192915 |
-0.831709 |
0.000000 |
0.000000 |
-0.073455 |
7 |
2.223354 |
-1.164356 |
-1.222986 |
0.009452 |
1.687198 |
0.344141 |
8 |
0.000000 |
0.000000 |
-1.024819 |
0.689602 |
1.210335 |
-0.714473 |
9 |
-0.397766 |
0.739464 |
-1.535297 |
-1.868259 |
0.724042 |
-1.714549 |
df.fillna(axis=1, method='ffill')
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
1.905276 |
1.905276 |
-0.978137 |
1.682227 |
1.682227 |
1.682227 |
1 |
0.164089 |
0.164089 |
-1.373336 |
1.388321 |
1.388321 |
1.388321 |
2 |
0.599135 |
0.599135 |
2.294221 |
0.737271 |
0.238057 |
0.526875 |
3 |
-0.481358 |
-0.481358 |
-0.519011 |
0.214852 |
0.040489 |
0.599064 |
4 |
-0.358184 |
-0.794557 |
-0.454329 |
-1.024130 |
-0.090786 |
-0.018543 |
5 |
-0.679481 |
-0.126602 |
0.726568 |
1.432684 |
-1.403756 |
0.252173 |
6 |
-0.842605 |
1.192915 |
-0.831709 |
-0.831709 |
-0.831709 |
-0.073455 |
7 |
2.223354 |
-1.164356 |
-1.222986 |
0.009452 |
1.687198 |
0.344141 |
8 |
NaN |
NaN |
-1.024819 |
0.689602 |
1.210335 |
-0.714473 |
9 |
-0.397766 |
0.739464 |
-1.535297 |
-1.868259 |
0.724042 |
-1.714549 |
df.fillna(axis=0, method='ffill')
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
1.905276 |
NaN |
-0.978137 |
1.682227 |
NaN |
NaN |
1 |
0.164089 |
NaN |
-1.373336 |
1.388321 |
NaN |
NaN |
2 |
0.599135 |
NaN |
2.294221 |
0.737271 |
0.238057 |
0.526875 |
3 |
-0.481358 |
NaN |
-0.519011 |
0.214852 |
0.040489 |
0.599064 |
4 |
-0.358184 |
-0.794557 |
-0.454329 |
-1.024130 |
-0.090786 |
-0.018543 |
5 |
-0.679481 |
-0.126602 |
0.726568 |
1.432684 |
-1.403756 |
0.252173 |
6 |
-0.842605 |
1.192915 |
-0.831709 |
1.432684 |
-1.403756 |
-0.073455 |
7 |
2.223354 |
-1.164356 |
-1.222986 |
0.009452 |
1.687198 |
0.344141 |
8 |
2.223354 |
-1.164356 |
-1.024819 |
0.689602 |
1.210335 |
-0.714473 |
9 |
-0.397766 |
0.739464 |
-1.535297 |
-1.868259 |
0.724042 |
-1.714549 |
info = {0:0,1:1,2:2,3:3,4:4,5:5}
df.fillna(value=info)
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
1.905276 |
1.000000 |
-0.978137 |
1.682227 |
4.000000 |
5.000000 |
1 |
0.164089 |
1.000000 |
-1.373336 |
1.388321 |
4.000000 |
5.000000 |
2 |
0.599135 |
1.000000 |
2.294221 |
0.737271 |
0.238057 |
0.526875 |
3 |
-0.481358 |
1.000000 |
-0.519011 |
0.214852 |
0.040489 |
0.599064 |
4 |
-0.358184 |
-0.794557 |
-0.454329 |
-1.024130 |
-0.090786 |
-0.018543 |
5 |
-0.679481 |
-0.126602 |
0.726568 |
1.432684 |
-1.403756 |
0.252173 |
6 |
-0.842605 |
1.192915 |
-0.831709 |
3.000000 |
4.000000 |
-0.073455 |
7 |
2.223354 |
-1.164356 |
-1.222986 |
0.009452 |
1.687198 |
0.344141 |
8 |
0.000000 |
1.000000 |
-1.024819 |
0.689602 |
1.210335 |
-0.714473 |
9 |
-0.397766 |
0.739464 |
-1.535297 |
-1.868259 |
0.724042 |
-1.714549 |
df.fillna(value=info, limit=1)
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
1.905276 |
1.000000 |
-0.978137 |
1.682227 |
4.000000 |
5.000000 |
1 |
0.164089 |
NaN |
-1.373336 |
1.388321 |
NaN |
NaN |
2 |
0.599135 |
NaN |
2.294221 |
0.737271 |
0.238057 |
0.526875 |
3 |
-0.481358 |
NaN |
-0.519011 |
0.214852 |
0.040489 |
0.599064 |
4 |
-0.358184 |
-0.794557 |
-0.454329 |
-1.024130 |
-0.090786 |
-0.018543 |
5 |
-0.679481 |
-0.126602 |
0.726568 |
1.432684 |
-1.403756 |
0.252173 |
6 |
-0.842605 |
1.192915 |
-0.831709 |
3.000000 |
NaN |
-0.073455 |
7 |
2.223354 |
-1.164356 |
-1.222986 |
0.009452 |
1.687198 |
0.344141 |
8 |
0.000000 |
NaN |
-1.024819 |
0.689602 |
1.210335 |
-0.714473 |
9 |
-0.397766 |
0.739464 |
-1.535297 |
-1.868259 |
0.724042 |
-1.714549 |