數據分析-數據處理-pandas處理缺失值

缺失值處理

找出缺失值

# 處理缺失值
import pandas as pd
import numpy as np

# 創建含有缺失值的DateFrame
df = pd.DataFrame(np.random.randn(10,6))
df.iloc[:4, 1] = None
df.iloc[:2, 4:6] = None
df.iloc[6, 3:5] = None
df.iloc[8, 0:2] = None

print(df)
          0         1         2         3         4         5
0  1.905276       NaN -0.978137  1.682227       NaN       NaN
1  0.164089       NaN -1.373336  1.388321       NaN       NaN
2  0.599135       NaN  2.294221  0.737271  0.238057  0.526875
3 -0.481358       NaN -0.519011  0.214852  0.040489  0.599064
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602  0.726568  1.432684 -1.403756  0.252173
6 -0.842605  1.192915 -0.831709       NaN       NaN -0.073455
7  2.223354 -1.164356 -1.222986  0.009452  1.687198  0.344141
8       NaN       NaN -1.024819  0.689602  1.210335 -0.714473
9 -0.397766  0.739464 -1.535297 -1.868259  0.724042 -1.714549
# 判斷缺失值
# 元素級別的判斷,把對應的所有元素的位置都列出來,元素爲空或者爲NA爲True,否則返回False
res = df.isnull()

res
0 1 2 3 4 5
0 False True False False True True
1 False True False False True True
2 False True False False False False
3 False True False False False False
4 False False False False False False
5 False False False False False False
6 False False False True True False
7 False False False False False False
8 True True False False False False
9 False False False False False False
# 列級別的判斷,只要該該列有空或者NA就返回True,否則返回False
res = df.isnull().any()

res
0     True
1     True
2    False
3     True
4     True
5     True
dtype: bool
# 只顯示存在缺失值的行列,確定缺失值的位置
# drop_duplicates() 去掉重複的行

result = df[df.isnull().values==True].drop_duplicates()
result
0 1 2 3 4 5
0 1.905276 NaN -0.978137 1.682227 NaN NaN
1 0.164089 NaN -1.373336 1.388321 NaN NaN
2 0.599135 NaN 2.294221 0.737271 0.238057 0.526875
3 -0.481358 NaN -0.519011 0.214852 0.040489 0.599064
6 -0.842605 1.192915 -0.831709 NaN NaN -0.073455
8 NaN NaN -1.024819 0.689602 1.210335 -0.714473
# 獲得爲空或者NA的列索引
res = df.columns[df.isnull().any()==True]
res
Int64Index([0, 1, 3, 4, 5], dtype='int64')
# 獲取每列爲空的數據的個數
num = df.isnull().sum()
num
0    1
1    5
2    0
3    1
4    3
5    2
dtype: int64
# 獲取每行爲空的數據的個數 axis=1 代表行
num = df.isnull().sum(axis=1)
num
0    3
1    3
2    1
3    1
4    0
5    0
6    2
7    0
8    2
9    0
dtype: int64

刪除缺失值所在的行列

# 刪除具有空值的行 不改變原矩陣
df.dropna()
0 1 2 3 4 5
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602 0.726568 1.432684 -1.403756 0.252173
7 2.223354 -1.164356 -1.222986 0.009452 1.687198 0.344141
9 -0.397766 0.739464 -1.535297 -1.868259 0.724042 -1.714549
# 刪除具有空值的列
df.dropna(axis=1)
2
0 -0.978137
1 -1.373336
2 2.294221
3 -0.519011
4 -0.454329
5 0.726568
6 -0.831709
7 -1.222986
8 -1.024819
9 -1.535297
# 所有值爲缺失值才刪除
df.dropna(how='all')
0 1 2 3 4 5
0 1.905276 NaN -0.978137 1.682227 NaN NaN
1 0.164089 NaN -1.373336 1.388321 NaN NaN
2 0.599135 NaN 2.294221 0.737271 0.238057 0.526875
3 -0.481358 NaN -0.519011 0.214852 0.040489 0.599064
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602 0.726568 1.432684 -1.403756 0.252173
6 -0.842605 1.192915 -0.831709 NaN NaN -0.073455
7 2.223354 -1.164356 -1.222986 0.009452 1.687198 0.344141
8 NaN NaN -1.024819 0.689602 1.210335 -0.714473
9 -0.397766 0.739464 -1.535297 -1.868259 0.724042 -1.714549
# 至少有四個非空值才保留
df.dropna(thresh=4)
0 1 2 3 4 5
2 0.599135 NaN 2.294221 0.737271 0.238057 0.526875
3 -0.481358 NaN -0.519011 0.214852 0.040489 0.599064
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602 0.726568 1.432684 -1.403756 0.252173
6 -0.842605 1.192915 -0.831709 NaN NaN -0.073455
7 2.223354 -1.164356 -1.222986 0.009452 1.687198 0.344141
8 NaN NaN -1.024819 0.689602 1.210335 -0.714473
9 -0.397766 0.739464 -1.535297 -1.868259 0.724042 -1.714549
# 刪除這個subset中的含有缺失值的行
df.dropna(subset=[2,4])
0 1 2 3 4 5
2 0.599135 NaN 2.294221 0.737271 0.238057 0.526875
3 -0.481358 NaN -0.519011 0.214852 0.040489 0.599064
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602 0.726568 1.432684 -1.403756 0.252173
7 2.223354 -1.164356 -1.222986 0.009452 1.687198 0.344141
8 NaN NaN -1.024819 0.689602 1.210335 -0.714473
9 -0.397766 0.739464 -1.535297 -1.868259 0.724042 -1.714549

缺失值填充

# 缺失值填充0
df.fillna(0)
0 1 2 3 4 5
0 1.905276 0.000000 -0.978137 1.682227 0.000000 0.000000
1 0.164089 0.000000 -1.373336 1.388321 0.000000 0.000000
2 0.599135 0.000000 2.294221 0.737271 0.238057 0.526875
3 -0.481358 0.000000 -0.519011 0.214852 0.040489 0.599064
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602 0.726568 1.432684 -1.403756 0.252173
6 -0.842605 1.192915 -0.831709 0.000000 0.000000 -0.073455
7 2.223354 -1.164356 -1.222986 0.009452 1.687198 0.344141
8 0.000000 0.000000 -1.024819 0.689602 1.210335 -0.714473
9 -0.397766 0.739464 -1.535297 -1.868259 0.724042 -1.714549
# 橫向用缺失值前面的值替換缺失值
df.fillna(axis=1, method='ffill')
0 1 2 3 4 5
0 1.905276 1.905276 -0.978137 1.682227 1.682227 1.682227
1 0.164089 0.164089 -1.373336 1.388321 1.388321 1.388321
2 0.599135 0.599135 2.294221 0.737271 0.238057 0.526875
3 -0.481358 -0.481358 -0.519011 0.214852 0.040489 0.599064
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602 0.726568 1.432684 -1.403756 0.252173
6 -0.842605 1.192915 -0.831709 -0.831709 -0.831709 -0.073455
7 2.223354 -1.164356 -1.222986 0.009452 1.687198 0.344141
8 NaN NaN -1.024819 0.689602 1.210335 -0.714473
9 -0.397766 0.739464 -1.535297 -1.868259 0.724042 -1.714549
# 縱向用缺失值上面的值替換缺失值
df.fillna(axis=0, method='ffill')
0 1 2 3 4 5
0 1.905276 NaN -0.978137 1.682227 NaN NaN
1 0.164089 NaN -1.373336 1.388321 NaN NaN
2 0.599135 NaN 2.294221 0.737271 0.238057 0.526875
3 -0.481358 NaN -0.519011 0.214852 0.040489 0.599064
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602 0.726568 1.432684 -1.403756 0.252173
6 -0.842605 1.192915 -0.831709 1.432684 -1.403756 -0.073455
7 2.223354 -1.164356 -1.222986 0.009452 1.687198 0.344141
8 2.223354 -1.164356 -1.024819 0.689602 1.210335 -0.714473
9 -0.397766 0.739464 -1.535297 -1.868259 0.724042 -1.714549
# 不同的列用不同的值進行填充
info = {0:0,1:1,2:2,3:3,4:4,5:5}
df.fillna(value=info)
0 1 2 3 4 5
0 1.905276 1.000000 -0.978137 1.682227 4.000000 5.000000
1 0.164089 1.000000 -1.373336 1.388321 4.000000 5.000000
2 0.599135 1.000000 2.294221 0.737271 0.238057 0.526875
3 -0.481358 1.000000 -0.519011 0.214852 0.040489 0.599064
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602 0.726568 1.432684 -1.403756 0.252173
6 -0.842605 1.192915 -0.831709 3.000000 4.000000 -0.073455
7 2.223354 -1.164356 -1.222986 0.009452 1.687198 0.344141
8 0.000000 1.000000 -1.024819 0.689602 1.210335 -0.714473
9 -0.397766 0.739464 -1.535297 -1.868259 0.724042 -1.714549
# 對每列出現的替換值有次數限制
df.fillna(value=info, limit=1)
0 1 2 3 4 5
0 1.905276 1.000000 -0.978137 1.682227 4.000000 5.000000
1 0.164089 NaN -1.373336 1.388321 NaN NaN
2 0.599135 NaN 2.294221 0.737271 0.238057 0.526875
3 -0.481358 NaN -0.519011 0.214852 0.040489 0.599064
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602 0.726568 1.432684 -1.403756 0.252173
6 -0.842605 1.192915 -0.831709 3.000000 NaN -0.073455
7 2.223354 -1.164356 -1.222986 0.009452 1.687198 0.344141
8 0.000000 NaN -1.024819 0.689602 1.210335 -0.714473
9 -0.397766 0.739464 -1.535297 -1.868259 0.724042 -1.714549
發佈了67 篇原創文章 · 獲贊 24 · 訪問量 1萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章