Pandas處理丟失數據
使用dropna()除去NaN的行或列
dates = pd.date_range('20180901',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A', 'B', 'C', 'D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
print("除去異常值:")
# 使用pd.dropna()去掉有NaN的行或列
df1 = df.dropna(axis=1, how='any') # ‘any’:只要存在NaN就drop掉;’all‘:必須全部是NaN才drop掉#0:對行進行操作;1:對列進行操作
print(df1)
A B C D
2018-09-01 0 NaN 2.0 3
2018-09-02 4 5.0 NaN 7
2018-09-03 8 9.0 10.0 11
2018-09-04 12 13.0 14.0 15
2018-09-05 16 17.0 18.0 19
2018-09-06 20 21.0 22.0 23
除去異常值:
A D
2018-09-01 0 3
2018-09-02 4 7
2018-09-03 8 11
2018-09-04 12 15
2018-09-05 16 19
2018-09-06 20 23
使用fillna指定值代替NaN
df2 = df.fillna(value=0) # 使用0代替nan
print(df2)
A B C D
2018-09-01 0 0.0 2.0 3
2018-09-02 4 5.0 0.0 7
2018-09-03 8 9.0 10.0 11
2018-09-04 12 13.0 14.0 15
2018-09-05 16 17.0 18.0 19
2018-09-06 20 21.0 22.0 23
##使用isnull判斷是否是NaN
A B C D
2018-09-01 False True False False
2018-09-02 False False True False
2018-09-03 False False False False
2018-09-04 False False False False
2018-09-05 False False False False
2018-09-06 False False False False
Pandas導入導出
pd.read_csv讀取csv文件
data = pd.read_csv('student.csv') # 讀入csv
print(data)
print(data.index)
print(data.columns)
print(data.values)
print(data.describe())
姓名 性別 分數
0 李召 男 96
1 李明 男 89
2 王紅 女 95
RangeIndex(start=0, stop=3, step=1)
Index(['姓名 性別 分數'], dtype='object')
[['李召 男 96']
['李明 男 89']
['王紅 女 95']]
姓名 性別 分數
count 3
unique 3
top 王紅 女 95
freq 1
將資料存取成pickle
data.to_pickle('student.pickle') # 將資料存取成pickle
Pandas合併concat
pandas
處理多組數據需要用到數據的合併處理,concat
是一種基本的合併方式,concat
中有很多參數可以調整。
axis方向合併
# 定義資料集
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
# concat縱向
print(df1)
print(df2)
print(df3)
# concat縱向合併
res = pd.concat([df1, df2, df3], axis=0)
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
ignore_index
# 縱向合併,並重置index
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
join(合併方式)
# 定義資料集
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
# 縱向“外”合併df1和df2,擴展
res = pd.concat([df1, df2], axis=0, join='outer', sort=False)
print(res)
# 縱向“內”合併df1和df2,找交集
res = pd.concat([df1, df2], axis=0, join='inner')
print(res)
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
b c d
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
join_axes(依照axes合併)
# 依照'df1.index'進行橫向合併
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
print(res)
# 移除join_axes
res = pd.concat([df1, df2], axis=1)
print(res)
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
append(添加數據)
# 定義資料集
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
# 將df2合併到df1的下面,以及重置index,並打印出結果
res = df1.append(df2, ignore_index=True)
print(res)
# 合併多個df,講df2與df3合併至df1的下面,以及重置index
res = df1.append([df2, df3], ignore_index=True)
print(res)
# 合併Series,將s1合併至df1,以及重置index,
res = df1.append(s1, ignore_index=True)
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
Pandas合併merge
pandas
中的merge
和concat
類似,但主要是用於兩組有key column的數據,統一索引的數據,通常也被用在Database的處理中
依據一組key合併
left = pd.DataFrame({
'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']
})
right = pd.DataFrame({
'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']
})
print(left)
print(right)
# 依據key_column合併
res = pd.merge(left, right, on='key')
print(res)
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
依據兩組key合併
left = pd.DataFrame({
'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']
})
right = pd.DataFrame({
'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']
})
print(left)
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
print(right)
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
# 依據key1與key2 column進行合併
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(res)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
5 K2 K0 NaN NaN C3 D3
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(res)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
3 K2 K0 NaN NaN C3 D3
Indicator
Indicator=True
會將合併的記錄放在新的一列
df1 = pd.DataFrame({
'col1': [0, 1], 'col_left': ['a', 'b']
})
df2 = pd.DataFrame({
'col1': [1, 2, 2], 'col_right': [2, 2, 2]
})
print(df1)
print(df2)
col1 col_left
0 0 a
1 1 b
col1 col_right
0 1 2
1 2 2
2 2 2
# 依據col1進行合併,並啓用indicator=True
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
# 自定義indicator column的名稱
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
依據index合併
# 定義資料集
left = pd.DataFrame({
'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({
'C': ['C0', 'C1', 'C2'],
'D': ['D0', 'D1', 'D2']},
index=['K0', 'K2', 'K3']
)
print(left)
print(right)
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C1 D1
K3 C2 D2
# 依據左右資料集的index進行合併,how='outer'
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C1 D1
K3 NaN NaN C2 D2
# 依據左右資料集的index進行合併,how='inner'
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
A B C D
K0 A0 B0 C0 D0
K2 A2 B2 C1 D1
解決overlapping
# 解決overlapping的問題
boys = pd.DataFrame({'K': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'K': ['K0', 'K1', 'K2'], 'age': [4, 5, 6]})
print(boys)
print(girls)
K age
0 K0 1
1 K1 2
2 K2 3
K age
0 K0 4
1 K1 5
2 K2 6
# 使用suffixes解決overlapping問題
res = pd.merge(boys, girls, on='K', suffixes=['_boy', '_girl'], how='inner')
print(res)
K age_boy age_girl
0 K0 1 4
1 K1 2 5
2 K2 3 6
Pandas plot繪圖
創建一個Series
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 隨機生成1000個數據
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
# 累加數據
data1 = data.cumsum()
# pandas數據可以直接觀察可視化形式
data1.plot()
plt.show()
Dataframe可視化
data = pd.DataFrame(
np.random.randn(1000, 4),
index=np.arange(1000),
columns=list("ABCD")
)
data1 = data.cumsum()
data1.plot()
plt.show()