1.前言
pandas中的merge和concat類似,但主要是用於兩組有key column的數據,統一索引的數據. 通常也被用在Database的處理當中.
2.通過key合併
import pandas as pd
left = pd.DataFrame({"key":["K0","K1","K2","K3"],
"A":["A0","A1","A2","A3"],
"B":["B0","B1","B2","B3"]})
right = pd.DataFrame({"key":["K0","K1","K2","K3"],
"C":["C0","C1","C2","C3"],
"D":["D0","D1","D2","D3"]})
print(left)
print('\n')
print(right)
print('\n')
res = pd.merge(left, right, on = "key") #依據key column合併,並打印出
print(res)
#輸出
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
3.Indicator
indicator=True會將合併的記錄放在新的一列。
res = pd.merge(df1,df2,on = 'col1',how = 'outer',indicator = True) # 依據col1進行合併,並啓用indicator=True,最後打印出
print(res)
print('\n')
res_self = pd.merge(df1,df2,on = 'col1',how = 'outer',indicator = 'indicator_column') # 自定indicator column的名稱,並打印出
print(res_self)
#輸出
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 3 NaN 2.0 right_only
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 3 NaN 2.0 right_only
4.根據index合併
import pandas as pd
left = pd.DataFrame({'A':['A0','A1','A2'],
'B':['B0','B1','B2']},
index = ['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],
'D':['D0','D2','D3']},
index = ['K0','K2','K3'])
print(left)
print('\n')
print(right)
print('\n')
res_outer = pd.merge(left,right,left_index=True,right_index=True,how='outer') #依據左右資料集的index進行合併,how='outer',並打印出
print(res_outer)
print('\n')
res_inner = pd.merge(left,right,left_index=True,right_index=True,how='inner') #依據左右資料集的index進行合併,how='inner',並打印出
print(res_inner)
#輸出
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
A B C D
K0 A0 B0 C0 D0
K2 A2 B2 C2 D2
5.解決overlapping的問題
import pandas as pd
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner') ##使用suffixes解決overlapping的問題
print(res)
#輸出
k age_boy age_girl
0 K0 1 4
1 K0 1 5