Pandas
1.層級索引
-
MultiIndex 對象
-
#pandas 層級索引
import pandas as pd
import numpy as npser_obj = pd.Series(np.random.randn(12),
index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'd'],
[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
print(ser_obj)answer:
a 0 -0.690188
1 0.850422
3 -1.732979
b 0 -1.007294
1 0.141019
2 -1.989199
c 0 -2.082892
1 1.728222
2 0.035156
d 0 0.489372
1 -0.582691
2 1.302097# MultiIndex 索引對象
print(type(ser_obj.index))
print(ser_obj.index)<class 'pandas.indexes.multi.MultiIndex'>
MultiIndex(levels=[['a', 'b', 'c', 'd'], [0, 1, 2]],
labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]]) -
選取子集
- 外層選取 ser_obj['outer_label']
- 內層選取 ser_obj[:,'inner_label']
-
#選取子集 #外層 # 外層選取 print(ser_obj['c']) 0 2.579259 1 0.566795 2 -0.796418 # 內層選取 print(ser_obj[:, 2]) a 1.254099 b -0.751972 c -0.796418 d -1.541993
- 交換分層順序
- swaplevel()
- 排序分層
- soerlevel()
#交換分層順序
print(ser_obj.swaplevel())
0 a 0.078539
1 a 0.643005
2 a 1.254099
0 b 0.569994
1 b -1.267482
2 b -0.751972
0 c 2.579259
1 c 0.566795
2 c -0.796418
0 d 1.444369
1 d -0.013740
2 d -1.541993
#交換並排序分層
print(ser_obj.swaplevel().sortlevel())
0 a 0.078539
b 0.569994
c 2.579259
d 1.444369
1 a 0.643005
b -1.267482
c 0.566795
d -0.013740
2 a 1.254099
b -0.751972
c -0.796418
d -1.541993
2.分組與聚合
(1)對象 GroupBy DataFrameGroupBy,SeriesGroupBy
GroupBy對象沒有進行實際的運算,只是包含分組的中間數據
使用如mean()方法,對GroupBy對象進行分組運算
size()方法返回每個分組的元素個數
- 對數據進行分組,接着對每組數據進行統計分析
- 分組的運算過程
- split--->apply--->combine
- split:進行分組的根據
- apply:每個分組運行的計算規則
- 合併:把每個分組的計算結果合併起來
import pandas as pd
import numpy as np
dict_obj = {'key1' : ['a', 'b', 'a', 'b',
'a', 'b', 'a', 'a'],
'key2' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'data1': np.random.randn(8),
'data2': np.random.randn(8)}
df_obj = pd.DataFrame(dict_obj)
print(df_obj)
key1 key2 data1 data2
0 a one 2.238727 -0.130895
1 b one 0.862376 0.539461
2 a two 0.849171 0.259936
3 b three -1.973560 -1.463266
4 a two -0.239325 -0.151813
5 b two -2.230763 0.343739
6 a one 1.278376 -0.878451
7 a three 0.935462 -0.778689
# dataframe根據key1進行分組
print(df_obj.groupby('key1'))
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x0000023F803DF2B0>
# data1列根據key1進行分組
print(type(df_obj['data1'].groupby(df_obj['key1'])))
<class 'pandas.core.groupby.groupby.SeriesGroupBy'>
# 分組運算
grouped1 = df_obj.groupby('key1')
print(grouped1.mean())
grouped2 = df_obj['data1'].groupby(df_obj['key1'])
print(grouped2.mean())
data1 data2
key1
a 0.129186 0.354484
b -0.482010 0.144217
key1
a 0.129186
b -0.482010
Name: data1, dtype: float64
# size
print(grouped1.size())
print(grouped2.size())
# 按自定義key分組,多層列表
df_obj.groupby([df_obj['key1'], df_obj['key2']]).size()
key1 key2
a one 2
three 1
two 2
b one 1
three 1
two 1
# 按多個列多層分組
grouped2 = df_obj.groupby(['key1', 'key2'])
print(grouped2.size())
key1 key2
a one 2
three 1
two 2
b one 1
three 1
two 1
# 多層分組按key的順序進行
grouped3 = df_obj.groupby(['key2', 'key1'])
print(grouped3.mean())
print()
print(grouped3.mean().unstack())
data1 data2
key2 key1
one a 1.432128 0.705949
b -0.103997 -0.552156
three a -0.698301 0.468734
b 0.325473 -2.221337
two a -0.269729 0.208442
b 0.360193 0.324535
data1 data2
key1 a b a b
key2
one 1.432128 -0.103997 0.705949 -0.552156
three -0.698301 0.325473 0.468734 -2.221337
two -0.269729 0.360193 0.208442 0.324535
(2)GroupBy對象分組迭代
- GroupBy對象支持迭代操作
- 每次迭代返回一個元祖(group_name,group_data)
- 可用於分組數據的具體運算
- GroupBy對象可轉換爲列表或者字典
print(df_obj)
key1 key2 data1 data2
0 a one 2.238727 -0.130895
1 b one 0.862376 0.539461
2 a two 0.849171 0.259936
3 b three -1.973560 -1.463266
4 a two -0.239325 -0.151813
5 b two -2.230763 0.343739
6 a one 1.278376 -0.878451
7 a three 0.935462 -0.778689
grouped1 = df_obj.groupby('key1')
#GroupBy對象分組迭代
# 單層分組
for group_name, group_data in grouped1:
print(group_name)
print(group_data)
a
key1 key2 data1 data2
0 a one 0.268936 1.394133
2 a two -1.324961 -0.627993
4 a two 1.834457 0.137958
6 a one -0.813389 0.653271
7 a three 0.680888 0.215051
b
key1 key2 data1 data2
1 b one -1.714305 0.878062
3 b three 0.108732 -0.620527
5 b two 0.159541 0.175115
# 多層分組
grouped2 = df_obj['data1'].groupby(df_obj['key1'])
for group_name, group_data in grouped2:
print(group_name)
print(group_data)
('a', 'one')
data1 data2 key1 key2
0 -0.943078 0.820645 a one
6 -1.291468 -1.186638 a one
('a', 'three')
data1 data2 key1 key2
7 1.186941 0.809122 a three
('a', 'two')
data1 data2 key1 key2
2 0.832261 0.843898 a two
4 0.541173 0.117232 a two
('b', 'one')
data1 data2 key1 key2
1 -1.429043 0.142617 b one
('b', 'three')
data1 data2 key1 key2
3 0.906262 0.688165 b three
('b', 'two')
data1 data2 key1 key2
5 -0.213385 -0.098734 b two
GroupBy對象轉list dict
# GroupBy對象轉換list
list(grouped1)
[('a', data1 data2 key1 key2
0 -0.943078 0.820645 a one
2 0.832261 0.843898 a two
4 0.541173 0.117232 a two
6 -1.291468 -1.186638 a one
7 1.186941 0.809122 a three), ('b', data1 data2 key1 key2
1 -1.429043 0.142617 b one
3 0.906262 0.688165 b three
5 -0.213385 -0.098734 b two)]
# GroupBy對象轉換dict
dict(list(grouped1))
{'a': data1 data2 key1 key2
0 -0.943078 0.820645 a one
2 0.832261 0.843898 a two
4 0.541173 0.117232 a two
6 -1.291468 -1.186638 a one
7 1.186941 0.809122 a three, 'b': data1 data2 key1 key2
1 -1.429043 0.142617 b one
3 0.906262 0.688165 b three
5 -0.213385 -0.098734 b two}
聚合
dict_obj = {'key1' : ['a', 'b', 'a', 'b',
'a', 'b', 'a', 'a'],
'key2' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'data1': np.random.randint(1,10, 8),
'data2': np.random.randint(1,10, 8)}
df_obj5 = pd.DataFrame(dict_obj)
print(df_obj5)
data1 data2 key1 key2
0 4 2 a one
1 7 1 b one
2 2 8 a two
3 9 4 b three
4 3 2 a two
5 8 5 b two
6 6 8 a one
7 9 3 a three
# 內置的聚合函數
print(df_obj5.groupby('key1').sum())
data1 data2
key1
a 24 23
b 24 10
print(df_obj5.groupby('key1').max())
data1 data2 key2
key1
a 9 8 two
b 9 5 two
print(df_obj5.groupby('key1').min())
data1 data2 key2
key1
a 2 2 one
b 7 1 one
print(df_obj5.groupby('key1').mean())
data1 data2
key1
a 4.8 4.600000
b 8.0 3.333333
print(df_obj5.groupby('key1').size())
key1
a 5
b 3
print(df_obj5.groupby('key1').count())
data1 data2 key2
key1
a 5 5 5
b 3 3 3
print(df_obj5.groupby('key1').describe())
data1 data2
key1
a count 5.000000 5.000000
mean 4.800000 4.600000
std 2.774887 3.130495
min 2.000000 2.000000
25% 3.000000 2.000000
50% 4.000000 3.000000
75% 6.000000 8.000000
max 9.000000 8.000000
b count 3.000000 3.000000
mean 8.000000 3.333333
std 1.000000 2.081666
min 7.000000 1.000000
25% 7.500000 2.500000
50% 8.000000 4.000000
75% 8.500000 4.500000
max 9.000000 5.000000
# 自定義聚合函數
def peak_range(df):
"""
返回數值範圍
"""
#print type(df) #參數爲索引所對應的記錄
return df.max() - df.min()
print(df_obj5.groupby('key1').agg(peak_range))
print(df_obj.groupby('key1').agg(lambda df : df.max() - df.min()))
data1 data2
key1
a 7 6
b 2 4
data1 data2
key1
a 2.478410 2.030536
b 2.335305 0.786899
# 應用多個聚合函數
# 同時應用多個聚合函數
print(df_obj.groupby('key1').agg(['mean', 'std', 'count', peak_range])) # 默認列名爲函數名
data1 data2
mean std count peak_range mean std count peak_range
key1
a 0.065166 1.110226 5 2.478410 0.280852 0.875752 5 2.030536
b -0.245389 1.167982 3 2.335305 0.244016 0.403130 3 0.786899
print(df_obj.groupby('key1').agg(['mean', 'std', 'count', ('range', peak_range)])) # 通過元組提供新的列名
data1 data2
mean std count range mean std count range
key1
a 0.065166 1.110226 5 2.478410 0.280852 0.875752 5 2.030536
b -0.245389 1.167982 3 2.335305 0.244016 0.403130 3 0.786899
# 每列作用不同的聚合函數
dict_mapping = {'data1':'mean',
'data2':'sum'}
print(df_obj.groupby('key1').agg(dict_mapping))
data2 data1
key1
a 1.404259 0.065166
b 0.732047 -0.245389
dict_mapping = {'data1':['mean','max'],
'data2':'sum'}
print(df_obj.groupby('key1').agg(dict_mapping))
data2 data1
sum mean max
key1
a 1.404259 0.065166 1.186941
b 0.732047 -0.245389 0.906262