pandas基礎(3. 索引、分組)


索引

import numpy as np
import pandas as pd

# 指定索引的Series
s = pd.Series(np.arange(6), index=list('BABCDA'))
s
Out[6]: 
B    0
A    1
B    2

# 無重複索引嗎?
s.index.is_unique
Out[7]: False

# 去掉重複的索引爲
s.index.unique()
Out[9]: Index(['B', 'A', 'C', 'D'], dtype='object')

# 按索引分組求和
s.groupby(s.index).sum()
Out[11]: 
A    6
B    2
C    3

# 構建一個二級索引所需參數,元組形式
a = [['a','a','a','b','b','c','c'],['1','2','3','1','2','2','3']]
t = list(zip(*a))
t
Out[15]: 
[('a', '1'),
 ('a', '2'),
 ('a', '3'),
 ('b', '1'),
 ('b', '2'),
 ('c', '2'),
 ('c', '3')]

# 構建多級索引
index = pd.MultiIndex.from_tuples(t,names=['level1','level2'])
index
Out[18]: 
MultiIndex([('a', '1'),
            ('a', '2'),
            ('a', '3'),
            ('b', '1'),
            ('b', '2'),
            ('c', '2'),
            ('c', '3')],
           names=['level1', 'level2'])

# 構建Series使用多級索引
s = pd.Series(np.random.rand(7), index=index)
s
Out[20]: 
level1  level2
a       1         0.442665
        2         0.864886
        3         0.563471
b       1         0.301778
        2         0.387837
c       2         0.190012
        3         0.901903
dtype: float64

# 通過索引取
s['b':'c']
Out[22]: 
level1  level2
b       1         0.301778
        2         0.387837
c       2         0.190012
        3         0.901903
dtype: float64

s[['a','c']]
Out[23]: 
level1  level2
a       1         0.442665
        2         0.864886
        3         0.563471
c       2         0.190012
        3         0.901903
dtype: float64

s[:,'2']
Out[28]: 
level1
a    0.864886
b    0.387837
c    0.190012
dtype: float64

DataFrame 多層索引


df = pd.DataFrame(np.random.randint(1, 10, (4, 3)), 
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                  columns=[['one', 'one', 'two'], ['blue', 'red', 'blue']])
df.index.names = ['row-1', 'row-2']
df.columns.names = ['col-1', 'col-2']
df
one two three four five six
A -0.049437 -0.526499 1.780662 1.154747 2.434957 -1.579278
D -0.075226 0.552163 -0.462732 -0.936051 -0.590041 0.484505
F 1.486168 0.725907 0.598127 -0.704809 -2.815687 -0.062462
H -0.900819 -0.177751 -0.232796 0.234088 -1.758574 1.255955
df2 = df.reindex(index=list('ABCDEFGH'))
df2
one two three four five six
A -0.049437 -0.526499 1.780662 1.154747 2.434957 -1.579278
B NaN NaN NaN NaN NaN NaN
C NaN NaN NaN NaN NaN NaN
D -0.075226 0.552163 -0.462732 -0.936051 -0.590041 0.484505
E NaN NaN NaN NaN NaN NaN
F 1.486168 0.725907 0.598127 -0.704809 -2.815687 -0.062462
G NaN NaN NaN NaN NaN NaN
H -0.900819 -0.177751 -0.232796 0.234088 -1.758574 1.255955
df.loc['A']['one'] = 100
df
one two three four five six
A 100.000000 -0.526499 1.780662 1.154747 2.434957 -1.579278
D -0.075226 0.552163 -0.462732 -0.936051 -0.590041 0.484505
F 1.486168 0.725907 0.598127 -0.704809 -2.815687 -0.062462
H -0.900819 -0.177751 -0.232796 0.234088 -1.758574 1.255955

分組計算

分組計算三步曲:拆分 -> 應用 -> 合併

拆分:根據什麼進行分組?
應用:每個分組進行什麼樣的計算?
合併:把每個分組的計算結果合併起來。

df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randint(1, 10, 5),
                  'data2': np.random.randint(1, 10, 5)})
df
data1 data2 key1 key2
0 1 6 a one
1 5 9 a two
2 4 7 b one
3 3 7 b two
4 3 5 a one
grouped = df['data1'].groupby(df['key1'])
grouped.mean()

key1
a    3.0
b    3.5
Name: data1, dtype: float64

df['data1'].groupby([df['key1'], df['key2']]).mean()

key1  key2
a     one     2
      two     5
b     one     4
      two     3
Name: data1, dtype: int32

df.groupby('key1').mean()


	data1	data2
key1		
a	3.0	6.666667
b	3.5	7.000000

means = df.groupby(['key1', 'key2']).mean()['data1']
means

	  key1  key2
a     one     2
      two     5
b     one     4
      two     3
Name: data1, dtype: float64

means.unstack()


key2	one	two
key1		
a		2	5
b		4	3

for name, group in df.groupby('key1'):
    print name
    print group
    
a
   data1  data2 key1 key2
0      1      6    a  one
1      5      9    a  two
4      3      5    a  one
b
   data1  data2 key1 key2
2      4      7    b  one
3      3      7    b  two

# 轉化字典
d = dict(list(df.groupby('key1')))
d

{'a':    data1  data2 key1 key2
 0      1      6    a  one
 1      5      9    a  two
 4      3      5    a  one, 'b':    data1  data2 key1 key2
 2      4      7    b  one
 3      3      7    b  two}

d['a']

	data1	data2	key1	key2
0	1	6	a	one
1	5	9	a	two
4	3	5	a	one

按列分組

df.dtypes

data1     int32
data2     int32
key1     object
key2     object
dtype: object

grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))

{dtype('int32'):    data1  data2
 0      1      6
 1      5      9
 2      4      7
 3      3      7
 4      3      5, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

通過字典進行分組

df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), 
                  columns=['a', 'b', 'c', 'd', 'e'], 
                  index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])
df

		a	b	c	d	e
Alice	7	2	7	9	9
Bob		8	7	7	6	8
Candy	4	2	6	6	5
Dark	6	3	9	4	8
Emily	8	4	9	6	5

df.ix[1, 1:3] = np.NaN
df


		a	b	c	d	e
Alice	7	2	7	9	9
Bob		8	NaN	NaN	6	8
Candy	4	2	6	6	5
Dark	6	3	9	4	8
Emily	8	4	9	6	5

# 定義字典映射關係
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'orange', 'e': 'blue'}
grouped = df.groupby(mapping, axis=1)
grouped.sum()

		blue	orange	red
Alice	16	9	9
Bob		8	6	8
Candy	11	6	6
Dark	17	4	9
Emily	14	6	12

grouped.count()

	blue	orange	red
Alice	2	1	2
Bob	1	1	1
Candy	2	1	2
Dark	2	1	2
Emily	2	1	2

grouped.size()

blue      2
orange    1
red       2
dtype: int64

通過函數來分組

當函數作爲分組依據時,數據表裏的每個索引(可以是行索引,也可以是列索引)都會調用一次函數,函數的返回值作爲分組的索引,即相同的返回值分在同一組。

df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), 
                  columns=['a', 'b', 'c', 'd', 'e'], 
                  index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])
df


Out[23]:
		a	b	c	d	e
Alice	7	9	1	9	1
Bob		6	6	7	1	5
Candy	7	8	5	3	8
Dark	3	4	6	8	1
Emily	1	2	2	1	2

def _dummy_group(idx):
    print idx
    return idx
    
df.groupby(_dummy_group)

Alice
Bob
Candy
Dark
Emily
Out[24]:
<pandas.core.groupby.DataFrameGroupBy object at 0x07525650>

多級索引數據根據索引級別來分組

columns = pd.MultiIndex.from_arrays([['China', 'USA', 'China', 'USA', 'China'],
                                     ['A', 'A', 'B', 'C', 'B']], names=['country', 'index'])
df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
df

country	China	USA	China	USA	China
index	A	A	B	C	B
	0	9	6	9	6	2
	1	5	6	1	8	7
	2	2	5	4	5	2
	3	4	8	9	4	9
	4	7	2	9	1	8

# 默認行分組,指定列分組
df.groupby(level='country', axis=1).count()


country	China	USA
0	3	2
1	3	2
2	3	2
3	3	2
4	3	2

df.groupby(level='country', axis=1).sum()

country	China	USA
0	20	12
1	13	14
2	8	10
3	22	12
4	24	3

聚合

分組運算,先根據一定規則拆分後的數據,然後對數據進行聚合運算,如前面見到的 mean(), sum() 等就是聚合的例子。聚合時,拆分後的第一個索引指定的數據都會依次傳給聚合函數進行運算。最後再把運算結果合併起來,生成最終結果。

聚合函數除了內置的 sum(), min(), max(), mean() 等等之外,還可以自定義聚合函數。自定義聚合函數時,使用 agg() 或 aggregate() 函數。

df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randint(1, 10, 5),
                  'data2': np.random.randint(1, 10, 5)})
df

	data1	data2	key1	key2
0	9	3	a	one
1	3	8	a	two
2	9	5	b	one
3	8	5	b	two
4	9	2	a	one



def peak_verbose(s):
    print type(s)
    return s.max() - s.min()

def peak(s):
    return s.max() - s.min()

grouped = df.groupby('key1')

grouped.agg(peak_verbose)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
Out[38]:
data1	data2
key1		
a	6	6
b	1	0

# 應用多個聚合函數
grouped['data1', 'data2'].agg(['mean', 'std', peak])

			data1							data2
mean	std	peak		mean	std			peak
key1						
a		7.0	3.464102	6		4.333333	3.21455	6
b		8.5	0.707107	1		5.000000	0.00000	0

# 給聚合後的列取名
grouped['data1'].agg([('agerage', 'mean'), ('max-range', peak)])

	agerage	max-range
key1		
a	7.0		6
b	8.5		1


給不同的列應用不同的聚合函數

使用 dict 作爲參數來實現

d = {'data1': ['mean', peak, 'max', 'min'],
     'data2': 'sum'}
grouped.agg(d)
data1 data2
mean peak max min sum
key1
a 7.0 6 9 3 13
b 8.5 1 9 8 10

分組運算和轉換

在這裏插入圖片描述

groupby 是特殊的分組運算。更一般的分組運算包括 “拆分 - 應用 - 合併”。這裏介紹 transform() 和 apply() 來實現分組運算。

df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randint(1, 10, 5),
                  'data2': np.random.randint(1, 10, 5)})
df

Out[44]:
data1	data2	key1	key2
0	4	9	a	one
1	5	2	a	two
2	1	9	b	one
3	3	9	b	two
4	1	8	a	one

# 給 df 每行都添加一個以 key1 分組後的平均值
k1_mean = df.groupby('key1').mean().add_prefix('mean_')
k1_mean


	mean_data1	mean_data2
key1		
a	3.333333	6.333333
b	2.000000	9.000000

pd.merge(df, k1_mean, left_on='key1', right_index=True)

data1	data2	key1	key2	mean_data1	mean_data2
0	4	9	a	one	3.333333	6.333333
1	5	2	a	two	3.333333	6.333333
4	1	8	a	one	3.333333	6.333333
2	1	9	b	one	2.000000	9.000000
3	3	9	b	two	2.000000	9.000000

# 使用 transform 簡化處理
k1_mean = df.groupby('key1').transform(np.mean).add_prefix('mean_')
k1_mean

	mean_data1	mean_data2
0	3.333333	6.333333
1	3.333333	6.333333
2	2.000000	9.000000
3	2.000000	9.000000
4	3.333333	6.333333

df[k1_mean.columns] = k1_mean
df

data1	data2	key1	key2	mean_data1	mean_data2
0	4	9	a	one	3.333333	6.333333
1	5	2	a	two	3.333333	6.333333
2	1	9	b	one	2.000000	9.000000
3	3	9	b	two	2.000000	9.000000
4	1	8	a	one	3.333333	6.333333

距平化

與平均值的差異值

df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), 
                  columns=['a', 'b', 'c', 'd', 'e'], 
                  index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])
df

		a	b	c	d	e
Alice	4	8	1	7	6
Bob		4	4	4	9	7
Candy	6	2	2	4	6
Dark	4	2	1	4	5
Emily	4	3	4	2	4


def demean(s):
    return s - s.mean()

key = ['one', 'one', 'two', 'one', 'two']
demeaned = df.groupby(key).transform(demean)
demeaned

	a	b	c	d	e
Alice	0	3.333333	-1	0.333333	0
Bob	0	-0.666667	2	2.333333	1
Candy	1	-0.500000	-1	1.000000	1
Dark	0	-2.666667	-1	-2.666667	-1
Emily	-1	0.500000	1	-1.000000	-1

demeaned.groupby(key).mean()

	a	b	c	d	e
one	0	-2.960595e-16	0	-2.960595e-16	0
two	0	0.000000e+00	0	0.000000e+00	0

apply 函數

我們介紹過 DataFrame 的 apply 函數是逐行或逐列來處理數據。GroupBy 的 apply 函數對每個分組進行計算。

df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a', 'a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two', 'one', 'one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randint(1, 10, 10),
                  'data2': np.random.randint(1, 10, 10)})
df


	data1	data2	key1	key2
0	3	9	a	one
1	5	9	a	two
2	7	4	b	one
3	7	6	b	two
4	9	7	a	one
5	3	7	a	one
6	3	3	a	two
7	4	5	b	one
8	8	2	b	two
9	7	4	a	one



# 根據 column 排序,輸出其最大的 n 行數據
def top(df, n=2, column='data1'):
    return df.sort_values(by=column, ascending=False)[:n]

top(df, n=5)


data1	data2	key1	key2
4	9	7	a	one
8	8	2	b	two
2	7	4	b	one
3	7	6	b	two
9	7	4	a	one

df.groupby('key1').apply(top)

		data1	data2	key1	key2
key1					
a	4	9	7	a	one
	9	7	4	a	one
b	8	8	2	b	two
	2	7	4	b	one

# 傳遞參數
df.groupby('key1').apply(top, n=3, column='data2')

		data1	data2	key1	key2
key1					
a	0	3	9	a	one
1	5	9	a	two
4	9	7	a	one
b	3	7	6	b	two
7	4	5	b	one
2	7	4	b	one


# 禁用分組鍵
df.groupby('key1', group_keys=False).apply(top)


data1	data2	key1	key2
4	9	7	a	one
9	7	4	a	one
8	8	2	b	two
2	7	4	b	one

載入數據到 Pandas

  • 索引:將一個列或多個列讀取出來構成 DataFrame,其中涉及是否從文件中讀取索引以及列名
  • 類型推斷和數據轉換:包括用戶自定義的轉換以及缺失值標記
  • 日期解析
  • 迭代:針對大文件進行逐塊迭代。這個是Pandas和Python原生的csv庫的最大區別
  • 不規整數據問題:跳過一些行,或註釋等等

讀取CSV文件,文件內容:

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

df = pd.read_csv('ex1.csv')
df
# 列索引是文件第一行讀出來的,行索引是pandas分配的


	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

# 完成相同功能
df = pd.read_table('data/ex1.csv', sep=',')

# 不將第一行作爲列索引
pd.read_csv('ex1.csv', header=None)

# 指定列索引
pd.read_csv('ex1.csv', header=None, names=['a','b','c','d','msg'])

# 指定某一列作爲行索引
pd.read_csv('data/ex2.csv', header=None, names=['a', 'b', 'c', 'd', 'msg'], index_col='msg')

		a	b	c	d
msg				
hello	1	2	3	4
world	5	6	7	8
foo		9	10	11	12

# 多層行索引
pd.read_csv('data/ex2.csv', header=None, names=['a', 'b', 'c', 'd', 'msg'], index_col=['msg', 'a'])

		b	c	d
msg		a			
hello	1	2	3	4
world	5	6	7	8
foo		9	10	11	12

處理不規則的分隔符

數據樣式爲:

       A         B         C

aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382 1.100491

有多個空格和單個空格組成分隔符

# 正則表達式
pd.read_table('data/ex3.csv', sep='\s+')
# pd.read_table('data/ex3.csv', sep=' ')
# pd.read_csv('data/ex3.csv')

	A			B			C
aaa	-0.264438	-1.026059	-0.619500
bbb	0.927272	0.302904	-0.032399
ccc	-0.264273	-0.386314	-0.217601
ddd	-0.871858	-0.348382	1.100491

注意,pandas自動將第一列作爲行索引

缺失值處理

數據樣式:

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,8,world
three,9,10,11,12,foo

  • pandas自動將空值位置和NA位置賦值NaN
pd.read_csv('data/ex5.csv')

	something	a	b	c	d	message
0	one			1	2	3	4	NaN
1	two			5	6	NaN	8	world
2	three		9	10	11	12	foo

  • 指定需要被識別爲NaN的值
pd.read_csv('data/ex5.csv', na_values=['NA', 'NULL', 'foo'])

	something	a	b	c	d	message
0	one			1	2	3	4	NaN
1	two			5	6	NaN	8	world
2	three		9	10	11	12	NaN

  • 針對不同列指定不同的缺失值
pd.read_csv('data/ex5.csv', na_values={'message': ['foo', 'NA'], 'something': ['two']})

	something	a	b	c	d	message
0	one			1	2	3	4	NaN
1	NaN			5	6	NaN	8	world
2	three		9	10	11	12	NaN

逐塊讀取數據

有10000行數據的文件,讀10行,格式如下

pd.read_csv('data/ex6.csv', nrows=10)

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q
5	1.817480	0.742273	0.419395	-2.251035	Q
6	-0.776764	0.935518	-0.332872	-1.875641	U
7	-0.913135	1.530624	-0.572657	0.477252	K
8	0.358480	-0.497572	-0.367016	0.507702	S
9	-1.740877	-1.160417	-1.637830	2.172201	G

統計每個 key 出現的次數

tr = pd.read_csv('data/ex6.csv', chunksize=1000)

key_count = pd.Series([])
for pieces in tr:
    key_count = key_count.add(pieces['key'].value_counts(), fill_value=0)
key_count = key_count.sort_values(ascending=False)
key_count[:10]

E    368
X    364
L    346
O    343
Q    340
M    338
J    337
F    335
K    334
H    330
dtype: float64

保存數據到磁盤

df.to_csv('data/ex5_out.csv')

df = pd.read_csv('data/ex5_out.csv')
df

Unnamed: 0	something	a	b	c	d	message
0	0		one			1	2	3	4	NaN
1	1		two			5	6	NaN	8	world
2	2		three		9	10	11	12	foo

# 直接寫入存在問題,會把行索引當作數據再次讀入

# 不寫索引
df = pd.read_csv('data/ex5.csv')
df.to_csv('data/ex5_out.csv', index=False)

# 不寫列名稱
df = pd.read_csv('data/ex5.csv')
df.to_csv('data/ex5_out.csv', index=False, header=None)

# 指定分隔符
df = pd.read_csv('data/ex5.csv')
df.to_csv('data/ex5_out.csv', index=False, sep='|')

# 只寫出一部分列
df = pd.read_csv('data/ex5.csv')
df.to_csv('data/ex5_out.csv', index=False, columns=['a', 'b', 'message'])

二進制格式

二進制的優點是容量小,讀取速度快。缺點是可能在不同版本間不兼容。比如 Pandas 版本升級後,早期版本保存的二進制數據可能無法正確地讀出來。

df = pd.read_csv('data/ex1.csv')
df

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

pd.to_pickle(df, 'data/ex1_pickle.bin')

pd.read_pickle('data/ex1_pickle.bin')


	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

pd.to_pickle(pd.read_csv('data/ex6.csv'), 'data/ex6_pickle.bin')

其他格式

  • HDF5: HDF5是個C語言實現的庫,可以高效地讀取磁盤上的二進制存儲的科學數據。
  • Excel文件: pd.read_excel/pd.ExcelFile/pd.ExcelWriter
  • JSON: 通過 json 模塊轉換爲字典,再轉換爲 DataFrame
  • SQL 數據庫:通過 pd.io.sql 模塊來從數據庫讀取數據
  • NoSQL (MongoDB) 數據庫:需要結合相應的數據庫模塊,如 pymongo 。再通過遊標把數據讀出來,轉換爲 DataFrame
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章