import pandas as pd
import numpy as np
Series和DataFrame提供的有豐富的下標存取方法,可以直接使用[],也可以使用如下形式:
.loc[]和.iloc[]和.at[]和.iat以及.ix[]
下標存取
np. random. seed( 42 )
df = pd. DataFrame( np. random. randint( 0 , 10 , ( 5 , 3 ) ) ,
index= [ "r1" , "r2" , "r3" , "r4" , "r5" ] ,
columns= [ "c1" , "c2" , "c3" ] )
[]
操作符
print ( df)
print ( "df的類型爲;{}" . format ( type ( df) ) )
print ( "*" * 50 )
print ( df[ 2 : 4 ] )
print ( "df[2:4]的類型爲;{}" . format ( type ( df[ 2 : 4 ] ) ) )
print ( "*" * 50 )
print ( df[ "r2" : "r4" ] )
print ( 'df["r2":"r4"]的類型爲;{}' . format ( type ( df[ "r2" : "r4" ] ) ) )
c1 c2 c3
r1 6 3 7
r2 4 6 9
r3 2 6 7
r4 4 3 7
r5 7 2 5
df的類型爲;<class 'pandas.core.frame.DataFrame'>
**************************************************
c1 c2 c3
r3 2 6 7
r4 4 3 7
df[2:4]的類型爲;<class 'pandas.core.frame.DataFrame'>
**************************************************
c1 c2 c3
r2 4 6 9
r3 2 6 7
r4 4 3 7
df["r2":"r4"]的類型爲;<class 'pandas.core.frame.DataFrame'>
df[ df. c1 > 4 ]
print ( "df[df.c1>4]的類型:{}" . format ( type ( df[ df. c1> 4 ] ) ) )
df[df.c1>4]的類型:<class 'pandas.core.frame.DataFrame'>
df[ df > 2 ]
c1
c2
c3
r1
6
3
7
r2
4
6
9
r3
nan
6
7
r4
4
3
7
r5
7
nan
5
df[ df >= 2 ]
c1
c2
c3
r1
6
3
7
r2
4
6
9
r3
2
6
7
r4
4
3
7
r5
7
2
5
.loc[]
和.iloc[]
存取器
df. loc[ "r2" ]
c1 4
c2 6
c3 9
Name: r2, dtype: int32
df. loc[ "r2" , "c2" ]
6
df. loc[ [ "r2" , "r3" ] ]
c1
c2
c3
r2
4
6
9
r3
2
6
7
df. loc[ [ "r2" , "r3" ] , [ "c1" , "c2" ] ]
df. loc[ "r2" : "r4" , [ "c2" , "c3" ] ]
c2
c3
r2
6
9
r3
6
7
r4
3
7
df. loc[ df. c1> 2 , [ "c1" , "c2" ] ]
c1
c2
r1
6
3
r2
4
6
r4
4
3
r5
7
2
df. iloc[ 2 ]
c1 2
c2 6
c3 7
Name: r3, dtype: int32
df. iloc[ [ 2 , 4 ] ]
c1
c2
c3
r3
2
6
7
r5
7
2
5
df. iloc[ [ 1 , 3 ] , [ 0 , 2 ] ]
df. iloc[ 2 : 4 , [ 0 , 2 ] ]
df. iloc[ df. c1. values> 2 , [ 0 , 1 ] ]
c1
c2
r1
6
3
r2
4
6
r4
4
3
r5
7
2
D:\installation\anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
“”"Entry point for launching an IPython kernel.
D:\installation\anaconda3\lib\site-packages\pandas\core\indexing.py:822: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
獲取單個值
.at[]和.iat[]分別使用標籤和整數下標獲取單個值,此外,get_value()和.at[]類似,但其執行速度會更加快。
df
c1
c2
c3
r1
6
3
7
r2
4
6
9
r3
2
6
7
r4
4
3
7
r5
7
2
5
df. at[ "r2" , "c2" ]
6
df. iat[ 1 , 1 ]
6
如若,希望獲取兩個列表中每一對標籤所對應的元素,可以使用.lookup(),結果返回一個指定元素的數組 。
df. lookup( [ "r2" , "r4" , "r3" ] , [ "c1" , "c2" , "c1" ] )
array([4, 3, 2])
midx = pd. MultiIndex. from_product( [ [ "A" , "B" , "C" ] , [ "x" , "y" ] ] ,
names= [ "class1" , "class2" ] )
df12 = pd. DataFrame( np. random. randint( 0 , 10 , ( 6 , 6 ) ) , columns= midx, index= midx)
df12
class1
A
B
C
class2
x
y
x
y
x
y
class1
class2
A
x
0
3
1
7
3
1
y
5
5
9
3
5
1
B
x
9
1
9
3
7
6
y
8
7
4
1
4
7
C
x
9
8
8
0
8
6
y
8
7
0
7
7
2
print ( type ( df12) )
<class 'pandas.core.frame.DataFrame'>
多級標籤的存取
soil_df = pd. read_csv( "./data/Soils-simple.csv" , index_col= [ 0 , 1 ] , parse_dates= [ "Date" ] )
soil_df
pH
Dens
Ca
Conduc
Date
Name
Depth
Contour
0-10
Depression
5.4
0.98
11
1.5
2015-05-26
Lois
Slope
5.5
1.1
12
2
2015-04-30
Roy
Top
5.3
1
13
1.4
2015-05-21
Roy
10-30
Depression
4.9
1.4
7.5
5.5
2015-03-21
Lois
Slope
5.3
1.3
9.5
4.9
2015-02-06
Diana
Top
4.8
1.3
10
3.6
2015-04-11
Diana
soil_df. loc[ "10-30" , [ "pH" , "Ca" ] ]
pH
Ca
Contour
Depression
4.9
7.5
Slope
5.3
9.5
Top
4.8
10
soil_df. loc[ np. s_[ : , "Top" ] , [ "pH" , "Ca" ] ]
pH
Ca
Depth
Contour
0-10
Top
5.3
13
10-30
Top
4.8
10
query()
方法
有時候需要根據一定的條件,對行,,進行過濾,通常需要先創建一個布爾數組,使用這個數組獲取True值所對應的行。示例如下:
soil_df[(soil_df.PH > 5) & (soil_df.Ca < 11)] 這個是原始形式,
print ( soil_df. query( "pH > 5 and Ca < 11" ) )
pH Dens Ca Conduc Date Name
Depth Contour
0-10 Depression 5.4 0.98 11 1.5 2015-05-26 Lois
10-30 Slope 5.3 1.3 9.5 4.9 2015-02-06 Diana
query()的參數是一個運算表達式字符串。其中可以使用:not,and,or等關鍵字進行向量布爾運算,表達式中的變量名代表與其對應的列。這裏使用的@符號:是爲了使用其他的全局變量或者局域變量的值。
pH_low = 5
Ca_hi = 11
print ( soil_df. query( "pH > @pH_low and Ca < @Ca_hi" ) )
pH Dens Ca Conduc Date Name
Depth Contour
0-10 Depression 5.4 0.98 11 1.5 2015-05-26 Lois
10-30 Slope 5.3 1.3 9.5 4.9 2015-02-06 Diana