點擊標題即可獲取文章源代碼和筆記
4.1.0 概要
Pandas
基礎處理
Pandas是什麼?爲什麼用?
核心數據結構
DataFrame
Panel
Series
基本操作
運算
畫圖
文件的讀取與存儲
高級處理
4. 1Pandas介紹
4.1 .1 Pandas介紹 - 數據處理工具
panel + data + analysis
panel面板數據 - 計量經濟學 三維數據
4.1 .2 爲什麼使用Pandas
便捷的數據處理能力
讀取文件方便
封裝了Matplotlib、Numpy的畫圖和計算
4.1 .3 DataFrame
結構:既有行索引,又有列索引的二維數組
屬性:
shape
index
columns
values
T
方法:
head( )
tail( )
3 DataFrame索引的設置
1 )修改行列索引值
2 )重設索引
3 )設置新索引
2 Panel
DataFrame的容器
3 Series
帶索引的一維數組
屬性
index
values
總結:
DataFrame是Series的容器
Panel是DataFrame的容器
4.2 基本數據操作
4.2 .1 索引操作
1 )直接索引
先列後行
2 )按名字索引
loc
3 )按數字索引
iloc
4 )組合索引
數字、名字
4.2 .3 排序
對內容排序
dataframe
series
對索引排序
dataframe
series
4.3 DataFrame運算
算術運算
邏輯運算
邏輯運算符
布爾索引
邏輯運算函數
query( )
isin( )
統計運算
min max mean median var std
np. argmax( )
np. argmin( )
自定義運算
apply ( func, axis= 0 ) True
func: 自定義函數
4.4 Pandas畫圖
sr. plot( )
4.5 文件讀取與存儲
4.5 .1 CSV
pd. read_csv( path)
usecols=
names=
dataframe. to_csv( path)
columns= [ ]
index= False
header= False
4.5 .2 HDF5
hdf5 存儲 3 維數據的文件
key1 dataframe1二維數據
key2 dataframe2二維數據
pd. read_hdf( path, key= )
df. to_hdf( path, key= )
4.5 .3 JSON
pd. read_json( path)
orient= "records"
lines= True
df. to_json( patn)
orient= "records"
lines= True
4.1.3 DataFrame
import numpy as np
stock_change = np. random. normal( 0 , 1 , ( 10 , 5 ) )
stock_change
array([[ 0.77072465, 1.30408183, -0.44043464, 0.8900768 , -0.80947118],
[ 0.92407994, 0.01646795, -1.26614793, 1.52393669, -0.85373051],
[-1.68378051, 0.4302981 , 0.8069393 , 0.60557427, -0.03960376],
[ 0.75708007, -0.39899325, 0.23027082, -0.89585658, -1.86590247],
[-0.41516245, -1.31841546, 0.16256478, -0.67449097, -1.26234013],
[-0.27687242, -0.74154521, -0.03755446, 1.24182603, -0.79444361],
[-0.2549323 , -0.41034663, -1.85076521, -1.28663451, -0.28566877],
[ 1.22453612, -1.60200055, -1.83171522, -0.85322799, -1.70950421],
[ 2.00461483, 1.49338564, 0.33928513, -0.1776084 , -0.39698965],
[ 0.2184662 , -0.03868143, -0.21432675, 0.00604093, 1.35011139]])
import pandas as pd
pd. DataFrame( stock_change)
0
1
2
3
4
0
0.770725
1.304082
-0.440435
0.890077
-0.809471
1
0.924080
0.016468
-1.266148
1.523937
-0.853731
2
-1.683781
0.430298
0.806939
0.605574
-0.039604
3
0.757080
-0.398993
0.230271
-0.895857
-1.865902
4
-0.415162
-1.318415
0.162565
-0.674491
-1.262340
5
-0.276872
-0.741545
-0.037554
1.241826
-0.794444
6
-0.254932
-0.410347
-1.850765
-1.286635
-0.285669
7
1.224536
-1.602001
-1.831715
-0.853228
-1.709504
8
2.004615
1.493386
0.339285
-0.177608
-0.396990
9
0.218466
-0.038681
-0.214327
0.006041
1.350111
stock_code = [ '股票' + str ( i) for i in range ( stock_change. shape[ 0 ] ) ]
stock_code
['股票0', '股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9']
data = pd. DataFrame( stock_change, index= stock_code)
data
0
1
2
3
4
股票0
0.770725
1.304082
-0.440435
0.890077
-0.809471
股票1
0.924080
0.016468
-1.266148
1.523937
-0.853731
股票2
-1.683781
0.430298
0.806939
0.605574
-0.039604
股票3
0.757080
-0.398993
0.230271
-0.895857
-1.865902
股票4
-0.415162
-1.318415
0.162565
-0.674491
-1.262340
股票5
-0.276872
-0.741545
-0.037554
1.241826
-0.794444
股票6
-0.254932
-0.410347
-1.850765
-1.286635
-0.285669
股票7
1.224536
-1.602001
-1.831715
-0.853228
-1.709504
股票8
2.004615
1.493386
0.339285
-0.177608
-0.396990
股票9
0.218466
-0.038681
-0.214327
0.006041
1.350111
date = pd. date_range( start= "20200618" , periods= 5 , freq= "B" )
date
DatetimeIndex(['2020-06-18', '2020-06-19', '2020-06-22', '2020-06-23',
'2020-06-24'],
dtype='datetime64[ns]', freq='B')
data = pd. DataFrame( stock_change, index= stock_code, columns= date)
data
2020-06-18
2020-06-19
2020-06-22
2020-06-23
2020-06-24
股票0
0.770725
1.304082
-0.440435
0.890077
-0.809471
股票1
0.924080
0.016468
-1.266148
1.523937
-0.853731
股票2
-1.683781
0.430298
0.806939
0.605574
-0.039604
股票3
0.757080
-0.398993
0.230271
-0.895857
-1.865902
股票4
-0.415162
-1.318415
0.162565
-0.674491
-1.262340
股票5
-0.276872
-0.741545
-0.037554
1.241826
-0.794444
股票6
-0.254932
-0.410347
-1.850765
-1.286635
-0.285669
股票7
1.224536
-1.602001
-1.831715
-0.853228
-1.709504
股票8
2.004615
1.493386
0.339285
-0.177608
-0.396990
股票9
0.218466
-0.038681
-0.214327
0.006041
1.350111
DataFrame屬性
data. shape
(10, 5)
data. index
Index(['股票0', '股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9'], dtype='object')
data. columns
DatetimeIndex(['2020-06-18', '2020-06-19', '2020-06-22', '2020-06-23',
'2020-06-24'],
dtype='datetime64[ns]', freq='B')
data. values
array([[ 0.77072465, 1.30408183, -0.44043464, 0.8900768 , -0.80947118],
[ 0.92407994, 0.01646795, -1.26614793, 1.52393669, -0.85373051],
[-1.68378051, 0.4302981 , 0.8069393 , 0.60557427, -0.03960376],
[ 0.75708007, -0.39899325, 0.23027082, -0.89585658, -1.86590247],
[-0.41516245, -1.31841546, 0.16256478, -0.67449097, -1.26234013],
[-0.27687242, -0.74154521, -0.03755446, 1.24182603, -0.79444361],
[-0.2549323 , -0.41034663, -1.85076521, -1.28663451, -0.28566877],
[ 1.22453612, -1.60200055, -1.83171522, -0.85322799, -1.70950421],
[ 2.00461483, 1.49338564, 0.33928513, -0.1776084 , -0.39698965],
[ 0.2184662 , -0.03868143, -0.21432675, 0.00604093, 1.35011139]])
data. T
股票0
股票1
股票2
股票3
股票4
股票5
股票6
股票7
股票8
股票9
2020-06-18
0.770725
0.924080
-1.683781
0.757080
-0.415162
-0.276872
-0.254932
1.224536
2.004615
0.218466
2020-06-19
1.304082
0.016468
0.430298
-0.398993
-1.318415
-0.741545
-0.410347
-1.602001
1.493386
-0.038681
2020-06-22
-0.440435
-1.266148
0.806939
0.230271
0.162565
-0.037554
-1.850765
-1.831715
0.339285
-0.214327
2020-06-23
0.890077
1.523937
0.605574
-0.895857
-0.674491
1.241826
-1.286635
-0.853228
-0.177608
0.006041
2020-06-24
-0.809471
-0.853731
-0.039604
-1.865902
-1.262340
-0.794444
-0.285669
-1.709504
-0.396990
1.350111
DataFrame方法
data. head( )
2020-06-18
2020-06-19
2020-06-22
2020-06-23
2020-06-24
股票0
0.770725
1.304082
-0.440435
0.890077
-0.809471
股票1
0.924080
0.016468
-1.266148
1.523937
-0.853731
股票2
-1.683781
0.430298
0.806939
0.605574
-0.039604
股票3
0.757080
-0.398993
0.230271
-0.895857
-1.865902
股票4
-0.415162
-1.318415
0.162565
-0.674491
-1.262340
data. tail( )
2020-06-18
2020-06-19
2020-06-22
2020-06-23
2020-06-24
股票5
-0.276872
-0.741545
-0.037554
1.241826
-0.794444
股票6
-0.254932
-0.410347
-1.850765
-1.286635
-0.285669
股票7
1.224536
-1.602001
-1.831715
-0.853228
-1.709504
股票8
2.004615
1.493386
0.339285
-0.177608
-0.396990
股票9
0.218466
-0.038681
-0.214327
0.006041
1.350111
3 DataFrame索引的設置
data. index[ 2 ]
'股票2'
data. index[ 2 ] = "股票88"
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-19-9e95917cc4d9> in <module>
----> 1 data.index[2] = "股票88"
D:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value)
3908
3909 def __setitem__(self, key, value):
-> 3910 raise TypeError("Index does not support mutable operations")
3911
3912 def __getitem__(self, key):
TypeError: Index does not support mutable operations
stock_ = [ "股票_{}" . format ( i) for i in range ( 10 ) ]
data. index = stock_
data. index
Index(['股票_0', '股票_1', '股票_2', '股票_3', '股票_4', '股票_5', '股票_6', '股票_7', '股票_8',
'股票_9'],
dtype='object')
重設索引
reset_index(drop=False)
設置新的下標索引
drop:默認爲False,不刪除原來索引,如果爲True,刪除原來的索引值
data. reset_index( )
index
2020-06-18 00:00:00
2020-06-19 00:00:00
2020-06-22 00:00:00
2020-06-23 00:00:00
2020-06-24 00:00:00
0
股票_0
0.770725
1.304082
-0.440435
0.890077
-0.809471
1
股票_1
0.924080
0.016468
-1.266148
1.523937
-0.853731
2
股票_2
-1.683781
0.430298
0.806939
0.605574
-0.039604
3
股票_3
0.757080
-0.398993
0.230271
-0.895857
-1.865902
4
股票_4
-0.415162
-1.318415
0.162565
-0.674491
-1.262340
5
股票_5
-0.276872
-0.741545
-0.037554
1.241826
-0.794444
6
股票_6
-0.254932
-0.410347
-1.850765
-1.286635
-0.285669
7
股票_7
1.224536
-1.602001
-1.831715
-0.853228
-1.709504
8
股票_8
2.004615
1.493386
0.339285
-0.177608
-0.396990
9
股票_9
0.218466
-0.038681
-0.214327
0.006041
1.350111
data. reset_index( drop= True )
2020-06-18
2020-06-19
2020-06-22
2020-06-23
2020-06-24
0
0.770725
1.304082
-0.440435
0.890077
-0.809471
1
0.924080
0.016468
-1.266148
1.523937
-0.853731
2
-1.683781
0.430298
0.806939
0.605574
-0.039604
3
0.757080
-0.398993
0.230271
-0.895857
-1.865902
4
-0.415162
-1.318415
0.162565
-0.674491
-1.262340
5
-0.276872
-0.741545
-0.037554
1.241826
-0.794444
6
-0.254932
-0.410347
-1.850765
-1.286635
-0.285669
7
1.224536
-1.602001
-1.831715
-0.853228
-1.709504
8
2.004615
1.493386
0.339285
-0.177608
-0.396990
9
0.218466
-0.038681
-0.214327
0.006041
1.350111
以某列值設置爲新的索引
set_index(keys,drop=True)
keys:列索引名或者列索引名稱的列表
drop:boolean,default True 當作新的索引,刪除原來的索引列
設置新索引案例
df = pd. DataFrame( {
'month' : [ 1 , 4 , 7 , 10 ] ,
'year' : [ 2012 , 2014 , 2013 , 2014 ] ,
'sale' : [ 55 , 40 , 84 , 31 ]
} )
df
month
year
sale
0
1
2012
55
1
4
2014
40
2
7
2013
84
3
10
2014
31
df. set_index( 'month' )
year
sale
month
1
2012
55
4
2014
40
7
2013
84
10
2014
31
new_df = df. set_index( [ 'year' , 'month' ] )
new_df
sale
year
month
2012
1
55
2014
4
40
2013
7
84
2014
10
31
new_df. index
MultiIndex([(2012, 1),
(2014, 4),
(2013, 7),
(2014, 10)],
names=['year', 'month'])
4.1.4 MultiIndex 與 Panel的關係
1 Multilndex多級或分層索引對象。
names: levels的名稱
levels:每個level的元組值
new_df. index. names
FrozenList(['year', 'month'])
new_df. index. levels
FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])
2 Panel
p = pd. Panel( )
p
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version
"""Entry point for launching an IPython kernel.
<pandas.__getattr__.<locals>.Panel at 0x203fd31ea08>
data
2020-06-18
2020-06-19
2020-06-22
2020-06-23
2020-06-24
股票_0
0.770725
1.304082
-0.440435
0.890077
-0.809471
股票_1
0.924080
0.016468
-1.266148
1.523937
-0.853731
股票_2
-1.683781
0.430298
0.806939
0.605574
-0.039604
股票_3
0.757080
-0.398993
0.230271
-0.895857
-1.865902
股票_4
-0.415162
-1.318415
0.162565
-0.674491
-1.262340
股票_5
-0.276872
-0.741545
-0.037554
1.241826
-0.794444
股票_6
-0.254932
-0.410347
-1.850765
-1.286635
-0.285669
股票_7
1.224536
-1.602001
-1.831715
-0.853228
-1.709504
股票_8
2.004615
1.493386
0.339285
-0.177608
-0.396990
股票_9
0.218466
-0.038681
-0.214327
0.006041
1.350111
Series
data. iloc[ 1 , : ]
2020-06-18 0.924080
2020-06-19 0.016468
2020-06-22 -1.266148
2020-06-23 1.523937
2020-06-24 -0.853731
Freq: B, Name: 股票_1, dtype: float64
type ( data. iloc[ 1 , : ] )
pandas.core.series.Series
屬性
data. iloc[ 1 , : ] . index
DatetimeIndex(['2020-06-18', '2020-06-19', '2020-06-22', '2020-06-23',
'2020-06-24'],
dtype='datetime64[ns]', freq='B')
data. iloc[ 1 , : ] . values
array([ 0.92407994, 0.01646795, -1.26614793, 1.52393669, -0.85373051])
1. 創建Series
通過已有數據創建
pd. Series( np. arange( 10 ) )
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int32
pd. Series( [ 6.7 , 5.6 , 3 , 10 , 2 ] , index= [ 1 , 2 , 3 , 4 , 5 ] )
1 6.7
2 5.6
3 3.0
4 10.0
5 2.0
dtype: float64
pd. Series( {
'red' : 100 ,
'blue' : 200 ,
'green' : 500 ,
'yellow' : 1000
} )
red 100
blue 200
green 500
yellow 1000
dtype: int64
總結
DataFrame 是 Series的容器
Panel 是 DataFrame的容器
4.2 基本數據操作
datas = pd. read_excel( "./datas/szfj_baoan.xls" )
datas
district
roomnum
hall
AREA
C_floor
floor_num
school
subway
per_price
0
baoan
3
2
89.3
middle
31
0
0
7.0773
1
baoan
4
2
127.0
high
31
0
0
6.9291
2
baoan
1
1
28.0
low
39
0
0
3.9286
3
baoan
1
1
28.0
middle
30
0
0
3.3568
4
baoan
2
2
78.0
middle
8
1
1
5.0769
...
...
...
...
...
...
...
...
...
...
1246
baoan
4
2
89.3
low
8
0
0
4.2553
1247
baoan
2
1
67.0
middle
30
0
0
3.8060
1248
baoan
2
2
67.4
middle
29
1
0
5.3412
1249
baoan
2
2
73.1
low
15
1
0
5.9508
1250
baoan
3
2
86.2
middle
32
0
1
4.5244
1251 rows × 9 columns
datas. columns
Index(['district', 'roomnum', 'hall', 'AREA', 'C_floor', 'floor_num', 'school',
'subway', 'per_price'],
dtype='object')
datas = datas. drop( columns= [ 'school' , 'subway' , ] , axis= 0 )
datas
district
roomnum
hall
AREA
C_floor
floor_num
per_price
0
baoan
3
2
89.3
middle
31
7.0773
1
baoan
4
2
127.0
high
31
6.9291
2
baoan
1
1
28.0
low
39
3.9286
3
baoan
1
1
28.0
middle
30
3.3568
4
baoan
2
2
78.0
middle
8
5.0769
...
...
...
...
...
...
...
...
1246
baoan
4
2
89.3
low
8
4.2553
1247
baoan
2
1
67.0
middle
30
3.8060
1248
baoan
2
2
67.4
middle
29
5.3412
1249
baoan
2
2
73.1
low
15
5.9508
1250
baoan
3
2
86.2
middle
32
4.5244
1251 rows × 7 columns
4.2.1 索引操作
1.直接使用行列索引(先列後行)
datas[ "per_price" ] [ 0 ]
7.0773
2. 按名字索引(先行後列)
datas. loc[ 0 ] [ "per_price" ]
7.0773
datas. loc[ 0 , "per_price" ]
7.0773
3.按數字索引
datas. iloc[ 0 , 6 ]
7.0773
datas. index[ 0 : 4 ]
RangeIndex(start=0, stop=4, step=1)
datas. loc[ datas. index[ 0 : 4 ] , [ "district" , "roomnum" ] ]
district
roomnum
0
baoan
3
1
baoan
4
2
baoan
1
3
baoan
1
datas. columns. get_indexer( [ "district" , "roomnum" ] )
array([0, 1], dtype=int64)
datas. iloc[ 0 : 4 , datas. columns. get_indexer( [ "district" , "roomnum" ] ) ]
district
roomnum
0
baoan
3
1
baoan
4
2
baoan
1
3
baoan
1
4.2.2 賦值操作
datas[ "hall" ] = 5
datas. head( )
district
roomnum
hall
AREA
C_floor
floor_num
per_price
0
baoan
3
5
89.3
middle
31
7.0773
1
baoan
4
5
127.0
high
31
6.9291
2
baoan
1
5
28.0
low
39
3.9286
3
baoan
1
5
28.0
middle
30
3.3568
4
baoan
2
5
78.0
middle
8
5.0769
datas. hall = 1
datas. head( )
district
roomnum
hall
AREA
C_floor
floor_num
per_price
0
baoan
3
1
89.3
middle
31
7.0773
1
baoan
4
1
127.0
high
31
6.9291
2
baoan
1
1
28.0
low
39
3.9286
3
baoan
1
1
28.0
middle
30
3.3568
4
baoan
2
1
78.0
middle
8
5.0769
datas. iloc[ 0 , 0 ] = "zzzz"
datas. head( )
district
roomnum
hall
AREA
C_floor
floor_num
per_price
0
zzzz
3
1
89.3
middle
31
7.0773
1
baoan
4
1
127.0
high
31
6.9291
2
baoan
1
1
28.0
low
39
3.9286
3
baoan
1
1
28.0
middle
30
3.3568
4
baoan
2
1
78.0
middle
8
5.0769
4.2.3 排序
datas. sort_values( by= "per_price" , ascending= False )
district
roomnum
hall
AREA
C_floor
floor_num
per_price
917
baoan
4
1
93.59
high
28
21.9040
356
baoan
8
1
248.99
low
7
21.2860
576
baoan
1
1
21.95
middle
22
19.3622
296
baoan
4
1
93.59
high
28
19.2328
186
baoan
3
1
113.60
middle
31
16.5493
...
...
...
...
...
...
...
...
911
baoan
2
1
89.00
middle
16
1.6854
841
baoan
2
1
75.00
high
7
1.6667
1188
baoan
3
1
110.00
middle
33
1.5909
684
baoan
3
1
89.00
middle
26
1.2247
1047
baoan
3
1
98.90
middle
26
1.1931
1251 rows × 7 columns
datas. sort_values( by= "per_price" )
district
roomnum
hall
AREA
C_floor
floor_num
per_price
1047
baoan
3
1
98.90
middle
26
1.1931
684
baoan
3
1
89.00
middle
26
1.2247
1188
baoan
3
1
110.00
middle
33
1.5909
841
baoan
2
1
75.00
high
7
1.6667
911
baoan
2
1
89.00
middle
16
1.6854
...
...
...
...
...
...
...
...
186
baoan
3
1
113.60
middle
31
16.5493
296
baoan
4
1
93.59
high
28
19.2328
576
baoan
1
1
21.95
middle
22
19.3622
356
baoan
8
1
248.99
low
7
21.2860
917
baoan
4
1
93.59
high
28
21.9040
1251 rows × 7 columns
datas. sort_values( by= [ "district" , "per_price" ] )
district
roomnum
hall
AREA
C_floor
floor_num
per_price
1047
baoan
3
1
98.90
middle
26
1.1931
684
baoan
3
1
89.00
middle
26
1.2247
1188
baoan
3
1
110.00
middle
33
1.5909
841
baoan
2
1
75.00
high
7
1.6667
911
baoan
2
1
89.00
middle
16
1.6854
...
...
...
...
...
...
...
...
296
baoan
4
1
93.59
high
28
19.2328
576
baoan
1
1
21.95
middle
22
19.3622
356
baoan
8
1
248.99
low
7
21.2860
917
baoan
4
1
93.59
high
28
21.9040
0
zzzz
3
1
89.30
middle
31
7.0773
1251 rows × 7 columns
datas. sort_index( )
district
roomnum
hall
AREA
C_floor
floor_num
per_price
0
zzzz
3
1
89.3
middle
31
7.0773
1
baoan
4
1
127.0
high
31
6.9291
2
baoan
1
1
28.0
low
39
3.9286
3
baoan
1
1
28.0
middle
30
3.3568
4
baoan
2
1
78.0
middle
8
5.0769
...
...
...
...
...
...
...
...
1246
baoan
4
1
89.3
low
8
4.2553
1247
baoan
2
1
67.0
middle
30
3.8060
1248
baoan
2
1
67.4
middle
29
5.3412
1249
baoan
2
1
73.1
low
15
5.9508
1250
baoan
3
1
86.2
middle
32
4.5244
1251 rows × 7 columns
sr = datas[ "per_price" ]
sr
0 7.0773
1 6.9291
2 3.9286
3 3.3568
4 5.0769
...
1246 4.2553
1247 3.8060
1248 5.3412
1249 5.9508
1250 4.5244
Name: per_price, Length: 1251, dtype: float64
sr. sort_values( )
1047 1.1931
684 1.2247
1188 1.5909
841 1.6667
911 1.6854
...
186 16.5493
296 19.2328
576 19.3622
356 21.2860
917 21.9040
Name: per_price, Length: 1251, dtype: float64
sr. sort_index( )
0 7.0773
1 6.9291
2 3.9286
3 3.3568
4 5.0769
...
1246 4.2553
1247 3.8060
1248 5.3412
1249 5.9508
1250 4.5244
Name: per_price, Length: 1251, dtype: float64
4.3 DataFrame運算
算術運算
datas[ "roomnum" ] + 3
0 6
1 7
2 4
3 4
4 5
..
1246 7
1247 5
1248 5
1249 5
1250 6
Name: roomnum, Length: 1251, dtype: int64
datas[ "roomnum" ] . add( 3 ) . head( )
0 6
1 7
2 4
3 4
4 5
Name: roomnum, dtype: int64
datas. iloc[ : , 1 : 4 ]
roomnum
hall
AREA
0
3
1
89.3
1
4
1
127.0
2
1
1
28.0
3
1
1
28.0
4
2
1
78.0
...
...
...
...
1246
4
1
89.3
1247
2
1
67.0
1248
2
1
67.4
1249
2
1
73.1
1250
3
1
86.2
1251 rows × 3 columns
datas. iloc[ : , 1 : 4 ] + 10
roomnum
hall
AREA
0
13
11
99.3
1
14
11
137.0
2
11
11
38.0
3
11
11
38.0
4
12
11
88.0
...
...
...
...
1246
14
11
99.3
1247
12
11
77.0
1248
12
11
77.4
1249
12
11
83.1
1250
13
11
96.2
1251 rows × 3 columns
邏輯運算
datas[ 'AREA' ] > 100
0 False
1 True
2 False
3 False
4 False
...
1246 False
1247 False
1248 False
1249 False
1250 False
Name: AREA, Length: 1251, dtype: bool
datas[ datas[ 'AREA' ] > 100 ]
district
roomnum
hall
AREA
C_floor
floor_num
per_price
1
baoan
4
1
127.00
high
31
6.9291
5
baoan
4
1
125.17
middle
15
5.8161
16
baoan
3
1
151.00
high
20
4.9669
25
baoan
3
1
116.00
high
18
5.0000
26
baoan
5
1
151.25
high
30
7.6033
...
...
...
...
...
...
...
...
1232
baoan
5
1
127.17
low
24
5.1113
1238
baoan
4
1
130.74
low
30
13.0029
1239
baoan
3
1
102.10
middle
28
10.8717
1241
baoan
5
1
151.30
high
29
7.2703
1243
baoan
4
1
142.25
high
32
6.3269
322 rows × 7 columns
( datas[ "AREA" ] > 100 ) & ( datas[ "per_price" ] < 40000 )
0 False
1 True
2 False
3 False
4 False
...
1246 False
1247 False
1248 False
1249 False
1250 False
Length: 1251, dtype: bool
datas[ ( datas[ "AREA" ] > 100 ) & ( datas[ "per_price" ] < 40000 ) ]
district
roomnum
hall
AREA
C_floor
floor_num
per_price
1
baoan
4
1
127.00
high
31
6.9291
5
baoan
4
1
125.17
middle
15
5.8161
16
baoan
3
1
151.00
high
20
4.9669
25
baoan
3
1
116.00
high
18
5.0000
26
baoan
5
1
151.25
high
30
7.6033
...
...
...
...
...
...
...
...
1232
baoan
5
1
127.17
low
24
5.1113
1238
baoan
4
1
130.74
low
30
13.0029
1239
baoan
3
1
102.10
middle
28
10.8717
1241
baoan
5
1
151.30
high
29
7.2703
1243
baoan
4
1
142.25
high
32
6.3269
322 rows × 7 columns
邏輯運算函數
datas. query( "AREA>100 & per_price<40000" )
district
roomnum
hall
AREA
C_floor
floor_num
per_price
1
baoan
4
1
127.00
high
31
6.9291
5
baoan
4
1
125.17
middle
15
5.8161
16
baoan
3
1
151.00
high
20
4.9669
25
baoan
3
1
116.00
high
18
5.0000
26
baoan
5
1
151.25
high
30
7.6033
...
...
...
...
...
...
...
...
1232
baoan
5
1
127.17
low
24
5.1113
1238
baoan
4
1
130.74
low
30
13.0029
1239
baoan
3
1
102.10
middle
28
10.8717
1241
baoan
5
1
151.30
high
29
7.2703
1243
baoan
4
1
142.25
high
32
6.3269
322 rows × 7 columns
datas[ "roomnum" ] . isin( [ 4 , 5 ] )
0 False
1 True
2 False
3 False
4 False
...
1246 True
1247 False
1248 False
1249 False
1250 False
Name: roomnum, Length: 1251, dtype: bool
datas[ datas[ "roomnum" ] . isin( [ 4 , 5 ] ) ]
district
roomnum
hall
AREA
C_floor
floor_num
per_price
1
baoan
4
1
127.00
high
31
6.9291
5
baoan
4
1
125.17
middle
15
5.8161
26
baoan
5
1
151.25
high
30
7.6033
29
baoan
4
1
143.45
middle
25
6.9711
36
baoan
4
1
134.60
middle
32
9.1828
...
...
...
...
...
...
...
...
1232
baoan
5
1
127.17
low
24
5.1113
1238
baoan
4
1
130.74
low
30
13.0029
1241
baoan
5
1
151.30
high
29
7.2703
1243
baoan
4
1
142.25
high
32
6.3269
1246
baoan
4
1
89.30
low
8
4.2553
224 rows × 7 columns
統計運算
datas. describe( )
roomnum
hall
AREA
floor_num
per_price
count
1251.000000
1251.0
1251.000000
1251.000000
1251.000000
mean
2.906475
1.0
92.409976
24.598721
6.643429
std
0.940663
0.0
37.798122
9.332119
2.435132
min
1.000000
1.0
21.950000
1.000000
1.193100
25%
2.000000
1.0
75.000000
17.000000
5.075850
50%
3.000000
1.0
87.800000
28.000000
5.906800
75%
3.000000
1.0
101.375000
31.000000
7.761950
max
8.000000
1.0
352.900000
53.000000
21.904000
統計函數
datas. max ( axis= 0 )
district zzzz
roomnum 8
hall 1
AREA 352.9
C_floor middle
floor_num 53
per_price 21.904
dtype: object
datas. var( axis= 0 )
roomnum 0.884846
hall 0.000000
AREA 1428.698032
floor_num 87.088446
per_price 5.929870
dtype: float64
datas. std( axis= 0 )
roomnum 0.940663
hall 0.000000
AREA 37.798122
floor_num 9.332119
per_price 2.435132
dtype: float64
datas. iloc[ : , 3 ]
0 89.3
1 127.0
2 28.0
3 28.0
4 78.0
...
1246 89.3
1247 67.0
1248 67.4
1249 73.1
1250 86.2
Name: AREA, Length: 1251, dtype: float64
datas. iloc[ : , 3 ] . idxmax( axis= 0 )
759
datas. iloc[ 759 , 3 ]
352.9
datas. iloc[ : , 3 ] . idxmin( axis= 0 )
576
datas. iloc[ 576 , 3 ]
21.95
累計統計函數
datas[ "per_price" ]
0 7.0773
1 6.9291
2 3.9286
3 3.3568
4 5.0769
...
1246 4.2553
1247 3.8060
1248 5.3412
1249 5.9508
1250 4.5244
Name: per_price, Length: 1251, dtype: float64
datas[ "per_price" ] . cumsum( )
0 7.0773
1 14.0064
2 17.9350
3 21.2918
4 26.3687
...
1246 8291.3076
1247 8295.1136
1248 8300.4548
1249 8306.4056
1250 8310.9300
Name: per_price, Length: 1251, dtype: float64
datas[ "per_price" ] . sort_index( ) . cumsum( ) . plot( )
<matplotlib.axes._subplots.AxesSubplot at 0x2039a3a3dc8>
import matplotlib. pyplot as plt
datas[ "per_price" ] . sort_index( ) . cumsum( ) . plot( )
plt. show( )
自定義運算
datas[ [ "per_price" ] ] . apply ( lambda x : x. max ( ) - x. min ( ) , axis= 0 )
per_price 20.7109
dtype: float64
4.4 Pandas畫圖
datas. plot( x= "AREA" , y= "per_price" , kind= "scatter" )
<matplotlib.axes._subplots.AxesSubplot at 0x203a343dec8>
datas. plot( x= "floor_num" , y= "per_price" , kind= "scatter" )
<matplotlib.axes._subplots.AxesSubplot at 0x203a3a81bc8>
datas. plot( x= "AREA" , y= "per_price" , kind= "barh" )
<matplotlib.axes._subplots.AxesSubplot at 0x203a2147f08>
4.5 文件的讀取與存儲
1.讀取csv文件 read_csv()
iris_data = pd. read_csv( "./datas/iris.data.csv" )
iris_data. head( )
feature1
feature2
feature3
feature4
result
0
5.1
3.5
1.4
0.2
Iris-setosa
1
4.9
3.0
1.4
0.2
Iris-setosa
2
4.7
3.2
1.3
0.2
Iris-setosa
3
4.6
3.1
1.5
0.2
Iris-setosa
4
5.0
3.6
1.4
0.2
Iris-setosa
iris_data1 = pd. read_csv( "./datas/iris.data.csv" , usecols= [ "feature1" , "feature2" , "result" ] )
iris_data1. head( )
feature1
feature2
result
0
5.1
3.5
Iris-setosa
1
4.9
3.0
Iris-setosa
2
4.7
3.2
Iris-setosa
3
4.6
3.1
Iris-setosa
4
5.0
3.6
Iris-setosa
iris_data2 = pd. read_csv( "./datas/iris.data2.csv" )
iris_data2. head( )
5.1
3.5
1.4
0.2
Iris-setosa
0
4.9
3.0
1.4
0.2
Iris-setosa
1
4.7
3.2
1.3
0.2
Iris-setosa
2
4.6
3.1
1.5
0.2
Iris-setosa
3
5.0
3.6
1.4
0.2
Iris-setosa
4
5.4
3.9
1.7
0.4
Iris-setosa
iris_data2 = pd. read_csv( "./datas/iris.data2.csv" , names= [ "feature1" , "feature2" , "feature3" , "feature4" , "result" ] )
iris_data2. head( )
feature1
feature2
feature3
feature4
result
0
5.1
3.5
1.4
0.2
Iris-setosa
1
4.9
3.0
1.4
0.2
Iris-setosa
2
4.7
3.2
1.3
0.2
Iris-setosa
3
4.6
3.1
1.5
0.2
Iris-setosa
4
5.0
3.6
1.4
0.2
Iris-setosa
datas. head( 5 )
district
roomnum
hall
AREA
C_floor
floor_num
per_price
0
zzzz
3
1
89.3
middle
31
7.0773
1
baoan
4
1
127.0
high
31
6.9291
2
baoan
1
1
28.0
low
39
3.9286
3
baoan
1
1
28.0
middle
30
3.3568
4
baoan
2
1
78.0
middle
8
5.0769
datas[ : - 1 ] . to_csv( "./price_test" , columns= [ 'per_price' ] , index= False , mode= "a" , header= False )
perice_test = pd. read_csv( "./price_test" )
perice_test
per_price
0
7.0773
1
6.9291
2
3.9286
3
3.3568
4
5.0769
...
...
3746
6.1932
3747
4.2553
3748
3.806
3749
5.3412
3750
5.9508
3751 rows × 1 columns