一文搞定Pandas快速入門

1 一維數組Series

>>> import numpy as np
>>> import pandas as pd
>>> s = pd.Series(np.random.rand(4)) # 數據結構類似於字典
>>> print(s,type(s))
0    0.011276
1    0.518487
2    0.404968
3    0.912360
dtype: float64 <class 'pandas.core.series.Series'>
>>> dic = {'a':1 ,'b':2 , 'c':3, '4':4, '5':5} # 可由字典創建
>>> s = pd.Series(dic)
>>> print(s)
4    4
5    5
a    1
b    2
c    3
dtype: int64
>>> s2 = pd.Series(np.random.randn(5),name = 'test') # 可命名
>>> print(s2.name)
test
>>> s3 = s2.rename('valid') # 重命名
>>> print(s3.name) 
valid
>>> print(s2.name) # 傳值非傳址
test

2 二維數組DateFrame

>>> data = {'name':['A','B','C'],'age':[1,2,3],'gender':['m','m','m']}
>>> excel = pd.DataFrame(data) # 由數組/列表組成的字典創建這個表格式數據結構
>>> print(excel,type(excel))
   age gender name
0    1      m    A
1    2      m    B
2    3      m    C <class 'pandas.core.frame.DataFrame'>
>>> data = {'one':pd.Series(np.random.rand(2), index = ['a','b']), 'two':pd.Series(np.random.rand(3),index = ['a','b','c'])}
>>> excel = pd.DataFrame(data) # 由一維數組Series創建
>>> print(excel,type(excel))
        one       two
a  0.521976  0.286897
b  0.434287  0.681197
c       NaN  0.100178 <class 'pandas.core.frame.DataFrame'>
>>> ar = np.random.rand(9).reshape(3,3)
>>> df = pd.DataFrame(ar, index = ['a', 'b', 'c'], columns = ['one','two','three']) # 由二維數組創建,可以指定行/列標籤
>>> print(df,type(df))
        one       two     three
a  0.533896  0.158577  0.201476
b  0.877298  0.451443  0.643094
c  0.012091  0.569678  0.778727 <class 'pandas.core.frame.DataFrame'>

3 索引與切片

>>> df = pd.DataFrame(np.random.rand(12).reshape(3,4),index = ['one','two','three'],columns = ['a','b','c','d'])
>>> data1 = df['a'] # 列一維索引,輸出Series
>>> print(data1,type(data1))
one      0.681222
two      0.948255
three    0.244360
Name: a, dtype: float64 <class 'pandas.core.series.Series'>
>>> data2 = df[['a','c']] # 列二維索引,輸出DataFrame
>>> print(data2,type(data2))
              a         c
one    0.681222  0.510396
two    0.948255  0.288504
three  0.244360  0.802351 <class 'pandas.core.frame.DataFrame'>
>>> data3 = df.loc['one'] # 行一維索引,輸出Series
>>> print(data2,type(data3))
              a         c
one    0.681222  0.510396
two    0.948255  0.288504
three  0.244360  0.802351 <class 'pandas.core.series.Series'>
>>> data4 = df.loc[['one','two']] # 行二維索引,輸出DataFrame
>>> print(data3,type(data4))
a    0.681222
b    0.402983
c    0.510396
d    0.855539
Name: one, dtype: float64 <class 'pandas.core.frame.DataFrame'>
>>> print(df.iloc[-1]) # 單行索引
a    0.244360
b    0.375580
c    0.802351
d    0.299126
Name: three, dtype: float64
>>> print(df.iloc[[2,1]]) # 多行索引
              a         b         c         d
three  0.244360  0.375580  0.802351  0.299126
two    0.948255  0.090073  0.288504  0.984931
>>> print(df.iloc[::2]) # 切片索引
              a         b         c         d
one    0.681222  0.402983  0.510396  0.855539
three  0.244360  0.375580  0.802351  0.299126
>>> print(df['a'].loc[['one','three']]) # 行列同時索引
one      0.681222
three    0.244360
Name: a, dtype: float64
>>> print(df[['b','c','d']].iloc[::2]) # 行列同時索引
              b         c         d
one    0.402983  0.510396  0.855539
three  0.375580  0.802351  0.299126

4 常用操作

>>> df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,columns = ['a','b'])
>>> print(df.head(2)) # 查看頭部
           a          b
0  16.753071  43.025150
1  29.949600  43.230387
>>> print(df.tail()) #查看尾部,默認5行
           a          b
3  36.987161  55.045023
4  45.037641  68.904523
5  86.519609  53.187602
6  93.424580  40.692516
7   3.060607  19.500474
>>> print(df.T) # 轉置
           0          1          2          3          4          5  \
a  16.753071  29.949600  54.280477  36.987161  45.037641  86.519609
b  43.025150  43.230387  72.242892  55.045023  68.904523  53.187602

           6          7
a  93.424580   3.060607
b  40.692516  19.500474
>>> del df['a'] # 刪除列
>>> print(df)
           b
0  43.025150
1  43.230387
2  72.242892
3  55.045023
4  68.904523
5  53.187602
6  40.692516
7  19.500474
>>> print(df.drop(0)) # 刪除行
           b
1  43.230387
2  72.242892
3  55.045023
4  68.904523
5  53.187602
6  40.692516
7  19.500474
>>> print(df.sort_values(['b'], ascending = True)) # 升序按值排序
           b 
7  19.500474
6  40.692516
0  43.025150
1  43.230387
5  53.187602
3  55.045023
4  68.904523
2  72.242892
>>> print(df.sort_index()) # 按索引排序
           b
0  43.025150
1  43.230387
2  72.242892
3  55.045023
4  68.904523
5  53.187602
6  40.692516
7  19.500474

>>> df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],'key2': ['K0', 'K1', 'K0', 'K1']})
>>> df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],'key2': ['K0', 'K0', 'K0', 'K0']})
>>> print(pd.merge(df3, df4,on=['key1','key2'], how = 'inner')) # 交集式合併
  key1 key2
0   K0   K0
1   K1   K0
2   K1   K0
>>> print(pd.merge(df3, df4, on=['key1','key2'], how = 'outer')) # 並集式合併
  key1 key2
0   K0   K0
1   K0   K1
2   K1   K0
3   K1   K0
4   K2   K1
5   K2   K0
>>> s1 = pd.Series([1,2,3])
>>> s2 = pd.Series([2,3,4])
>>> print(pd.concat([s1,s2])) # 垂向連接
0    1
1    2
2    3
0    2
1    3
2    4
dtype: int64
>>> print(pd.concat([s1,s2],axis=1)) # 水平連接
   0  1
0  1  2
1  2  3
2  3  4
>>> print(s1.duplicated()) # 查重
0    False
1    False
2    False
dtype: bool
>>> s_re = s1.drop_duplicates() # 去重
>>> print(s_re)
0    1
1    2
2    3
dtype: int64
>>> print(s2.replace(3, np.nan)) # 替換
0    2.0
1    NaN
2    4.0
dtype: float64

5 統計計算

>>> df = pd.DataFrame({'key1':[4,5,3,np.nan,2],'key2':[1,2,np.nan,4,5],'key3':[1,2,3,'j','k']},index = ['a','b','c','d','e'])
>>> print(df)
   key1  key2 key3
a   4.0   1.0    1
b   5.0   2.0    2
c   3.0   NaN    3
d   NaN   4.0    j
e   2.0   5.0    k
>>> print(df.count(),'→ count統計非Na值的數量\n')
key1    4
key2    4
key3    5
dtype: int64 → count統計非Na值的數量

>>> print(df.min(),'→ min統計最小值\n',df['key2'].max(),'→ max統計最大值\n')
key1    2.0
key2    1.0
dtype: float64 → min統計最小值
 5.0 → max統計最大值

>>> print(df.quantile(q=0.75),'→ quantile統計分位數,參數q確定位置\n')
key1    4.25
key2    4.25
Name: 0.75, dtype: float64 → quantile統計分位數,參數q確定位置

>>> print(df.sum(),'→ sum求和\n')
key1    14.0
key2    12.0
dtype: float64 → sum求和

>>> print(df.mean(),'→ mean求平均值\n')
key1    3.5
key2    3.0
dtype: float64 → mean求平均值

>>> print(df.median(),'→ median求算數中位數,50%分位數\n')
key1    3.5
key2    3.0
dtype: float64 → median求算數中位數,50%分位數

>>> print(df.std(),'\n',df.var(),'→ std,var分別求標準差,方差\n')
key1    1.290994
key2    1.825742
dtype: float64
 key1    1.666667
key2    3.333333
dtype: float64 → std,var分別求標準差,方差

>>> print(df.skew(),'→ skew樣本的偏度\n')
key1    0.0
key2    0.0
dtype: float64 → skew樣本的偏度

>>> print(df.kurt(),'→ kurt樣本的峯度\n')
key1   -1.2
key2   -3.3
dtype: float64 → kurt樣本的峯度

6 字符串方法

>>> s = pd.Series(['A','b','bbhello','123',np.nan])
>>> print(s.str.lower(),'→ lower小寫\n')
0          a
1          b
2    bbhello
3        123
4        NaN
dtype: object → lower小寫

>>> print(s.str.upper(),'→ upper大寫\n')
0          A
1          B
2    BBHELLO
3        123
4        NaN
dtype: object → upper大寫

>>> print(s.str.len(),'→ len字符長度\n')
0    1.0
1    1.0
2    7.0
3    3.0
4    NaN
dtype: float64 → len字符長度

>>> print(s.str.startswith('b'),'→ 判斷起始是否爲a\n')
0    False
1     True
2     True
3    False
4      NaN
dtype: object → 判斷起始是否爲a

>>> print(s.str.endswith('3'),'→ 判斷結束是否爲3\n')
0    False
1    False
2    False
3     True
4      NaN
dtype: object → 判斷結束是否爲3

>>> print(s.str.strip(),'→去除字符串中的空格\n')
0          A
1          b
2    bbhello
3        123
4        NaN
dtype: object →去除字符串中的空格

>>> print(s.str.replace('A','bye',n=1),'→替換1個\n')
0        bye
1          b
2    bbhello
3        123
4        NaN
dtype: object →替換1個

>>> print(s.str.split(','),'→類似字符串的split\n')
0          [A]
1          [b]
2    [bbhello]
3        [123]
4          NaN
dtype: object →類似字符串的split
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章