import numpy as np import pandas as pd
創建DataFrame
In [2]: df = pd.DataFrame({'col_a': np.arange(10), 'col_b': np.random.randn(10), 'col_c': np.random.choice(['A', 'B', 'C'], 10), 'col_d': np.random.choice([0, 1], 10)}) df.head(5) # R code: # df <- data.frame(col_a = 0:9, # col_b = rnorm(10), # col_c = sample(c('A', 'B', 'C'), size = 10, replace = TRUE), # col_d = sample(c(0, 1), size = 10, replace = TRUE), # stringsAsFactors = FALSE) # head(df, 5) Out[2]: col_a col_b col_c col_d 0 0 0.308520 C 1 1 1 -1.829450 B 1 2 2 -0.710135 C 0 3 3 1.354760 A 0 4 4 -0.581359 A 1
獲取DataFrame維度
In [3]: print(df.shape, df.shape[0], df.shape[1]) # R code: # dim(df), rnow(df), ncol(df) (10, 4) 10 4
獲取DataFrame列名
In [4]: df.columns # R code: # names(df) Out[4]: Index(['col_a', 'col_b', 'col_c', 'col_d'], dtype='object')
數據選取
In [5]: # 選取前5行數據 df.iloc[:5] # R code: # df[1:5, ] Out[5]: col_a col_b col_c col_d 0 0 0.308520 C 1 1 1 -1.829450 B 1 2 2 -0.710135 C 0 3 3 1.354760 A 0 4 4 -0.581359 A 1
# 選取col_a和col_b列 df[['col_a', 'col_b']] # R code: # df[, c('col_a', 'col_b')] Out[6]: col_a col_b 0 0 0.308520 1 1 -1.829450 2 2 -0.710135 3 3 1.354760 4 4 -0.581359 5 5 1.633542 6 6 -0.253950 7 7 1.799087 8 8 0.412991 9 9 0.374330
# 選取前5行和前2列 df.iloc[:5, :2] # R code: # df[1:5, 1:2] Out[7]: col_a col_b 0 0 0.308520 1 1 -1.829450 2 2 -0.710135 3 3 1.354760 4 4 -0.581359
# 選取單個值(scalar) df.iat[0, 1] # R code: # df[1, 2] Out[8]: 0.3085196186883713
按條件選取數據
In [9]: df[(df['col_a'] > 3) & (df['col_b'] < 0)] # or # df.query('col_a > 3 & col_b < 0') # R code: # df[df$col_a > 3 & df$col_b < 0, ] Out[9]: col_a col_b col_c col_d 4 4 -0.581359 A 1 6 6 -0.253950 B 1 In [10]: df[df['col_c'].isin(['A', 'B'])] # R code: # df[df$col_c %in% c('A', 'B'), ] Out[10]: col_a col_b col_c col_d 1 1 -1.829450 B 1 3 3 1.354760 A 0 4 4 -0.581359 A 1 5 5 1.633542 B 1 6 6 -0.253950 B 1 7 7 1.799087 A 1 9 9 0.374330 A 0
增加新列
In [11]: df['col_e'] = df['col_a'] + df['col_b'] df # df$col_e <- df$col_a + df$col_b Out[11]: col_a col_b col_c col_d col_e 0 0 0.308520 C 1 0.308520 1 1 -1.829450 B 1 -0.829450 2 2 -0.710135 C 0 1.289865 3 3 1.354760 A 0 4.354760 4 4 -0.581359 A 1 3.418641 5 5 1.633542 B 1 6.633542 6 6 -0.253950 B 1 5.746050 7 7 1.799087 A 1 8.799087 8 8 0.412991 C 0 8.412991 9 9 0.374330 A 0 9.374330
刪除列
In [12]: # 刪除col_e列 df = df.drop(columns='col_e') df # R code: # df <- df[, !names(df) == 'col_e'] Out[12]: col_a col_b col_c col_d 0 0 0.308520 C 1 1 1 -1.829450 B 1 2 2 -0.710135 C 0 3 3 1.354760 A 0 4 4 -0.581359 A 1 5 5 1.633542 B 1 6 6 -0.253950 B 1 7 7 1.799087 A 1 8 8 0.412991 C 0 9 9 0.374330 A 0 In [13]: # 刪除第一列 df.drop(columns=df.columns[0]) # R code: # df[, -1] Out[13]: col_b col_c col_d 0 0.308520 C 1 1 -1.829450 B 1 2 -0.710135 C 0 3 1.354760 A 0 4 -0.581359 A 1 5 1.633542 B 1 6 -0.253950 B 1 7 1.799087 A 1 8 0.412991 C 0 9 0.374330 A 0
轉置
In [14]: df.T # R code: # t(df) Out[14]: 0 1 2 3 4 5 6 7 8 9 col_a 0 1 2 3 4 5 6 7 8 9 col_b 0.30852 -1.82945 -0.710135 1.35476 -0.581359 1.63354 -0.25395 1.79909 0.412991 0.37433 col_c C B C A A B B A C A col_d 1 1 0 0 1 1 1 1 0 0
數據類型轉換
In [15]: df['col_a'].astype(str) # as.character(df$col_a) Out[15]: 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 Name: col_a, dtype: object
轉換爲類別(categories)/因子(factor)類型
In [16]: pd.Categorical(df['col_c']) # factor(df$col_d) Out[16]: [C, B, C, A, A, B, B, A, C, A] Categories (3, object): [A, B, C]
數據彙總 按行進行計算
In [17]: df[['col_a', 'col_b']].sum(axis=1) # R code: # apply(df[, c('col_a', 'col_b')], 1, sum) Out[17]: 0 0.308520 1 -0.829450 2 1.289865 3 4.354760 4 3.418641 5 6.633542 6 5.746050 7 8.799087 8 8.412991 9 9.374330 dtype: float64
按列進行計算
In [18]: df[['col_a', 'col_b']].mean(axis=0) # R code: # apply(df[, c('col_a', 'col_b')], 2, mean) Out[18]: col_a 4.500000 col_b 0.250834 dtype: float64 In [19]: df[['col_a', 'col_b']].apply(lambda x: x.mean() + 10) # R code: # apply(df[, c('col_a', 'col_b')], 2, function(x) mean(x) + 10) Out[19]: col_a 14.500000 col_b 10.250834 dtype: float64
數據合併 合併列
In [20]: df2 = pd.DataFrame({'col_x': np.arange(10), 'col_y': np.arange(10)[::-1]}) df2 Out[20]: col_x col_y 0 0 9 1 1 8 2 2 7 3 3 6 4 4 5 5 5 4 6 6 3 7 7 2 8 8 1 9 9 0 In [21]: pd.concat([df, df2], axis=1) # R code: # cbind(df, df2) Out[21]: col_a col_b col_c col_d col_x col_y 0 0 0.308520 C 1 0 9 1 1 -1.829450 B 1 1 8 2 2 -0.710135 C 0 2 7 3 3 1.354760 A 0 3 6 4 4 -0.581359 A 1 4 5 5 5 1.633542 B 1 5 4 6 6 -0.253950 B 1 6 3 7 7 1.799087 A 1 7 2 8 8 0.412991 C 0 8 1 9 9 0.374330 A 0 9 0
合併行
In [22]: df3 = pd.DataFrame({'col_a': [-1, -2], 'col_b' : [0, 1], 'col_c': ['B', 'C'], 'col_d': [1, 0]}) df3 Out[22]: col_a col_b col_c col_d 0 -1 0 B 1 1 -2 1 C 0 In [23]: pd.concat([df, df3], axis=0, ignore_index=True) # R code: # rbind(df, df3) Out[23]: col_a col_b col_c col_d 0 0 0.308520 C 1 1 1 -1.829450 B 1 2 2 -0.710135 C 0 3 3 1.354760 A 0 4 4 -0.581359 A 1 5 5 1.633542 B 1 6 6 -0.253950 B 1 7 7 1.799087 A 1 8 8 0.412991 C 0 9 9 0.374330 A 0 10 -1 0.000000 B 1 11 -2 1.000000 C 0