Pandas數據結構:DataFrame

剛剛接觸pandas的朋友,想了解數據結構,就一定要認識DataFrame,接下來給大家詳細介紹!

初識DataFrame

import numpy as np
import pandas as pd
data = {"name": ["Jack", "Tom", "LiSa"],
        "age": [20, 21, 18],
        "city": ["BeiJing", "TianJin", "ShenZhen"]}
print(data)
print("")

frame = pd.DataFrame(data)  # 創建DataFrame
print(frame)
print("")

print(frame.index)  # 查看行索引
print("")

print(frame.columns)  # 查看列索引
print("")

print(frame.values)  # 查看值
{'name': ['Jack', 'Tom', 'LiSa'], 'age': [20, 21, 18], 'city': ['BeiJing', 'TianJin', 'ShenZhen']}

   age      city  name
0   20   BeiJing  Jack
1   21   TianJin   Tom
2   18  ShenZhen  LiSa

RangeIndex(start=0, stop=3, step=1)

Index(['age', 'city', 'name'], dtype='object')

[[20 'BeiJing' 'Jack']
 [21 'TianJin' 'Tom']
 [18 'ShenZhen' 'LiSa']]

創建DataFrame

方法一: 由字典創建 字典的key是列索引 值可以是1.列表 2.ndarray 3.Series

# 值是列表

data1 = {"a": [1,2,3],
         "b": [4,5,6],
         "c": [7,8,9]
        }
print(data1)
print("")

print(pd.DataFrame(data1))  # 創建DataFrame
print("")


# 注意: index是可以給行索引重新命名 columns是給列索引重新指定順序 如果沒有該列 那麼產生NaN值

print(pd.DataFrame(data1, index=list("mnp"), columns=list("bcad")))   
print("")
{'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}

   a  b  c
0  1  4  7
1  2  5  8
2  3  6  9

   b  c  a    d
m  4  7  1  NaN
n  5  8  2  NaN
p  6  9  3  NaN
# 值是ndarray  注意: 用ndarray創建DataFrame值的個數必須相同 否則報錯

data2 = {"one": np.random.rand(3),
         "two": np.random.rand(3)
        }
print(data2)
print("")

print(pd.DataFrame(data2))
{'one': array([ 0.60720023,  0.30838024,  0.30678266]), 'two': array([ 0.21368784,  0.03797809,  0.41698718])}

        one       two
0  0.607200  0.213688
1  0.308380  0.037978
2  0.306783  0.416987
# 值是Series--帶有標籤的一維數組  注意: 用Series創建DataFrame值的個數可以不同  少的值用Nan填充

data3 = {"one": pd.Series(np.random.rand(4)),
         "two": pd.Series(np.random.rand(5))
        }
print(data3)
print("")

df3 = pd.DataFrame(data3)
print(df3)
print("")
{'one': 0    0.217639
1    0.921641
2    0.898810
3    0.933510
dtype: float64, 'two': 0    0.132789
1    0.099904
2    0.723495
3    0.719173
4    0.477456
dtype: float64}

        one       two
0  0.217639  0.132789
1  0.921641  0.099904
2  0.898810  0.723495
3  0.933510  0.719173
4       NaN  0.477456

方法二: 通過二維數組直接創建

arr = np.random.rand(12).reshape(3,4)
print(arr)
print("")

df1 = pd.DataFrame(arr)
print(df1)
print("")

df2 = pd.DataFrame(arr, index=list("abc"), columns=["one", "two", "three", "four"])  # 通過index和columns指定行索引和列索引
print(df2)
[[ 0.85898536  0.40300549  0.80043098  0.87045042]
 [ 0.56918302  0.34589982  0.63733905  0.93012927]
 [ 0.47535281  0.93652147  0.57446896  0.59470213]]

          0         1         2         3
0  0.858985  0.403005  0.800431  0.870450
1  0.569183  0.345900  0.637339  0.930129
2  0.475353  0.936521  0.574469  0.594702

        one       two     three      four
a  0.858985  0.403005  0.800431  0.870450
b  0.569183  0.345900  0.637339  0.930129
c  0.475353  0.936521  0.574469  0.594702

方法三: 由字典組成的列表創建 DataFrame

data = [{"one": 1, "two": 2}, {"one": 5, "two": 10, "three": 15}]  # 每一個字典在DataFrame裏就是一行數據
print(data)
print("")

df1 = pd.DataFrame(data)
print(df1)
print("")

df2 = pd.DataFrame(data, index=list("ab"), columns=["one", "two", "three", "four"])
print(df2)
[{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 15}]

   one  three  two
0    1    NaN    2
1    5   15.0   10

   one  two  three  four
a    1    2    NaN   NaN
b    5   10   15.0   NaN

創建方法四: 由字典組成的字典

# columns爲字典的key index爲子字典的key

data = {"Jack": {"age":1, "country":"China", "sex":"man"}, 
        "LiSa": {"age":18, "country":"America", "sex":"women"},
        "Tom": {"age":20, "country":"English"}}

df1 = pd.DataFrame(data)
print(df1)
print("")

# 注意: 這裏的index並不能給子字典的key(行索引)重新命名 但可以給子字典的key重新排序 若出現原數組沒有的index 那麼就填充NaN值

df2 = pd.DataFrame(data, index=["sex", "age", "country"])
print(df2)
print("")

df3 = pd.DataFrame(data, index=list("abc"))
print(df3)
print("")

# columns 給列索引重新排序 若出現原數組沒有的列索引填充NaN值
df4 = pd.DataFrame(data, columns=["Tom", "LiSa", "Jack", "TangMu"])
print(df4)
          Jack     LiSa      Tom
age          1       18       20
country  China  America  English
sex        man    women      NaN

          Jack     LiSa      Tom
sex        man    women      NaN
age          1       18       20
country  China  America  English

   Jack  LiSa  Tom
a   NaN   NaN  NaN
b   NaN   NaN  NaN
c   NaN   NaN  NaN

             Tom     LiSa   Jack TangMu
age           20       18      1    NaN
country  English  America  China    NaN
sex          NaN    women    man    NaN

DataFrame索引

選擇行與列

選擇列 直接用df[“列標籤”]

df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
                 index = ["one", "two", "three"], columns = ["a", "b", "c", "d"])
print(df)
print("")

print(df["a"], "  ", type(df["a"]))  # 取一列
print("")

print(df[["a", "c"]], "  ", type(df[["a", "c"]]))  # 取多列
               a          b          c          d
one    92.905464  11.630358  19.518051  77.417377
two    91.107357   0.641600   4.913662  65.593182
three   3.152801  42.324671  14.030304  22.138608

one      92.905464
two      91.107357
three     3.152801
Name: a, dtype: float64    <class 'pandas.core.series.Series'>

               a          c
one    92.905464  19.518051
two    91.107357   4.913662
three   3.152801  14.030304    <class 'pandas.core.frame.DataFrame'>

選擇行 不能通過標籤索引 df[“one”] 來選擇行 要用 df.loc[“one”], loc就是針對行來操作的

print(df)
print("")

print(df.loc["one"], " ", type(df.loc["one"]))  # 取一行
print("")

print(df.loc[["one", "three"]], " ", type(df.loc[["one", "three"]])) # 取不連續的多行
print("")
               a          b          c          d
one    92.905464  11.630358  19.518051  77.417377
two    91.107357   0.641600   4.913662  65.593182
three   3.152801  42.324671  14.030304  22.138608

a    92.905464
b    11.630358
c    19.518051
d    77.417377
Name: one, dtype: float64   <class 'pandas.core.series.Series'>

               a          b          c          d
one    92.905464  11.630358  19.518051  77.417377
three   3.152801  42.324671  14.030304  22.138608   <class 'pandas.core.frame.DataFrame'>

loc支持切片索引–針對行 幷包含末端 df.loc[“one”: “three”]

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100, index=["one", "two", "three", "four"],
                 columns=["a", "b", "c", "d"])
print(df)
print("")

print(df.loc["one": "three"])
print("") 

print(df[: 3])  # 切片表示取連續的多行(儘量不用 免得混淆)
               a          b          c          d
one    65.471894  19.137274  31.680635  41.659808
two    31.570587  45.575849  37.739644   5.140845
three  54.930986  68.232707  17.215544  70.765401
four   45.591798  63.274956  74.056045   2.466652

               a          b          c          d
one    65.471894  19.137274  31.680635  41.659808
two    31.570587  45.575849  37.739644   5.140845
three  54.930986  68.232707  17.215544  70.765401

               a          b          c          d
one    65.471894  19.137274  31.680635  41.659808
two    31.570587  45.575849  37.739644   5.140845
three  54.930986  68.232707  17.215544  70.765401

iloc也是對行來操作的 只不過把行標籤改成了行索引 並且是不包含末端的

print(df)
print("")

print(df.iloc[0])  # 取一行
print("")

print(df.iloc[[0,2]])  # 取不連續的多行
print("")

print(df.iloc[0:3])  # 不包含末端
               a          b          c          d
one    65.471894  19.137274  31.680635  41.659808
two    31.570587  45.575849  37.739644   5.140845
three  54.930986  68.232707  17.215544  70.765401
four   45.591798  63.274956  74.056045   2.466652

a    65.471894
b    19.137274
c    31.680635
d    41.659808
Name: one, dtype: float64

               a          b          c          d
one    65.471894  19.137274  31.680635  41.659808
three  54.930986  68.232707  17.215544  70.765401

               a          b          c          d
one    65.471894  19.137274  31.680635  41.659808
two    31.570587  45.575849  37.739644   5.140845
three  54.930986  68.232707  17.215544  70.765401

布爾型索引

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100, index=["one", "two", "three", "four"],
                 columns=["a", "b", "c", "d"])
print(df)
print("")

d1 = df >50  # d1爲布爾型索引
print(d1)
print("")

print(df[d1])  # df根據d1 只返回True的值  False的值對應爲NaN
print("")
               a          b          c          d
one    91.503673  74.080822  85.274682  80.788609
two    49.670055  42.221393  36.674490  69.272958
three  78.349843  68.090150  22.326223  93.984369
four   79.057146  77.687246  32.304265   0.567816

           a      b      c      d
one     True   True   True   True
two    False  False  False   True
three   True   True  False   True
four    True   True  False  False

               a          b          c          d
one    91.503673  74.080822  85.274682  80.788609
two          NaN        NaN        NaN  69.272958
three  78.349843  68.090150        NaN  93.984369
four   79.057146  77.687246        NaN        NaN

選取某一列作爲布爾型索引 返回True所在行的所有列 注意: 不能選取多列作爲布爾型索引

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100, index=["one", "two", "three", "four"],
                 columns=["a", "b", "c", "d"], dtype=np.int64)
print(df)
print("") 

d2 = df["b"] > 50  
print(d2)
print("")

print(df[d2])
        a   b   c   d
one    27  18  47  61
two    26  35  16  78
three  80  98  94  41
four   85   3  47  90

one      False
two      False
three     True
four     False
Name: b, dtype: bool

        a   b   c   d
three  80  98  94  41

選取多列作爲布爾型索引 返回True所對應的值 False對應爲NaN 沒有的列全部填充爲NaN

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100, index=["one", "two", "three", "four"],
                 columns=["a", "b", "c", "d"], dtype=np.int64)
print(df)
print("")

d3 = df[["a", "c"]] > 50
print(d3)
print("")

print(df[d3])
        a   b   c   d
one    49  82  32  39
two    78   2  24  84
three   6  84  84  69
four   21  89  16  77

           a      c
one    False  False
two     True  False
three  False   True
four   False  False

          a   b     c   d
one     NaN NaN   NaN NaN
two    78.0 NaN   NaN NaN
three   NaN NaN  84.0 NaN
four    NaN NaN   NaN NaN

多重索引

print(df)
        a   b   c   d
one    49  82  32  39
two    78   2  24  84
three   6  84  84  69
four   21  89  16  77
print(df["a"].loc[["one", "three"]])  # 取列再取行
print("")

print(df[["a", "c"]].iloc[0:3])
one      49
three     6
Name: a, dtype: int64

        a   c
one    49  32
two    78  24
three   6  84
print(df.loc[["one", "three"]][["a", "c"]])  # 取行再取列
        a   c
one    49  32
three   6  84
print(df > 50)
print("")

print(df[df>50])
print("")

print(df[df>50][["a","b"]])
           a      b      c      d
one    False   True  False  False
two     True  False  False   True
three  False   True   True   True
four   False   True  False   True

          a     b     c     d
one     NaN  82.0   NaN   NaN
two    78.0   NaN   NaN  84.0
three   NaN  84.0  84.0  69.0
four    NaN  89.0   NaN  77.0

          a     b
one     NaN  82.0
two    78.0   NaN
three   NaN  84.0
four    NaN  89.0

DataFrame基本技巧

import numpy as np
import pandas as pd
arr = np.random.rand(16).reshape(8, 2)*10
# print(arr)
print("")

print(len(arr))
print("")

df = pd.DataFrame(arr, index=[chr(i) for i in range(97, 97+len(arr))], columns=["one", "two"])
print(df)
8

        one       two
a  2.129959  1.827002
b  8.631212  0.423903
c  6.262012  3.851107
d  6.890305  9.543065
e  6.883742  3.643955
f  2.740878  6.851490
g  6.242513  7.402237
h  9.226572  3.179664

查看數據

print(df)
print("")

print(df.head(2))  # 查看頭部數據 默認查看5條
print("")

print(df.tail(3))  # 查看末尾數據 默認查看5條
        one       two
a  2.129959  1.827002
b  8.631212  0.423903
c  6.262012  3.851107
d  6.890305  9.543065
e  6.883742  3.643955
f  2.740878  6.851490
g  6.242513  7.402237
h  9.226572  3.179664

        one       two
a  2.129959  1.827002
b  8.631212  0.423903

        one       two
f  2.740878  6.851490
g  6.242513  7.402237
h  9.226572  3.179664

轉置

print(df)
        one       two
a  2.129959  1.827002
b  8.631212  0.423903
c  6.262012  3.851107
d  6.890305  9.543065
e  6.883742  3.643955
f  2.740878  6.851490
g  6.242513  7.402237
h  9.226572  3.179664
print(df.T)
            a         b         c         d         e         f         g  \
one  2.129959  8.631212  6.262012  6.890305  6.883742  2.740878  6.242513   
two  1.827002  0.423903  3.851107  9.543065  3.643955  6.851490  7.402237   

            h  
one  9.226572  
two  3.179664  

添加與修改

df = pd.DataFrame(np.random.rand(16).reshape(4,4),index=["one", "two", "three", "four"], columns=["a", "b", "c", "d"])
print(df)
print("")

df.loc["five"] = 100  # 增加一行
print(df)
print("")

df["e"] = 10  # 增加一列
print(df)
print("")

df["e"] = 101  # 修改一列
print(df)
print("")

df.loc["five"] = 111  # 修改一行
print(df)
print("")
              a         b         c         d
one    0.708481  0.285426  0.355058  0.990070
two    0.199559  0.733047  0.322982  0.791169
three  0.198043  0.801163  0.356082  0.857501
four   0.430182  0.020549  0.896011  0.503088

                a           b           c           d
one      0.708481    0.285426    0.355058    0.990070
two      0.199559    0.733047    0.322982    0.791169
three    0.198043    0.801163    0.356082    0.857501
four     0.430182    0.020549    0.896011    0.503088
five   100.000000  100.000000  100.000000  100.000000

                a           b           c           d   e
one      0.708481    0.285426    0.355058    0.990070  10
two      0.199559    0.733047    0.322982    0.791169  10
three    0.198043    0.801163    0.356082    0.857501  10
four     0.430182    0.020549    0.896011    0.503088  10
five   100.000000  100.000000  100.000000  100.000000  10

                a           b           c           d    e
one      0.708481    0.285426    0.355058    0.990070  101
two      0.199559    0.733047    0.322982    0.791169  101
three    0.198043    0.801163    0.356082    0.857501  101
four     0.430182    0.020549    0.896011    0.503088  101
five   100.000000  100.000000  100.000000  100.000000  101

                a           b           c           d    e
one      0.708481    0.285426    0.355058    0.990070  101
two      0.199559    0.733047    0.322982    0.791169  101
three    0.198043    0.801163    0.356082    0.857501  101
four     0.430182    0.020549    0.896011    0.503088  101
five   111.000000  111.000000  111.000000  111.000000  111

刪除 del(刪除行)/drop(刪除列 指定axis=1刪除行)

df = pd.DataFrame(np.random.rand(16).reshape(4,4),index=["one", "two", "three", "four"], columns=["a", "b", "c", "d"])
print(df)
print("")

del df["a"]  # 刪除列  改變原數組
print(df)
              a         b         c         d
one    0.339979  0.577661  0.108308  0.482164
two    0.374043  0.102067  0.660970  0.786986
three  0.384832  0.076563  0.529472  0.358780
four   0.938592  0.852895  0.466709  0.938307

              b         c         d
one    0.577661  0.108308  0.482164
two    0.102067  0.660970  0.786986
three  0.076563  0.529472  0.358780
four   0.852895  0.466709  0.938307
df = pd.DataFrame(np.random.rand(16).reshape(4,4),index=["one", "two", "three", "four"], columns=["a", "b", "c", "d"])
print(df)
print("")

d1 = df.drop("one")  # 刪除行 並返回新的數組 不改變原數組
print(d1)
print("")

print(df)
              a         b         c         d
one    0.205438  0.324132  0.401131  0.368300
two    0.471426  0.671785  0.837956  0.097416
three  0.888816  0.451950  0.137032  0.568844
four   0.524813  0.448306  0.875787  0.479477

              a         b         c         d
two    0.471426  0.671785  0.837956  0.097416
three  0.888816  0.451950  0.137032  0.568844
four   0.524813  0.448306  0.875787  0.479477

              a         b         c         d
one    0.205438  0.324132  0.401131  0.368300
two    0.471426  0.671785  0.837956  0.097416
three  0.888816  0.451950  0.137032  0.568844
four   0.524813  0.448306  0.875787  0.479477
df = pd.DataFrame(np.random.rand(16).reshape(4,4),index=["one", "two", "three", "four"], columns=["a", "b", "c", "d"])
print(df)
print("")

d2 = df.drop("a", axis=1)  # 刪除列 返回新的數組 不會改變原數組
print(d2)
print("")

print(df)
              a         b         c         d
one    0.939552  0.613218  0.357056  0.534264
two    0.110583  0.602123  0.990186  0.149132
three  0.756016  0.897848  0.176100  0.204789
four   0.655573  0.819009  0.094322  0.656406

              b         c         d
one    0.613218  0.357056  0.534264
two    0.602123  0.990186  0.149132
three  0.897848  0.176100  0.204789
four   0.819009  0.094322  0.656406

              a         b         c         d
one    0.939552  0.613218  0.357056  0.534264
two    0.110583  0.602123  0.990186  0.149132
three  0.756016  0.897848  0.176100  0.204789
four   0.655573  0.819009  0.094322  0.656406

排序

根據指定列的列值排序 同時列值所在的行也會跟着移動 .sort_values([‘列’])

# 單列

df = pd.DataFrame(np.random.rand(16).reshape(4,4), columns=["a", "b", "c", "d"])
print(df)
print("")

print(df.sort_values(['a']))  # 默認升序  
print("")

print(df.sort_values(['a'], ascending=False))  # 降序
          a         b         c         d
0  0.616386  0.416094  0.072445  0.140167
1  0.263227  0.079205  0.520708  0.866316
2  0.665673  0.836688  0.733966  0.310229
3  0.405777  0.090530  0.991211  0.712312

          a         b         c         d
1  0.263227  0.079205  0.520708  0.866316
3  0.405777  0.090530  0.991211  0.712312
0  0.616386  0.416094  0.072445  0.140167
2  0.665673  0.836688  0.733966  0.310229

          a         b         c         d
2  0.665673  0.836688  0.733966  0.310229
0  0.616386  0.416094  0.072445  0.140167
3  0.405777  0.090530  0.991211  0.712312
1  0.263227  0.079205  0.520708  0.866316

根據索引排序 .sort_index()

df = pd.DataFrame(np.random.rand(16).reshape(4,4), index=[2,1,3,0], columns=["a", "b", "c", "d"])
print(df)
print("")

print(df.sort_index())  # 默認升序
print("")

print(df.sort_index(ascending=False))  # 降序
          a         b         c         d
2  0.669311  0.118176  0.635512  0.248388
1  0.752321  0.935779  0.572554  0.274019
3  0.701334  0.354684  0.592998  0.402686
0  0.548317  0.966295  0.191219  0.307908

          a         b         c         d
0  0.548317  0.966295  0.191219  0.307908
1  0.752321  0.935779  0.572554  0.274019
2  0.669311  0.118176  0.635512  0.248388
3  0.701334  0.354684  0.592998  0.402686

          a         b         c         d
3  0.701334  0.354684  0.592998  0.402686
2  0.669311  0.118176  0.635512  0.248388
1  0.752321  0.935779  0.572554  0.274019
0  0.548317  0.966295  0.191219  0.307908
df = pd.DataFrame(np.random.rand(16).reshape(4,4), index=["x", "z", "y", "t"], columns=["a", "b", "c", "d"])
print(df)
print("")

print(df.sort_index())  # 根據字母順序表排序
          a         b         c         d
x  0.717421  0.206383  0.757656  0.720580
z  0.969988  0.551812  0.210200  0.083031
y  0.956637  0.759216  0.350744  0.335287
t  0.846718  0.207411  0.936231  0.891330

          a         b         c         d
t  0.846718  0.207411  0.936231  0.891330
x  0.717421  0.206383  0.757656  0.720580
y  0.956637  0.759216  0.350744  0.335287
z  0.969988  0.551812  0.210200  0.083031
df = pd.DataFrame(np.random.rand(16).reshape(4,4), index=["three", "one", "four", "two"], columns=["a", "b", "c", "d"])
print(df)
print("")

print(df.sort_index())  # 根據單詞首字母排序
              a         b         c         d
three  0.173818  0.902347  0.106037  0.303450
one    0.591793  0.526785  0.101916  0.884698
four   0.685250  0.364044  0.932338  0.668774
two    0.240763  0.260322  0.722891  0.634825

              a         b         c         d
four   0.685250  0.364044  0.932338  0.668774
one    0.591793  0.526785  0.101916  0.884698
three  0.173818  0.902347  0.106037  0.303450
two    0.240763  0.260322  0.722891  0.634825
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章