Pandas簡單使用2

Pandas處理丟失數據

使用dropna()除去NaN的行或列

dates = pd.date_range('20180901',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A', 'B', 'C', 'D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)

print("除去異常值:")
# 使用pd.dropna()去掉有NaN的行或列
df1 = df.dropna(axis=1, how='any')  # ‘any’:只要存在NaN就drop掉;’all‘:必須全部是NaN才drop掉#0:對行進行操作;1:對列進行操作
print(df1)
             A     B     C   D
2018-09-01   0   NaN   2.0   3
2018-09-02   4   5.0   NaN   7
2018-09-03   8   9.0  10.0  11
2018-09-04  12  13.0  14.0  15
2018-09-05  16  17.0  18.0  19
2018-09-06  20  21.0  22.0  23
除去異常值:
             A   D
2018-09-01   0   3
2018-09-02   4   7
2018-09-03   8  11
2018-09-04  12  15
2018-09-05  16  19
2018-09-06  20  23

使用fillna指定值代替NaN

df2 = df.fillna(value=0)  # 使用0代替nan
print(df2)
	  A     B     C   D
2018-09-01   0   0.0   2.0   3
2018-09-02   4   5.0   0.0   7
2018-09-03   8   9.0  10.0  11
2018-09-04  12  13.0  14.0  15
2018-09-05  16  17.0  18.0  19
2018-09-06  20  21.0  22.0  23

##使用isnull判斷是否是NaN

                A      B      C      D
2018-09-01  False   True  False  False
2018-09-02  False  False   True  False
2018-09-03  False  False  False  False
2018-09-04  False  False  False  False
2018-09-05  False  False  False  False
2018-09-06  False  False  False  False

Pandas導入導出

pd.read_csv讀取csv文件

data = pd.read_csv('student.csv')   # 讀入csv
print(data)
print(data.index)
print(data.columns)
print(data.values)
print(data.describe())
姓名 性別 分數
0  李召 男 96
1   李明 男 89
2   王紅 女 95
RangeIndex(start=0, stop=3, step=1)
Index(['姓名 性別 分數'], dtype='object')
[['李召 男 96']
 ['李明 男 89']
 ['王紅 女 95']]
       姓名 性別 分數
count         3
unique        3
top     王紅 女 95
freq          1

將資料存取成pickle

data.to_pickle('student.pickle')    # 將資料存取成pickle

Pandas合併concat

pandas處理多組數據需要用到數據的合併處理,concat是一種基本的合併方式,concat中有很多參數可以調整。

axis方向合併

# 定義資料集
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
# concat縱向
print(df1)
print(df2)
print(df3)
# concat縱向合併
res = pd.concat([df1, df2, df3], axis=0)
print(res)
	a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0

ignore_index

# 縱向合併,並重置index
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
print(res)
	a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0

join(合併方式)

# 定義資料集
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])

# 縱向“外”合併df1和df2,擴展
res = pd.concat([df1, df2], axis=0, join='outer', sort=False)
print(res)

# 縱向“內”合併df1和df2,找交集
res = pd.concat([df1, df2], axis=0, join='inner')
print(res)
    a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
     b    c    d
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  0.0  0.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0

join_axes(依照axes合併)

# 依照'df1.index'進行橫向合併
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
print(res)

# 移除join_axes
res = pd.concat([df1, df2], axis=1)
print(res)
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0

append(添加數據)

# 定義資料集
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])

# 將df2合併到df1的下面,以及重置index,並打印出結果
res = df1.append(df2, ignore_index=True)
print(res)

# 合併多個df,講df2與df3合併至df1的下面,以及重置index
res = df1.append([df2, df3], ignore_index=True)
print(res)

# 合併Series,將s1合併至df1,以及重置index,
res = df1.append(s1, ignore_index=True)
print(res)
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  2.0  3.0  4.0

Pandas合併merge

pandas中的mergeconcat類似,但主要是用於兩組有key column的數據,統一索引的數據,通常也被用在Database的處理中

依據一組key合併

left = pd.DataFrame({
                'key': ['K0', 'K1', 'K2', 'K3'],
                'A': ['A0', 'A1', 'A2', 'A3'],
                'B': ['B0', 'B1', 'B2', 'B3']
                })
right = pd.DataFrame({
                'key': ['K0', 'K1', 'K2', 'K3'],
                'C': ['C0', 'C1', 'C2', 'C3'],
                'D': ['D0', 'D1', 'D2', 'D3']
                })
print(left)
print(right)
# 依據key_column合併
res = pd.merge(left, right, on='key')
print(res)
  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3
  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3
  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3

依據兩組key合併

left = pd.DataFrame({
                'key1': ['K0', 'K0', 'K1', 'K2'],
                'key2': ['K0', 'K1', 'K0', 'K1'],
                'A': ['A0', 'A1', 'A2', 'A3'],
                'B': ['B0', 'B1', 'B2', 'B3']
                })
right = pd.DataFrame({
                'key1': ['K0', 'K1', 'K1', 'K2'],
                'key2': ['K0', 'K0', 'K0', 'K0'],
                'C': ['C0', 'C1', 'C2', 'C3'],
                'D': ['D0', 'D1', 'D2', 'D3']
                })
print(left)
 key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
print(right)
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3
# 依據key1與key2 column進行合併
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2

res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(res)
  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K1   A3   B3  NaN  NaN
5   K2   K0  NaN  NaN   C3   D3

res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN

res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(res)
  key1 key2    A    B   C   D
0   K0   K0   A0   B0  C0  D0
1   K1   K0   A2   B2  C1  D1
2   K1   K0   A2   B2  C2  D2
3   K2   K0  NaN  NaN  C3  D3

Indicator

Indicator=True會將合併的記錄放在新的一列

df1 = pd.DataFrame({
            'col1': [0, 1], 'col_left': ['a', 'b']
            })
df2 = pd.DataFrame({
            'col1': [1, 2, 2], 'col_right': [2, 2, 2]
            })
print(df1)
print(df2)
   col1 col_left
0     0        a
1     1        b
   col1  col_right
0     1          2
1     2          2
2     2          2

# 依據col1進行合併,並啓用indicator=True
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
   col1 col_left  col_right      _merge
0     0        a        NaN   left_only
1     1        b        2.0        both
2     2      NaN        2.0  right_only
3     2      NaN        2.0  right_only

# 自定義indicator column的名稱
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)
   col1 col_left  col_right indicator_column
0     0        a        NaN        left_only
1     1        b        2.0             both
2     2      NaN        2.0       right_only
3     2      NaN        2.0       right_only

依據index合併

# 定義資料集
left = pd.DataFrame({
            'A': ['A0', 'A1', 'A2'],
            'B': ['B0', 'B1', 'B2']},
            index=['K0', 'K1', 'K2'])
right = pd.DataFrame({
            'C': ['C0', 'C1', 'C2'],
            'D': ['D0', 'D1', 'D2']},
            index=['K0', 'K2', 'K3']
            )
print(left)
print(right)
     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
     C   D
K0  C0  D0
K2  C1  D1
K3  C2  D2

# 依據左右資料集的index進行合併,how='outer'
res = pd.merge(left, right, left_index=True, right_index=True,  how='outer')
print(res)
      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C1   D1
K3  NaN  NaN   C2   D2

# 依據左右資料集的index進行合併,how='inner'
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
     A   B   C   D
K0  A0  B0  C0  D0
K2  A2  B2  C1  D1

解決overlapping

# 解決overlapping的問題
boys = pd.DataFrame({'K': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'K': ['K0', 'K1', 'K2'], 'age': [4, 5, 6]})

print(boys)
print(girls)
    K  age
0  K0    1
1  K1    2
2  K2    3
    K  age
0  K0    4
1  K1    5
2  K2    6

# 使用suffixes解決overlapping問題
res = pd.merge(boys, girls, on='K', suffixes=['_boy', '_girl'], how='inner')
print(res)
    K  age_boy  age_girl
0  K0        1         4
1  K1        2         5
2  K2        3         6

Pandas plot繪圖

創建一個Series

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 隨機生成1000個數據
data = pd.Series(np.random.randn(1000), index=np.arange(1000))

# 累加數據
data1 = data.cumsum()

# pandas數據可以直接觀察可視化形式
data1.plot()
plt.show()

在這裏插入圖片描述

Dataframe可視化

data = pd.DataFrame(
        np.random.randn(1000, 4),
        index=np.arange(1000),
        columns=list("ABCD")
        )
data1 = data.cumsum()
data1.plot()
plt.show()

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章