Python數據分析實戰【第三章】2.14-數值計算和統計基礎【python】

【課程2.14】 數值計算和統計基礎

常用數學、統計方法

1.基本參數:axis、skipna


import numpy as np
import pandas as pd

df = pd.DataFrame({'key1':[4,5,3,np.nan,2],
                 'key2':[1,2,np.nan,4,5],
                 'key3':[1,2,3,'j','k']},
                 index = ['a','b','c','d','e'])
print(df)
print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype)
print('-----')

m1 = df.mean()
print(m1,type(m1))
print('單獨統計一列:',df['key2'].mean())
print('-----')
# np.nan :空值
# .mean()計算均值
# 只統計數字列
# 可以通過索引單獨統計一列

m2 = df.mean(axis=1)
print(m2)
print('-----')
# axis參數:默認爲0,以列來計算,axis=1,以行來計算,這裏就按照行來彙總了

m3 = df.mean(skipna=False)
print(m3)
print('-----')
# skipna參數:是否忽略NaN,默認True,如False,有NaN的列統計結果仍未NaN
----------------------------------------------------------------------
# 基本參數:axis、skipna

import numpy as np
import pandas as pd

df = pd.DataFrame({'key1':[4,5,3,np.nan,2],
                 'key2':[1,2,np.nan,4,5],
                 'key3':[1,2,3,'j','k']},
                 index = ['a','b','c','d','e'])
print(df)
print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype)
print('-----')

m1 = df.mean()
print(m1,type(m1))
print('單獨統計一列:',df['key2'].mean())
print('-----')
# np.nan :空值
# .mean()計算均值
# 只統計數字列
# 可以通過索引單獨統計一列

m2 = df.mean(axis=1)
print(m2)
print('-----')
# axis參數:默認爲0,以列來計算,axis=1,以行來計算,這裏就按照行來彙總了

m3 = df.mean(skipna=False)
print(m3)
print('-----')
# skipna參數:是否忽略NaN,默認True,如False,有NaN的列統計結果仍未NaN
----------------------------------------------------------------------

2.主要數學計算方法,可用於Series和DataFrame(1)

df = pd.DataFrame({'key1':np.arange(10),
                  'key2':np.random.rand(10)*10})
print(df)
print('-----')

print(df.count(),'→ count統計非Na值的數量\n')
print(df.min(),'→ min統計最小值\n',df['key2'].max(),'→ max統計最大值\n')
print(df.quantile(q=0.75),'→ quantile統計分位數,參數q確定位置\n')
print(df.sum(),'→ sum求和\n')
print(df.mean(),'→ mean求平均值\n')
print(df.median(),'→ median求算數中位數,50%分位數\n')
print(df.std(),'\n',df.var(),'→ std,var分別求標準差,方差\n')
print(df.skew(),'→ skew樣本的偏度\n')
print(df.kurt(),'→ kurt樣本的峯度\n')
----------------------------------------------------------------------
 key1      key2
0     0  4.667989
1     1  4.336625
2     2  0.746852
3     3  9.670919
4     4  8.732045
5     5  0.013751
6     6  8.963752
7     7  0.279303
8     8  8.586821
9     9  8.899657
-----
key1    10
key2    10
dtype: int64 → count統計非Na值的數量

key1    0.000000
key2    0.013751
dtype: float64 → min統計最小值
 9.67091932107 → max統計最大值

key1    6.750000
key2    8.857754
dtype: float64 → quantile統計分位數,參數q確定位置

key1    45.000000
key2    54.897714
dtype: float64 → sum求和

key1    4.500000
key2    5.489771
dtype: float64 → mean求平均值

key1    4.500000
key2    6.627405
dtype: float64 → median求算數中位數,50%分位數

key1    3.027650
key2    3.984945
dtype: float64 
 key1     9.166667
key2    15.879783
dtype: float64 → std,var分別求標準差,方差

key1    0.000000
key2   -0.430166
dtype: float64 → skew樣本的偏度

key1   -1.200000
key2   -1.800296
dtype: float64 → kurt樣本的峯度

3.主要數學計算方法,可用於Series和DataFrame(2)



df['key1_s'] = df['key1'].cumsum()
df['key2_s'] = df['key2'].cumsum()
print(df,'→ cumsum樣本的累計和\n')

df['key1_p'] = df['key1'].cumprod()
df['key2_p'] = df['key2'].cumprod()
print(df,'→ cumprod樣本的累計積\n')

print(df.cummax(),'\n',df.cummin(),'→ cummax,cummin分別求累計最大值,累計最小值\n')
# 會填充key1,和key2的值
----------------------------------------------------------------------
 key1      key2  key1_s     key2_s
0     0  4.667989       0   4.667989
1     1  4.336625       1   9.004614
2     2  0.746852       3   9.751466
3     3  9.670919       6  19.422386
4     4  8.732045      10  28.154431
5     5  0.013751      15  28.168182
6     6  8.963752      21  37.131934
7     7  0.279303      28  37.411236
8     8  8.586821      36  45.998057
9     9  8.899657      45  54.897714 → cumsum樣本的累計和

   key1      key2  key1_s     key2_s  key1_p       key2_p
0     0  4.667989       0   4.667989       0     4.667989
1     1  4.336625       1   9.004614       0    20.243318
2     2  0.746852       3   9.751466       0    15.118767
3     3  9.670919       6  19.422386       0   146.212377
4     4  8.732045      10  28.154431       0  1276.733069
5     5  0.013751      15  28.168182       0    17.556729
6     6  8.963752      21  37.131934       0   157.374157
7     7  0.279303      28  37.411236       0    43.955024
8     8  8.586821      36  45.998057       0   377.433921
9     9  8.899657      45  54.897714       0  3359.032396 → cumprod樣本的累計積

   key1      key2  key1_s     key2_s  key1_p       key2_p
0   0.0  4.667989     0.0   4.667989     0.0     4.667989
1   1.0  4.667989     1.0   9.004614     0.0    20.243318
2   2.0  4.667989     3.0   9.751466     0.0    20.243318
3   3.0  9.670919     6.0  19.422386     0.0   146.212377
4   4.0  9.670919    10.0  28.154431     0.0  1276.733069
5   5.0  9.670919    15.0  28.168182     0.0  1276.733069
6   6.0  9.670919    21.0  37.131934     0.0  1276.733069
7   7.0  9.670919    28.0  37.411236     0.0  1276.733069
8   8.0  9.670919    36.0  45.998057     0.0  1276.733069
9   9.0  9.670919    45.0  54.897714     0.0  3359.032396 
    key1      key2  key1_s    key2_s  key1_p    key2_p
0   0.0  4.667989     0.0  4.667989     0.0  4.667989
1   0.0  4.336625     0.0  4.667989     0.0  4.667989
2   0.0  0.746852     0.0  4.667989     0.0  4.667989
3   0.0  0.746852     0.0  4.667989     0.0  4.667989
4   0.0  0.746852     0.0  4.667989     0.0  4.667989
5   0.0  0.013751     0.0  4.667989     0.0  4.667989
6   0.0  0.013751     0.0  4.667989     0.0  4.667989
7   0.0  0.013751     0.0  4.667989     0.0  4.667989
8   0.0  0.013751     0.0  4.667989     0.0  4.667989
9   0.0  0.013751     0.0  4.667989     0.0  4.667989 → cummax,cummin分別求累計最大值,累計最小值

4.唯一值:.unique()



s = pd.Series(list('asdvasdcfgg'))
sq = s.unique()
print(s)
print(sq,type(sq))
print(pd.Series(sq))
# 得到一個唯一值數組
# 通過pd.Series重新變成新的Series

sq.sort()
print(sq)
# 重新排序
----------------------------------------------------------------------
0     a
1     s
2     d
3     v
4     a
5     s
6     d
7     c
8     f
9     g
10    g
dtype: object
['a' 's' 'd' 'v' 'c' 'f' 'g'] <class 'numpy.ndarray'>
0    a
1    s
2    d
3    v
4    c
5    f
6    g
dtype: object
['a' 'c' 'd' 'f' 'g' 's' 'v']

5.值計數:.value_counts()

sc = s.value_counts(sort = False)  # 也可以這樣寫:pd.value_counts(sc, sort = False)
print(sc)
# 得到一個新的Series,計算出不同值出現的頻率
# sort參數:排序,默認爲True
----------------------------------------------------------------------
s    2
d    2
v    1
c    1
a    2
g    2
f    1
dtype: int64

6.成員資格:.isin()

s = pd.Series(np.arange(10,15))
df = pd.DataFrame({'key1':list('asdcbvasd'),
                  'key2':np.arange(4,13)})
print(s)
print(df)
print('-----')

print(s.isin([5,14]))
print(df.isin(['a','bc','10',8]))
# 用[]表示
# 得到一個布爾值的Series或者Dataframe
----------------------------------------------------------------------
0    10
1    11
2    12
3    13
4    14
dtype: int32
  key1  key2
0    a     4
1    s     5
2    d     6
3    c     7
4    b     8
5    v     9
6    a    10
7    s    11
8    d    12
-----
0    False
1    False
2    False
3    False
4     True
dtype: bool
    key1   key2
0   True  False
1  False  False
2  False  False
3  False  False
4  False   True
5  False  False
6   True  False
7  False  False
8  False  False
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章