#導入所需的包
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
#讀取數據,並用describe查看數據的分佈情況
data = pd.read_excel('./Desktop/實踐一.xlsx')
data.describe()
out:
data
count 699.000000
mean 4.417740
std 2.815741
min 1.000000
25% 2.000000
50% 4.000000
75% 6.000000
max 10.000000
#爲了方便計算,將數據集轉爲列表
data1 = list(data['data'])
#通過numpy、stats求得各個指標
print('平均數:',np.mean(data1))
print('下四分位數:',np.quantile(data1,0.25))
print('中位數:',np.median(data1))
print('上四分位數:',np.quantile(data1,0.75))
print('衆數:',stats.mode(data1)[0][0])
print('標準差:',np.std(data1))
print('偏態係數:',stats.skew(data1))
print('峯態係數:',stats.kurtosis(data1))
out:
平均數: 4.417739628040057
下四分位數: 2.0
中位數: 4.0
上四分位數: 6.0
衆數: 1
標準差: 2.8137258170785375
偏態係數: 0.5915855449527385
峯態係數: -0.6278342838815454
#自定義一個函數,實現以上功能:
def get_data_describe(data):
n = len(data)
m = int(n/2)
m1 = int((n/2))-1
m2 = int((n/2))+1
data = sorted(data)
data_mean = np.sum(data)/n
if n%2 ==0:
data_median = (data[m1]+data[m2])/2
else:
data_median = data[m]
data_range = [data[i]-data_mean for i in range(len(data))]
data_sd = np.sum([pow(x,2) for x in data_range])/n
data_std = np.sqrt(data_sd)
data_sk = np.sum([pow(x,3) for x in data_range])/(n*pow(data_std,3))
data_kurt=np.sum([pow(x,4) for x in data_range])/(n*pow(data_std,4))-3
print ('平均數:',data_mean)
print ('中位數:',data_median)
print ('方差:',data_sd)
print ('標準差:',data_std)
print('偏態係數:',data_sk)
print('峯態係數:',data_kurt)
get_data_describe(data1)
out:
平均數: 4.417739628040057
中位數: 4
方差: 7.9170529736942825
標準差: 2.8137258170785375
偏態係數: 0.5915855449527385
峯態係數: -0.6278342838815454