python實現
import math
import numpy as np
def calc(data):
n=len(data) # 10000個數
niu=0.0 # niu表示平均值,即期望.
niu2=0.0 # niu2表示平方的平均值
niu3=0.0 # niu3表示三次方的平均值
for a in data:
niu += a
niu2 += a**2
niu3 += a**3
niu /= n
niu2 /= n
niu3 /= n
sigma = math.sqrt(niu2 - niu*niu)
return [niu,sigma,niu3]
def calc_stat(data):
[niu, sigma, niu3]=calc(data)
n=len(data)
niu4=0.0 # niu4計算峯度計算公式的分子
for a in data:
a -= niu
niu4 += a**4
niu4 /= n
skew =(niu3 -3*niu*sigma**2-niu**3)/(sigma**3) # 偏度計算公式
kurt=niu4/(sigma**4) # 峯度計算公式:下方爲方差的平方即爲標準差的四次方
return [niu, sigma,skew,kurt]
if __name__ == "__main__":
data = list(np.random.randn(10000)) # 滿足高斯分佈的10000個數
[niu, sigma, skew, kurt] = calc_stat(data)
print (niu, sigma, skew, kurt)
pyspark實現
Spark裏計算的峯度是超峯度(超額峯度(峯度減去常峯度3)定義爲峯度,這樣做的目的是讓正態分佈的峯度重新定義爲0),具體概念參考另一篇博文:https://blog.csdn.net/qq_36653505/article/details/86618648
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import mean, stddev, skewness, kurtosis
sc = SparkContext(appName='Spark_feature_transform', conf=conf)
sqlContext = SQLContext(sc)
data = np.random.randn(10000).tolist()
dd = [(i,) for i in data]
ddf = sqlContext.createDataFrame(dd, ['num'])
ddf.select(mean('num').alias('mean'),stddev('num').alias('stddev'), skewness('num').alias('skewness'), kurtosis('num').alias('kurtosis')).show()
參考
https://blog.csdn.net/u013555719/article/details/78530879
https://blog.csdn.net/suzyu12345/article/details/79673473