python实现
import math
import numpy as np
def calc(data):
n=len(data) # 10000个数
niu=0.0 # niu表示平均值,即期望.
niu2=0.0 # niu2表示平方的平均值
niu3=0.0 # niu3表示三次方的平均值
for a in data:
niu += a
niu2 += a**2
niu3 += a**3
niu /= n
niu2 /= n
niu3 /= n
sigma = math.sqrt(niu2 - niu*niu)
return [niu,sigma,niu3]
def calc_stat(data):
[niu, sigma, niu3]=calc(data)
n=len(data)
niu4=0.0 # niu4计算峰度计算公式的分子
for a in data:
a -= niu
niu4 += a**4
niu4 /= n
skew =(niu3 -3*niu*sigma**2-niu**3)/(sigma**3) # 偏度计算公式
kurt=niu4/(sigma**4) # 峰度计算公式:下方为方差的平方即为标准差的四次方
return [niu, sigma,skew,kurt]
if __name__ == "__main__":
data = list(np.random.randn(10000)) # 满足高斯分布的10000个数
[niu, sigma, skew, kurt] = calc_stat(data)
print (niu, sigma, skew, kurt)
pyspark实现
Spark里计算的峰度是超峰度(超额峰度(峰度减去常峰度3)定义为峰度,这样做的目的是让正态分布的峰度重新定义为0),具体概念参考另一篇博文:https://blog.csdn.net/qq_36653505/article/details/86618648
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import mean, stddev, skewness, kurtosis
sc = SparkContext(appName='Spark_feature_transform', conf=conf)
sqlContext = SQLContext(sc)
data = np.random.randn(10000).tolist()
dd = [(i,) for i in data]
ddf = sqlContext.createDataFrame(dd, ['num'])
ddf.select(mean('num').alias('mean'),stddev('num').alias('stddev'), skewness('num').alias('skewness'), kurtosis('num').alias('kurtosis')).show()
参考
https://blog.csdn.net/u013555719/article/details/78530879
https://blog.csdn.net/suzyu12345/article/details/79673473