PySpark计算均值、方差、偏度和峰度

python实现
import math
import numpy as np
def calc(data):
    n=len(data) # 10000个数
    niu=0.0 # niu表示平均值,即期望.
    niu2=0.0 # niu2表示平方的平均值
    niu3=0.0 # niu3表示三次方的平均值
    for a in data:
        niu += a
        niu2 += a**2
        niu3 += a**3
    niu /= n  
    niu2 /= n
    niu3 /= n
    sigma = math.sqrt(niu2 - niu*niu)
    return [niu,sigma,niu3]
def calc_stat(data):
    [niu, sigma, niu3]=calc(data)
    n=len(data)
    niu4=0.0 # niu4计算峰度计算公式的分子
    for a in data:
        a -= niu
        niu4 += a**4
    niu4 /= n

    skew =(niu3 -3*niu*sigma**2-niu**3)/(sigma**3) # 偏度计算公式
    kurt=niu4/(sigma**4) # 峰度计算公式:下方为方差的平方即为标准差的四次方
    return [niu, sigma,skew,kurt]

if __name__ == "__main__":
    data =  list(np.random.randn(10000)) # 满足高斯分布的10000个数
    [niu, sigma, skew, kurt] = calc_stat(data)
	print (niu, sigma, skew, kurt)
pyspark实现

Spark里计算的峰度是超峰度(超额峰度(峰度减去常峰度3)定义为峰度,这样做的目的是让正态分布的峰度重新定义为0),具体概念参考另一篇博文:https://blog.csdn.net/qq_36653505/article/details/86618648

from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import mean, stddev, skewness, kurtosis
sc = SparkContext(appName='Spark_feature_transform', conf=conf)
sqlContext = SQLContext(sc)
data =  np.random.randn(10000).tolist()
dd = [(i,) for i in data]
ddf = sqlContext.createDataFrame(dd, ['num'])
ddf.select(mean('num').alias('mean'),stddev('num').alias('stddev'), skewness('num').alias('skewness'), kurtosis('num').alias('kurtosis')).show()
参考

https://blog.csdn.net/u013555719/article/details/78530879
https://blog.csdn.net/suzyu12345/article/details/79673473

猜你喜欢

转载自blog.csdn.net/qq_36653505/article/details/86618695
今日推荐