1. SparkMllib的SummaryStatistic摘要统计(重点)
RDD[Vector]通过colStats 可用的功能提供列摘要统计信息Statistics。
- RDD操作要导入mllib下的包
- DataFrame操作导入ml下的包
均值 方差 最大值 最小值等
import org.apache.spark.mllib.linalg.{
Vector, Vectors}
import org.apache.spark.mllib.stat.{
MultivariateStatisticalSummary, Statistics}
import org.apache.spark.rdd.RDD
import org.apache.spark.{
SparkConf, SparkContext}
/**
* @author liu a fu
* @date 2021/1/27 0027
* @version 1.0
* @DESC
*/
object _06SummeryTest {
def main(args: Array[String]): Unit = {
//1-准备环境
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName.stripSuffix("$")).setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
//2-读取数据
val path = "C:\\software\\studysoft\\BigdataCode\\Spark_Code\\Spark_Mllib\\data\\libsvm\\testSummary.txt"
val dataRDD: RDD[Vector] = sc.textFile(path).map(_.split("\\s+").map(_.toDouble)).map(x => Vectors.dense(x))
dataRDD.foreach(println(_))
val summery: MultivariateStatisticalSummary = Statistics.colStats(dataRDD)
println("non zeros:",summery.numNonzeros)
println("min value:",summery.min) //最小值
println("max value:",summery.max) //最大值
println("mean value",summery.mean) //计算均值
println("variance value:",summery.variance) //计算方差
println("="*100)
val dataRDD1: RDD[Vector] = sc.parallelize(Seq(
Vectors.dense(1.0, 10.0, 100.0),
Vectors.dense(2.0, 20.0, 200.0),
Vectors.dense(3.0, 30.0, 300.0)
))
val summery1: MultivariateStatisticalSummary = Statistics.colStats(dataRDD1)
println("non zeros",summery1.numNonzeros) //number of nonzeros in each column 每列中非零的数目
println("min value",summery1.min) //每列中最小的数字 都是按照列来计算的
println("max value",summery1.max)
println("mean value",summery1.mean)
println("variance value",summery1.variance)
}
}
2. SparkMllib的Correlation相关系数详解(了解)
皮尔逊系数:
- pearson相关系数是在cos余弦距离基础上实现的标准化的操作
原始写法代码:
import org.apache.spark.mllib.linalg.{
Matrix, Vector, Vectors}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
import org.apache.spark.{
SparkConf, SparkContext}
/**
* @author liu a fu
* @date 2021/1/27 0027
* @version 1.0
* @DESC pearson相关系数原始代码
*/
object _07PearsonTest {
def main(args: Array[String]): Unit = {
//1-准备环境
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName.stripSuffix("$")).setMaster("local[8]")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
//2-读取数据
val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 4, 5))
// a series// must have the same number of partitions and cardinality as seriesX
val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 44, 55))
val dataRDD1: RDD[Vector] = sc.parallelize(Seq(
Vectors.dense(1.0, 10.0, 100.0),
Vectors.dense(2.0, 20.0, 200.0),
Vectors.dense(3.0, 30.0, 300.0)
))
val corr: Double = Statistics.corr(seriesX, seriesY)
println(corr) //0.9999999999999999
val matrix: Matrix = Statistics.corr(dataRDD1)
println(matrix)
/**
* 1.0 1.0 1.0
* 1.0 1.0 1.0
* 1.0 1.0 1.0
*/
}
}
DataFrame代码写法:
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.{
DataFrame, SparkSession}
/**
* DESC:
* Complete data processing and modeling process steps:
*
*/
object testMlSummary2 {
def main(args: Array[String]): Unit = {
//1-准备环境
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("testMlSummary2").getOrCreate()
spark.sparkContext.setLogLevel("WARN")
import spark.implicits._
//2-导入数据
val data = Seq(
Vectors.sparse(4, Seq((0, 1.0), (3, -2.0))),
Vectors.dense(4.0, 5.0, 0.0, 3.0),
Vectors.dense(6.0, 7.0, 0.0, 8.0),
Vectors.sparse(4, Seq((0, 9.0), (3, 1.0))))
val df = data.map(Tuple1.apply).toDF("features")
df.printSchema()
df.show(false)
println("ml package corr result is:")
val df1: DataFrame = Correlation.corr(df, "features", "pearson")
df1.show(false)
// +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
// |pearson(features) |
// +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
// |1.0 0.055641488407465814 NaN 0.4004714203168137
// 0.055641488407465814 1.0 NaN 0.9135958615342522
// NaN NaN 1.0 NaN
// 0.4004714203168137 0.9135958615342522 NaN 1.0
}
}
3. 随机数【了解】
什么时候使用随机数?
- 数据科学领域中,随机数的作用是当没有数据的时候,可以用随机数产生数据,产生符合某种分布方式的随机数,比如获取上海2020年高考成绩,高考成绩符合正态分布,能造一组符合正态分布的随机数
- 随机数可以用于在机器学习中数据集切分为训练集和测试集
//演示如何将数据集切分为训练集和测试集
//weights表示的是切分数据的权重 0.8表示的是80%比例的数据作为训练集
//seed表示的是随机数的种子
val split: Array[RDD[Int]] = dataSamples.randomSplit(Array(0.8,0.2),seed = 123L)
val trainingset: RDD[Int] = split(0)
val testset: RDD[Int] = split(1)
trainingset.foreach(println(_))
testset.foreach(println(_))
代码:
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.rdd.RDD
import org.apache.spark.{
SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
/**
* DESC:
* Complete data processing and modeling process steps:
*
*/
object randomNumberTest {
def main(args: Array[String]): Unit = {
//1-准备环境
val conf = new SparkConf()
.setMaster("local")
.setAppName("RandomRDDTest")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
//随机数
//from the standard normal distribution.
val double: RDD[Double] = RandomRDDs.normalRDD(sc,10)
double.foreach(println(_))
//samples
val dataSamples: RDD[Int] = sc.parallelize(1 to 10)
//是否进行又放回的抽样------是否保证整体的数据是一致的
//抽样的比例,抽样20%
//抽样的时候保证每次抽取的结果的可重复性seed=3----随机数种子
//同一个seed随机数抽样的结果是一致的
val sample: RDD[Int] = dataSamples.sample(false,0.2,3) // 1 3 5 【seed=9 1 2 3】
val sample1: Array[Int] = dataSamples.takeSample(false,2,40)
}
}