1. 读取SparkSQL的数据进行统计实战
- 1-读取单个列的数据
import org.apache.spark.mllib.linalg.{
Vector, Vectors}
import org.apache.spark.mllib.stat.{
MultivariateStatisticalSummary, Statistics}
import org.apache.spark.rdd.RDD
import org.apache.spark.{
SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
/**
* @author liu a fu
* @date 2021/2/1 0001
* @version 1.0
* @DESC 基于SparkMllib的RDD的结构完成统计---因为在2.2.0中dataframe还没有实现基本数据统计
* 1-准备环境
* 2-读取数据
* 3-转化为Vector
* 4-数据统计操作
* 5-展示
*/
object _01SpetalLengthStaticesDemo {
def main(args: Array[String]): Unit = {
// * 1-准备环境
val conf: SparkConf = new SparkConf().setAppName("IrisSparkCoreLoader").setMaster("local[*]")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
// * 2-读取数据
val datapath="C:\\software\\studysoft\\BigdataCode\\Spark_Code\\Spark_Mllib\\data\\Iris\\length.csv"
// * 3-转化为Vector 密集向量
val data: RDD[Vector] = sc.textFile(datapath).map(_.toDouble).map(x=>Vectors.dense(x))
// * 4-数据统计操作 统计特征Statistics.colStats 详见SparkMllib的统计特征实践一文
val stats: MultivariateStatisticalSummary = Statistics.colStats(data)
// * 5-展示
println("states nonzeros:",stats.numNonzeros)
println("states min:",stats.min)
println("states max:",stats.max)
println("states mean:",stats.mean)
println("states varience:",stats.variance)
}
}
- 2-读取全部数据指定列的特征
import org.apache.spark.mllib.linalg.{
Vector, Vectors}
import org.apache.spark.mllib.stat.{
MultivariateStatisticalSummary, Statistics}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
/**
* @author liu a fu
* @date 2021/2/1 0001
* @version 1.0
* @DESC
*/
object _02irisDataStaticesDemo {
def main(args: Array[String]): Unit = {
//1-准备环境
val spark: SparkSession = SparkSession
.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
//2-读取数据 转化为Vectors
val path = "C:\\software\\studysoft\\BigdataCode\\Spark_Code\\Spark_Mllib\\data\\Iris\\iris.data"
val data: RDD[Vector] = spark.sparkContext.textFile(path).map(x=>(x.split(",")(0)))
.map(_.toDouble)
.map(x=>Vectors.dense(x))
//3-数据统计操作
/**
* Statistics: API for statistical functions in MLlib.
* colStats: Computes column-wise summary statistics for the input RDD[Vector].
*/
val stats: MultivariateStatisticalSummary = Statistics.colStats(data)
// * 5-展示
println("states nonzeros:",stats.numNonzeros)
println("states min:",stats.min)
println("states max:",stats.max)
println("states mean:",stats.mean)
println("states varience:",stats.variance)
//获取相关系数的double的列的值 第一列
val data1: RDD[Double] =spark.sparkContext.textFile(path).map(x=>x.split(",")(0)).map(_.toDouble)
//第二例
val data2: RDD[Double] = spark.sparkContext.textFile(path).map(x=>x.split(",")(2)).map(_.toDouble)
/**
* corr: Compute the Pearson correlation for the input RDDs.
* Returns NaN if either vector has 0 variance.
*/
val corr1: Double = Statistics.corr(data1,data2)
println("data1 and data2 corr value is:",corr1) //(data1 and data2 corr value is:,0.8717541573048727)
}
}
- 3-通过SQL的方式进行统计
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.{
DataFrame, SparkSession}
/**
* @author liu a fu
* @date 2021/2/2 0002
* @version 1.0
* @DESC 使用SQL的方式读取数据
* 1-准备环境
* 2-准备读取数据---option方法读取
* 3-解析数据
* 4-打印schema
*/
object _03IrisSparkSQLStaticesDemo {
def main(args: Array[String]): Unit = {
//1-读取数据
val spark: SparkSession = SparkSession
.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
//2-准备读取数据---option方法读取
val path = "C:\\software\\studysoft\\BigdataCode\\Spark_Code\\Spark_Mllib\\data\\Iris\\iris.csv"
val valueDF: DataFrame = spark.read.format("csv") //读取CSV文件的数据
.option("header", "true")
.option("inferschema", true)
.load(path)
valueDF.printSchema()
valueDF.show()
/**
* root
* |-- sepal_length: double (nullable = true)
* |-- sepal_width: double (nullable = true)
* |-- petal_length: double (nullable = true)
* |-- petal_width: double (nullable = true)
* |-- class: string (nullable = true)
*/
val vec: VectorAssembler = new VectorAssembler()
.setInputCols(Array("sepal_length", "sepal_width", "petal_length", "petal_width"))
.setOutputCol("feaures")
val vecResult: DataFrame = vec.transform(valueDF)
//Compute the Pearson correlation matrix for the input Dataset of Vectors.
val corr: DataFrame = Correlation.corr(vecResult, "feaures", "pearson")
println("corr matrix is:")
corr.show(false)
}
}
2.特征工程实践
- 1-对数据有敏感性(搞大数据要对数据有想法)
- 2-特征工程分类
- 特征抽取
- 特征选择
- 特征转换-----重要
- 特征降维
- 高纬度降低到低纬度,低纬度的物理意义不明确
特征工程案例:
Iris部分数据集展示:
/**
* DESC: 使用SQL方式读取数据
* Complete data processing and modeling process steps:
* 1-准备环境
* 2-准备读取数据---option方法读取
* 3-解析数据
* 4-打印schema
*/
object IrisSparkSQLFeaturesEngineer {
def main(args: Array[String]): Unit = {
// * 1-准备环境
val conf: SparkConf = new SparkConf().setAppName("IrisSparkCoreLoader").setMaster("local[*]")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
import spark.implicits._
// * 2-准备读取数据---option方法读取
val datapath = "C:\\software\\studysoft\\BigdataCode\\Spark_Code\\Spark_Mllib\\data\\Iris\\iris.csv"
val data: DataFrame = spark.read.format("csv").option("header", "true").option("inferschema", true).load(datapath)
// * 3-解析数据
data.printSchema()
data.show(false)
// * 4-打印schema
// root
// |-- sepal_length: double (nullable = true)
// |-- sepal_width: double (nullable = true)
// |-- petal_length: double (nullable = true)
// |-- petal_width: double (nullable = true)
// |-- class: string (nullable = true)
//1-首先将数据的标签列进行labelencoder的编码的操作0-1-2
val strIndex: StringIndexer = new StringIndexer().setInputCol("class").setOutputCol("labelclass")
val strModel: StringIndexerModel = strIndex.fit(data)
val strResult: DataFrame = strModel.transform(data)
strResult.show(false)
//2-可以将4个特征列转化为3个特征列
//2-1特征选择------df.secelt------ChiSquareSeletor
data.select("sepal_length").show(false)
data.select($"sepal_length").show(false)
data.select(col("sepal_length"))
data.select($"sepal_length",col("sepal_width")).show(false)
val vec: VectorAssembler = new VectorAssembler()
.setInputCols(Array("sepal_length","sepal_width","petal_length","petal_width"))
.setOutputCol("features")
val vecResult: DataFrame = vec.transform(data)
//卡方验证选特征
val chi: ChiSqSelector = new ChiSqSelector().setFeaturesCol("features").setLabelCol("class").setNumTopFeatures(3)
val chiModel: ChiSqSelectorModel = chi.fit(vecResult)
val chiResult: DataFrame = chiModel.transform(vecResult)
chiResult.show(false)
//2-2特征降维------PCA setK(2) 降维到2维度
println("pca transfomation:")
val pca: PCA = new PCA().setInputCol("features").setOutputCol("pca_features").setK(2)
val pcaModel: PCAModel = pca.fit(vecResult)
pcaModel.transform(vecResult).show(false)
}
}