val Array(trainDF, testDF) = dataDF.randomSplit(Array(0.75, 0.25))
trainDF.persist()
testDF.persist()
预处理训练集
val inputCols = trainDF.columns.filter(_ != "Cover_Type")
val assembler = new VectorAssembler()
.setInputCols(inputCols)
.setOutputCol("featureVector")
val assemblerTrainDF = assembler.transform(trainDF).persist()
assemblerTrainDF.select("featureVector").show(false)
预处理测试集
val assemblerTestDF = new VectorAssembler()
.setInputCols(inputCols)
.setOutputCol("featureVector")
.transform(testDF)
// 构建模型
val classifier = new DecisionTreeClassifier()
.setSeed(Random.nextLong())
.setLabelCol("Cover_Type")
.setFeaturesCol("featureVector")
.setPredictionCol("prediction")
// 训练模型
val model = classifier.fit(assemblerTrainDF)
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("Cover_Type")
.setPredictionCol("prediction")
val accuracy = evaluator.setMetricName("accuracy").evaluate(predictionDF)
val f1 = evaluator.setMetricName("f1").evaluate(predictionDF)
println(s"accuracy = $accuracy, f1 = $f1")
评分结果
accuracy = 0.6986190873428979, f1 = 0.6820440997673965
4. 利用网格搜索与交叉验证API
构建管道模型
val pipeline = new Pipeline().setStages(Array(assembler, classifier))
构建网格参数
val paramGrid = new ParamGridBuilder()
.addGrid(classifier.impurity, Seq("gini", "entropy"))
.addGrid(classifier.maxDepth, Seq(1, 20))
.addGrid(classifier.maxBins, Seq(40, 300))
.addGrid(classifier.minInfoGain, Seq(0.0, 0.05))
.build()
构建分类模型的评估器
val multiclassEvaluator = new MulticlassClassificationEvaluator()
.setLabelCol("Cover_Type")
.setPredictionCol("prediction")
.setMetricName("accuracy")
开始网格搜索+交叉验证
// 构建模型
val validator = new TrainValidationSplit()
.setSeed(Random.nextLong())
.setEstimator(pipeline)
.setEvaluator(multiclassEvaluator)
.setEstimatorParamMaps(paramGrid)
.setTrainRatio(0.8)
// 训练模型
val validatorModel = validator.fit(trainDF)
val classifier = new RandomForestClassifier()
.setSeed(Random.nextLong())
.setLabelCol("Cover_Type")
.setFeaturesCol("featureVector")
.setPredictionCol("prediction")
.setNumTrees(100)
6. 完整代码
import org.apache.spark.SparkConf
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.util.Random
/**
* 第四章 - 决策树 - 预测森林植被
*
* @author ALion
*/
object RunRDF {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Demo").setMaster("local[4]")
val spark = SparkSession.builder()
.config(conf)
.enableHiveSupport()
.getOrCreate()
org.apache.log4j.Logger.getRootLogger.setLevel(
org.apache.log4j.Level.toLevel("WARN")
)
import spark.implicits._
// 1.准备数据
val dataDF = loadData(spark)
dataDF.show()
// 2. 拆分数据集
val Array(trainDF, testDF) = dataDF.randomSplit(Array(0.75, 0.25))
trainDF.persist()
testDF.persist()
// 3. 预处理
val inputCols = trainDF.columns.filter(_ != "Cover_Type")
val assembler = new VectorAssembler()
.setInputCols(inputCols)
.setOutputCol("featureVector")
// 训练集
val assemblerTrainDF = assembler.transform(trainDF).persist()
assemblerTrainDF.select("featureVector").show(false)
// 测试集
val assemblerTestDF = new VectorAssembler()
.setInputCols(inputCols)
.setOutputCol("featureVector")
.transform(testDF)
// 4. 构建决策树模型
// val classifier = new DecisionTreeClassifier()
// .setSeed(Random.nextLong())
// .setLabelCol("Cover_Type")
// .setFeaturesCol("featureVector")
// .setPredictionCol("prediction")
// 使用随机森林模型替换前面的决策树,提高准确率
val classifier = new RandomForestClassifier()
.setSeed(Random.nextLong())
.setLabelCol("Cover_Type")
.setFeaturesCol("featureVector")
.setPredictionCol("prediction")
.setNumTrees(100)
// 训练数据
val model = classifier.fit(assemblerTrainDF)
println(model.toDebugString) // 打印决策模型
// 打印不同特征的信息增益
model.featureImportances
.toArray
.zip(inputCols)
.sorted.reverse
.foreach(println)
// 5. 预测植被
val predictionDF = model.transform(assemblerTestDF)
predictionDF.persist()
predictionDF.select("Cover_Type", "prediction", "probability")
.show(false)
// 评分
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("Cover_Type")
.setPredictionCol("prediction")
val accuracy = evaluator.setMetricName("accuracy").evaluate(predictionDF)
val f1 = evaluator.setMetricName("f1").evaluate(predictionDF)
println(s"accuracy = $accuracy, f1 = $f1")
// 计算混淆矩阵
// 方法1
val predictionRDD = predictionDF
.select("prediction", "Cover_Type")
.as[(Double, Double)]
.rdd
val multiclassMetrics = new MulticlassMetrics(predictionRDD)
println(multiclassMetrics.confusionMatrix)
// 方法2
val confusionMatrix = predictionDF
.groupBy("Cover_Type")
.pivot("prediction", 1 to 7)
.count()
.na.fill(0.0)
.orderBy("Cover_Type")
confusionMatrix.show()
// 6. 网格搜索+交叉验证
// 构建管道模型
val pipeline = new Pipeline().setStages(Array(assembler, classifier))
// 构建网格参数
val paramGrid = new ParamGridBuilder()
.addGrid(classifier.impurity, Seq("gini", "entropy"))
.addGrid(classifier.maxDepth, Seq(1, 20))
.addGrid(classifier.maxBins, Seq(40, 300))
.addGrid(classifier.minInfoGain, Seq(0.0, 0.05))
.build()
// 构建分类模型的评估器
val multiclassEvaluator = new MulticlassClassificationEvaluator()
.setLabelCol("Cover_Type")
.setPredictionCol("prediction")
.setMetricName("accuracy")
// 开始网格搜索+交叉验证
val validator = new TrainValidationSplit()
.setSeed(Random.nextLong())
.setEstimator(pipeline)
.setEvaluator(multiclassEvaluator)
.setEstimatorParamMaps(paramGrid)
.setTrainRatio(0.8)
val validatorModel = validator.fit(trainDF)
// 获取训练结果的最佳模型,最佳参数
val bestModel = validatorModel.bestModel
println(bestModel.asInstanceOf[PipelineModel].stages.last.extractParamMap())
// 查看所有参数组合
validatorModel.validationMetrics
.zip(validatorModel.getEstimatorParamMaps)
.sortBy(-_._1)
.foreach { case (metric, params) =>
println("-----------------------------------------")
println(metric)
println(params)
}
spark.stop()
}
/**
* 加载原始数据
* @param spark SparkSession
* @return DataFrame
*/
def loadData(spark: SparkSession): DataFrame = {
import spark.implicits._
val dataWithoutHeaderDF = spark.read
.option("inferSchema", true)
.option("header", false)
.csv("E:/Data/saa/Chapter4_covtype/covtype.data")
// 重新定义字段名
val colNames = Seq(
"Elevation", "Aspect", "Slope",
"Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
"Horizontal_Distance_To_Roadways",
"Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
"Horizontal_Distance_To_Fire_Points") ++
(0 until 4).map(i => s"Wilderness_Area_$i") ++
(0 until 40).map(i => s"Soil_Type_$i") ++
Seq("Cover_Type")
dataWithoutHeaderDF.toDF(colNames: _*)
.withColumn("Cover_Type", $"Cover_Type".cast("double"))
}
}