/* Notice: 需要事先将IsolationForest算法源码利用mvn方式jar包,才可以使用import org.apache.spark.ml.iforest.IForest scala源代码地址:https://github.com/titicaca/spark-iforest python库sklearn.ensemble.IsolationForest官方文档地址: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html */ import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.iforest.IForest import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator // Wisconsin Breast Cancer Dataset val dataset = (spark.read.option("inferSchema", "true") .csv("/anomaly-detection/breastw.csv")) // Index label values: 2 -> 0, 4 -> 1 val indexer = (new StringIndexer() .setInputCol("_c10") .setOutputCol("label")) val assembler = (new VectorAssembler() .setInputCols(dataset.columns.filter(!_.contains("label"))) .setOutputCol("features")) val iForest = (new IForest() .setNumTrees(100) .setMaxSamples(256) .setContamination(0.35) .setBootstrap(false) .setMaxDepth(100) .setSeed(123456L)) val pipeline = new Pipeline().setStages(Array(indexer, assembler, iForest)) // let's split the dataset into a training and test dataframe val Array(trainDF, testDF) = dataset.randomSplit(Array(0.8, 0.2),seed = 123456L) val model = pipeline.fit(trainDF) val predictions = model.transform(testDF) // What was the overall accuracy of the model, using AUC val evaluator = (new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("prediction") .setMetricName("areaUnderROC")) val auc = evaluator.evaluate(predictions) println(s"The model's auc: $auc") /* scala> val auc = evaluator.evaluate(predictions) auc: Double = 0.9311653116531164 scala> println(s"The model's auc: $auc") The model's auc: 0.9311653116531164 */
https://www.liangzl.com/get-article-detail-36344.html