spark-submit --class com.ones.soc.cf.KMeansClustering --master yarn --num-executors 3 --driver-memory 5g --executor-memory 4g /root/bigData.jar /ones/mldata/test1 /ones/mldata/test2 8 30 3 /ones/result/12345 ############################################## package com.ones.soc.cf import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors /** * Created by tom */ object KMeansClustering { def main(args: Array[String]) { if(args.length < 6){ println("Usage:KMeansClustering trainingDataFilePath testDataFilePath numClusters numIterations runTimes outpath") sys.exit(1) } val confighdfs = new Configuration(); val fs=FileSystem.get(confighdfs) ; if(args(5) != null && args(5).trim().length > 1){ val output = new Path(args(5)); if(fs.exists(output)){ //删除输出目录 fs.delete(output, true); } } val conf = new SparkConf().setAppName("K-Means") val sc = new SparkContext(conf) val rawTrainingData = sc.textFile(args(0)) val parsedTrainingData = rawTrainingData.filter(!isColumnNameLine(_)).map(line => { Vectors.dense(line.split("\t").map(_.trim).filter(!"".equals(_)).map(_.toDouble)) }).cache() //Cluster the data into two classes using KMeans val numClusters = args(2).toInt val numIterations = args(3).toInt val runTimes = args(4).toInt var clusterIndex: Int = 0 val clusters: KMeansModel = KMeans.train(parsedTrainingData, numClusters, numIterations, runTimes) println("Cluster Number:" + clusters.clusterCenters.length) println("Cluster Centers Information Overview:") clusters.clusterCenters.foreach( x => { println("Center Point of Cluster " + clusterIndex + ":") println(x) clusterIndex += 1 }) //begin to check which cluster each test data belongs to based on the clustering result val rawTestData = sc.textFile(args(1)) val parsedTestData = rawTestData.map(line => { Vectors.dense(line.split("\t").map(_.trim).filter(!"".equals(_)).map(_.toDouble)) }) val sb=new StringBuilder() parsedTestData.collect().foreach(testDataLine => { val predictedClusterIndex: Int = clusters.predict(testDataLine) println("The data " + testDataLine.toString + " belongs to cluster " +predictedClusterIndex) sb.append(testDataLine.toString).append("\t").append("belongs to cluster ").append(predictedClusterIndex).append("\r\n") }) outputHdfs(fs,sb.toString(),args(5)) println("Spark MLlib K-means clustering test finished.") } private def isColumnNameLine(line: String): Boolean = { if (line != null && line.contains("Channel")) true else false } def outputHdfs(fs:FileSystem,text:String,textdir:String):Unit={ try{ val fsDataOutputStream = fs.create(new Path(textdir+"/result.txt"), true); val s=text.getBytes("UTF-8") fsDataOutputStream.write(s,0,s.length) fsDataOutputStream.hflush(); }catch{ case e:Exception => } } }
Spark Kmeans算法
猜你喜欢
转载自houston123.iteye.com/blog/2317948
今日推荐
周排行