Spark Kmeans算法



spark-submit --class   com.ones.soc.cf.KMeansClustering --master yarn --num-executors 3 --driver-memory 5g --executor-memory 4g /root/bigData.jar /ones/mldata/test1 /ones/mldata/test2 8 30 3 /ones/result/12345

##############################################

package com.ones.soc.cf

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path, FileSystem}
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors

/**
  * Created by tom
  */
object KMeansClustering {

  def main(args: Array[String]) {
    if(args.length < 6){
      println("Usage:KMeansClustering trainingDataFilePath testDataFilePath numClusters numIterations runTimes outpath")
      sys.exit(1)
    }

    val confighdfs = new Configuration();
    val fs=FileSystem.get(confighdfs) ;
    if(args(5) != null && args(5).trim().length > 1){
      val output = new Path(args(5));
      if(fs.exists(output)){ //删除输出目录
        fs.delete(output, true);
      }
    }

    val conf = new SparkConf().setAppName("K-Means")
    val sc = new SparkContext(conf)
    val rawTrainingData = sc.textFile(args(0))
    val parsedTrainingData =
    rawTrainingData.filter(!isColumnNameLine(_)).map(line => {
      Vectors.dense(line.split("\t").map(_.trim).filter(!"".equals(_)).map(_.toDouble))
    }).cache()

    //Cluster the data into two classes using KMeans
    val numClusters = args(2).toInt
    val numIterations = args(3).toInt
    val runTimes = args(4).toInt
    var clusterIndex: Int = 0
    val clusters: KMeansModel = KMeans.train(parsedTrainingData, numClusters, numIterations, runTimes)

     println("Cluster Number:" + clusters.clusterCenters.length)
     println("Cluster Centers Information Overview:")
     clusters.clusterCenters.foreach(
      x => {
        println("Center Point of Cluster " + clusterIndex + ":")
        println(x)
        clusterIndex += 1
      })
    //begin to check which cluster each test data belongs to based on the clustering result
    val rawTestData = sc.textFile(args(1))
    val parsedTestData = rawTestData.map(line => {
    Vectors.dense(line.split("\t").map(_.trim).filter(!"".equals(_)).map(_.toDouble))
    })

    val sb=new StringBuilder()
    parsedTestData.collect().foreach(testDataLine => {
      val predictedClusterIndex:
      Int = clusters.predict(testDataLine)
      println("The data " + testDataLine.toString + " belongs to cluster " +predictedClusterIndex)
      sb.append(testDataLine.toString).append("\t").append("belongs to cluster ").append(predictedClusterIndex).append("\r\n")
    })

    outputHdfs(fs,sb.toString(),args(5))
    println("Spark MLlib K-means clustering test finished.")
  }

  private def isColumnNameLine(line: String): Boolean = {
    if (line != null &&
      line.contains("Channel")) true
    else false
  }

  def outputHdfs(fs:FileSystem,text:String,textdir:String):Unit={
    try{
      val fsDataOutputStream = fs.create(new Path(textdir+"/result.txt"), true);
      val s=text.getBytes("UTF-8")
      fsDataOutputStream.write(s,0,s.length)
      fsDataOutputStream.hflush();
    }catch{
      case e:Exception =>
    }
  }

}
猜你喜欢