机器学习spark ml提取文章关键词特征并聚类word2Vec+KMeans

1.取英文文章数据，训练成模型，就是特征向量，用word2Vec。

2.然后用这个模型，去将新数据，或者老数据进行分类。

3.效果非常好的话，会形成类似：体育、游戏、生活、艺术等类别。

4.所用为最新的spark ml，不是mllib。

5.中间可以自己加一些去除停用词，结果优化，格式化输出等。

    val conf = new SparkConf().setMaster("local[2]").setAppName("word2vec_test03")
    var sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    val path = "D:\\part-00000"
    val rdd = sc.textFile(path)
//    import scala.collection.JavaConverters._
//    val stopWords = sc.textFile("stop_words_eng.txt").collect().toSeq.asJava
//    val filter = new StopRecognition().insertStopWords(stopWords)
//    filter.insertStopNatures("w", null, "null")

    val splitWordRdd = rdd.map(_.split(",")).filter(_.size>1).map(x => {
      (x(0), x(1))
    })
    //splitWordRdd.foreach(println)

    val df = sqlContext.createDataFrame(splitWordRdd).toDF("userID", "words")
    df.show()

    df.createOrReplaceTempView("input_data")
    //    df.show()
    //sqlContext.udf.register("strLen", (str: String) => str.toString.replaceAll("\\]_\\[",", "))
    val result_valuse = sqlContext.sql("select userID, concat_ws(' ',collect_set(cast(words as string))) as values from input_data group by userID")
    val df2 = result_valuse.toDF()
    df2.show()

    //var z:Array[String] = new Array[String](3)
    val splitWordRdd2 = df2.rdd.map(file => {
      (file(0).toString, file(1).toString.replaceAll(" ",",").split(","))
    })

    val df3 = sqlContext.createDataFrame(splitWordRdd2).toDF("userID", "words")
    df3.rdd.foreach(println)

    // Learn a mapping from words to Vectors.
    val word2Vec = new Word2Vec()
      .setInputCol("words")
      .setOutputCol("result")
      .setVectorSize(200)
      .setMinCount(2)
    val model = word2Vec.fit(df3)
//    val synonyms = model.findSynonyms("black", 10)
//
//    synonyms.show()
    val result = model.transform(df)
    val vocs = model.getVectors
    val vocs2 = vocs.withColumnRenamed( "vector" , "features" )
    //vocs.rdd.saveAsTextFile("D:\\soft\\IDEA\\data\\input\\aaa")
//
//    val kmeans = new KMeans()
//      .setK(20)
//      .setFeaturesCol("features")
//      .setPredictionCol("prediction")
//    val model2 = kmeans.fit(vocs2)
//    val predictions = model2.transform(vocs2)
//    predictions.show()


    ///predictions.rdd.saveAsTextFile("D:\\soft\\IDEA\\data\\input\\aaa")
//    predictions.createOrReplaceTempView("table_re")
//    val result1 = sqlContext.sql("select prediction,concat_ws(',',collect_set(cast(word as string))) as words from table_re group by prediction")
//    val result2 = result1.explode( "words" , "words_" ){words: String => words.split( "," )}
//    val result3 = result2.select("prediction","words_")
    //result1.rdd.foreach(println)
    //result1.rdd.repartition(1).saveAsTextFile("D:\\soft\\IDEA\\data\\input\\user_data_re")
    //result3.write.partitionBy("prediction").text("D:\\soft\\IDEA\\data\\input\\user_data_re")



//    val ks:Array[Int] = Array(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
//    ks.foreach(cluster => {
//      val kmeans = new KMeans()
//          .setK(cluster)
//          .setFeaturesCol("features")
//          .setPredictionCol("prediction")
//      val model2 = kmeans.fit(vocs2)
//      val ssd = model2.computeCost(vocs2)
//      println("sum of squared distances of points to their nearest center when k=" + cluster + " -> "+ ssd)
//    })

//    // 计算聚类的中心点
//    println("Cluster Centers: ")
//    model.clusterCenters.foreach(println)

有兴趣可以加我的大数据、数据分析、爬虫群：
《453908562》

机器学习spark ml提取文章关键词特征并聚类word2Vec+KMeans

猜你喜欢