1.取英文文章数据,训练成模型,就是特征向量,用word2Vec。
2.然后用这个模型,去将新数据,或者老数据进行分类。
3.效果非常好的话,会形成类似:体育、游戏、生活、艺术等类别。
4.所用为最新的spark ml,不是mllib。
5.中间可以自己加一些去除停用词,结果优化,格式化输出等。
val conf = new SparkConf().setMaster("local[2]").setAppName("word2vec_test03")
var sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val path = "D:\\part-00000"
val rdd = sc.textFile(path)
// import scala.collection.JavaConverters._
// val stopWords = sc.textFile("stop_words_eng.txt").collect().toSeq.asJava
// val filter = new StopRecognition().insertStopWords(stopWords)
// filter.insertStopNatures("w", null, "null")
val splitWordRdd = rdd.map(_.split(",")).filter(_.size>1).map(x => {
(x(0), x(1))
})
//splitWordRdd.foreach(println)
val df = sqlContext.createDataFrame(splitWordRdd).toDF("userID", "words")
df.show()
df.createOrReplaceTempView("input_data")
// df.show()
//sqlContext.udf.register("strLen", (str: String) => str.toString.replaceAll("\\]_\\[",", "))
val result_valuse = sqlContext.sql("select userID, concat_ws(' ',collect_set(cast(words as string))) as values from input_data group by userID")
val df2 = result_valuse.toDF()
df2.show()
//var z:Array[String] = new Array[String](3)
val splitWordRdd2 = df2.rdd.map(file => {
(file(0).toString, file(1).toString.replaceAll(" ",",").split(","))
})
val df3 = sqlContext.createDataFrame(splitWordRdd2).toDF("userID", "words")
df3.rdd.foreach(println)
// Learn a mapping from words to Vectors.
val word2Vec = new Word2Vec()
.setInputCol("words")
.setOutputCol("result")
.setVectorSize(200)
.setMinCount(2)
val model = word2Vec.fit(df3)
// val synonyms = model.findSynonyms("black", 10)
//
// synonyms.show()
val result = model.transform(df)
val vocs = model.getVectors
val vocs2 = vocs.withColumnRenamed( "vector" , "features" )
//vocs.rdd.saveAsTextFile("D:\\soft\\IDEA\\data\\input\\aaa")
//
// val kmeans = new KMeans()
// .setK(20)
// .setFeaturesCol("features")
// .setPredictionCol("prediction")
// val model2 = kmeans.fit(vocs2)
// val predictions = model2.transform(vocs2)
// predictions.show()
///predictions.rdd.saveAsTextFile("D:\\soft\\IDEA\\data\\input\\aaa")
// predictions.createOrReplaceTempView("table_re")
// val result1 = sqlContext.sql("select prediction,concat_ws(',',collect_set(cast(word as string))) as words from table_re group by prediction")
// val result2 = result1.explode( "words" , "words_" ){words: String => words.split( "," )}
// val result3 = result2.select("prediction","words_")
//result1.rdd.foreach(println)
//result1.rdd.repartition(1).saveAsTextFile("D:\\soft\\IDEA\\data\\input\\user_data_re")
//result3.write.partitionBy("prediction").text("D:\\soft\\IDEA\\data\\input\\user_data_re")
// val ks:Array[Int] = Array(3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
// ks.foreach(cluster => {
// val kmeans = new KMeans()
// .setK(cluster)
// .setFeaturesCol("features")
// .setPredictionCol("prediction")
// val model2 = kmeans.fit(vocs2)
// val ssd = model2.computeCost(vocs2)
// println("sum of squared distances of points to their nearest center when k=" + cluster + " -> "+ ssd)
// })
// // 计算聚类的中心点
// println("Cluster Centers: ")
// model.clusterCenters.foreach(println)
有兴趣可以加我的大数据、数据分析、爬虫群:
《453908562》