sparksql 2.x 写WordCount

package com.ws.sparksql
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

object SqlWordCount {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("SqlWordCount").master("local[*]").getOrCreate()

    //Dataset : 只有一列,默认列名value
    //Dataset分布式数据,进一步封装RDD,更加智能的RDD
    val dataRdd: Dataset[String] = spark.read.textFile("hdfs://hadoop-01:9000/project")

    //导入隐式转换
    import spark.implicits._
    //切分
    val splitRdd: Dataset[String] = dataRdd.flatMap(_.split(" "))

    //注册视图
    splitRdd.createTempView("t_project")
    //dataFrame方式 ,也可以使用聚合函数 agg()
    // val result = splitRdd.groupBy($"value" as "keyWord").count().sort($"count" desc)
    //sql方式
    val result: DataFrame = spark.sql("select value as keyWord, count(*) as num " +
      "from t_project group by keyWord order by num desc ")

    result.show()

    spark.stop()
  }
}

结果 :

+-------+---+
|keyWord|num|
+-------+---+
|  spark|  7|
| hadoop|  5|
|   hive|  4|
|  hbase|  3|
|  flume|  2|
|  sqoop|  1|
| ssqoop|  1|
+-------+---+

猜你喜欢

转载自blog.csdn.net/bb23417274/article/details/82947762