1、分别给出一张表的数据:student_scores.txt
字段是:班级编号,班级名称,入学日期,所属院系中文名
学号,姓名,性别,所属班级编号,入学成绩
170401011001 ,施礼义,男,0101,467
170401011002 ,王旭,男,0101,518
170401011003 ,肖桢,女,0101,509
170401011004 ,吴佩东,男,0101,508
170401011005 ,魏会,男,0101,494
170401011006 ,曾美,女,0101,500
170401011007 ,邵亚,女,0101,490
170401011008 ,朱燕菊,女,0101,466
使用三种方式
第1种:指定列名添加Schema 查询数据
package SparkSql
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext, sql}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
/**
* Created by 一个蔡狗 on 2020/4/13.
* 第一种方法
* 手动指定 添加 Schema
*/
object CreateDFDS {
def main(args: Array[String]): Unit = {
// 1创建 SparkSeesion
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("01").getOrCreate()
//创建 sc
val sc: SparkContext = spark.sparkContext
//导入数据
val fileRDD: RDD[String] = sc.textFile("E:\\student_scores.txt")
//根据空格 分割 然后遍历 导入数据
val DataRDD: RDD[(String, String,String, String,String)] = fileRDD.map(a => a.split(",")).map(b => (b(0), b(1),b(2),b(3),b(4)))
//隐式转换222
import spark.implicits._
val DataDF: DataFrame = DataRDD.toDF()
DataDF.show()
DataDF.printSchema()
//关闭
sc.stop()
spark.stop()
}
}
第2种:通过StructType指定Schema 查询数据
package SparkSql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StructField, _}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
/**
* Created by 一个蔡狗 on 2020/4/13.
*
* 第二种方法
*
*
*
*/
object CreateDFDS_02 {
def main(args: Array[String]): Unit = {
// 1创建 SparkSeesion
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("01").getOrCreate()
//创建 sc
val sc: SparkContext = spark.sparkContext
//导入数据
val fileRDD: RDD[String] = sc.textFile("E:\\student_scores.txt")
//根据空格 分割 然后遍历 导入数据
val RowRDD: RDD[Row] = fileRDD.map(a => a.split(",")).map(b => Row(b(0), b(1),b(2),b(3),b(4)))
//设置表结构
val structType: StructType = StructType(Seq(
StructField("id", StringType, true), //允许为空
StructField("name", StringType, true),
StructField("rux", StringType, true),
StructField("score", StringType, true)
)
)
val DataDF: DataFrame = spark.createDataFrame(RowRDD, structType)
// DataDF.show()
// DataDF.printSchema()
//注册成一张表 查询数据
DataDF.createOrReplaceTempView("DataDF")
spark.sql("select * from DataDF ").show()
//关闭
sc.stop()
spark.stop()
}
}
第3种:编写样例类,利用反射机制推断Schema 查询数据
package SparkSql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
/**
* Created by 一个蔡狗 on 2020/4/13.
*/
object CreateDFDS_03 {
//准备样例类
// case class Student(id: String, name: String, sex: String,classname:String,day:String)
// case class department(id:String,name:String)
case class tudent_scores(id:String,name:String,rux:String,score:String)
def main(args: Array[String]): Unit = {
// 1创建 SparkSeesion
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("01").getOrCreate()
//创建 sc
val sc: SparkContext = spark.sparkContext
//导入数据
val fileRDD: RDD[String] = sc.textFile("E:\\student_scores.txt")
//根据空格 分割 然后遍历 导入数据
val DataRDD: RDD[Array[String]] = fileRDD.map(x => x.split(","))
val tudent_scoresRDD: RDD[tudent_scores] = DataRDD.map(a => tudent_scores(a(0), a(1),a(2),a(3)))
//导入 隐式转换
import spark.implicits._
//RDD 转 DF
val tudent_scoresDF: DataFrame = tudent_scoresRDD.toDF()
//注册成一张表 查询数据
tudent_scoresDF.show()
tudent_scoresDF.printSchema()
//json 输出 json 数据
// studentDF.write.json("路径")
//第一种查询方式
tudent_scoresDF.createOrReplaceTempView("tudent_scoresDF")
// var sql =
// """
// |select value , count(value) as count
// |from personDF
// |group by value
// |order by count desc
// """.stripMargin
// spark.sql(sql).show()
spark.sql("select * from tudent_scoresDF").show()
// 第二种查询方式
// personDF.select("name", "age").filter($"age" > 25).show()
//关闭
sc.stop()
spark.stop()
}
}