spark sql 之 DSL风格 练习mysql50题

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.2.0
      /_/

Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_221)
Type in expressions to have them evaluated.
Type :help for more information.

scala> //学生表

scala> case class Student(student_id:Int,student_name:String,birth:String,sex:String)
defined class Student

scala> val rdd = sc.makeRDD(Array(
     | (1 , "赵雷" , "1990-01-01" , "男"),
     | (2 , "钱电" , "1990-12-21" , "男"),
     | (3 , "孙风" , "1990-05-20" , "男"),
     | (4 , "李云" , "1990-08-06" , "男"),
     | (5 , "周梅" , "1991-12-01" , "女"),
     | (6 , "吴兰" , "1992-03-01" , "女"),
     | (7 , "郑竹" , "1989-07-01" , "女"),
     | (8 , "王菊" , "1990-01-20" , "女")))
rdd: org.apache.spark.rdd.RDD[(Int, String, String, String)] = ParallelCollectionRDD[0] at makeRDD at <console>:24

scala> val studentDF = rdd.map(x=>Student(x._1,x._2,x._3,x._4)).toDF
studentDF: org.apache.spark.sql.DataFrame = [student_id: int, student_name: string ... 2 more fields]

scala>

scala> //课程表

scala> case class Course(course_id:Int,course_name:String,teacher_id:Int)
defined class Course

scala> val rdd2 = sc.makeRDD(Array(
     | (1 , "语文" , 2),
     | (2 , "数学" , 1),
     | (3 , "英语" , 3)
     | ))
rdd2: org.apache.spark.rdd.RDD[(Int, String, Int)] = ParallelCollectionRDD[2] at makeRDD at <console>:24

scala> val courseDF = rdd2.map(x=>Course(x._1,x._2,x._3)).toDF
courseDF: org.apache.spark.sql.DataFrame = [course_id: int, course_name: string ... 1 more field]

scala> //教师表

scala> case class Teacher(teacher_id:Int,teacher_name:String)
defined class Teacher

scala> val rdd3 = sc.makeRDD(Array(
     | (1 , "张三"),
     | (2 , "李四"),
     | (3 , "王五")
     | ))
rdd3: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[4] at makeRDD at <console>:24

scala> val teacherDF = rdd3.map(x=>Teacher(x._1,x._2)).toDF
teacherDF: org.apache.spark.sql.DataFrame = [teacher_id: int, teacher_name: string]

scala> //成绩表

scala> case class Score(student_id:Int,course_id:Int,score:Int)
defined class Score

scala> val rdd4 = sc.makeRDD(Array(
     | (1 , 1 , 80),
     | (1 , 2 , 90),
     | (1 , 3 , 99),
     | (2 , 1 , 70),
     | (2 , 2 , 60),
     | (2 , 3 , 80),
     | (3 , 1 , 80),
     | (3 , 2 , 80),
     | (3 , 3 , 80),
     | (4 , 1 , 50),
     | (4 , 2 , 30),
     | (4 , 3 , 20),
     | (5 , 1 , 76),
     | (5 , 2 , 87),
     | (6 , 1 , 31),
     | (6 , 3 , 34),
     | (7 , 2 , 89),
     | (7 , 3 , 98)))
rdd4: org.apache.spark.rdd.RDD[(Int, Int, Int)] = ParallelCollectionRDD[6] at makeRDD at <console>:24

scala> val scoreDF = rdd4.map(x=>Score(x._1,x._2,x._3)).toDF
scoreDF: org.apache.spark.sql.DataFrame = [student_id: int, course_id: int ... 1 more field]

scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._

scala>

scala> //1、查询"01"课程比"02"课程成绩高的学生的信息及课程分数:

scala> scoreDF.as("s1").join(scoreDF.as("s2"),"student_id").filter("s1.course_id=1 and s2.course_id=2 and s1.score>s2.score").join(studentDF,"student_id").show
+----------+---------+-----+---------+-----+------------+----------+---+
|student_id|course_id|score|course_id|score|student_name|     birth|sex|
+----------+---------+-----+---------+-----+------------+----------+---+
|         4|        1|   50|        2|   30|          李云|1990-08-06||
|         2|        1|   70|        2|   60|          钱电|1990-12-21||
+----------+---------+-----+---------+-----+------------+----------+---+


scala> //2.查询"01"课程比"02"课程成绩低的学生的信息及课程分数:

scala> scoreDF.as("s1").join(scoreDF.as("s2"),"student_id").filter("s1.course_id=1 and s2.course_id=2 and s1.score<s2.score").join(studentDF,"student_id").show
+----------+---------+-----+---------+-----+------------+----------+---+
|student_id|course_id|score|course_id|score|student_name|     birth|sex|
+----------+---------+-----+---------+-----+------------+----------+---+
|         1|        1|   80|        2|   90|          赵雷|1990-01-01||
|         5|        1|   76|        2|   87|          周梅|1991-12-01||
+----------+---------+-----+---------+-----+------------+----------+---+


scala> //3.查询平均成绩大于等于60分的同学的学生编号和学生姓名和平均成绩

scala> scoreDF.as("s1").groupBy("student_id").avg("score").join(studentDF.as("s2"),"student_id").filter($"avg(score)">=60).show
+----------+-----------------+------------+----------+---+
|student_id|       avg(score)|student_name|     birth|sex|
+----------+-----------------+------------+----------+---+
|         1|89.66666666666667|          赵雷|1990-01-01||
|         3|             80.0|          孙风|1990-05-20||
|         5|             81.5|          周梅|1991-12-01||
|         7|             93.5|          郑竹|1989-07-01||
|         2|             70.0|          钱电|1990-12-21||
+----------+-----------------+------------+----------+---+


scala> //4.查询平均成绩小于60分的同学的学生编号和学生姓名和平均成绩:(包括有成绩的和无成绩的)

scala> studentDF.as("s2").join((scoreDF.as("s1").groupBy("student_id").avg("score")).as("s3"),Seq("student_id"),"left_outer").as("s").withColumnRenamed("avg(score)","A").where((col("A")<60) || (col("A").isNull)).show
+----------+------------+----------+---+------------------+
|student_id|student_name|     birth|sex|                 A|
+----------+------------+----------+---+------------------+
|         6|          吴兰|1992-03-01||              32.5|
|         4|          李云|1990-08-06||33.333333333333336|
|         8|          王菊|1990-01-20||              null|
+----------+------------+----------+---+------------------+


scala> //5.查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩:

scala> //选课数

scala> scoreDF.groupBy("student_id").count
res4: org.apache.spark.sql.DataFrame = [student_id: int, count: bigint]

scala> //总成绩

scala> scoreDF.groupBy("student_id").sum("score")
res5: org.apache.spark.sql.DataFrame = [student_id: int, sum(score): bigint]

scala> //连表

scala> studentDF.join(scoreDF.groupBy("student_id").count,Seq("student_id"),"left_outer").join(scoreDF.groupBy("student_id").sum("score"),Seq("student_id"),"left_outer").show
+----------+------------+----------+---+-----+----------+
|student_id|student_name|     birth|sex|count|sum(score)|
+----------+------------+----------+---+-----+----------+
|         1|          赵雷|1990-01-01||    3|       269|
|         6|          吴兰|1992-03-01||    2|        65|
|         3|          孙风|1990-05-20||    3|       240|
|         5|          周梅|1991-12-01||    2|       163|
|         4|          李云|1990-08-06||    3|       100|
|         8|          王菊|1990-01-20|| null|      null|
|         7|          郑竹|1989-07-01||    2|       187|
|         2|          钱电|1990-12-21||    3|       210|
+----------+------------+----------+---+-----+----------+


scala> //6.查询"李"姓老师的数量:

scala> teacherDF.where("teacher_name like '李%'").select("teacher_id").count
res7: Long = 1

scala> //7.查询学过"张三"老师授课的同学的信息:

scala> scoreDF.join(courseDF,"course_id").join(teacherDF,"teacher_id").filter("teacher_name = '张三'").join(studentDF,"student_id").show
+----------+----------+---------+-----+-----------+------------+------------+----------+---+
|student_id|teacher_id|course_id|score|course_name|teacher_name|student_name|     birth|sex|
+----------+----------+---------+-----+-----------+------------+------------+----------+---+
|         1|         1|        2|   90|         数学|          张三|          赵雷|1990-01-01||
|         3|         1|        2|   80|         数学|          张三|          孙风|1990-05-20||
|         5|         1|        2|   87|         数学|          张三|          周梅|1991-12-01||
|         4|         1|        2|   30|         数学|          张三|          李云|1990-08-06||
|         7|         1|        2|   89|         数学|          张三|          郑竹|1989-07-01||
|         2|         1|        2|   60|         数学|          张三|          钱电|1990-12-21||
+----------+----------+---------+-----+-----------+------------+------------+----------+---+


scala> //8.查询没学过"张三"老师授课的同学的信息:

scala> studentDF.join(scoreDF.join(courseDF,"course_id").join(teacherDF,"teacher_id"),Seq("student_id"),"left_outer").where("teacher_name!='张三' or teach_name is null").show
+----------+------------+----------+---+----------+---------+-----+-----------+------------+
|student_id|student_name|     birth|sex|teacher_id|course_id|score|course_name|teacher_name|
+----------+------------+----------+---+----------+---------+-----+-----------+------------+
|         1|          赵雷|1990-01-01||         3|        3|   99|         英语|          王五|
|         1|          赵雷|1990-01-01||         2|        1|   80|         语文|          李四|
|         6|          吴兰|1992-03-01||         3|        3|   34|         英语|          王五|
|         6|          吴兰|1992-03-01||         2|        1|   31|         语文|          李四|
|         3|          孙风|1990-05-20||         3|        3|   80|         英语|          王五|
|         3|          孙风|1990-05-20||         2|        1|   80|         语文|          李四|
|         5|          周梅|1991-12-01||         2|        1|   76|         语文|          李四|
|         4|          李云|1990-08-06||         3|        3|   20|         英语|          王五|
|         4|          李云|1990-08-06||         2|        1|   50|         语文|          李四|
|         8|          王菊|1990-01-20||      null|     null| null|       null|        null|
|         7|          郑竹|1989-07-01||         3|        3|   98|         英语|          王五|
|         2|          钱电|1990-12-21||         3|        3|   80|         英语|          王五|
|         2|          钱电|1990-12-21||         2|        1|   70|         语文|          李四|
+----------+------------+----------+---+----------+---------+-----+-----------+------------+


scala> //9.查询学过编号为"01"并且也学过编号为"02"的课程的同学的信息:

scala> scoreDF.where("course_id in(1,2)").groupBy("student_id").count.where("count=2").join(studentDF,"student_id").show
+----------+-----+------------+----------+---+
|student_id|count|student_name|     birth|sex|
+----------+-----+------------+----------+---+
|         1|    2|          赵雷|1990-01-01||
|         3|    2|          孙风|1990-05-20||
|         5|    2|          周梅|1991-12-01||
|         4|    2|          李云|1990-08-06||
|         2|    2|          钱电|1990-12-21||
+----------+-----+------------+----------+---+


scala> //10.查询学过编号为"01"但是没有学过编号为"02"的课程的同学的信息:

scala> studentDF.join(scoreDF.where("course_id in (2)"),Seq("student_id"),"left_outer").as("s1").where("s1.course_id is null").join(scoreDF.where("course_id in (1)"),"student_id").show
+----------+------------+----------+---+---------+-----+---------+-----+
|student_id|student_name|     birth|sex|course_id|score|course_id|score|
+----------+------------+----------+---+---------+-----+---------+-----+
|         6|          吴兰|1992-03-01||     null| null|        1|   31|
+----------+------------+----------+---+---------+-----+---------+-----+


scala> //11、查询没有学全所有课程的同学的信息:

scala> studentDF.join(scoreDF,Seq("student_id"),"left_outer").groupBy("student_id").count.where(s"count != ${courseDF.select("course_id").count}").join(studentDF,"student_id").show
+----------+-----+------------+----------+---+
|student_id|count|student_name|     birth|sex|
+----------+-----+------------+----------+---+
|         6|    2|          吴兰|1992-03-01||
|         5|    2|          周梅|1991-12-01||
|         8|    1|          王菊|1990-01-20||
|         7|    2|          郑竹|1989-07-01||
+----------+-----+------------+----------+---+


scala> //12、查询至少有一门课与学号为"01"的同学所学相同的同学的信息:

scala> studentDF.as("a").join(scoreDF.as("c"),"student_id").as("d").join(scoreDF.where("student_id=1").as("b"),"course_id").select("d.student_id").distinct.where("student_id!=1").join(studentDF,"student_id").show
+----------+------------+----------+---+
|student_id|student_name|     birth|sex|
+----------+------------+----------+---+
|         6|          吴兰|1992-03-01||
|         3|          孙风|1990-05-20||
|         5|          周梅|1991-12-01||
|         4|          李云|1990-08-06||
|         7|          郑竹|1989-07-01||
|         2|          钱电|1990-12-21||
+----------+------------+----------+---+


scala> //13.查询和"01"号的同学学习的课程完全相同的其他同学的信息:

scala> scoreDF.where("student_id=1").as("s1").join(scoreDF.as("s2"),"course_id").groupBy("s2.student_id").count.as("s3").where(s"count = ${scoreDF.where("student_id=1").count} and student_id!=1").join(studentDF,"student_id").show
+----------+-----+------------+----------+---+
|student_id|count|student_name|     birth|sex|
+----------+-----+------------+----------+---+
|         3|    3|          孙风|1990-05-20||
|         4|    3|          李云|1990-08-06||
|         2|    3|          钱电|1990-12-21||
+----------+-----+------------+----------+---+


scala> //14、查询没学过"张三"老师讲授的任一门课程的学生姓名:

scala> studentDF.join(teacherDF.where("teacher_name='张三'").join(courseDF,"teacher_id").join(scoreDF,Seq("course_id"),"left_outer"),Seq("student_id"),"le_outer").as("s1").where("s1.teacher_id is null").show
+----------+------------+----------+---+---------+----------+------------+-----------+-----+
|student_id|student_name|     birth|sex|course_id|teacher_id|teacher_name|course_name|score|
+----------+------------+----------+---+---------+----------+------------+-----------+-----+
|         6|          吴兰|1992-03-01||     null|      null|        null|       null| null|
|         8|          王菊|1990-01-20||     null|      null|        null|       null| null|
+----------+------------+----------+---+---------+----------+------------+-----------+-----+


scala> //15、查询两门及其以上不及格课程的同学的学号,姓名及其平均成绩:

scala> scoreDF.where("score<60").groupBy("student_id").count.where("count>=2").as("s1").join(scoreDF.as("s2"),"student_id").groupBy("student_id").avg("score").join(studentDF,"student_id").show
+----------+------------------+------------+----------+---+
|student_id|        avg(score)|student_name|     birth|sex|
+----------+------------------+------------+----------+---+
|         6|              32.5|          吴兰|1992-03-01||
|         4|33.333333333333336|          李云|1990-08-06||
+----------+------------------+------------+----------+---+


scala> //16、检索"01"课程分数小于60,按分数降序排列的学生信息:

scala> scoreDF.where("course_id=1 and score<60").join(studentDF,"student_id").orderBy($"score".desc).show
+----------+---------+-----+------------+----------+---+
|student_id|course_id|score|student_name|     birth|sex|
+----------+---------+-----+------------+----------+---+
|         4|        1|   50|          李云|1990-08-06||
|         6|        1|   31|          吴兰|1992-03-01||
+----------+---------+-----+------------+----------+---+


scala> //17、按平均成绩从高到低显示所有学生的所有课程的成绩以及平均成绩:

scala> scoreDF.join(scoreDF.groupBy("student_id").avg("score"),Seq("student_id"),"left_outer").join(studentDF,"student_id").orderBy($"avg(score)".desc).show
+----------+---------+-----+------------------+------------+----------+---+
|student_id|course_id|score|        avg(score)|student_name|     birth|sex|
+----------+---------+-----+------------------+------------+----------+---+
|         7|        2|   89|              93.5|          郑竹|1989-07-01||
|         7|        3|   98|              93.5|          郑竹|1989-07-01||
|         1|        1|   80| 89.66666666666667|          赵雷|1990-01-01||
|         1|        2|   90| 89.66666666666667|          赵雷|1990-01-01||
|         1|        3|   99| 89.66666666666667|          赵雷|1990-01-01||
|         5|        1|   76|              81.5|          周梅|1991-12-01||
|         5|        2|   87|              81.5|          周梅|1991-12-01||
|         3|        2|   80|              80.0|          孙风|1990-05-20||
|         3|        3|   80|              80.0|          孙风|1990-05-20||
|         3|        1|   80|              80.0|          孙风|1990-05-20||
|         2|        2|   60|              70.0|          钱电|1990-12-21||
|         2|        1|   70|              70.0|          钱电|1990-12-21||
|         2|        3|   80|              70.0|          钱电|1990-12-21||
|         4|        2|   30|33.333333333333336|          李云|1990-08-06||
|         4|        3|   20|33.333333333333336|          李云|1990-08-06||
|         4|        1|   50|33.333333333333336|          李云|1990-08-06||
|         6|        1|   31|              32.5|          吴兰|1992-03-01||
|         6|        3|   34|              32.5|          吴兰|1992-03-01||
+----------+---------+-----+------------------+------------+----------+---+


scala> //18.查询各科成绩最高分、最低分和平均分:以如下形式显示:课程ID,课程name,最高分,最低分,平均分,及格率,中等率,优良率,优秀率:

scala> val jige = scoreDF.rdd.map(x=>{
    
    if(x.getAs("score").toString.toInt > 60) (x(1).toString,1) else (x(1).toString,0)}).reduceByKey(_+_).toDF("course_id","jige")
jige: org.apache.spark.sql.DataFrame = [course_id: string, jige: int]

scala> val zhongdeng = scoreDF.rdd.map(x=>{
    
    if(x.getAs("score").toString.toInt > 70) (x(1).toString,1) else (x(1).toString,0)}).reduceByKey(_+_).toDF("course_id","zhongdeng")
zhongdeng: org.apache.spark.sql.DataFrame = [course_id: string, zhongdeng: int]

scala> val youliang = scoreDF.rdd.map(x=>{
    
    if(x.getAs("score").toString.toInt > 80) (x(1).toString,1) else (x(1).toString,0)}).reduceByKey(_+_).toDF("course_id","youliang")
youliang: org.apache.spark.sql.DataFrame = [course_id: string, youliang: int]

scala> val youxiu = scoreDF.rdd.map(x=>{
    
    if(x.getAs("score").toString.toInt > 90) (x(1).toString,1) else (x(1).toString,0)}).reduceByKey(_+_).toDF("course_id","youxiu")
youxiu: org.apache.spark.sql.DataFrame = [course_id: string, youxiu: int]

scala> val s1 = scoreDF.groupBy("course_id").agg("score"->"max","score"->"min","score"->"avg","score"->"count")
s1: org.apache.spark.sql.DataFrame = [course_id: int, max(score): int ... 3 more fields]

scala> s1.join(jige,"course_id").join(zhongdeng,"course_id").join(youliang,"course_id").join(youxiu,"course_id").withColumn("jgl",$"jige"/$"count(score)").withColumn("zdl",$"zhongdeng"/$"count(score)").withColumn("yll",$"youliang"/$"count(score)").withColumn("yxl",$"youxiu"/$"count(score)").drop("jige","zhongdeng","youliang","youxiu").show
+---------+----------+----------+-----------------+------------+------------------+------------------+------------------+------------------+
|course_id|max(score)|min(score)|       avg(score)|count(score)|               jgl|               zdl|               yll|               yxl|
+---------+----------+----------+-----------------+------------+------------------+------------------+------------------+------------------+
|        1|        80|        31|             64.5|           6|0.6666666666666666|               0.5|               0.0|               0.0|
|        3|        99|        20|             68.5|           6|0.6666666666666666|0.6666666666666666|0.3333333333333333|0.3333333333333333|
|        2|        90|        30|72.66666666666667|           6|0.6666666666666666|0.6666666666666666|               0.5|               0.0|
+---------+----------+----------+-----------------+------------+------------------+------------------+------------------+------------------+


scala> //19、按各科成绩进行排序,并显示排名:

scala> scoreDF.selectExpr("*","row_number() over(partition by course_id order by score desc) rank").show
+----------+---------+-----+----+
|student_id|course_id|score|rank|
+----------+---------+-----+----+
|         1|        1|   80|   1|
|         3|        1|   80|   2|
|         5|        1|   76|   3|
|         2|        1|   70|   4|
|         4|        1|   50|   5|
|         6|        1|   31|   6|
|         1|        3|   99|   1|
|         7|        3|   98|   2|
|         2|        3|   80|   3|
|         3|        3|   80|   4|
|         6|        3|   34|   5|
|         4|        3|   20|   6|
|         1|        2|   90|   1|
|         7|        2|   89|   2|
|         5|        2|   87|   3|
|         3|        2|   80|   4|
|         2|        2|   60|   5|
|         4|        2|   30|   6|
+----------+---------+-----+----+


scala> //20、查询学生的总成绩并进行排名:

scala> scoreDF.selectExpr("*","sum(score) over(partition by student_id) as sum_score").dropDuplicates("student_id","sum_score").selectExpr("*","row_number() over(order by sum_score desc) rank").show
20/08/16 15:38:23 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
+----------+---------+-----+---------+----+
|student_id|course_id|score|sum_score|rank|
+----------+---------+-----+---------+----+
|         1|        1|   80|      269|   1|
|         3|        1|   80|      240|   2|
|         2|        1|   70|      210|   3|
|         7|        2|   89|      187|   4|
|         5|        1|   76|      163|   5|
|         4|        1|   50|      100|   6|
|         6|        1|   31|       65|   7|
+----------+---------+-----+---------+----+


scala> //21、查询不同老师所教不同课程平均分从高到低显示:

scala> scoreDF.join(courseDF,"course_id").join(teacherDF,"teacher_id").groupBy("teacher_id","course_id").avg("score").orderBy($"avg(score)".desc).show
+----------+---------+-----------------+
|teacher_id|course_id|       avg(score)|
+----------+---------+-----------------+
|         1|        2|72.66666666666667|
|         3|        3|             68.5|
|         2|        1|             64.5|
+----------+---------+-----------------+


scala> //22、查询所有课程的成绩第2名到第3名的学生信息及该课程成绩:

scala> scoreDF.selectExpr("*","row_number() over(partition by course_id order by score desc) rank").where("rank between 2 and 3").join(studentDF,"student_id").show
+----------+---------+-----+----+------------+----------+---+
|student_id|course_id|score|rank|student_name|     birth|sex|
+----------+---------+-----+----+------------+----------+---+
|         3|        1|   80|   2|          孙风|1990-05-20||
|         5|        1|   76|   3|          周梅|1991-12-01||
|         5|        2|   87|   3|          周梅|1991-12-01||
|         7|        3|   98|   2|          郑竹|1989-07-01||
|         7|        2|   89|   2|          郑竹|1989-07-01||
|         2|        3|   80|   3|          钱电|1990-12-21||
+----------+---------+-----+----+------------+----------+---+


scala> //23.统计各科成绩各分数段人数:课程编号,课程名称,[100-85],[85-70],[70-60],[0-60]及所占百分比

scala> val fenduan = scoreDF.rdd.map(x=>{
    
    
     | if(x.getAs("score").toString.toInt < 60) (x(1).toString,1)
     | else if(x.getAs("score").toString.toInt < 70) (x(1).toString,2)
     | else if(x.getAs("score").toString.toInt < 85) (x(1).toString,3)
     | else (x(1).toString,4)
     | }).toDF("course_id","fenduan")
fenduan: org.apache.spark.sql.DataFrame = [course_id: string, fenduan: int]

scala> fenduan.groupBy("course_id").count.as("f1").join(fenduan.groupBy("course_id","fenduan").count.as("f2"),"course_id").withColumn("rate",$"f2.count"/$"f1.count").drop($"f1.count").join(courseDF,"course_id").show
+---------+-------+-----+-------------------+-----------+----------+
|course_id|fenduan|count|               rate|course_name|teacher_id|
+---------+-------+-----+-------------------+-----------+----------+
|        1|      3|    4| 0.6666666666666666|         语文|         2|
|        1|      1|    2| 0.3333333333333333|         语文|         2|
|        3|      1|    2| 0.3333333333333333|         英语|         3|
|        3|      3|    2| 0.3333333333333333|         英语|         3|
|        3|      4|    2| 0.3333333333333333|         英语|         3|
|        2|      2|    1|0.16666666666666666|         数学|         1|
|        2|      4|    3|                0.5|         数学|         1|
|        2|      3|    1|0.16666666666666666|         数学|         1|
|        2|      1|    1|0.16666666666666666|         数学|         1|
+---------+-------+-----+-------------------+-----------+----------+


scala> //24、查询学生平均成绩及其名次:

scala> scoreDF.groupBy("student_id").avg("score").selectExpr("*",s"row_number() over(order by 'avg(score)' desc) as rank").show
20/08/16 15:38:26 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
+----------+------------------+----+
|student_id|        avg(score)|rank|
+----------+------------------+----+
|         1| 89.66666666666667|   1|
|         6|              32.5|   2|
|         3|              80.0|   3|
|         5|              81.5|   4|
|         4|33.333333333333336|   5|
|         7|              93.5|   6|
|         2|              70.0|   7|
+----------+------------------+----+


scala> //25、查询各科成绩前三名的记录

scala> scoreDF.selectExpr("*","row_number() over(partition by course_id order by score desc) rank").where("rank <=3").show
+----------+---------+-----+----+
|student_id|course_id|score|rank|
+----------+---------+-----+----+
|         1|        1|   80|   1|
|         3|        1|   80|   2|
|         5|        1|   76|   3|
|         1|        3|   99|   1|
|         7|        3|   98|   2|
|         2|        3|   80|   3|
|         1|        2|   90|   1|
|         7|        2|   89|   2|
|         5|        2|   87|   3|
+----------+---------+-----+----+


scala> //26、查询每门课程被选修的学生数:

scala> scoreDF.groupBy("course_id").count.show
+---------+-----+
|course_id|count|
+---------+-----+
|        1|    6|
|        3|    6|
|        2|    6|
+---------+-----+


scala> //27.查询出只有两门课程的全部学生的学号和姓名:

scala> scoreDF.groupBy("student_id").count.where("count=2").join(studentDF,"student_id").show
+----------+-----+------------+----------+---+
|student_id|count|student_name|     birth|sex|
+----------+-----+------------+----------+---+
|         6|    2|          吴兰|1992-03-01||
|         5|    2|          周梅|1991-12-01||
|         7|    2|          郑竹|1989-07-01||
+----------+-----+------------+----------+---+


scala> //28、查询男生、女生人数:

scala> studentDF.groupBy("sex").count.show
+---+-----+
|sex|count|
+---+-----+
||    4|
||    4|
+---+-----+


scala> //29、查询名字中含有"风"字的学生信息:

scala> studentDF.where("student_name like '%风%'").show
+----------+------------+----------+---+
|student_id|student_name|     birth|sex|
+----------+------------+----------+---+
|         3|          孙风|1990-05-20||
+----------+------------+----------+---+


scala> //30、查询同名同姓学生名单,并统计同名人数:

scala> studentDF.groupBy("student_name").count.where("count>1").show
+------------+-----+
|student_name|count|
+------------+-----+
+------------+-----+


scala> //31、查询1990年出生的学生名单:

scala> studentDF.where("year(birth) = 1990").show
+----------+------------+----------+---+
|student_id|student_name|     birth|sex|
+----------+------------+----------+---+
|         1|          赵雷|1990-01-01||
|         2|          钱电|1990-12-21||
|         3|          孙风|1990-05-20||
|         4|          李云|1990-08-06||
|         8|          王菊|1990-01-20||
+----------+------------+----------+---+


scala> //32、查询每门课程的平均成绩,结果按平均成绩降序排列,平均成绩相同时,按课程编号升序排列:

scala> scoreDF.groupBy("course_id").avg("score").orderBy($"avg(score)".desc).orderBy($"course_id").show
+---------+-----------------+
|course_id|       avg(score)|
+---------+-----------------+
|        1|             64.5|
|        2|72.66666666666667|
|        3|             68.5|
+---------+-----------------+


scala> //33、查询平均成绩大于等于85的所有学生的学号、姓名和平均成绩:

scala> scoreDF.groupBy("student_id").avg("score").where("avg(score)>=85").join(studentDF,"student_id").show
+----------+-----------------+------------+----------+---+
|student_id|       avg(score)|student_name|     birth|sex|
+----------+-----------------+------------+----------+---+
|         1|89.66666666666667|          赵雷|1990-01-01||
|         7|             93.5|          郑竹|1989-07-01||
+----------+-----------------+------------+----------+---+


scala> //34、查询课程名称为"数学",且分数低于60的学生姓名和分数:

scala> scoreDF.where("score<60").join(courseDF,"course_id").where("course_name='数学'").show
+---------+----------+-----+-----------+----------+
|course_id|student_id|score|course_name|teacher_id|
+---------+----------+-----+-----------+----------+
|        2|         4|   30|         数学|         1|
+---------+----------+-----+-----------+----------+


scala> //35、查询所有学生的课程及分数情况:

scala> studentDF.join(scoreDF,Seq("student_id"),"left_outer").show
+----------+------------+----------+---+---------+-----+
|student_id|student_name|     birth|sex|course_id|score|
+----------+------------+----------+---+---------+-----+
|         1|          赵雷|1990-01-01||        1|   80|
|         1|          赵雷|1990-01-01||        2|   90|
|         1|          赵雷|1990-01-01||        3|   99|
|         6|          吴兰|1992-03-01||        1|   31|
|         6|          吴兰|1992-03-01||        3|   34|
|         3|          孙风|1990-05-20||        1|   80|
|         3|          孙风|1990-05-20||        2|   80|
|         3|          孙风|1990-05-20||        3|   80|
|         5|          周梅|1991-12-01||        1|   76|
|         5|          周梅|1991-12-01||        2|   87|
|         4|          李云|1990-08-06||        1|   50|
|         4|          李云|1990-08-06||        2|   30|
|         4|          李云|1990-08-06||        3|   20|
|         8|          王菊|1990-01-20||     null| null|
|         7|          郑竹|1989-07-01||        2|   89|
|         7|          郑竹|1989-07-01||        3|   98|
|         2|          钱电|1990-12-21||        1|   70|
|         2|          钱电|1990-12-21||        2|   60|
|         2|          钱电|1990-12-21||        3|   80|
+----------+------------+----------+---+---------+-----+


scala> //36.查询任何一门课程成绩在70分以上的学生姓名、课程名称和分数:

scala> scoreDF.where("score>70").join(studentDF,"student_id").join(courseDF,"course_id").show
+---------+----------+-----+------------+----------+---+-----------+----------+
|course_id|student_id|score|student_name|     birth|sex|course_name|teacher_id|
+---------+----------+-----+------------+----------+---+-----------+----------+
|        1|         1|   80|          赵雷|1990-01-01||         语文|         2|
|        1|         3|   80|          孙风|1990-05-20||         语文|         2|
|        1|         5|   76|          周梅|1991-12-01||         语文|         2|
|        3|         1|   99|          赵雷|1990-01-01||         英语|         3|
|        3|         3|   80|          孙风|1990-05-20||         英语|         3|
|        3|         7|   98|          郑竹|1989-07-01||         英语|         3|
|        3|         2|   80|          钱电|1990-12-21||         英语|         3|
|        2|         1|   90|          赵雷|1990-01-01||         数学|         1|
|        2|         3|   80|          孙风|1990-05-20||         数学|         1|
|        2|         5|   87|          周梅|1991-12-01||         数学|         1|
|        2|         7|   89|          郑竹|1989-07-01||         数学|         1|
+---------+----------+-----+------------+----------+---+-----------+----------+


scala> //37、查询课程不及格的学生:

scala> scoreDF.where("score<60").join(studentDF,"student_id").show
+----------+---------+-----+------------+----------+---+
|student_id|course_id|score|student_name|     birth|sex|
+----------+---------+-----+------------+----------+---+
|         6|        1|   31|          吴兰|1992-03-01||
|         6|        3|   34|          吴兰|1992-03-01||
|         4|        1|   50|          李云|1990-08-06||
|         4|        2|   30|          李云|1990-08-06||
|         4|        3|   20|          李云|1990-08-06||
+----------+---------+-----+------------+----------+---+


scala> //38、查询课程编号为01且课程成绩在80分以上的学生的学号和姓名:

scala> scoreDF.where("course_id=1 and score>=80").join(studentDF,"student_id").show
+----------+---------+-----+------------+----------+---+
|student_id|course_id|score|student_name|     birth|sex|
+----------+---------+-----+------------+----------+---+
|         1|        1|   80|          赵雷|1990-01-01||
|         3|        1|   80|          孙风|1990-05-20||
+----------+---------+-----+------------+----------+---+


scala> //39.求每门课程的学生人数:

scala> scoreDF.groupBy("course_id").count.show
+---------+-----+
|course_id|count|
+---------+-----+
|        1|    6|
|        3|    6|
|        2|    6|
+---------+-----+


scala> //40、查询选修"张三"老师所授课程的学生中,成绩最高的学生信息及其成绩:

scala> scoreDF.join(courseDF,"course_id").join(teacherDF,"teacher_id").where("teacher_name='张三'").selectExpr("*","max(score) over() max_score").where("m_score=score").show
20/08/16 15:38:32 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
+----------+---------+----------+-----+-----------+------------+---------+
|teacher_id|course_id|student_id|score|course_name|teacher_name|max_score|
+----------+---------+----------+-----+-----------+------------+---------+
|         1|        2|         1|   90|         数学|          张三|       90|
+----------+---------+----------+-----+-----------+------------+---------+


scala> //41、查询不同课程成绩相同的学生的学生编号、课程编号、学生成绩:

scala> scoreDF.as("s1").crossJoin(scoreDF.as("s2")).where("s1.score=s2.score and s1.course_id!=s2.course_id").show
+----------+---------+-----+----------+---------+-----+
|student_id|course_id|score|student_id|course_id|score|
+----------+---------+-----+----------+---------+-----+
|         1|        1|   80|         2|        3|   80|
|         1|        1|   80|         3|        2|   80|
|         1|        1|   80|         3|        3|   80|
|         2|        3|   80|         1|        1|   80|
|         2|        3|   80|         3|        1|   80|
|         2|        3|   80|         3|        2|   80|
|         3|        1|   80|         2|        3|   80|
|         3|        1|   80|         3|        2|   80|
|         3|        1|   80|         3|        3|   80|
|         3|        2|   80|         1|        1|   80|
|         3|        2|   80|         2|        3|   80|
|         3|        2|   80|         3|        1|   80|
|         3|        2|   80|         3|        3|   80|
|         3|        3|   80|         1|        1|   80|
|         3|        3|   80|         3|        1|   80|
|         3|        3|   80|         3|        2|   80|
+----------+---------+-----+----------+---------+-----+


scala> //42、查询每门课程成绩最好的前三名:

scala> scoreDF.selectExpr("*","row_number() over(partition by course_id order by score desc) rank").where("rank<=3").show
+----------+---------+-----+----+
|student_id|course_id|score|rank|
+----------+---------+-----+----+
|         1|        1|   80|   1|
|         3|        1|   80|   2|
|         5|        1|   76|   3|
|         1|        3|   99|   1|
|         7|        3|   98|   2|
|         2|        3|   80|   3|
|         1|        2|   90|   1|
|         7|        2|   89|   2|
|         5|        2|   87|   3|
+----------+---------+-----+----+


scala> //43、统计每门课程的学生选修人数(超过5人的课程才统计)要求输出课程号和选修人数,查询结果按人数降序排列,若人数相同,按课程号升序排列

scala> scoreDF.selectExpr("*","count(1) over(partition by course_id) cnt").where("cnt>=5").orderBy($"cnt".desc).orderBy("course_id").drop("student_id","score").dropDuplicates("course_id","cnt").show
+---------+---+
|course_id|cnt|
+---------+---+
|        1|  6|
|        2|  6|
|        3|  6|
+---------+---+


scala> //44、检索至少选修两门课程的学生学号:

scala> scoreDF.groupBy("student_id").count.where("count>=2").show
+----------+-----+
|student_id|count|
+----------+-----+
|         1|    3|
|         6|    2|
|         3|    3|
|         5|    2|
|         4|    3|
|         7|    2|
|         2|    3|
+----------+-----+


scala> //45、查询选修了全部课程的学生信息:

scala> studentDF.join(scoreDF,Seq("student_id"),"left_outer").groupBy("student_id").count.where(s"count = ${courseDF.select("course_id").count}").join(studentDF,"student_id").show
+----------+-----+------------+----------+---+
|student_id|count|student_name|     birth|sex|
+----------+-----+------------+----------+---+
|         1|    3|          赵雷|1990-01-01||
|         3|    3|          孙风|1990-05-20||
|         4|    3|          李云|1990-08-06||
|         2|    3|          钱电|1990-12-21||
+----------+-----+------------+----------+---+


scala> //46、查询各学生的年龄(周岁):

scala> studentDF.selectExpr("*","cast(date_format(current_date(),'yyyy') as Int)-cast(date_format(birth,'yyyy') as Int) age").show
+----------+------------+----------+---+---+
|student_id|student_name|     birth|sex|age|
+----------+------------+----------+---+---+
|         1|          赵雷|1990-01-01|| 30|
|         2|          钱电|1990-12-21|| 30|
|         3|          孙风|1990-05-20|| 30|
|         4|          李云|1990-08-06|| 30|
|         5|          周梅|1991-12-01|| 29|
|         6|          吴兰|1992-03-01|| 28|
|         7|          郑竹|1989-07-01|| 31|
|         8|          王菊|1990-01-20|| 30|
+----------+------------+----------+---+---+


scala> //47、查询本周过生日的学生:找到下周一-1即为本周最后一天,开始时间为当前天(若今天就是星期天会不会出错?)

scala> studentDF.where("unix_timestamp(cast(concat_ws('-',date_format(current_date(),'yyyy'),date_format(birth,'MM'),date_format(birth,'dd')) as date),'yyyy-MM-dd') between unix_timestamp(current_date()) and unix_timestamp(date_sub(next_day(current_date(),'MON'),1),'yyyy-MM-dd')").show
+----------+------------+-----+---+
|student_id|student_name|birth|sex|
+----------+------------+-----+---+
+----------+------------+-----+---+


scala> //48、查询下周过生日的学生: 下周1到+6天

scala> studentDF.where("unix_timestamp(cast(concat_ws('-',date_format(current_date(),'yyyy'),date_format(birth,'MM'),date_format(birth,'dd')) as date),'yyyy-MM-dd')between unix_timestamp(next_day(current_date(),'MON'),'yyyy-MM-dd') and unix_timestamp(date_add(next_day(current_date(),'MON'),6),'yyyy-MM-dd')").show
+----------+------------+-----+---+
|student_id|student_name|birth|sex|
+----------+------------+-----+---+
+----------+------------+-----+---+


scala> //49、查询本月过生日的学生:

scala> studentDF.where("month(birth) = month(current_date())").show
+----------+------------+----------+---+
|student_id|student_name|     birth|sex|
+----------+------------+----------+---+
|         4|          李云|1990-08-06||
+----------+------------+----------+---+


scala> //50、查询12月份过生日的学生:

scala> studentDF.where("month(birth) = 12").show
+----------+------------+----------+---+
|student_id|student_name|     birth|sex|
+----------+------------+----------+---+
|         2|          钱电|1990-12-21||
|         5|          周梅|1991-12-01||
+----------+------------+----------+---+

猜你喜欢

转载自blog.csdn.net/sun_0128/article/details/108034501