Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.2.0
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_221)
Type in expressions to have them evaluated.
Type :help for more information.
scala> //学生表
scala> case class Student(student_id:Int,student_name:String,birth:String,sex:String)
defined class Student
scala> val rdd = sc.makeRDD(Array(
| (1 , "赵雷" , "1990-01-01" , "男"),
| (2 , "钱电" , "1990-12-21" , "男"),
| (3 , "孙风" , "1990-05-20" , "男"),
| (4 , "李云" , "1990-08-06" , "男"),
| (5 , "周梅" , "1991-12-01" , "女"),
| (6 , "吴兰" , "1992-03-01" , "女"),
| (7 , "郑竹" , "1989-07-01" , "女"),
| (8 , "王菊" , "1990-01-20" , "女")))
rdd: org.apache.spark.rdd.RDD[(Int, String, String, String)] = ParallelCollectionRDD[0] at makeRDD at <console>:24
scala> val studentDF = rdd.map(x=>Student(x._1,x._2,x._3,x._4)).toDF
studentDF: org.apache.spark.sql.DataFrame = [student_id: int, student_name: string ... 2 more fields]
scala>
scala> //课程表
scala> case class Course(course_id:Int,course_name:String,teacher_id:Int)
defined class Course
scala> val rdd2 = sc.makeRDD(Array(
| (1 , "语文" , 2),
| (2 , "数学" , 1),
| (3 , "英语" , 3)
| ))
rdd2: org.apache.spark.rdd.RDD[(Int, String, Int)] = ParallelCollectionRDD[2] at makeRDD at <console>:24
scala> val courseDF = rdd2.map(x=>Course(x._1,x._2,x._3)).toDF
courseDF: org.apache.spark.sql.DataFrame = [course_id: int, course_name: string ... 1 more field]
scala> //教师表
scala> case class Teacher(teacher_id:Int,teacher_name:String)
defined class Teacher
scala> val rdd3 = sc.makeRDD(Array(
| (1 , "张三"),
| (2 , "李四"),
| (3 , "王五")
| ))
rdd3: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[4] at makeRDD at <console>:24
scala> val teacherDF = rdd3.map(x=>Teacher(x._1,x._2)).toDF
teacherDF: org.apache.spark.sql.DataFrame = [teacher_id: int, teacher_name: string]
scala> //成绩表
scala> case class Score(student_id:Int,course_id:Int,score:Int)
defined class Score
scala> val rdd4 = sc.makeRDD(Array(
| (1 , 1 , 80),
| (1 , 2 , 90),
| (1 , 3 , 99),
| (2 , 1 , 70),
| (2 , 2 , 60),
| (2 , 3 , 80),
| (3 , 1 , 80),
| (3 , 2 , 80),
| (3 , 3 , 80),
| (4 , 1 , 50),
| (4 , 2 , 30),
| (4 , 3 , 20),
| (5 , 1 , 76),
| (5 , 2 , 87),
| (6 , 1 , 31),
| (6 , 3 , 34),
| (7 , 2 , 89),
| (7 , 3 , 98)))
rdd4: org.apache.spark.rdd.RDD[(Int, Int, Int)] = ParallelCollectionRDD[6] at makeRDD at <console>:24
scala> val scoreDF = rdd4.map(x=>Score(x._1,x._2,x._3)).toDF
scoreDF: org.apache.spark.sql.DataFrame = [student_id: int, course_id: int ... 1 more field]
scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._
scala>
scala> //1、查询"01"课程比"02"课程成绩高的学生的信息及课程分数:
scala> scoreDF.as("s1").join(scoreDF.as("s2"),"student_id").filter("s1.course_id=1 and s2.course_id=2 and s1.score>s2.score").join(studentDF,"student_id").show
+----------+---------+-----+---------+-----+------------+----------+---+
|student_id|course_id|score|course_id|score|student_name| birth|sex|
+----------+---------+-----+---------+-----+------------+----------+---+
| 4| 1| 50| 2| 30| 李云|1990-08-06| 男|
| 2| 1| 70| 2| 60| 钱电|1990-12-21| 男|
+----------+---------+-----+---------+-----+------------+----------+---+
scala> //2.查询"01"课程比"02"课程成绩低的学生的信息及课程分数:
scala> scoreDF.as("s1").join(scoreDF.as("s2"),"student_id").filter("s1.course_id=1 and s2.course_id=2 and s1.score<s2.score").join(studentDF,"student_id").show
+----------+---------+-----+---------+-----+------------+----------+---+
|student_id|course_id|score|course_id|score|student_name| birth|sex|
+----------+---------+-----+---------+-----+------------+----------+---+
| 1| 1| 80| 2| 90| 赵雷|1990-01-01| 男|
| 5| 1| 76| 2| 87| 周梅|1991-12-01| 女|
+----------+---------+-----+---------+-----+------------+----------+---+
scala> //3.查询平均成绩大于等于60分的同学的学生编号和学生姓名和平均成绩
scala> scoreDF.as("s1").groupBy("student_id").avg("score").join(studentDF.as("s2"),"student_id").filter($"avg(score)">=60).show
+----------+-----------------+------------+----------+---+
|student_id| avg(score)|student_name| birth|sex|
+----------+-----------------+------------+----------+---+
| 1|89.66666666666667| 赵雷|1990-01-01| 男|
| 3| 80.0| 孙风|1990-05-20| 男|
| 5| 81.5| 周梅|1991-12-01| 女|
| 7| 93.5| 郑竹|1989-07-01| 女|
| 2| 70.0| 钱电|1990-12-21| 男|
+----------+-----------------+------------+----------+---+
scala> //4.查询平均成绩小于60分的同学的学生编号和学生姓名和平均成绩:(包括有成绩的和无成绩的)
scala> studentDF.as("s2").join((scoreDF.as("s1").groupBy("student_id").avg("score")).as("s3"),Seq("student_id"),"left_outer").as("s").withColumnRenamed("avg(score)","A").where((col("A")<60) || (col("A").isNull)).show
+----------+------------+----------+---+------------------+
|student_id|student_name| birth|sex| A|
+----------+------------+----------+---+------------------+
| 6| 吴兰|1992-03-01| 女| 32.5|
| 4| 李云|1990-08-06| 男|33.333333333333336|
| 8| 王菊|1990-01-20| 女| null|
+----------+------------+----------+---+------------------+
scala> //5.查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩:
scala> //选课数
scala> scoreDF.groupBy("student_id").count
res4: org.apache.spark.sql.DataFrame = [student_id: int, count: bigint]
scala> //总成绩
scala> scoreDF.groupBy("student_id").sum("score")
res5: org.apache.spark.sql.DataFrame = [student_id: int, sum(score): bigint]
scala> //连表
scala> studentDF.join(scoreDF.groupBy("student_id").count,Seq("student_id"),"left_outer").join(scoreDF.groupBy("student_id").sum("score"),Seq("student_id"),"left_outer").show
+----------+------------+----------+---+-----+----------+
|student_id|student_name| birth|sex|count|sum(score)|
+----------+------------+----------+---+-----+----------+
| 1| 赵雷|1990-01-01| 男| 3| 269|
| 6| 吴兰|1992-03-01| 女| 2| 65|
| 3| 孙风|1990-05-20| 男| 3| 240|
| 5| 周梅|1991-12-01| 女| 2| 163|
| 4| 李云|1990-08-06| 男| 3| 100|
| 8| 王菊|1990-01-20| 女| null| null|
| 7| 郑竹|1989-07-01| 女| 2| 187|
| 2| 钱电|1990-12-21| 男| 3| 210|
+----------+------------+----------+---+-----+----------+
scala> //6.查询"李"姓老师的数量:
scala> teacherDF.where("teacher_name like '李%'").select("teacher_id").count
res7: Long = 1
scala> //7.查询学过"张三"老师授课的同学的信息:
scala> scoreDF.join(courseDF,"course_id").join(teacherDF,"teacher_id").filter("teacher_name = '张三'").join(studentDF,"student_id").show
+----------+----------+---------+-----+-----------+------------+------------+----------+---+
|student_id|teacher_id|course_id|score|course_name|teacher_name|student_name| birth|sex|
+----------+----------+---------+-----+-----------+------------+------------+----------+---+
| 1| 1| 2| 90| 数学| 张三| 赵雷|1990-01-01| 男|
| 3| 1| 2| 80| 数学| 张三| 孙风|1990-05-20| 男|
| 5| 1| 2| 87| 数学| 张三| 周梅|1991-12-01| 女|
| 4| 1| 2| 30| 数学| 张三| 李云|1990-08-06| 男|
| 7| 1| 2| 89| 数学| 张三| 郑竹|1989-07-01| 女|
| 2| 1| 2| 60| 数学| 张三| 钱电|1990-12-21| 男|
+----------+----------+---------+-----+-----------+------------+------------+----------+---+
scala> //8.查询没学过"张三"老师授课的同学的信息:
scala> studentDF.join(scoreDF.join(courseDF,"course_id").join(teacherDF,"teacher_id"),Seq("student_id"),"left_outer").where("teacher_name!='张三' or teach_name is null").show
+----------+------------+----------+---+----------+---------+-----+-----------+------------+
|student_id|student_name| birth|sex|teacher_id|course_id|score|course_name|teacher_name|
+----------+------------+----------+---+----------+---------+-----+-----------+------------+
| 1| 赵雷|1990-01-01| 男| 3| 3| 99| 英语| 王五|
| 1| 赵雷|1990-01-01| 男| 2| 1| 80| 语文| 李四|
| 6| 吴兰|1992-03-01| 女| 3| 3| 34| 英语| 王五|
| 6| 吴兰|1992-03-01| 女| 2| 1| 31| 语文| 李四|
| 3| 孙风|1990-05-20| 男| 3| 3| 80| 英语| 王五|
| 3| 孙风|1990-05-20| 男| 2| 1| 80| 语文| 李四|
| 5| 周梅|1991-12-01| 女| 2| 1| 76| 语文| 李四|
| 4| 李云|1990-08-06| 男| 3| 3| 20| 英语| 王五|
| 4| 李云|1990-08-06| 男| 2| 1| 50| 语文| 李四|
| 8| 王菊|1990-01-20| 女| null| null| null| null| null|
| 7| 郑竹|1989-07-01| 女| 3| 3| 98| 英语| 王五|
| 2| 钱电|1990-12-21| 男| 3| 3| 80| 英语| 王五|
| 2| 钱电|1990-12-21| 男| 2| 1| 70| 语文| 李四|
+----------+------------+----------+---+----------+---------+-----+-----------+------------+
scala> //9.查询学过编号为"01"并且也学过编号为"02"的课程的同学的信息:
scala> scoreDF.where("course_id in(1,2)").groupBy("student_id").count.where("count=2").join(studentDF,"student_id").show
+----------+-----+------------+----------+---+
|student_id|count|student_name| birth|sex|
+----------+-----+------------+----------+---+
| 1| 2| 赵雷|1990-01-01| 男|
| 3| 2| 孙风|1990-05-20| 男|
| 5| 2| 周梅|1991-12-01| 女|
| 4| 2| 李云|1990-08-06| 男|
| 2| 2| 钱电|1990-12-21| 男|
+----------+-----+------------+----------+---+
scala> //10.查询学过编号为"01"但是没有学过编号为"02"的课程的同学的信息:
scala> studentDF.join(scoreDF.where("course_id in (2)"),Seq("student_id"),"left_outer").as("s1").where("s1.course_id is null").join(scoreDF.where("course_id in (1)"),"student_id").show
+----------+------------+----------+---+---------+-----+---------+-----+
|student_id|student_name| birth|sex|course_id|score|course_id|score|
+----------+------------+----------+---+---------+-----+---------+-----+
| 6| 吴兰|1992-03-01| 女| null| null| 1| 31|
+----------+------------+----------+---+---------+-----+---------+-----+
scala> //11、查询没有学全所有课程的同学的信息:
scala> studentDF.join(scoreDF,Seq("student_id"),"left_outer").groupBy("student_id").count.where(s"count != ${courseDF.select("course_id").count}").join(studentDF,"student_id").show
+----------+-----+------------+----------+---+
|student_id|count|student_name| birth|sex|
+----------+-----+------------+----------+---+
| 6| 2| 吴兰|1992-03-01| 女|
| 5| 2| 周梅|1991-12-01| 女|
| 8| 1| 王菊|1990-01-20| 女|
| 7| 2| 郑竹|1989-07-01| 女|
+----------+-----+------------+----------+---+
scala> //12、查询至少有一门课与学号为"01"的同学所学相同的同学的信息:
scala> studentDF.as("a").join(scoreDF.as("c"),"student_id").as("d").join(scoreDF.where("student_id=1").as("b"),"course_id").select("d.student_id").distinct.where("student_id!=1").join(studentDF,"student_id").show
+----------+------------+----------+---+
|student_id|student_name| birth|sex|
+----------+------------+----------+---+
| 6| 吴兰|1992-03-01| 女|
| 3| 孙风|1990-05-20| 男|
| 5| 周梅|1991-12-01| 女|
| 4| 李云|1990-08-06| 男|
| 7| 郑竹|1989-07-01| 女|
| 2| 钱电|1990-12-21| 男|
+----------+------------+----------+---+
scala> //13.查询和"01"号的同学学习的课程完全相同的其他同学的信息:
scala> scoreDF.where("student_id=1").as("s1").join(scoreDF.as("s2"),"course_id").groupBy("s2.student_id").count.as("s3").where(s"count = ${scoreDF.where("student_id=1").count} and student_id!=1").join(studentDF,"student_id").show
+----------+-----+------------+----------+---+
|student_id|count|student_name| birth|sex|
+----------+-----+------------+----------+---+
| 3| 3| 孙风|1990-05-20| 男|
| 4| 3| 李云|1990-08-06| 男|
| 2| 3| 钱电|1990-12-21| 男|
+----------+-----+------------+----------+---+
scala> //14、查询没学过"张三"老师讲授的任一门课程的学生姓名:
scala> studentDF.join(teacherDF.where("teacher_name='张三'").join(courseDF,"teacher_id").join(scoreDF,Seq("course_id"),"left_outer"),Seq("student_id"),"le_outer").as("s1").where("s1.teacher_id is null").show
+----------+------------+----------+---+---------+----------+------------+-----------+-----+
|student_id|student_name| birth|sex|course_id|teacher_id|teacher_name|course_name|score|
+----------+------------+----------+---+---------+----------+------------+-----------+-----+
| 6| 吴兰|1992-03-01| 女| null| null| null| null| null|
| 8| 王菊|1990-01-20| 女| null| null| null| null| null|
+----------+------------+----------+---+---------+----------+------------+-----------+-----+
scala> //15、查询两门及其以上不及格课程的同学的学号,姓名及其平均成绩:
scala> scoreDF.where("score<60").groupBy("student_id").count.where("count>=2").as("s1").join(scoreDF.as("s2"),"student_id").groupBy("student_id").avg("score").join(studentDF,"student_id").show
+----------+------------------+------------+----------+---+
|student_id| avg(score)|student_name| birth|sex|
+----------+------------------+------------+----------+---+
| 6| 32.5| 吴兰|1992-03-01| 女|
| 4|33.333333333333336| 李云|1990-08-06| 男|
+----------+------------------+------------+----------+---+
scala> //16、检索"01"课程分数小于60,按分数降序排列的学生信息:
scala> scoreDF.where("course_id=1 and score<60").join(studentDF,"student_id").orderBy($"score".desc).show
+----------+---------+-----+------------+----------+---+
|student_id|course_id|score|student_name| birth|sex|
+----------+---------+-----+------------+----------+---+
| 4| 1| 50| 李云|1990-08-06| 男|
| 6| 1| 31| 吴兰|1992-03-01| 女|
+----------+---------+-----+------------+----------+---+
scala> //17、按平均成绩从高到低显示所有学生的所有课程的成绩以及平均成绩:
scala> scoreDF.join(scoreDF.groupBy("student_id").avg("score"),Seq("student_id"),"left_outer").join(studentDF,"student_id").orderBy($"avg(score)".desc).show
+----------+---------+-----+------------------+------------+----------+---+
|student_id|course_id|score| avg(score)|student_name| birth|sex|
+----------+---------+-----+------------------+------------+----------+---+
| 7| 2| 89| 93.5| 郑竹|1989-07-01| 女|
| 7| 3| 98| 93.5| 郑竹|1989-07-01| 女|
| 1| 1| 80| 89.66666666666667| 赵雷|1990-01-01| 男|
| 1| 2| 90| 89.66666666666667| 赵雷|1990-01-01| 男|
| 1| 3| 99| 89.66666666666667| 赵雷|1990-01-01| 男|
| 5| 1| 76| 81.5| 周梅|1991-12-01| 女|
| 5| 2| 87| 81.5| 周梅|1991-12-01| 女|
| 3| 2| 80| 80.0| 孙风|1990-05-20| 男|
| 3| 3| 80| 80.0| 孙风|1990-05-20| 男|
| 3| 1| 80| 80.0| 孙风|1990-05-20| 男|
| 2| 2| 60| 70.0| 钱电|1990-12-21| 男|
| 2| 1| 70| 70.0| 钱电|1990-12-21| 男|
| 2| 3| 80| 70.0| 钱电|1990-12-21| 男|
| 4| 2| 30|33.333333333333336| 李云|1990-08-06| 男|
| 4| 3| 20|33.333333333333336| 李云|1990-08-06| 男|
| 4| 1| 50|33.333333333333336| 李云|1990-08-06| 男|
| 6| 1| 31| 32.5| 吴兰|1992-03-01| 女|
| 6| 3| 34| 32.5| 吴兰|1992-03-01| 女|
+----------+---------+-----+------------------+------------+----------+---+
scala> //18.查询各科成绩最高分、最低分和平均分:以如下形式显示:课程ID,课程name,最高分,最低分,平均分,及格率,中等率,优良率,优秀率:
scala> val jige = scoreDF.rdd.map(x=>{
if(x.getAs("score").toString.toInt > 60) (x(1).toString,1) else (x(1).toString,0)}).reduceByKey(_+_).toDF("course_id","jige")
jige: org.apache.spark.sql.DataFrame = [course_id: string, jige: int]
scala> val zhongdeng = scoreDF.rdd.map(x=>{
if(x.getAs("score").toString.toInt > 70) (x(1).toString,1) else (x(1).toString,0)}).reduceByKey(_+_).toDF("course_id","zhongdeng")
zhongdeng: org.apache.spark.sql.DataFrame = [course_id: string, zhongdeng: int]
scala> val youliang = scoreDF.rdd.map(x=>{
if(x.getAs("score").toString.toInt > 80) (x(1).toString,1) else (x(1).toString,0)}).reduceByKey(_+_).toDF("course_id","youliang")
youliang: org.apache.spark.sql.DataFrame = [course_id: string, youliang: int]
scala> val youxiu = scoreDF.rdd.map(x=>{
if(x.getAs("score").toString.toInt > 90) (x(1).toString,1) else (x(1).toString,0)}).reduceByKey(_+_).toDF("course_id","youxiu")
youxiu: org.apache.spark.sql.DataFrame = [course_id: string, youxiu: int]
scala> val s1 = scoreDF.groupBy("course_id").agg("score"->"max","score"->"min","score"->"avg","score"->"count")
s1: org.apache.spark.sql.DataFrame = [course_id: int, max(score): int ... 3 more fields]
scala> s1.join(jige,"course_id").join(zhongdeng,"course_id").join(youliang,"course_id").join(youxiu,"course_id").withColumn("jgl",$"jige"/$"count(score)").withColumn("zdl",$"zhongdeng"/$"count(score)").withColumn("yll",$"youliang"/$"count(score)").withColumn("yxl",$"youxiu"/$"count(score)").drop("jige","zhongdeng","youliang","youxiu").show
+---------+----------+----------+-----------------+------------+------------------+------------------+------------------+------------------+
|course_id|max(score)|min(score)| avg(score)|count(score)| jgl| zdl| yll| yxl|
+---------+----------+----------+-----------------+------------+------------------+------------------+------------------+------------------+
| 1| 80| 31| 64.5| 6|0.6666666666666666| 0.5| 0.0| 0.0|
| 3| 99| 20| 68.5| 6|0.6666666666666666|0.6666666666666666|0.3333333333333333|0.3333333333333333|
| 2| 90| 30|72.66666666666667| 6|0.6666666666666666|0.6666666666666666| 0.5| 0.0|
+---------+----------+----------+-----------------+------------+------------------+------------------+------------------+------------------+
scala> //19、按各科成绩进行排序,并显示排名:
scala> scoreDF.selectExpr("*","row_number() over(partition by course_id order by score desc) rank").show
+----------+---------+-----+----+
|student_id|course_id|score|rank|
+----------+---------+-----+----+
| 1| 1| 80| 1|
| 3| 1| 80| 2|
| 5| 1| 76| 3|
| 2| 1| 70| 4|
| 4| 1| 50| 5|
| 6| 1| 31| 6|
| 1| 3| 99| 1|
| 7| 3| 98| 2|
| 2| 3| 80| 3|
| 3| 3| 80| 4|
| 6| 3| 34| 5|
| 4| 3| 20| 6|
| 1| 2| 90| 1|
| 7| 2| 89| 2|
| 5| 2| 87| 3|
| 3| 2| 80| 4|
| 2| 2| 60| 5|
| 4| 2| 30| 6|
+----------+---------+-----+----+
scala> //20、查询学生的总成绩并进行排名:
scala> scoreDF.selectExpr("*","sum(score) over(partition by student_id) as sum_score").dropDuplicates("student_id","sum_score").selectExpr("*","row_number() over(order by sum_score desc) rank").show
20/08/16 15:38:23 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
+----------+---------+-----+---------+----+
|student_id|course_id|score|sum_score|rank|
+----------+---------+-----+---------+----+
| 1| 1| 80| 269| 1|
| 3| 1| 80| 240| 2|
| 2| 1| 70| 210| 3|
| 7| 2| 89| 187| 4|
| 5| 1| 76| 163| 5|
| 4| 1| 50| 100| 6|
| 6| 1| 31| 65| 7|
+----------+---------+-----+---------+----+
scala> //21、查询不同老师所教不同课程平均分从高到低显示:
scala> scoreDF.join(courseDF,"course_id").join(teacherDF,"teacher_id").groupBy("teacher_id","course_id").avg("score").orderBy($"avg(score)".desc).show
+----------+---------+-----------------+
|teacher_id|course_id| avg(score)|
+----------+---------+-----------------+
| 1| 2|72.66666666666667|
| 3| 3| 68.5|
| 2| 1| 64.5|
+----------+---------+-----------------+
scala> //22、查询所有课程的成绩第2名到第3名的学生信息及该课程成绩:
scala> scoreDF.selectExpr("*","row_number() over(partition by course_id order by score desc) rank").where("rank between 2 and 3").join(studentDF,"student_id").show
+----------+---------+-----+----+------------+----------+---+
|student_id|course_id|score|rank|student_name| birth|sex|
+----------+---------+-----+----+------------+----------+---+
| 3| 1| 80| 2| 孙风|1990-05-20| 男|
| 5| 1| 76| 3| 周梅|1991-12-01| 女|
| 5| 2| 87| 3| 周梅|1991-12-01| 女|
| 7| 3| 98| 2| 郑竹|1989-07-01| 女|
| 7| 2| 89| 2| 郑竹|1989-07-01| 女|
| 2| 3| 80| 3| 钱电|1990-12-21| 男|
+----------+---------+-----+----+------------+----------+---+
scala> //23.统计各科成绩各分数段人数:课程编号,课程名称,[100-85],[85-70],[70-60],[0-60]及所占百分比
scala> val fenduan = scoreDF.rdd.map(x=>{
| if(x.getAs("score").toString.toInt < 60) (x(1).toString,1)
| else if(x.getAs("score").toString.toInt < 70) (x(1).toString,2)
| else if(x.getAs("score").toString.toInt < 85) (x(1).toString,3)
| else (x(1).toString,4)
| }).toDF("course_id","fenduan")
fenduan: org.apache.spark.sql.DataFrame = [course_id: string, fenduan: int]
scala> fenduan.groupBy("course_id").count.as("f1").join(fenduan.groupBy("course_id","fenduan").count.as("f2"),"course_id").withColumn("rate",$"f2.count"/$"f1.count").drop($"f1.count").join(courseDF,"course_id").show
+---------+-------+-----+-------------------+-----------+----------+
|course_id|fenduan|count| rate|course_name|teacher_id|
+---------+-------+-----+-------------------+-----------+----------+
| 1| 3| 4| 0.6666666666666666| 语文| 2|
| 1| 1| 2| 0.3333333333333333| 语文| 2|
| 3| 1| 2| 0.3333333333333333| 英语| 3|
| 3| 3| 2| 0.3333333333333333| 英语| 3|
| 3| 4| 2| 0.3333333333333333| 英语| 3|
| 2| 2| 1|0.16666666666666666| 数学| 1|
| 2| 4| 3| 0.5| 数学| 1|
| 2| 3| 1|0.16666666666666666| 数学| 1|
| 2| 1| 1|0.16666666666666666| 数学| 1|
+---------+-------+-----+-------------------+-----------+----------+
scala> //24、查询学生平均成绩及其名次:
scala> scoreDF.groupBy("student_id").avg("score").selectExpr("*",s"row_number() over(order by 'avg(score)' desc) as rank").show
20/08/16 15:38:26 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
+----------+------------------+----+
|student_id| avg(score)|rank|
+----------+------------------+----+
| 1| 89.66666666666667| 1|
| 6| 32.5| 2|
| 3| 80.0| 3|
| 5| 81.5| 4|
| 4|33.333333333333336| 5|
| 7| 93.5| 6|
| 2| 70.0| 7|
+----------+------------------+----+
scala> //25、查询各科成绩前三名的记录
scala> scoreDF.selectExpr("*","row_number() over(partition by course_id order by score desc) rank").where("rank <=3").show
+----------+---------+-----+----+
|student_id|course_id|score|rank|
+----------+---------+-----+----+
| 1| 1| 80| 1|
| 3| 1| 80| 2|
| 5| 1| 76| 3|
| 1| 3| 99| 1|
| 7| 3| 98| 2|
| 2| 3| 80| 3|
| 1| 2| 90| 1|
| 7| 2| 89| 2|
| 5| 2| 87| 3|
+----------+---------+-----+----+
scala> //26、查询每门课程被选修的学生数:
scala> scoreDF.groupBy("course_id").count.show
+---------+-----+
|course_id|count|
+---------+-----+
| 1| 6|
| 3| 6|
| 2| 6|
+---------+-----+
scala> //27.查询出只有两门课程的全部学生的学号和姓名:
scala> scoreDF.groupBy("student_id").count.where("count=2").join(studentDF,"student_id").show
+----------+-----+------------+----------+---+
|student_id|count|student_name| birth|sex|
+----------+-----+------------+----------+---+
| 6| 2| 吴兰|1992-03-01| 女|
| 5| 2| 周梅|1991-12-01| 女|
| 7| 2| 郑竹|1989-07-01| 女|
+----------+-----+------------+----------+---+
scala> //28、查询男生、女生人数:
scala> studentDF.groupBy("sex").count.show
+---+-----+
|sex|count|
+---+-----+
| 男| 4|
| 女| 4|
+---+-----+
scala> //29、查询名字中含有"风"字的学生信息:
scala> studentDF.where("student_name like '%风%'").show
+----------+------------+----------+---+
|student_id|student_name| birth|sex|
+----------+------------+----------+---+
| 3| 孙风|1990-05-20| 男|
+----------+------------+----------+---+
scala> //30、查询同名同姓学生名单,并统计同名人数:
scala> studentDF.groupBy("student_name").count.where("count>1").show
+------------+-----+
|student_name|count|
+------------+-----+
+------------+-----+
scala> //31、查询1990年出生的学生名单:
scala> studentDF.where("year(birth) = 1990").show
+----------+------------+----------+---+
|student_id|student_name| birth|sex|
+----------+------------+----------+---+
| 1| 赵雷|1990-01-01| 男|
| 2| 钱电|1990-12-21| 男|
| 3| 孙风|1990-05-20| 男|
| 4| 李云|1990-08-06| 男|
| 8| 王菊|1990-01-20| 女|
+----------+------------+----------+---+
scala> //32、查询每门课程的平均成绩,结果按平均成绩降序排列,平均成绩相同时,按课程编号升序排列:
scala> scoreDF.groupBy("course_id").avg("score").orderBy($"avg(score)".desc).orderBy($"course_id").show
+---------+-----------------+
|course_id| avg(score)|
+---------+-----------------+
| 1| 64.5|
| 2|72.66666666666667|
| 3| 68.5|
+---------+-----------------+
scala> //33、查询平均成绩大于等于85的所有学生的学号、姓名和平均成绩:
scala> scoreDF.groupBy("student_id").avg("score").where("avg(score)>=85").join(studentDF,"student_id").show
+----------+-----------------+------------+----------+---+
|student_id| avg(score)|student_name| birth|sex|
+----------+-----------------+------------+----------+---+
| 1|89.66666666666667| 赵雷|1990-01-01| 男|
| 7| 93.5| 郑竹|1989-07-01| 女|
+----------+-----------------+------------+----------+---+
scala> //34、查询课程名称为"数学",且分数低于60的学生姓名和分数:
scala> scoreDF.where("score<60").join(courseDF,"course_id").where("course_name='数学'").show
+---------+----------+-----+-----------+----------+
|course_id|student_id|score|course_name|teacher_id|
+---------+----------+-----+-----------+----------+
| 2| 4| 30| 数学| 1|
+---------+----------+-----+-----------+----------+
scala> //35、查询所有学生的课程及分数情况:
scala> studentDF.join(scoreDF,Seq("student_id"),"left_outer").show
+----------+------------+----------+---+---------+-----+
|student_id|student_name| birth|sex|course_id|score|
+----------+------------+----------+---+---------+-----+
| 1| 赵雷|1990-01-01| 男| 1| 80|
| 1| 赵雷|1990-01-01| 男| 2| 90|
| 1| 赵雷|1990-01-01| 男| 3| 99|
| 6| 吴兰|1992-03-01| 女| 1| 31|
| 6| 吴兰|1992-03-01| 女| 3| 34|
| 3| 孙风|1990-05-20| 男| 1| 80|
| 3| 孙风|1990-05-20| 男| 2| 80|
| 3| 孙风|1990-05-20| 男| 3| 80|
| 5| 周梅|1991-12-01| 女| 1| 76|
| 5| 周梅|1991-12-01| 女| 2| 87|
| 4| 李云|1990-08-06| 男| 1| 50|
| 4| 李云|1990-08-06| 男| 2| 30|
| 4| 李云|1990-08-06| 男| 3| 20|
| 8| 王菊|1990-01-20| 女| null| null|
| 7| 郑竹|1989-07-01| 女| 2| 89|
| 7| 郑竹|1989-07-01| 女| 3| 98|
| 2| 钱电|1990-12-21| 男| 1| 70|
| 2| 钱电|1990-12-21| 男| 2| 60|
| 2| 钱电|1990-12-21| 男| 3| 80|
+----------+------------+----------+---+---------+-----+
scala> //36.查询任何一门课程成绩在70分以上的学生姓名、课程名称和分数:
scala> scoreDF.where("score>70").join(studentDF,"student_id").join(courseDF,"course_id").show
+---------+----------+-----+------------+----------+---+-----------+----------+
|course_id|student_id|score|student_name| birth|sex|course_name|teacher_id|
+---------+----------+-----+------------+----------+---+-----------+----------+
| 1| 1| 80| 赵雷|1990-01-01| 男| 语文| 2|
| 1| 3| 80| 孙风|1990-05-20| 男| 语文| 2|
| 1| 5| 76| 周梅|1991-12-01| 女| 语文| 2|
| 3| 1| 99| 赵雷|1990-01-01| 男| 英语| 3|
| 3| 3| 80| 孙风|1990-05-20| 男| 英语| 3|
| 3| 7| 98| 郑竹|1989-07-01| 女| 英语| 3|
| 3| 2| 80| 钱电|1990-12-21| 男| 英语| 3|
| 2| 1| 90| 赵雷|1990-01-01| 男| 数学| 1|
| 2| 3| 80| 孙风|1990-05-20| 男| 数学| 1|
| 2| 5| 87| 周梅|1991-12-01| 女| 数学| 1|
| 2| 7| 89| 郑竹|1989-07-01| 女| 数学| 1|
+---------+----------+-----+------------+----------+---+-----------+----------+
scala> //37、查询课程不及格的学生:
scala> scoreDF.where("score<60").join(studentDF,"student_id").show
+----------+---------+-----+------------+----------+---+
|student_id|course_id|score|student_name| birth|sex|
+----------+---------+-----+------------+----------+---+
| 6| 1| 31| 吴兰|1992-03-01| 女|
| 6| 3| 34| 吴兰|1992-03-01| 女|
| 4| 1| 50| 李云|1990-08-06| 男|
| 4| 2| 30| 李云|1990-08-06| 男|
| 4| 3| 20| 李云|1990-08-06| 男|
+----------+---------+-----+------------+----------+---+
scala> //38、查询课程编号为01且课程成绩在80分以上的学生的学号和姓名:
scala> scoreDF.where("course_id=1 and score>=80").join(studentDF,"student_id").show
+----------+---------+-----+------------+----------+---+
|student_id|course_id|score|student_name| birth|sex|
+----------+---------+-----+------------+----------+---+
| 1| 1| 80| 赵雷|1990-01-01| 男|
| 3| 1| 80| 孙风|1990-05-20| 男|
+----------+---------+-----+------------+----------+---+
scala> //39.求每门课程的学生人数:
scala> scoreDF.groupBy("course_id").count.show
+---------+-----+
|course_id|count|
+---------+-----+
| 1| 6|
| 3| 6|
| 2| 6|
+---------+-----+
scala> //40、查询选修"张三"老师所授课程的学生中,成绩最高的学生信息及其成绩:
scala> scoreDF.join(courseDF,"course_id").join(teacherDF,"teacher_id").where("teacher_name='张三'").selectExpr("*","max(score) over() max_score").where("m_score=score").show
20/08/16 15:38:32 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
+----------+---------+----------+-----+-----------+------------+---------+
|teacher_id|course_id|student_id|score|course_name|teacher_name|max_score|
+----------+---------+----------+-----+-----------+------------+---------+
| 1| 2| 1| 90| 数学| 张三| 90|
+----------+---------+----------+-----+-----------+------------+---------+
scala> //41、查询不同课程成绩相同的学生的学生编号、课程编号、学生成绩:
scala> scoreDF.as("s1").crossJoin(scoreDF.as("s2")).where("s1.score=s2.score and s1.course_id!=s2.course_id").show
+----------+---------+-----+----------+---------+-----+
|student_id|course_id|score|student_id|course_id|score|
+----------+---------+-----+----------+---------+-----+
| 1| 1| 80| 2| 3| 80|
| 1| 1| 80| 3| 2| 80|
| 1| 1| 80| 3| 3| 80|
| 2| 3| 80| 1| 1| 80|
| 2| 3| 80| 3| 1| 80|
| 2| 3| 80| 3| 2| 80|
| 3| 1| 80| 2| 3| 80|
| 3| 1| 80| 3| 2| 80|
| 3| 1| 80| 3| 3| 80|
| 3| 2| 80| 1| 1| 80|
| 3| 2| 80| 2| 3| 80|
| 3| 2| 80| 3| 1| 80|
| 3| 2| 80| 3| 3| 80|
| 3| 3| 80| 1| 1| 80|
| 3| 3| 80| 3| 1| 80|
| 3| 3| 80| 3| 2| 80|
+----------+---------+-----+----------+---------+-----+
scala> //42、查询每门课程成绩最好的前三名:
scala> scoreDF.selectExpr("*","row_number() over(partition by course_id order by score desc) rank").where("rank<=3").show
+----------+---------+-----+----+
|student_id|course_id|score|rank|
+----------+---------+-----+----+
| 1| 1| 80| 1|
| 3| 1| 80| 2|
| 5| 1| 76| 3|
| 1| 3| 99| 1|
| 7| 3| 98| 2|
| 2| 3| 80| 3|
| 1| 2| 90| 1|
| 7| 2| 89| 2|
| 5| 2| 87| 3|
+----------+---------+-----+----+
scala> //43、统计每门课程的学生选修人数(超过5人的课程才统计)要求输出课程号和选修人数,查询结果按人数降序排列,若人数相同,按课程号升序排列
scala> scoreDF.selectExpr("*","count(1) over(partition by course_id) cnt").where("cnt>=5").orderBy($"cnt".desc).orderBy("course_id").drop("student_id","score").dropDuplicates("course_id","cnt").show
+---------+---+
|course_id|cnt|
+---------+---+
| 1| 6|
| 2| 6|
| 3| 6|
+---------+---+
scala> //44、检索至少选修两门课程的学生学号:
scala> scoreDF.groupBy("student_id").count.where("count>=2").show
+----------+-----+
|student_id|count|
+----------+-----+
| 1| 3|
| 6| 2|
| 3| 3|
| 5| 2|
| 4| 3|
| 7| 2|
| 2| 3|
+----------+-----+
scala> //45、查询选修了全部课程的学生信息:
scala> studentDF.join(scoreDF,Seq("student_id"),"left_outer").groupBy("student_id").count.where(s"count = ${courseDF.select("course_id").count}").join(studentDF,"student_id").show
+----------+-----+------------+----------+---+
|student_id|count|student_name| birth|sex|
+----------+-----+------------+----------+---+
| 1| 3| 赵雷|1990-01-01| 男|
| 3| 3| 孙风|1990-05-20| 男|
| 4| 3| 李云|1990-08-06| 男|
| 2| 3| 钱电|1990-12-21| 男|
+----------+-----+------------+----------+---+
scala> //46、查询各学生的年龄(周岁):
scala> studentDF.selectExpr("*","cast(date_format(current_date(),'yyyy') as Int)-cast(date_format(birth,'yyyy') as Int) age").show
+----------+------------+----------+---+---+
|student_id|student_name| birth|sex|age|
+----------+------------+----------+---+---+
| 1| 赵雷|1990-01-01| 男| 30|
| 2| 钱电|1990-12-21| 男| 30|
| 3| 孙风|1990-05-20| 男| 30|
| 4| 李云|1990-08-06| 男| 30|
| 5| 周梅|1991-12-01| 女| 29|
| 6| 吴兰|1992-03-01| 女| 28|
| 7| 郑竹|1989-07-01| 女| 31|
| 8| 王菊|1990-01-20| 女| 30|
+----------+------------+----------+---+---+
scala> //47、查询本周过生日的学生:找到下周一-1即为本周最后一天,开始时间为当前天(若今天就是星期天会不会出错?)
scala> studentDF.where("unix_timestamp(cast(concat_ws('-',date_format(current_date(),'yyyy'),date_format(birth,'MM'),date_format(birth,'dd')) as date),'yyyy-MM-dd') between unix_timestamp(current_date()) and unix_timestamp(date_sub(next_day(current_date(),'MON'),1),'yyyy-MM-dd')").show
+----------+------------+-----+---+
|student_id|student_name|birth|sex|
+----------+------------+-----+---+
+----------+------------+-----+---+
scala> //48、查询下周过生日的学生: 下周1到+6天
scala> studentDF.where("unix_timestamp(cast(concat_ws('-',date_format(current_date(),'yyyy'),date_format(birth,'MM'),date_format(birth,'dd')) as date),'yyyy-MM-dd')between unix_timestamp(next_day(current_date(),'MON'),'yyyy-MM-dd') and unix_timestamp(date_add(next_day(current_date(),'MON'),6),'yyyy-MM-dd')").show
+----------+------------+-----+---+
|student_id|student_name|birth|sex|
+----------+------------+-----+---+
+----------+------------+-----+---+
scala> //49、查询本月过生日的学生:
scala> studentDF.where("month(birth) = month(current_date())").show
+----------+------------+----------+---+
|student_id|student_name| birth|sex|
+----------+------------+----------+---+
| 4| 李云|1990-08-06| 男|
+----------+------------+----------+---+
scala> //50、查询12月份过生日的学生:
scala> studentDF.where("month(birth) = 12").show
+----------+------------+----------+---+
|student_id|student_name| birth|sex|
+----------+------------+----------+---+
| 2| 钱电|1990-12-21| 男|
| 5| 周梅|1991-12-01| 女|
+----------+------------+----------+---+
spark sql 之 DSL风格 练习mysql50题
猜你喜欢
转载自blog.csdn.net/sun_0128/article/details/108034501
今日推荐
周排行