第一种:
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import scala.reflect.io.Path
object joinTest {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName("joinTest")
.master("local[*]")
.getOrCreate()
import spark.implicits._
val lines: Dataset[String] = spark.createDataset(List("1,jige,china","2,AJ,Japan","3,杨玉环,Tang"))
//整理数据
val Person: Dataset[(Int, String, String)] = lines.map(line => {
val word: Array[String] = line.split(",")
val id: Int = word(0).toInt
val name: String = word(1).toString
val country: String = word(2).toString
(id, name, country)
})
val df1: DataFrame = Person.toDF("id","name","country")
println("----df1----")
df1.show()
val lines2: Dataset[String] = spark.createDataset(Array("china,中国","Japan,日本"))
val nations: Dataset[(String, String)] = lines2.map(x => {
val word: Array[String] = x.split(",")
val ename: String = word(0).toString
val cname: String = word(1).toString
(ename, cname)
})
val df2: DataFrame = nations.toDF("ename","cname")
println("----df2----")
df2.show()
/**
* 第一种:创建视图
*/
df1.createTempView("t1")
df2.createTempView("t2")
//join语句:查询字段 + 第一张表 关联(join)第二张表 + 条件(on)+ 两张表字段相同
val r: DataFrame = spark.sql("SELECT id,name,cname,ename FROM t1 JOIN t2 ON country = ename")
println("----关联结果----")
r.show()
/**
* 第二种,用dataframeAPI
*/
//join连接类型有很多:`inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`,
// * `right`, `right_outer`, `left_semi`, `left_anti`.
//如果最后不加join类型,默认是"inner"类型,等价于val rj: DataFrame = df1.join(df2,$"country" === $"ename")
val rj: DataFrame = df1.join(df2,$"country" === $"ename","inner")
println("----innerJoin----")
rj.show()
//左连接和左外连接结果相同`left`, `left_outer`
val left: DataFrame = df1.join(df2,$"country" === $"ename","left")
println("----leftJoin----")
left.show()
}
}
运行结果:
----df1----
+---+----+-------+
| id|name|country|
+---+----+-------+
| 1|jige| china|
| 2| AJ| Japan|
| 3| 杨玉环| Tang|
+---+----+-------+
----df2----
+-----+-----+
|ename|cname|
+-----+-----+
|china| 中国|
|Japan| 日本|
+-----+-----+
----关联结果----
+---+----+-----+-----+
| id|name|cname|ename|
+---+----+-----+-----+
| 1|jige| 中国|china|
| 2| AJ| 日本|Japan|
+---+----+-----+-----+
----innerJoin----
+---+----+-------+-----+-----+
| id|name|country|ename|cname|
+---+----+-------+-----+-----+
| 1|jige| china|china| 中国|
| 2| AJ| Japan|Japan| 日本|
+---+----+-------+-----+-----+
----leftJoin----
+---+----+-------+-----+-----+
| id|name|country|ename|cname|
+---+----+-------+-----+-----+
| 1|jige| china|china| 中国|
| 2| AJ| Japan|Japan| 日本|
| 3| 杨玉环| Tang| null| null|
+---+----+-------+-----+-----+
Process finished with exit code 0
第二种:通过创建case class
假数据表1(people.txt):
1,冯瑞,22,男,北京
2,老王,50,男,上海
3,大师,100,女,仙界
表2(peopleaddr.txt) :
beijing,北京,京
shanghai,上海,琼
xianjie,仙界,仙
case class People(id: Int,name: String,age: Int,sex: String,addr: String)
case class Peopleaddr(ename: String, cname: String, jiancheng: String)
object test111{
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("test111").getOrCreate()
import spark.implicits._
val lines: Dataset[Array[String]] = spark.read.textFile("G:/people.txt").map(_.split(","))
val line: Dataset[People] = lines.map(x => People(x(0).toInt,x(1).toString,x(2).toInt,x(3).toString,x(4).toString))
val df1: DataFrame = line.toDF()
df1.show()
df1.createTempView("t1")
val addr: Dataset[Array[String]] = spark.read.textFile("G:/peopleaddr.txt").map(_.split(","))
val lines2: Dataset[Peopleaddr] = addr.map(x => Peopleaddr(x(0).toString,x(1).toString,x(2).toString))
val df2: DataFrame = lines2.toDF()
df2.show()
df2.createTempView("t2")
val r: DataFrame = spark.sql("select id,name,age,sex,addr,ename,cname,jiancheng from t1 join t2 on addr = cname")
r.show()
}
}
运行结果:
+---+----+---+---+----+
| id|name|age|sex|addr|
+---+----+---+---+----+
| 1| 冯瑞| 22| 男| 北京|
| 2| 老王| 50| 男| 上海|
| 3| 大师|100| 女| 仙界|
+---+----+---+---+----+
+--------+-----+---------+
| ename|cname|jiancheng|
+--------+-----+---------+
| beijing| 北京| 京|
|shanghai| 上海| 琼|
| xianjie| 仙界| 仙|
+--------+-----+---------+
+---+----+---+---+----+--------+-----+---------+
| id|name|age|sex|addr| ename|cname|jiancheng|
+---+----+---+---+----+--------+-----+---------+
| 1| 冯瑞| 22| 男| 北京| beijing| 北京| 京|
| 2| 老王| 50| 男| 上海|shanghai| 上海| 琼|
| 3| 大师|100| 女| 仙界| xianjie| 仙界| 仙|
+---+----+---+---+----+--------+-----+---------+
Process finished with exit code 0