package scalapackage
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by Germmy on 2018/5/9.
*/
object SparkRddTest {
def main(args: Array[String]) {
val sparkConf: SparkConf = new SparkConf().setAppName("SparkRdd").setMaster("local[*]")
val sc: SparkContext = new SparkContext(sparkConf)
val rdd1: RDD[Int] = sc.parallelize(List(4,5,1,9,10,8,7,6))
//1.将rdd里面的值都*2,并排序
val rdd2: RDD[Int] = rdd1.map(_*2)
// println(rdd2.collect().toBuffer)
val rdd3: RDD[Int] = rdd2.sortBy(x=>x,true)//用x=>x
// println(rdd3.collect().toBuffer)
//过滤出大于等于10的元素
val rdd4: RDD[Int] = rdd3.filter(_>=10)
// println(rdd4.collect().toBuffer)
//将以下元素先切分再压平
val rdd5: RDD[String] = sc.parallelize(Array("a,b,c","d e f","h i j"))
val rdd6: RDD[String] = rdd5.flatMap(_.split(" "))
//来一个复杂的
val rdd7: RDD[List[String]] = sc.parallelize(List(List("a b c","a b b"),List("d e f","a b b"),List("h i j","a b b")))
val rdd8: RDD[String] = rdd7.flatMap(_.flatMap(_.split(" ")))
// println(rdd8.collect().toBuffer)
val rdd9: RDD[Int] = sc.parallelize(List(5,6,4,3))
val rdd10: RDD[Int] = sc.parallelize(List(1,2,3,4))
//求并集
val rdd11: RDD[Int] = rdd9.union(rdd10)
// println("并集为:"+rdd11.collect().toBuffer)
//求交集
val rdd12: RDD[Int] = rdd9.intersection(rdd10)
// println("交集为:"+rdd12.collect().toBuffer)
//去重
val rdd13: RDD[Int] = rdd11.distinct()
// println("去重为:"+rdd13.collect().toBuffer)
val rdd14: RDD[(String, Int)] = sc.parallelize(List(("tom",1),("tom",2),("jerry",3),("kitty",2)))
val rdd15: RDD[(String, Int)] = sc.parallelize(List(("jerry",2),("tom",1),("shuke",2)))
//求join
val rdd16: RDD[(String, (Int, Int))] = rdd14.join(rdd15)
println("join为:"+rdd16.collect().toBuffer)
//左连接
val rdd17: RDD[(String, (Int, Option[Int]))] = rdd14.leftOuterJoin(rdd15)
println("left join为:"+rdd17.collect().toBuffer)
//右连接
val rdd18: RDD[(String, (Option[Int], Int))] = rdd14.rightOuterJoin(rdd15)
println("right join为:"+rdd18.collect().toBuffer)
}
}
Spark算子练习
猜你喜欢
转载自my.oschina.net/windows20/blog/1809817
今日推荐
周排行