package cn.spark.study01 import org.apache.spark.{SparkConf, SparkContext} /** * Created by lp on 2017/11/29. */ object Demo01 { def main(args: Array[String]): Unit = { val conf=new SparkConf().setAppName("ForeachDemo").setMaster("local") val sc=new SparkContext(conf) //初始化一个rdd val rdd1 = sc.parallelize(Array(5,6,4,7,3,8,2,9,1,10)) //对每个元素进行乘2操作 sc.parallelize(Array(5,6,4,7,3,8,2,9,1,10)).map(_*2) //------------------------------------------------------------------------------------------------------------------ //排序 val rdd2 = sc.parallelize(Array(5,6,4,7,3,8,2,9,1,10)).map(_*2).sortBy(x=>x,true) //2, 4, 6, 8, 10, 12, 14, 16, 18, 20 //------------------------------------------------------------------------------------------------------------------ //过滤 rdd2.filter(_>10)//12, 14, 16, 18, 20 val rdd3 = sc.parallelize(Array(5,6,4,7,3,8,2,9,1,10)).map(_*2).sortBy(x=>x+"",true) //10, 12, 14, 16, 18, 2, 20, 4, 6, 8 //------------------------------------------------------------------------------------------------------------------ //把所有的值压平为一个数组 val rdd4 = sc.parallelize(Array("a b c", "d e f", "h i j")).flatMap(_.split(" ")) //a, b, c, d, e, f, h, i, j val rdd5 = sc.parallelize(List(List("a b c", "a b b"),List("e f g", "a f g"), List("h i j", "a a b"))).flatMap(_.flatMap(_.split(" "))) //a, b, c, a, b, b, e, f, g, a, f, g, h, i, j, a, a, b //union求并集,注意类型要一致 val rdd6 = sc.parallelize(List(5,6,4,7)) val rdd7 = sc.parallelize(List(1,2,3,4)) val rdd8 = rdd6.union(rdd7)//5, 6, 4, 7, 1, 2, 3, 4 //去掉重复值 val rdd9 = rdd8.distinct().sortBy(x=>x)//1, 2, 3, 4, 5, 6, 7 //求交集 val rdd10 = rdd6.intersection(rdd7)//4 //关联操作 join val rdd11 = sc.parallelize(List(("tom", 1), ("jerry", 2), ("kitty", 3))) val rdd12 = sc.parallelize(List(("jerry", 9), ("tom", 8), ("shuke", 7),("tom",100))) val rdd13 = rdd11.join(rdd12)//(tom,(1,8)), (tom,(1,100)), (jerry,(2,9)) //left join val rdd14 = rdd11.leftOuterJoin(rdd12) //(tom,(1,Some(8))), (tom,(1,Some(100))), (jerry,(2,Some(9))), (kitty,(3,None)) //right join val rdd15 = rdd11.rightOuterJoin(rdd12) //(tom,(Some(1),8)), (tom,(Some(1),100)), (jerry,(Some(2),9)), (shuke,(None,7)) //分区groupByKey val rdd16 = rdd11.union(rdd12).groupByKey() //(tom,CompactBuffer(1, 8, 100)), (jerry,CompactBuffer(2, 9)), (shuke,CompactBuffer(7)), (kitty,CompactBuffer(3)) val rdd17 = rdd16.map(x=>(x._1,x._2.sum)) //(tom,109), (jerry,11), (shuke,7), (kitty,3) //WordCount // reduceByKey效率高比groupBykey,reduceByKey进行了局部操作 sc.textFile("/root/words.txt").flatMap(x=>x.split(" ")).map((_,1)).reduceByKey(_+_).sortBy(_._2,false).collect sc.textFile("/root/words.txt").flatMap(x=>x.split(" ")).map((_,1)).groupByKey.map(t=>(t._1, t._2.sum)).collect //cogroup先局部聚合再总体聚合 val rdd18 = sc.parallelize(List(("tom", 1), ("tom", 2), ("jerry", 3), ("kitty", 2))) val rdd19 = sc.parallelize(List(("jerry", 2), ("tom", 1), ("shuke", 2))) val rdd20 = rdd18.cogroup(rdd19) //Array((tom,(CompactBuffer(1, 2),CompactBuffer(1))), (jerry,(CompactBuffer(3),CompactBuffer(2))), (shuke,(CompactBuffer(),CompactBuffer(2))), (kitty,(CompactBuffer(2),CompactBuffer()))) val rdd21 = rdd20.map(t=>(t._1, t._2._1.sum + t._2._2.sum)) //cartesian笛卡尔积 val rdd22 = sc.parallelize(List("tom", "jerry")) val rdd23 = sc.parallelize(List("tom", "kitty", "shuke")) val rdd34 = rdd22.cartesian(rdd23) //Array((tom,tom), (tom,kitty), (tom,shuke), (jerry,tom), (jerry,kitty), (jerry,shuke)) } }
spark01-算子练习01
猜你喜欢
转载自blog.csdn.net/oracle8090/article/details/78679923
今日推荐
周排行