spark01-算子练习01

package cn.spark.study01

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by lp on 2017/11/29.
  */
object Demo01 {
  def main(args: Array[String]): Unit = {
    val conf=new SparkConf().setAppName("ForeachDemo").setMaster("local")
    val sc=new SparkContext(conf)
    //初始化一个rdd
    val rdd1 = sc.parallelize(Array(5,6,4,7,3,8,2,9,1,10))
    //对每个元素进行乘2操作
    sc.parallelize(Array(5,6,4,7,3,8,2,9,1,10)).map(_*2)

    //------------------------------------------------------------------------------------------------------------------
    //排序
    val rdd2 = sc.parallelize(Array(5,6,4,7,3,8,2,9,1,10)).map(_*2).sortBy(x=>x,true)
    //2, 4, 6, 8, 10, 12, 14, 16, 18, 20

    //------------------------------------------------------------------------------------------------------------------
    //过滤
    rdd2.filter(_>10)//12, 14, 16, 18, 20
    val rdd3 = sc.parallelize(Array(5,6,4,7,3,8,2,9,1,10)).map(_*2).sortBy(x=>x+"",true)
    //10, 12, 14, 16, 18, 2, 20, 4, 6, 8

    //------------------------------------------------------------------------------------------------------------------
    //把所有的值压平为一个数组
    val rdd4 = sc.parallelize(Array("a b c", "d e f", "h i j")).flatMap(_.split(" "))
    //a, b, c, d, e, f, h, i, j

    val rdd5 = sc.parallelize(List(List("a b c", "a b b"),List("e f g", "a f g"), List("h i j", "a a b"))).flatMap(_.flatMap(_.split(" ")))
    //a, b, c, a, b, b, e, f, g, a, f, g, h, i, j, a, a, b

    //union求并集,注意类型要一致
    val rdd6 = sc.parallelize(List(5,6,4,7))
    val rdd7 = sc.parallelize(List(1,2,3,4))
    val rdd8 = rdd6.union(rdd7)//5, 6, 4, 7, 1, 2, 3, 4

    //去掉重复值
    val rdd9 = rdd8.distinct().sortBy(x=>x)//1, 2, 3, 4, 5, 6, 7

    //求交集
    val rdd10 = rdd6.intersection(rdd7)//4

    //关联操作 join
    val rdd11 = sc.parallelize(List(("tom", 1), ("jerry", 2), ("kitty", 3)))
    val rdd12 = sc.parallelize(List(("jerry", 9), ("tom", 8), ("shuke", 7),("tom",100)))
    val rdd13 = rdd11.join(rdd12)//(tom,(1,8)), (tom,(1,100)), (jerry,(2,9))

    //left join
    val rdd14 = rdd11.leftOuterJoin(rdd12)
    //(tom,(1,Some(8))), (tom,(1,Some(100))), (jerry,(2,Some(9))), (kitty,(3,None))

    //right join
    val rdd15 = rdd11.rightOuterJoin(rdd12)
    //(tom,(Some(1),8)), (tom,(Some(1),100)), (jerry,(Some(2),9)), (shuke,(None,7))

    //分区groupByKey
    val rdd16 = rdd11.union(rdd12).groupByKey()
    //(tom,CompactBuffer(1, 8, 100)), (jerry,CompactBuffer(2, 9)), (shuke,CompactBuffer(7)), (kitty,CompactBuffer(3))

    val rdd17 = rdd16.map(x=>(x._1,x._2.sum))
    //(tom,109), (jerry,11), (shuke,7), (kitty,3)

    //WordCount
    // reduceByKey效率高比groupBykey,reduceByKey进行了局部操作
    sc.textFile("/root/words.txt").flatMap(x=>x.split(" ")).map((_,1)).reduceByKey(_+_).sortBy(_._2,false).collect
    sc.textFile("/root/words.txt").flatMap(x=>x.split(" ")).map((_,1)).groupByKey.map(t=>(t._1, t._2.sum)).collect

    //cogroup先局部聚合再总体聚合
    val rdd18 = sc.parallelize(List(("tom", 1), ("tom", 2), ("jerry", 3), ("kitty", 2)))
    val rdd19 = sc.parallelize(List(("jerry", 2), ("tom", 1), ("shuke", 2)))
    val rdd20 = rdd18.cogroup(rdd19)
    //Array((tom,(CompactBuffer(1, 2),CompactBuffer(1))), (jerry,(CompactBuffer(3),CompactBuffer(2))), (shuke,(CompactBuffer(),CompactBuffer(2))), (kitty,(CompactBuffer(2),CompactBuffer())))
    val rdd21 = rdd20.map(t=>(t._1, t._2._1.sum + t._2._2.sum))

    //cartesian笛卡尔积
    val rdd22 = sc.parallelize(List("tom", "jerry"))
    val rdd23 = sc.parallelize(List("tom", "kitty", "shuke"))
    val rdd34 = rdd22.cartesian(rdd23)
    //Array((tom,tom), (tom,kitty), (tom,shuke), (jerry,tom), (jerry,kitty), (jerry,shuke))
  }
}

猜你喜欢

转载自blog.csdn.net/oracle8090/article/details/78679923