val conf=new SparkConf().setAppName("ForeachDemo").setMaster("local") val sc=new SparkContext(conf) /** * mapPartitionsWithIndex 操作每个分区,并获取分区号 * 该方法需要传入一个函数, */ val func = (index: Int, iter: Iterator[(Int)]) => { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } def func2(index: Int, iter: Iterator[(String)]) : Iterator[String] = { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } //------------------------------------------------------------------------------------------------------------------ val rdd1 = sc.parallelize(List(1,2,3,4,5,6,7), 2) val rdd2 = rdd1.mapPartitionsWithIndex(func).collect //[partID:0, val: 1], [partID:0, val: 2], [partID:0, val: 3], [partID:0, val: 4], [partID:1, val: 5], [partID:1, val: 6], [partID:1, val: 7] //print(rdd2.toBuffer) /** * def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U = withScope { * aggregate zeroValue代表一个初始值,seqOp 表示在每个RDD内部进行操作,combOp表示在所有的Rdd之间操作 * seqOp 和 combOp 都需要两个参数 * zeroValue初始值会在每次操作都会进行使用, * 比如aggregate(5)(_+_,_+_) 如果有两个分区,每个分区都会加5,最后还会再加5 * rdd6.aggregate("=")(_ + _, _ + _)的结果为==abc=def */ val rdd3 = rdd1.aggregate(0)(_+_,_+_)//28 //println(rdd3) //print(rdd1.getNumPartitions) val rdd4 = rdd1.aggregate(5)(_+_,_+_)//43 //首先 每个分区加多加一个5 最后分区相加再加一个5 一共加15 //----------------------------------------------------------------------------------------------------------------- val rdd5 = rdd1.aggregate(5)((x,y)=>math.max(x,y),_+_) //首先 把5作用的每个分区上比较大小,第一个分区5最大,第二个分区7最大, //然后第二个方法相加5+7+5 println(rdd5) //----------------------------------------------------------------------------------------------------------------- val rdd6 = sc.parallelize(List("a","b","c","d","e","f"),2) //println(rdd6.aggregate("")(_ + _, _ + _))// abcdef //println(rdd6.aggregate("=")(_ + _, _ + _))// ==abc=def //------------------------------------------------------------------------------------------------------------------ val rdd7 = sc.parallelize(List("12","23","345","4567"),2) println(rdd7.mapPartitionsWithIndex(func2).collect.toBuffer) //[partID:0, val: 12], [partID:0, val: 23], [partID:1, val: 345], [partID:1, val: 4567] println(rdd7.aggregate("")((x,y) => math.max(x.length, y.length).toString, (x,y) => x + y)) //结果是24或者42 因为是并行化的 结果看两个分区哪个先执行完 //------------------------------------------------------------------------------------------------------------------ val rdd8 = sc.parallelize(List("12","23","345",""),2) //[partID:0, val: 12], [partID:0, val: 23], [partID:1, val: 345], [partID:1, val: ] // println(rdd8.mapPartitionsWithIndex(func2).collect.toBuffer) /**在求完最小值之后,比较的结果会使用toString方法作为下次比较的参数 * 10或者01 * 第一个分区 "".length和"12".length 最小值为0 "0".length 和"23".length最小值为1 * 第二个分区"".length 和"345".length 最小值为0 "0".length 和"".length 最小值为0 */ println(rdd8.aggregate("")((x,y)=>math.min(x.length, y.length).toString, (x,y) => x + y)) //10或者01 //------------------------------------------------------------------------------------------------------------------ /** 21或者12 * 第一个分区 "".length和"12".length 最大值为2 "2".length 和"23".length 最大值为 2 * 第二个分区"".length 和"345".length 最大值为3 "3".length 和"".length 最大值为1 */ println(rdd8.aggregate("")((x,y)=>math.max(x.length, y.length).toString, (x,y) => x + y)) //21或者12 //------------------------------------------------------------------------------------------------------------------ val rdd9 = sc.parallelize(List("12","23","","345"),2) println(rdd9.mapPartitionsWithIndex(func2).collect.toBuffer) //[partID:0, val: 12], [partID:0, val: 23], [partID:1, val: ], [partID:1, val: 345] print(rdd9.aggregate("")((x,y) => math.min(x.length, y.length).toString, (x,y) => x + y)) //结果是11 //------------------------------------------------------------------------------------------------------------------ val rdd10 = sc.parallelize(List( ("cat",2), ("cat", 5), ("mouse", 4),("cat", 12), ("dog", 12), ("mouse", 2)), 2) def func3(index: Int, iter: Iterator[(String, Int)]) : Iterator[String] = { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } rdd10.mapPartitionsWithIndex(func3).collect println(rdd10.aggregateByKey(0)(math.max(_, _), _ + _).collect.toBuffer) //ArrayBuffer((dog,12), (cat,17), (mouse,6)) println(rdd10.aggregateByKey(100)(math.max(_, _), _ + _).collect.toBuffer) //ArrayBuffer((dog,100), (cat,200), (mouse,200))
spark01-算子练习02
猜你喜欢
转载自blog.csdn.net/oracle8090/article/details/78713128
今日推荐
周排行