spark01-算子练习02

val conf=new SparkConf().setAppName("ForeachDemo").setMaster("local")
val sc=new SparkContext(conf)
/**
  * mapPartitionsWithIndex 操作每个分区,并获取分区号
  * 该方法需要传入一个函数,
  */
val func = (index: Int, iter: Iterator[(Int)]) => {
  iter.toList.map(x => "[partID:" +  index + ", val: " + x + "]").iterator
}
def func2(index: Int, iter: Iterator[(String)]) : Iterator[String] = {
  iter.toList.map(x => "[partID:" +  index + ", val: " + x + "]").iterator
}
//------------------------------------------------------------------------------------------------------------------
val rdd1 = sc.parallelize(List(1,2,3,4,5,6,7), 2)
val rdd2 = rdd1.mapPartitionsWithIndex(func).collect
//[partID:0, val: 1], [partID:0, val: 2], [partID:0, val: 3], [partID:0, val: 4], [partID:1, val: 5], [partID:1, val: 6], [partID:1, val: 7]
//print(rdd2.toBuffer)

/**
  * def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U = withScope {
  * aggregate zeroValue代表一个初始值,seqOp 表示在每个RDD内部进行操作,combOp表示在所有的Rdd之间操作
  * seqOp 和 combOp 都需要两个参数
  * zeroValue初始值会在每次操作都会进行使用,
  * 比如aggregate(5)(_+_,_+_) 如果有两个分区,每个分区都会加5,最后还会再加5
  * rdd6.aggregate("=")(_ + _, _ + _)的结果为==abc=def
  */
val rdd3 = rdd1.aggregate(0)(_+_,_+_)//28
//println(rdd3)
//print(rdd1.getNumPartitions)


val rdd4 = rdd1.aggregate(5)(_+_,_+_)//43
//首先 每个分区加多加一个5 最后分区相加再加一个5 一共加15

//-----------------------------------------------------------------------------------------------------------------
val rdd5 = rdd1.aggregate(5)((x,y)=>math.max(x,y),_+_)
//首先 把5作用的每个分区上比较大小,第一个分区5最大,第二个分区7最大,
//然后第二个方法相加5+7+5
println(rdd5)

//-----------------------------------------------------------------------------------------------------------------
val rdd6 = sc.parallelize(List("a","b","c","d","e","f"),2)
//println(rdd6.aggregate("")(_ + _, _ + _))// abcdef
//println(rdd6.aggregate("=")(_ + _, _ + _))// ==abc=def

//------------------------------------------------------------------------------------------------------------------
val rdd7 = sc.parallelize(List("12","23","345","4567"),2)
println(rdd7.mapPartitionsWithIndex(func2).collect.toBuffer)
//[partID:0, val: 12], [partID:0, val: 23], [partID:1, val: 345], [partID:1, val: 4567]
println(rdd7.aggregate("")((x,y) => math.max(x.length, y.length).toString, (x,y) => x + y))
//结果是24或者42 因为是并行化的 结果看两个分区哪个先执行完

//------------------------------------------------------------------------------------------------------------------
val rdd8 = sc.parallelize(List("12","23","345",""),2)
//[partID:0, val: 12], [partID:0, val: 23], [partID:1, val: 345], [partID:1, val: ]
// println(rdd8.mapPartitionsWithIndex(func2).collect.toBuffer)
/**在求完最小值之后,比较的结果会使用toString方法作为下次比较的参数
  * 10或者01
  * 第一个分区 "".length和"12".length 最小值为0 "0".length 和"23".length最小值为1
  * 第二个分区"".length 和"345".length 最小值为0 "0".length 和"".length 最小值为0
  */
println(rdd8.aggregate("")((x,y)=>math.min(x.length, y.length).toString, (x,y) => x + y))
//10或者01
//------------------------------------------------------------------------------------------------------------------
/** 21或者12
  * 第一个分区 "".length和"12".length 最大值为2 "2".length 和"23".length 最大值为 2
  * 第二个分区"".length 和"345".length 最大值为3 "3".length 和"".length 最大值为1
  */
println(rdd8.aggregate("")((x,y)=>math.max(x.length, y.length).toString, (x,y) => x + y))
//21或者12
//------------------------------------------------------------------------------------------------------------------
val rdd9 = sc.parallelize(List("12","23","","345"),2)
println(rdd9.mapPartitionsWithIndex(func2).collect.toBuffer)
//[partID:0, val: 12], [partID:0, val: 23], [partID:1, val: ], [partID:1, val: 345]
print(rdd9.aggregate("")((x,y) => math.min(x.length, y.length).toString, (x,y) => x + y))
//结果是11

//------------------------------------------------------------------------------------------------------------------
val rdd10 = sc.parallelize(List( ("cat",2), ("cat", 5), ("mouse", 4),("cat", 12), ("dog", 12), ("mouse", 2)), 2)
def func3(index: Int, iter: Iterator[(String, Int)]) : Iterator[String] = {
  iter.toList.map(x => "[partID:" +  index + ", val: " + x + "]").iterator
}
rdd10.mapPartitionsWithIndex(func3).collect
println(rdd10.aggregateByKey(0)(math.max(_, _), _ + _).collect.toBuffer)
//ArrayBuffer((dog,12), (cat,17), (mouse,6))
println(rdd10.aggregateByKey(100)(math.max(_, _), _ + _).collect.toBuffer)
//ArrayBuffer((dog,100), (cat,200), (mouse,200))

猜你喜欢

转载自blog.csdn.net/oracle8090/article/details/78713128
今日推荐