spark篇1:Spark和SparkSql UDF数据倾斜之随机数前缀
废话不多说,直接上干货
1 spark rdd
思路:先加前缀一次聚会,再去前缀二次聚合
package RDDDFDS
import RDDDFDS.初始化.ContextUtils
import org.apache.spark.rdd.RDD
import RDDDFDS.隐式转换.ImplicitAspect.rdd2RichRDD
object sparkWc {
def main(args: Array[String]): Unit = {
val sc = ContextUtils.getSC(this.getClass.getSimpleName)
val readRdd: RDD[String] = sc.textFile("C:\\数据\\WC数据")
val words: RDD[String] = readRdd.flatMap(_.split(""))
// 增加随机数前缀,把倾斜的数据打散
val randomRDD: RDD[String] = words.map(scala.util.Random.nextInt(9).toString+"_"+_)
val tuples: RDD[(String, Int)] = randomRDD.map((_,1))
// 带前缀进行首次聚合
val sumed: RDD[(String, Int)] = tuples.reduceByKey(_+_)
sumed.distinct()
// 把随机数前缀去掉
val value1: RDD[(String, Int)] = sumed.map(x => {
val strings = x._1.split("_")
val key = strings(1)
val value = x._2
(key, value)
})
// 对没有前缀的RDD进行二次聚合(最终聚合)
val value: RDD[(String, Int)] = value1.reduceByKey(_+_)
value.printInfo()
sc.stop()
}
}
2 sparksql DataFrame udf
思路:先加前缀一次聚会,再去前缀二次聚合
package RDDDFDS
import java.util.Random
import RDDDFDS.初始化.ContextUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SQLContext}
object sparkMysqlWc {
def main(args: Array[String]): Unit = {
Logger.getRootLogger.setLevel(Level.ERROR)
val sc: SparkContext = ContextUtils.getSC(this.getClass.getSimpleName)
val sparkSession = new SQLContext(sc)
val reader = sparkSession.read.format("jdbc")
.option("url","jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=utf8&autoReconnect=true&rewriteBatchedStatements=TRUE&useSSL=false")
.option("driver", "com.mysql.jdbc.Driver")
.option("user", "root")
.option("password","123456")
.option("dbtable", "boytest").load()
reader.registerTempTable("boytest")
sparkSession.cacheTable("boytest")
//udf加前缀
sparkSession.udf.register("random_prefix", (key:String)=>{
val random = new Random()
val randNum = random.nextInt(10)
randNum + "_" + key
})
//udf去前缀
sparkSession.udf.register("remove_prefix",(key:String)=>{
val strings = key.split("_")
strings(1)
})
val datas = sparkSession.sql(
s"""
|select sum(ct),remove_prefix(name)
| from(
| select
| count(age) as ct,
| random_prefix(name) as name
| from boytest
| group by random_prefix(name)
| )
| group by remove_prefix(name)
""".stripMargin)
datas.show()
// Thread.sleep(10000)
sparkSession.uncacheTable("boytest")
sc.stop()
}
}