接上篇 https://blog.csdn.net/qq_44868502/article/details/103512533
上篇的Kafka生产者生产日志信息,sparkstreaming application订阅接收日志并进行处理
实现如下(代码已释意)
import java.text.SimpleDateFormat
import java.util.{Date, Locale, Properties}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkReceiver{
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
//获取sparkConf
val sparkConf = new SparkConf().setMaster("local[2]").setAppName("SparkReceiver")
//获取streamingContext
val ssc = new StreamingContext(sparkConf,Seconds(2))
//设置checkpoint
ssc.checkpoint("E:\\checkpoint")
//主要参数一,Map内四项基本配置
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hadoop:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "day3"
)
//主要参数二,topic的名称列表
val topics = List("day3")
//sparkstreaming使用reverse方式从kafka中获取数据
val stream = KafkaUtils.createDirectStream[String,String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
//从stream中获取数据的value
val mapDs= stream.map(_.value())
// mapDs.print()
//transform以rdd为单位进行修改离散流
val userDS = mapDs.transform(x=>x.map(line=>{
val arr = line.split(" ")
//时间戳
val day = new SimpleDateFormat("yyyy-MM-dd").format(new Date(arr(0).toLong))
val userID = arr(3)
val adID = arr(4)
(day+","+userID+","+adID,1)
}))
//每个时间序列,使用reduceByKey进行小聚合计算
val reduceDS = userDS.reduceByKey(_+_)
//使用updateStateByKey,进行总的聚合计算
val totalDS = reduceDS.updateStateByKey((nowValue:Seq[Int],bfValue:Option[Int])=>{
val now = nowValue.sum
val bf = bfValue.getOrElse(0)
//返回值是option类型
Option(now+bf)
})
totalDS.print()
ssc.start()
ssc.awaitTermination()
}
}
初步实现未工,待续。。。。