广告流量实时统计 scala版本 过滤黑名单 统计各省市实时广告用户点击量

1.项目分析

项目地址:https://gitee.com/jenrey/adv

技术分析:

    SparkStreaming或者Strom

数据:

    广告流量点击数据

需求分析:

   1)【 实时】统计【每天】【各省】【热门】广告(分组求广告点击次数多的TopN)

   2)实时统计某个阶段广告投放趋势

数据调研:

timestamp:时间戳,用户点击广告的时间

province:省份,用户在哪个省份点击的广告

city:城市,用户在哪个城市点击的广告

userid:用户的唯一标识

advid:被点击的广告id

现在有数据源在kafka里面

2.黑名单过滤


   
   
  1. import kafka.serializer.StringDecoder
  2. import org.apache.spark.broadcast.Broadcast
  3. import org.apache.spark.rdd.RDD
  4. import org.apache.spark.streaming.dstream.DStream
  5. import org.apache.spark.streaming.kafka.KafkaUtils
  6. import org.apache.spark.streaming.{Seconds, StreamingContext}
  7. import org.apache.spark.{SparkConf, SparkContext}
  8. /**
  9. * Create by jenrey on 2018/5/27 21:07
  10. */
  11. object AdvApplicationTest {
  12. def main(args: Array[String]): Unit = {
  13. val conf = new SparkConf()
  14. conf.setAppName( "AdvApplicationTest")
  15. conf.setMaster( "local")
  16. conf.set( "", "") //序列化
  17. val sc = new SparkContext(conf)
  18. val ssc = new StreamingContext(sc, Seconds( 5))
  19. /**
  20. * TODO:第一步:从kafka获取数据(direct 方式)
  21. */
  22. /* K: ClassTag,
  23. V: ClassTag,
  24. KD <: Decoder[K]: ClassTag,
  25. VD <: Decoder[V]: ClassTag] (
  26. ssc: StreamingContext,
  27. kafkaParams: Map[String, String],
  28. topics: Set[String]*/
  29. val kafkaParams = Map( "metadata.broker.list" -> "hadoop04:9092")
  30. val topics = Set( "aura")
  31. val logDStream: DStream[String] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics).map(_._2)
  32. //TODO:如果【一个用户】【一天内】对【某个广告】点击的次数超过了【100次】,这样的用户属于黑名单用户,这样的数据就不统计了
  33. /**
  34. * TODO:第二步:进行黑名单过滤
  35. */
  36. val filterLogDStream: DStream[String] = blackListFileter(logDStream,ssc)
  37. /**
  38. * TODO:第三步:动态生成黑名单
  39. */
  40. /**
  41. * TODO:第四步:实时统计每天各省各城市广告点击量
  42. */
  43. /**
  44. * TODO:第五步:实时统计每天各省热门广告点击量
  45. */
  46. /**
  47. * TODO:第六步:实时统计每天每个广告在最近一小时的滑动窗口的点击趋势
  48. */
  49. ssc.start()
  50. ssc.awaitTermination()
  51. ssc.stop()
  52. }
  53. /**
  54. * 对黑名单进行过滤的方法
  55. *
  56. * @param logDStream 从kafka读取数据
  57. * @return 进行黑名单过滤以后的数据
  58. */
  59. def blackListFileter(logDStream: DStream[String], ssc: StreamingContext): DStream[String] = {
  60. //这个地方的黑名单,应该是从我们持久化的数据库里面读取的:有三个数据库是我们常用的(Redis,hbase,mysql)
  61. val blackList = List(( 1L, true), ( 2L, true), ( 3L, true))
  62. //把黑名单转化成RDD
  63. val blackListRDD: RDD[(Long, Boolean)] = ssc.sparkContext.parallelize(blackList)
  64. //广播黑名单
  65. val blackListBroadcast: Broadcast[Array[(Long, Boolean)]] = ssc.sparkContext.broadcast(blackListRDD.collect())
  66. //transform对传进来的DStream中的每一个RDD进行操作
  67. logDStream.transform(rdd => {
  68. //把传进来的数据切分,组成kv形式
  69. val user_lineRDD: RDD[(Long, String)] = rdd.map(line => {
  70. val fields: Array[String] = line.split( ",")
  71. (fields( 3).toLong, line)
  72. })
  73. //注意广播出去后,需要使用.value来获取播放值
  74. val blackRDD: RDD[(Long, Boolean)] = rdd.sparkContext.parallelize(blackListBroadcast.value)
  75. /**
  76. * List((22L, "qwe"), (2L, "asd"), (3L, "zxc"))
  77. * List((1L, true), (2L, true), (3L, true))
  78. * leftOuterJoin 后的结果如下,此算子必须都是kv形式才行
  79. * (22,(qwe,None))
  80. * (3,(zxc,Some(true)))
  81. * (2,(asd,Some(true)))
  82. */
  83. val resultRDD: RDD[(Long, (String, Option[Boolean]))] = user_lineRDD.leftOuterJoin(blackRDD)
  84. //这个是返回值,返回进行黑名单过滤以后的数据
  85. resultRDD.filter(tuple=>{
  86. tuple._2._2.isEmpty
  87. }).map(_._2._1)
  88. })
  89. }
  90. }

3.动态生成黑名单


   
   
  1. import java.util.{Date, Properties}
  2. import kafka.serializer.StringDecoder
  3. import org.apache.spark.broadcast.Broadcast
  4. import org.apache.spark.rdd.RDD
  5. import org.apache.spark.sql._
  6. import org.apache.spark.streaming.dstream.DStream
  7. import org.apache.spark.streaming.kafka.KafkaUtils
  8. import org.apache.spark.streaming.{Seconds, StreamingContext}
  9. import org.apache.spark.{SparkConf, SparkContext}
  10. import utils.{ConnectionPool, DateUtils}
  11. /**
  12. * Create by jenrey on 2018/5/27 21:07
  13. */
  14. object AdvApplicationTest {
  15. def main(args: Array[String]): Unit = {
  16. val conf = new SparkConf()
  17. conf.setAppName( "AdvApplicationTest")
  18. conf.setMaster( "local")
  19. conf.set( "", "") //序列化
  20. val sc = new SparkContext(conf)
  21. val ssc = new StreamingContext(sc, Seconds( 5))
  22. val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
  23. /**
  24. * TODO:第一步:从kafka获取数据(direct 方式)
  25. */
  26. /* K: ClassTag,
  27. V: ClassTag,
  28. KD <: Decoder[K]: ClassTag,
  29. VD <: Decoder[V]: ClassTag] (
  30. ssc: StreamingContext,
  31. kafkaParams: Map[String, String],
  32. topics: Set[String]*/
  33. val kafkaParams = Map( "metadata.broker.list" -> "hadoop04:9092")
  34. val topics = Set( "aura")
  35. val logDStream: DStream[String] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics).map(_._2)
  36. //TODO:如果【一个用户】【一天内】对【某个广告】点击的次数超过了【100次】,这样的用户属于黑名单用户,这样的数据就不统计了
  37. /**
  38. * TODO:第二步:进行黑名单过滤
  39. */
  40. val filterLogDStream: DStream[String] = blackListFileter(logDStream, ssc)
  41. /**
  42. * TODO:第三步:动态生成黑名单 实时生成黑名单
  43. */
  44. DynamicGenerationBlacklists(filterLogDStream,spark)
  45. /**
  46. * TODO:第四步:实时统计每天各省各城市广告点击量
  47. */
  48. /**
  49. * TODO:第五步:实时统计每天各省热门广告点击量
  50. */
  51. /**
  52. * TODO:第六步:实时统计每天每个广告在最近一小时的滑动窗口的点击趋势
  53. */
  54. ssc.start()
  55. ssc.awaitTermination()
  56. ssc.stop()
  57. }
  58. /**
  59. * TODO:对黑名单进行过滤的方法
  60. *
  61. * @param logDStream 从kafka读取数据
  62. * @return 进行黑名单过滤以后的数据
  63. */
  64. def blackListFileter(logDStream: DStream[String], ssc: StreamingContext): DStream[String] = {
  65. //这个地方的黑名单,应该是从我们持久化的数据库里面读取的:有三个数据库是我们常用的(Redis,hbase,mysql)
  66. val blackList = List(( 1L, true), ( 2L, true), ( 3L, true))
  67. //把黑名单转化成RDD
  68. val blackListRDD: RDD[(Long, Boolean)] = ssc.sparkContext.parallelize(blackList)
  69. //广播黑名单
  70. val blackListBroadcast: Broadcast[Array[(Long, Boolean)]] = ssc.sparkContext.broadcast(blackListRDD.collect())
  71. //transform对传进来的DStream中的每一个RDD进行操作
  72. logDStream.transform(rdd => {
  73. //把传进来的数据切分,组成kv形式
  74. val user_lineRDD: RDD[(Long, String)] = rdd.map(line => {
  75. val fields: Array[String] = line.split( ",")
  76. (fields( 3).toLong, line)
  77. })
  78. //注意广播出去后,需要使用.value来获取播放值
  79. val blackRDD: RDD[(Long, Boolean)] = rdd.sparkContext.parallelize(blackListBroadcast.value)
  80. /**
  81. * List((22L, "qwe"), (2L, "asd"), (3L, "zxc"))
  82. * List((1L, true), (2L, true), (3L, true))
  83. * leftOuterJoin 后的结果如下,此算子必须都是kv形式才行
  84. * (22,(qwe,None))
  85. * (3,(zxc,Some(true)))
  86. * (2,(asd,Some(true)))
  87. */
  88. val resultRDD: RDD[(Long, (String, Option[Boolean]))] = user_lineRDD.leftOuterJoin(blackRDD)
  89. //这个是返回值,返回进行黑名单过滤以后的数据
  90. resultRDD.filter(tuple => {
  91. tuple._2._2.isEmpty
  92. }).map(_._2._1)
  93. })
  94. }
  95. /**
  96. * TODO:动态生成黑名单
  97. *
  98. * @param filterLogDStream 黑名单过滤完了以后的数据
  99. * 如果【一个用户】【一天内】对【某个广告】点击的次数超过了【100次】,这样的用户属于黑名单用户
  100. * 有三种方式:1)使用UpdateStateByKey 2)reduceByKey 存入HBase 3)Mysql的方式
  101. */
  102. def DynamicGenerationBlacklists(filterLogDStream: DStream[String], spark: SparkSession): Unit = {
  103. val date_userid_advid_ds: DStream[(String, Long)] = filterLogDStream.map(line => {
  104. val fields: Array[String] = line.split( ",")
  105. val time = new Date(fields( 0).toLong)
  106. val date: String = DateUtils.formatDateKey(time)
  107. val userid: String = fields( 3)
  108. val advid: String = fields( 4)
  109. (date + "_" + userid + "_" + advid, 1L)
  110. }).reduceByKey(_ + _)
  111. date_userid_advid_ds.foreachRDD(rdd => {
  112. rdd.foreachPartition(partition => {
  113. //下面是写好的工具类,连接Mysql
  114. val connection = ConnectionPool.getConnection()
  115. val statement = connection.createStatement()
  116. partition.foreach {
  117. case (date_userid_advid, count) => {
  118. val fields = date_userid_advid.split( "_")
  119. val date = fields( 0)
  120. val userid = fields( 1).toLong
  121. val advid = fields( 2).toLong
  122. val sql = s "insert into aura.tmp_advclick_count values($date,$userid,$advid,$count)";
  123. statement.execute(sql);
  124. }
  125. }
  126. ConnectionPool.returnConnection(connection)
  127. })
  128. })
  129. /**
  130. * 生成黑名单
  131. */
  132. val df: DataFrame = spark.read.format( "jdbc")
  133. .option( "url", "jdbc:mysql://localhost:3306/aura")
  134. .option( "user", "aura")
  135. .option( "password", "aura")
  136. .option( "dbtable", "tmp_advclick_count")
  137. .load()
  138. df.createOrReplaceTempView( "tmp_advclick_count")
  139. val sql =
  140. "" "
  141. select
  142. userid
  143. from
  144. (
  145. select
  146. date,userid,advid,sum(click_count) c_count
  147. from
  148. tmp_advclick_count
  149. group by date,userid,advid
  150. ) t
  151. where
  152. t.c_count>100
  153. " ""
  154. val blacklistdf= spark.sql(sql).distinct()
  155. val properties = new Properties()
  156. properties.put( "user", "aura")
  157. properties.put( "password", "aura")
  158. blacklistdf.write.mode(SaveMode.Append)
  159. .jdbc( "jdbc:mysql://localhost:3306/aura", "black_list",properties)
  160. }
  161. }

4.实时统计每天各省各城市广告点击量

在上面代码后继续写下面代码就行了。


   
   
  1. /**
  2. * 实时统计每天各省各城市广告点击量
  3. *
  4. * @param filterLogDStream
  5. */
  6. def ProvinceCityAdvClick_Count(filterLogDStream: DStream[String]): DStream[(String, Long)] = {
  7. var f = (input: Seq[Long], state: Option[Long]) => {
  8. val current_count = input.sum
  9. val last_count = state.getOrElse( 0)
  10. Some(current_count + last_count)
  11. }
  12. filterLogDStream.map(line => {
  13. val fields = line.split( ",")
  14. val time = fields( 0).toLong
  15. val mydate = new Date(time)
  16. val date = DateUtils.formatDateKey(mydate)
  17. val province = fields( 1)
  18. val city = fields( 2)
  19. val advid = fields( 4)
  20. (date + "_" + province + "_" + city + "_" + advid, 1L)
  21. }).updateStateByKey(f)
  22. /**
  23. * 如果开发有需求的话,可以把这些数据库写入 MySQL数据库 ,Hbase
  24. */

5.实时统计各省热门广告


   
   
  1. /**
  2. * 实时统计 各省热门广告
  3. *
  4. * transform : rdd -> datafram -> table -> sql
  5. *
  6. * @param date_province_city_advid_count
  7. */
  8. def ProvinceAdvClick_Count(date_province_city_advid_count: DStream[(String, Long)], spark: SparkSession): Unit = {
  9. date_province_city_advid_count.transform(rdd => {
  10. var date_province_advid_count = rdd.map {
  11. case (date_province_city_advid, count) => {
  12. val fields = date_province_city_advid.split( "_")
  13. val date = fields( 0)
  14. val province = fields( 1)
  15. val advid = fields( 3)
  16. (date + "_" + province + "_" + advid, count)
  17. }
  18. }.reduceByKey(_ + _)
  19. val rowRDD = date_province_advid_count.map(tuple => {
  20. val fields = tuple._1.split( "_")
  21. val date = fields( 0)
  22. val provnice = fields( 1)
  23. val advid = fields( 2).toLong
  24. val count = tuple. _2
  25. Row(date, provnice, advid, count)
  26. })
  27. val schema = StructType(
  28. StructField( "date", StringType, true) ::
  29. StructField( "province", StringType, true) ::
  30. StructField( "advid", LongType, true) ::
  31. StructField( "count", LongType, true) :: Nil
  32. )
  33. val df = spark.createDataFrame(rowRDD, schema)
  34. df.createOrReplaceTempView( "temp_date_province_adv_count")
  35. val sql =
  36. "" "
  37. select
  38. *
  39. from
  40. (
  41. select
  42. date,province,advid,count,row_number() over(partition by province ordr by count desc) rank
  43. from
  44. temp_date_province_adv_count
  45. ) temp
  46. where temp.rank < 10
  47. " ""
  48. /**
  49. * 把结果持久化到数据库
  50. */
  51. spark.sql(sql)
  52. rdd
  53. })
  54. }

6.总的代码


   
   
  1. package sparkstreaming.lesson09
  2. import java.sql.Date
  3. import java.util.Properties
  4. import kafka.serializer.StringDecoder
  5. import org.apache.spark.rdd.RDD
  6. import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
  7. import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
  8. import org.apache.spark.streaming.dstream.DStream
  9. import org.apache.spark.streaming.kafka.KafkaUtils
  10. import org.apache.spark.streaming.{Seconds, StreamingContext}
  11. import org.apache.spark.{SparkConf, SparkContext}
  12. import sparkstreaming.demo.lesson01.ConnectionPool
  13. import sparkstreaming.demo.utils.DateUtils
  14. /**
  15. * Created by Administrator on 2018/5/12.
  16. *
  17. * timestamp:
  18. * 时间戳,用户点击广告的时间
  19. * province:
  20. * 省份,用户在哪个省份点击的广告
  21. * city:
  22. * 城市,用户在哪个城市点击的广告
  23. * userid:
  24. * 用户的唯一标识
  25. * advid:
  26. * 被点击的广告id
  27. */
  28. object AdvApplicationTest {
  29. def main(args: Array[String]): Unit = {
  30. val conf = new SparkConf()
  31. conf.setMaster( "local")
  32. conf.setAppName( "AdvApplicationTest")
  33. conf.set( "", "") //序列化
  34. val sc = new SparkContext(conf)
  35. val ssc = new StreamingContext(sc,Seconds( 5))
  36. val spark = SparkSession.builder()
  37. .config(conf).getOrCreate()
  38. /**
  39. * 第一步:从kafka获取数据(direct 方式)
  40. * K: ClassTag,
  41. V: ClassTag,
  42. KD <: Decoder[K]: ClassTag,
  43. VD <: Decoder[V]: ClassTag] (
  44. ssc: StreamingContext,
  45. kafkaParams: Map[String, String],
  46. topics: Set[String]
  47. */
  48. val kafkaParams = Map( "metadata.broker.list" -> "hadoop1:9092")
  49. val topics = Set( "aura")
  50. val logDstream: DStream[String] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
  51. ssc, kafkaParams, topics).map(_._2)
  52. /**
  53. * 第二步:进行黑名单过滤
  54. */
  55. val filterLogDStream: DStream[String] = blackListFilter(logDstream,ssc)
  56. /**
  57. * 【一个用户】【一天内】对【某个广告】点击的次数超过了【100次】,这样的用户属于黑名单用户
  58. *
  59. *
  60. * zhangsan:
  61. * A:50 B:60
  62. * lisi:
  63. * A:50 A:20 A:40 这就是黑名单用户
  64. * 如果一个用户今天是黑名单用户,那么明天还是黑名单用户吗?
  65. * 这个看业务而定。
  66. *
  67. * 第三步:动态生成黑名单 实时生成黑名单
  68. */
  69. DynamicGenerationBlacklists(filterLogDStream,spark)
  70. /**
  71. * 第四步:
  72. * 实时统计每天各省各城市广告点击量
  73. */
  74. val dateProvinceCityAdvClick_Count = ProvinceCityAdvClick_Count(filterLogDStream)
  75. /**
  76. * 第五步:
  77. * 实时统计每天各省热门广告
  78. * 分组求TopN
  79. *
  80. * transform froeachRDD
  81. * rdd => dataframe
  82. * SparkSQL:
  83. * SQL
  84. */
  85. /**
  86. * 第六步:
  87. * 实时统计每天每个广告在最近一小时的滑动窗口的点击趋势
  88. */
  89. ssc.start()
  90. ssc.awaitTermination()
  91. ssc.stop()
  92. }
  93. /**
  94. * 对黑名单数据进行过滤
  95. * @param logDstream 从kafka读取数据
  96. * @return 进行黑名单过滤以后的数据
  97. */
  98. def blackListFilter(logDstream: DStream[String],ssc:StreamingContext):DStream[String]={
  99. /**
  100. * 这个地方应该是去数据库里面去读取数据
  101. * black_list
  102. */
  103. val blackList = List(( 1L, true),( 2L, true),( 3L, true))
  104. val blackListRDD = ssc.sparkContext.parallelize(blackList)
  105. val balckListBroadcast = ssc.sparkContext.broadcast(blackListRDD.collect())
  106. /**
  107. * 这个地方的黑名单,应该是从我们的持久化的数据库里面读取的:有三个数据库是我们常用的:
  108. * 1)Reids 自己去百度一下
  109. * 2) HBase 自己去百度一下
  110. * 3) Mysql 上课演示过
  111. * SparkCore的方式读取的
  112. * SparkSQL -> dataframe -> rdd
  113. */
  114. logDstream.transform( rdd =>{
  115. val user_lineRDD=rdd.map( line =>{
  116. val fields = line.split( ",")
  117. (fields( 3).toLong,line)
  118. })
  119. val blackRDD = rdd.sparkContext.parallelize(balckListBroadcast.value)
  120. val resultRDD: RDD[(Long, (String, Option[Boolean]))] = user_lineRDD.leftOuterJoin(blackRDD)
  121. resultRDD.filter( tuple =>{
  122. tuple._2._2.isEmpty
  123. }).map(_._2._1)
  124. })
  125. }
  126. /**
  127. * 动然生成黑名单
  128. * @param filterLogDStream 黑名单过滤万了以后的数据
  129. * 【一个用户】【一天内】对【某个广告】点击的次数超过了【100次】,这样的用户属于黑名单用户
  130. *
  131. * 梳理一下思路:
  132. * 这个需求 跟 我们单词计数很像,无非不就是实时统计每个单词出现了多少次
  133. * 如果发现某个单词出现了一个100,那么他就是黑名单单词
  134. * 方式一:
  135. * (date_userid_advid,v)=map
  136. * 实时统计出来每个单词出现了多少次=updateStateBykey (对内存的要求高一点)
  137. * 张三 A 80
  138. * 李四 B 99
  139. * 100
  140. * fitler 过滤出来次数 一百以上 把它写入 MySQL,Reids,HBase 数据库
  141. * 方式二:
  142. * (date_userid_advid,v)=map
  143. * 每次处理的是本批次的数据 reduceBykey(对内存的要求低一点)
  144. * HBase:
  145. * rowkey: date_userid_advid 2
  146. * 本批次 3
  147. * 5
  148. * Redis
  149. * 方式三:
  150. * MySQL的方式
  151. *
  152. *
  153. *
  154. *
  155. */
  156. def DynamicGenerationBlacklists(filterLogDStream: DStream[String],spark:SparkSession):Unit={
  157. val date_userid_advid_ds=filterLogDStream.map( line =>{
  158. val fields = line.split( ",")
  159. val time = new Date( fields( 0).toLong)
  160. val date = DateUtils.formatDateKey(time)
  161. val userid = fields( 3)
  162. val advid = fields( 4)
  163. //20180512_
  164. (date+ "_"+userid+ "_"+advid, 1L)
  165. }).reduceByKey(_+_)
  166. date_userid_advid_ds.foreachRDD( rdd =>{
  167. rdd.foreachPartition( partition =>{
  168. val connection = ConnectionPool.getConnection()
  169. val statement = connection.createStatement()
  170. partition.foreach{
  171. case(date_userid_advid,count) =>{
  172. val fields = date_userid_advid.split( "_")
  173. val date = fields( 0)
  174. val userid = fields( 1).toLong
  175. val advid = fields( 2).toLong
  176. val sql=s "insert into aura.tmp_advclick_count values($date,$userid,$advid,$count)";
  177. statement.execute(sql);
  178. }
  179. }
  180. ConnectionPool.returnConnection(connection)
  181. })
  182. })
  183. /**
  184. *生成黑名单
  185. */
  186. val df: DataFrame = spark.read.format( "jdbc")
  187. .option( "url", "jdbc:mysql://localhost:3306/aura")
  188. .option( "user", "aura")
  189. .option( "password", "aura")
  190. .option( "dbtable", "tmp_advclick_count")
  191. .load()
  192. df.createOrReplaceTempView( "tmp_advclick_count")
  193. val sql=
  194. "" "
  195. SELECT
  196. userid
  197. FROM
  198. (
  199. SELECT
  200. date,userid,advid,sum(click_count) c_count
  201. FROM
  202. tmp_advclick_count
  203. GROUP BY
  204. date,userid,advid
  205. ) t
  206. WHERE
  207. t.c_count > 100
  208. " ""
  209. //统计出来黑名单
  210. val blacklistdf = spark.sql(sql).distinct()
  211. val properties = new Properties()
  212. properties.put( "user", "aura")
  213. properties.put( "password", "aura")
  214. blacklistdf.write.mode(SaveMode.Append)
  215. .jdbc( "jdbc:mysql://localhost:3306/aura", "black_list",properties)
  216. }
  217. /**
  218. * 实时统计每天各省各城市广告点击量
  219. * @param filterLogDStream
  220. */
  221. def ProvinceCityAdvClick_Count(filterLogDStream: DStream[String]):DStream[(String,Long)]={
  222. /**
  223. * 思路
  224. * map => (k,v) => date+province+city+advid 1
  225. * updateStateBykey
  226. */
  227. var f=(input:Seq[Long],state:Option[Long]) =>{
  228. val current_count = input.sum
  229. val last_count = state.getOrElse( 0)
  230. Some(current_count+last_count)
  231. }
  232. filterLogDStream.map( line =>{
  233. val fields = line.split( ",")
  234. val time = fields( 0).toLong
  235. val mydate = new Date(time)
  236. val date = DateUtils.formatDateKey(mydate)
  237. val province = fields( 1)
  238. val city = fields( 2)
  239. val advid = fields( 4)
  240. (date+ "_"+province+ "_"+city+ "_"+advid, 1L)
  241. }).updateStateByKey(f)
  242. /**
  243. * 如果开发有需求的话,可以把这些数据库写入 MySQL数据库 ,Hbase
  244. */
  245. }
  246. /**
  247. * 实时统计 各省热门广告
  248. *
  249. * transform : rdd -> datafram -> table -> sql
  250. * @param date_province_city_advid_count
  251. */
  252. def ProvinceAdvClick_Count(date_province_city_advid_count:DStream[(String,Long)],spark:SparkSession): Unit ={
  253. date_province_city_advid_count.transform( rdd =>{
  254. var date_province_advid_count= rdd.map{
  255. case(date_province_city_advid,count) =>{
  256. val fields = date_province_city_advid.split( "_")
  257. val date = fields( 0)
  258. val province = fields( 1)
  259. val advid = fields( 3)
  260. (date+ "_"+province+ "_"+advid,count)
  261. }
  262. }.reduceByKey(_+_)
  263. val rowRDD=date_province_advid_count.map( tuple =>{
  264. val fields = tuple._1.split( "_")
  265. val date = fields( 0)
  266. val provnice = fields( 1)
  267. val advid = fields( 2).toLong
  268. val count = tuple. _2
  269. Row(date,provnice,advid,count)
  270. })
  271. val schema=StructType(
  272. StructField( "date",StringType, true)::
  273. StructField( "province",StringType, true)::
  274. StructField( "advid",LongType, true)::
  275. StructField( "count",LongType, true):: Nil
  276. )
  277. val df = spark.createDataFrame(rowRDD,schema)
  278. df.createOrReplaceTempView( "temp_date_province_adv_count")
  279. val sql=
  280. "" "
  281. select
  282. *
  283. from
  284. (
  285. select
  286. date,province,advid,count,row_number() over(partition by province ordr by count desc) rank
  287. from
  288. temp_date_province_adv_count
  289. ) temp
  290. where temp.rank < 10
  291. " ""
  292. /**
  293. * 把结果持久化到数据库
  294. */
  295. spark.sql(sql)
  296. rdd
  297. })
  298. }
  299. }










猜你喜欢

转载自blog.csdn.net/chixushuchu/article/details/85238401
今日推荐