基于Flink实时项目:用户行为分析(二:实时流量统计)

1.需求:

实时流量统计,每隔5秒钟输出最近10分钟内访问量最多的前N个URL(与上一个非常的类似)。
废话不多说,直接上代码

2.代码实现

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.api.java.tuple.{Tuple, Tuple1}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

import java.sql.Timestamp
import java.text.SimpleDateFormat
import scala.collection.mutable.ListBuffer

/**
 *需求:
 *    实时流量统计
 *    每隔 5 秒,输出最近 10 分钟内访问量最多的前 N 个 URL。
 *    (这个类似热门商品的统计)
 */
case class ApacheLogEvent(ip:String,userId:String,eventTime:Long,method:String,url:String)

case class UrlViewCount(url:String,windowEnd:Long,count:Long)

object NetworkFlow {
  def main(args: Array[String]): Unit = {

    val env = StreamExecutionEnvironment.getExecutionEnvironment

    env.setParallelism(1)

    //读取数据
    val data = env.readTextFile("E:\\WY\\programme\\MusicProject\\src\\main\\resources\\apache.log")
    //原始日志中的时间是“dd/MM/yyyy:HH:mm:ss”的形式,需要定义一个 DateTimeFormat 将其转换为我们需要的时间戳格式
    val dataStream = data.map(line => {
      val arr = line.split(" ")
      val simpleDateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss")
      val timestamp = simpleDateFormat.parse(arr(3)).getTime
      ApacheLogEvent(arr(0), arr(2), timestamp, arr(5), arr(6))
    })
    //指定时间戳和watermark
    dataStream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[ApacheLogEvent](Time.milliseconds(1000)) {
      override def extractTimestamp(element: ApacheLogEvent): Long = element.eventTime
    })
      .filter(_.method == "GET")
      .keyBy("url")
      .timeWindow(Time.minutes(10),Time.seconds(5))
      .aggregate(new count(),new WindowresultFunction())
      .keyBy(1)
      .process(new TopNHotUrls(5))
      .print()

    env.execute()

  }
}
class count() extends AggregateFunction[ApacheLogEvent,Long,Long]{
  override def createAccumulator(): Long = 0L

  override def add(value: ApacheLogEvent, accumulator: Long): Long = accumulator + 1

  override def getResult(accumulator: Long): Long = accumulator

  override def merge(a: Long, b: Long): Long = a + b
}

class WindowresultFunction() extends WindowFunction[Long,UrlViewCount,Tuple,TimeWindow] {
  override def apply(key: Tuple, window: TimeWindow, input: Iterable[Long], out: Collector[UrlViewCount]): Unit = {

    val url:String = key.asInstanceOf[Tuple1[String]].f0
    val count = input.iterator.next()
    out.collect(UrlViewCount(url,window.getEnd,count))

  }
}
class TopNHotUrls(topSize:Int) extends KeyedProcessFunction[Tuple,UrlViewCount,String] {

  lazy val urlState:ListState[UrlViewCount] = getRuntimeContext.getListState(new ListStateDescriptor[UrlViewCount]("urlState",classOf[UrlViewCount]))

  override def processElement(value: UrlViewCount, ctx: KeyedProcessFunction[Tuple, UrlViewCount, String]#Context, out: Collector[String]): Unit = {

    //每条数据都保存到状态中
    urlState.add(value)
    ctx.timerService().registerEventTimeTimer(value.windowEnd + 1)
  }

  override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Tuple, UrlViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {

    //获取收到的所有的URL访问量
    val allUrlViews : ListBuffer[UrlViewCount] = ListBuffer()
    import scala.collection.JavaConversions._
    for(urlView <- urlState.get){
      allUrlViews += urlView
    }

    //清空状态
    urlState.clear()

    //按照访问量从大到小进行排序
    val sortedUrlViews = allUrlViews.sortBy(_.count)(Ordering.Long.reverse).take(topSize)

    //将排名信息格式化成String,便于打印
    val result:StringBuilder = new StringBuilder
    result.append("=====================================\n")

    result.append("窗口关闭时间:").append(new Timestamp(timestamp - 1)).append("\n")

    for( i <- sortedUrlViews.indices){
      val currentUrlView:UrlViewCount = sortedUrlViews(i)
      result.append("No").append(i+1).append(":")
        .append("URL=").append(currentUrlView.url)
        .append("流量=").append(currentUrlView.count).append("\n")
    }

    result.append("======================================\n\n")

    //控制输出频率,模拟实时滚动结果
    Thread.sleep(1000)
    out.collect(result.toString())


  }
}

3.输出结果展示

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/weixin_48929324/article/details/117339169