package com.qf.gp1707.day06
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 需求:在一定时间范围内 求用户在所有基站(lac)停留的时长 再取top2
* 1.用户分别在所有基站停留的各个总时长
* 2.把经纬度信息join过来
* 3.按照手机号进行分组
* 4.组内排序,并取top2
*/
object MobileLocation {
def main(args: Array[String]): Unit = {
val conf=new SparkConf()
.setAppName("mobilelocation")
.setMaster("local[2]")
val sc=new SparkContext(conf)
//获取用户基站信息
val files:RDD[String]=sc.textFile("./src/main/scala/com/qf/gp1707/day06/lacduration/log")
val phoneAndLacAndTime:RDD[((String,String),Long)]=files.map(line=>{
val fields:Array[String]=line.split(",")//切分每一行数据
val phone=fields(0)//用户手机号
val time=fields(1).toLong//时间戳
val lac=fields(2)//基站
val eventType=fields(3)//事件类型
val time_long=if(eventType==1) -time else time
((phone,lac),time_long)
})
// 用户在相同的基站停留的时长
val sumedPhoneAndLacAndTime:RDD[((String,String),Long)]=phoneAndLacAndTime.reduceByKey(_+_)
//为了便于和经纬度进行join,需要把lac放到key的位置
val lacAndPhoneAndTime:RDD[(String,(String,Long))]=sumedPhoneAndLacAndTime.map(line=>{
val phone=line._1._1//手机号
val lac=line._1._2//基站ID
val time=line._2//用户在该基站停留的总时长
(lac,(phone,time))
})
// 获取基站的基础信息
val lacInfo: RDD[String] = sc.textFile("./src/main/scala/com/qf/gp1707/day06/lacduration/lac_info.txt")
val lacAndXY: RDD[(String, (String, String))] = lacInfo.map(line => {
val fields = line.split(",")
val lac = fields(0) //基站ID
val x = fields(1)//经度
val y = fields(2)//纬度
(lac, (x, y))
})
//把经纬度信息join到用户访问信息
val joined: RDD[(String, ((String, Long), (String, String)))] =
lacAndPhoneAndTime.join(lacAndXY)
val phoneAndTimeAndXY: RDD[(String, Long, (String, String))] =
joined.map(x => {
val phone = x._2._1._1
//手机号
val lac = x._1
//基站ID
val time = x._2._1._2
//停留时长
val xy = x._2._2 //经纬度
(phone, time, xy)
})
//按照用户手机号进行分组
val grouped: RDD[(String, Iterable[(String, Long, (String, String))])]
= phoneAndTimeAndXY.groupBy(_._1)
//按照时长进行降序排序
val sorted: RDD[(String, List[(String, Long, (String, String))])]
= grouped.mapValues(_.toList.sortBy(_._2).reverse)
//取top2
val top2: RDD[(String, List[(String, Long, (String, String))])] = sorted.mapValues(_.take(2))
println(top2.collect().toBuffer)
}
}
Spark实现在一定时间范围内 求用户在所有基站(lac)停留的时长 再取top2
猜你喜欢
转载自blog.csdn.net/weixin_40903057/article/details/88422609
今日推荐
周排行