版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/someby/article/details/89007275
目录
本篇文章记录广告点击流量实时统计-基于动态黑名单进行点击行为过滤。
代码
spark.ad
AdClickRealTimeStatSpark.java
/** * 过滤广告黑名单用户日志 * @param adRealTimeLogDStream * @return */ private static JavaPairDStream<String,String> filterByBlacklist(JavaPairInputDStream<String,String> adRealTimeLogDStream){ // 刚刚接受到原始的用户点击行为日志之后 // 根据mysql中的动态黑名单,进行实时的黑名单过滤(黑名单用户的点击行为,直接过滤掉,不要了) // 使用transform算子(将dstream中的每个batch RDD进行处理,转换为任意的其他RDD,功能很强大) JavaPairDStream<String,String> filteredAdRealTimeLogDStream = adRealTimeLogDStream.transformToPair(new Function<JavaPairRDD<String, String>, JavaPairRDD<String, String>>() { private static final long serialVersionUID = 1L; @Override public JavaPairRDD<String, String> call(JavaPairRDD<String, String> rdd) throws Exception { // 首先,从mysql中查询所有黑名单用户,将其转换为一个rdd IAdBlacklistDAO adBlacklistDAO = DAOFactory.getAdBlacklistDAO(); List<AdBlacklist> adBlacklists = adBlacklistDAO.findAll(); List<Tuple2<Long,Boolean>> tuples = new ArrayList<Tuple2<Long, Boolean>>(); for (AdBlacklist adBlacklist : adBlacklists){ tuples.add(new Tuple2<Long,Boolean>(adBlacklist.getUserid(),true)); } JavaSparkContext sc = new JavaSparkContext(rdd.context()); JavaPairRDD<Long,Boolean> blacklistRDD = sc.parallelizePairs(tuples); // 将原始数据rdd映射成<userid, tuple2<string, string>> JavaPairRDD<Long,Tuple2<String,String>> mappedRDD = rdd.mapToPair(new PairFunction<Tuple2<String, String>, Long, Tuple2<String, String>>() { private static final long serialVersionUID = 1L; @Override public Tuple2<Long, Tuple2<String, String>> call(Tuple2<String, String> tuple) throws Exception { String log = tuple._2; String[] logSplited = log.split("_"); long userid = Long.valueOf(logSplited[3]); return new Tuple2<Long,Tuple2<String, String>>(userid,tuple); } }); // 将原始日志数据rdd,与黑名单rdd,进行左外连接 // 如果说原始日志的userid,没有在对应的黑名单中,join不到,左外连接 // 用inner join,内连接,会导致数据丢失 JavaPairRDD<Long, Tuple2<Tuple2<String, String>, Optional<Boolean>>> joinedRDD = mappedRDD.leftOuterJoin(blacklistRDD); JavaPairRDD<Long,Tuple2<Tuple2<String,String>,Optional<Boolean>>> filteredRDD = joinedRDD.filter(new Function<Tuple2<Long, Tuple2<Tuple2<String, String>, Optional<Boolean>>>, Boolean>() { private static final long serialVersionUID = 1L; @Override public Boolean call(Tuple2<Long, Tuple2<Tuple2<String, String>, Optional<Boolean>>> tuple) throws Exception { Optional<Boolean> optional = tuple._2._2; // 如果这个值存在,那么说明原始日志中的userid,join到了某个黑名单用户 if (optional.isPresent() && optional.get()){ return false; } return true; } }); JavaPairRDD<String,String> resultRDD = filteredRDD.mapToPair(new PairFunction<Tuple2<Long, Tuple2<Tuple2<String, String>, Optional<Boolean>>>, String, String>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, String> call(Tuple2<Long, Tuple2<Tuple2<String, String>, Optional<Boolean>>> tuple) throws Exception { return tuple._2._1; } }); return resultRDD; } }); return filteredAdRealTimeLogDStream; }