HIVE SQL 计算留存率 思路

问题

计算每日访问用户 在之后日期的留存数

基础表 每日访问用户ID

create external table if not exists user_visit_date (
  user_id bigint comment '用户ID'
)
comment '每日访问用户'
partitioned by (p_day date comment '分区日期')
stored as parquet;

解决思考

Step 1. 先算出历史访问用户 在昨日的 留存数 (关键思路)

create external table if not exists user_before_visit_date (
  before_visit_date date comment '历史访问日期',
  remain_count bigint comment '在visit_date留存人数'
) comment '历史访问日期在visit_date的留存人数'
partitioned by (visit_date date comment '分区日期')
stored as parquet;

insert overwrite table user_before_visit_date partition(visit_date=${yesterday})
  select
      t2.p_day as before_visit_date,
      count(1) as remain_count
    from user_visit_date t1
    inner join user_visit_date t2
     on t1.user_id = t2.user_id 
    and t2.p_day >= date_sub(${yesterday}, 30) 
    and t2.p_day < ${yesterday}
    where t1.p_day = ${yesterday}
    group by t2.p_day

Step 2. 动态分区计算 历史日期的汇总(group by) 留存数

select
    max(if(datediff(visit_date, before_visit_date) = 1, remain_count, 0)) as 1_day_remain_count, -- 1日留存数
    max(if(datediff(visit_date, before_visit_date) = 2, remain_count, 0)) as 2_day_remain_count, -- 2日留存数
    max(if(datediff(visit_date, before_visit_date) = 3, remain_count, 0)) as 3_day_remain_count, -- 3日留存数
    max(if(datediff(visit_date, before_visit_date) = 7, remain_count, 0)) as 7_day_remain_count, -- 7日留存数
    max(if(datediff(visit_date, before_visit_date) = 15, remain_count, 0)) as 15_day_remain_count, -- 15日留存数
    max(if(datediff(visit_date, before_visit_date) = 30, remain_count, 0)) as 30_day_remain_count, -- 30日留存数
    before_visit_date as p_day
  from user_before_visit_date
  where visit_date >= date_sub(${yesterday}, 30) and visit_date <= ${yesterday}
  group by before_visit_date
  ;
发布了53 篇原创文章 · 获赞 50 · 访问量 2万+

猜你喜欢

转载自blog.csdn.net/xw514124202/article/details/93767984
今日推荐