MySQL求连续登陆天数-鹅厂面试题

一.需求

求每个用户的最长连续登录天数,两个日期的间隔小于或等于 3 均视为连续登录。比如 01-01 号登录,最近的下一次登录是 01-04 号,两个日期的间隔等于 3 天,因此这两个日期之间的天数都算作活跃天数,一共 4 天。

因为考虑MySQL版本问题,暂时不能使用分析函数。

二.解决方案

测试数据:

create table test_login(user_id int,login_date timestamp);

insert into test_login values (1,'2020-01-01 00:01:00');
insert into test_login values (1,'2020-01-01 00:02:00');
insert into test_login values (1,'2020-01-01 00:03:00');
insert into test_login values (1,'2020-01-02 00:03:00');
insert into test_login values (1,'2020-01-05 00:03:00');
insert into test_login values (1,'2020-01-07 00:03:00');
insert into test_login values (1,'2020-01-11 00:03:00');
insert into test_login values (1,'2020-01-12 00:03:00');
insert into test_login values (1,'2020-01-13 00:03:00');
insert into test_login values (1,'2020-01-14 00:03:00');
insert into test_login values (1,'2020-01-17 00:03:00');
insert into test_login values (1,'2020-01-18 00:03:00');
insert into test_login values (1,'2020-01-19 00:03:00');
insert into test_login values (1,'2020-01-22 00:03:00');

insert into test_login values (2,'2020-01-01 00:04:00');
insert into test_login values (2,'2020-01-01 00:05:00');
insert into test_login values (2,'2020-01-01 00:06:00');
insert into test_login values (2,'2020-01-02 00:06:00');
insert into test_login values (2,'2020-01-07 00:06:00');
insert into test_login values (2,'2020-01-11 00:06:00');
insert into test_login values (2,'2020-01-12 00:06:00');
insert into test_login values (2,'2020-01-14 00:06:00');
insert into test_login values (2,'2020-01-18 00:06:00');
insert into test_login values (2,'2020-01-19 00:06:00');
insert into test_login values (2,'2020-01-22 00:06:00');

代码

SELECT user_id,
       max(diff_days) + 1 max_diff_days
  from 
(
SELECT user_id,
       flag2,
       min(login_date) min_login_date,
       max(login_date) max_login_date,
       datediff(max(login_date),min(login_date)) diff_days
  from 
(
SELECT tmp3.user_id,
       tmp3.login_date,
       tmp3.flag1,
       sum(tmp4.flag1) flag2
  from 
(
SELECT user_id,
       login_date,
       case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
  from 
(
SELECT  user_id,
        login_date,
        (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from 
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
  from test_login 
 group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2  -- 通过标量子查询实现上一天的登陆日期
) tmp3  -- 打标记 如上下间隔<=3 则为1  否则为0
left join 
(
SELECT user_id,
       login_date,
       case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
  from 
(
SELECT  user_id,
        login_date,
        (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from 
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
  from test_login 
 group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2  -- 通过标量子查询实现上一天的登陆日期
) tmp4  -- 功能同tmp3 
on  tmp3.user_id = tmp4.user_id
and tmp4.login_date <= tmp3.login_date
group by tmp3.user_id,tmp3.login_date,tmp3.flag1
) tmp5 -- 每一个用户  每一个连续区间设置为不同的flag  标记为flag2
group by user_id,flag2
) tmp6 -- 计算出每个连续区间 最大值和最小值  以及连续间隔天数
group by user_id

这样看起来是不是觉得非常的复杂,下面我们拆解开

2.1 同一天多次登陆去重

考虑同一天会有多次登陆的,此处用group by语句进行去重,同一天只保留一条记录

代码:

SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
  from test_login 
 group by user_id,date_format(login_date,'%Y-%m-%d')

测试记录:

mysql> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
    ->   from test_login 
    ->  group by user_id,date_format(login_date,'%Y-%m-%d');
+---------+------------+
| user_id | login_date |
+---------+------------+
|       1 | 2020-01-01 |
|       1 | 2020-01-02 |
|       1 | 2020-01-05 |
|       1 | 2020-01-07 |
|       1 | 2020-01-11 |
|       1 | 2020-01-12 |
|       1 | 2020-01-13 |
|       1 | 2020-01-14 |
|       1 | 2020-01-17 |
|       1 | 2020-01-18 |
|       1 | 2020-01-19 |
|       1 | 2020-01-22 |
|       2 | 2020-01-01 |
|       2 | 2020-01-02 |
|       2 | 2020-01-07 |
|       2 | 2020-01-11 |
|       2 | 2020-01-12 |
|       2 | 2020-01-14 |
|       2 | 2020-01-18 |
|       2 | 2020-01-19 |
|       2 | 2020-01-22 |
+---------+------------+
21 rows in set (0.00 sec)

mysql> 

2.2 求上一次的登陆日期

我们需要找到上一次登陆日期,与此次的登陆日期对比,才可以判断是否符合小于等于3天的条件
此处,我们可以使用标量子查询来实现
需要记录的每个用户的第一条登陆信息上一天为null

代码:

SELECT  user_id,
        login_date,
        (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from 
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
  from test_login 
 group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条

测试记录:

mysql> SELECT  user_id,
    ->         login_date,
    ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    -> from 
    -> (
    -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
    ->   from test_login 
    ->  group by user_id,date_format(login_date,'%Y-%m-%d')
    -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    -> ;
+---------+------------+---------------+
| user_id | login_date | up_login_date |
+---------+------------+---------------+
|       1 | 2020-01-01 | NULL          |
|       1 | 2020-01-02 | 2020-01-01    |
|       1 | 2020-01-05 | 2020-01-02    |
|       1 | 2020-01-07 | 2020-01-05    |
|       1 | 2020-01-11 | 2020-01-07    |
|       1 | 2020-01-12 | 2020-01-11    |
|       1 | 2020-01-13 | 2020-01-12    |
|       1 | 2020-01-14 | 2020-01-13    |
|       1 | 2020-01-17 | 2020-01-14    |
|       1 | 2020-01-18 | 2020-01-17    |
|       1 | 2020-01-19 | 2020-01-18    |
|       1 | 2020-01-22 | 2020-01-19    |
|       2 | 2020-01-01 | NULL          |
|       2 | 2020-01-02 | 2020-01-01    |
|       2 | 2020-01-07 | 2020-01-02    |
|       2 | 2020-01-11 | 2020-01-07    |
|       2 | 2020-01-12 | 2020-01-11    |
|       2 | 2020-01-14 | 2020-01-12    |
|       2 | 2020-01-18 | 2020-01-14    |
|       2 | 2020-01-19 | 2020-01-18    |
|       2 | 2020-01-22 | 2020-01-19    |
+---------+------------+---------------+
21 rows in set (0.00 sec)

2.3 打标记判断是否间隔天数<=3

求出上一个登陆日期,与本次登陆日期比较,如果<=3,则标记为0,否则标记为1
需要注意的是上一步的空值直接标记为1
这样通过flag1 我们就可以看到连续登陆的区间了

代码:

SELECT user_id,
       login_date,
       case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
  from 
(
SELECT  user_id,
        login_date,
        (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from 
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
  from test_login 
 group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2  -- 通过标量子查询实现上一天的登陆日期

测试记录:

mysql> SELECT user_id,
    ->        login_date,
    ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
    ->   from 
    -> (
    -> SELECT  user_id,
    ->         login_date,
    ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    -> from 
    -> (
    -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
    ->   from test_login 
    ->  group by user_id,date_format(login_date,'%Y-%m-%d')
    -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    -> ;
+---------+------------+-------+
| user_id | login_date | flag1 |
+---------+------------+-------+
|       1 | 2020-01-01 |     1 |
|       1 | 2020-01-02 |     0 |
|       1 | 2020-01-05 |     0 |
|       1 | 2020-01-07 |     0 |
|       1 | 2020-01-11 |     1 |
|       1 | 2020-01-12 |     0 |
|       1 | 2020-01-13 |     0 |
|       1 | 2020-01-14 |     0 |
|       1 | 2020-01-17 |     0 |
|       1 | 2020-01-18 |     0 |
|       1 | 2020-01-19 |     0 |
|       1 | 2020-01-22 |     0 |
|       2 | 2020-01-01 |     1 |
|       2 | 2020-01-02 |     0 |
|       2 | 2020-01-07 |     1 |
|       2 | 2020-01-11 |     1 |
|       2 | 2020-01-12 |     0 |
|       2 | 2020-01-14 |     0 |
|       2 | 2020-01-18 |     1 |
|       2 | 2020-01-19 |     0 |
|       2 | 2020-01-22 |     0 |
+---------+------------+-------+
21 rows in set (0.01 sec)

2.4 打分组的标记

因为同一个用户存在多个连续登陆的区间,根据上一个步骤的flag1没办法区分开,此时需要区分开同一个用户的不同连续登陆区间。

因为MySQL版本不支持分析函数,只能通过临时表表连接的方式实现,再考虑去除重复,需要用到分组语句进行去重。

其实flag2是这个解决方案核心所在,同一个用户第一次连续登陆区间标记为1,第二次则累加为2,以此类推。

代码:

SELECT tmp3.user_id,
       tmp3.login_date,
       tmp3.flag1,
       sum(tmp4.flag1) flag2
  from 
(
SELECT user_id,
       login_date,
       case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
  from 
(
SELECT  user_id,
        login_date,
        (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from 
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
  from test_login 
 group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2  -- 通过标量子查询实现上一天的登陆日期
) tmp3  -- 打标记 如上下间隔<=3 则为1  否则为0
left join 
(
SELECT user_id,
       login_date,
       case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
  from 
(
SELECT  user_id,
        login_date,
        (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from 
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
  from test_login 
 group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2  -- 通过标量子查询实现上一天的登陆日期
) tmp4  -- 功能同tmp3 
on  tmp3.user_id = tmp4.user_id
and tmp4.login_date <= tmp3.login_date
group by tmp3.user_id,tmp3.login_date,tmp3.flag1

测试记录:

mysql> SELECT tmp3.user_id,
    ->        tmp3.login_date,
    ->        tmp3.flag1,
    ->        sum(tmp4.flag1) flag2
    ->   from 
    -> (
    -> SELECT user_id,
    ->        login_date,
    ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
    ->   from 
    -> (
    -> SELECT  user_id,
    ->         login_date,
    ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    -> from 
    -> (
    -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
    ->   from test_login 
    ->  group by user_id,date_format(login_date,'%Y-%m-%d')
    -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    -> ) tmp3  -- 打标记 如上下间隔<=3 则为1  否则为0
    -> left join 
    -> (
    -> SELECT user_id,
    ->        login_date,
    ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
    ->   from 
    -> (
    -> SELECT  user_id,
    ->         login_date,
    ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    -> from 
    -> (
    -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
    ->   from test_login 
    ->  group by user_id,date_format(login_date,'%Y-%m-%d')
    -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    -> ) tmp4  -- 功能同tmp3 
    -> on  tmp3.user_id = tmp4.user_id
    -> and tmp4.login_date <= tmp3.login_date
    -> group by tmp3.user_id,tmp3.login_date,tmp3.flag1;
+---------+------------+-------+-------+
| user_id | login_date | flag1 | flag2 |
+---------+------------+-------+-------+
|       1 | 2020-01-01 |     1 |     1 |
|       1 | 2020-01-02 |     0 |     1 |
|       1 | 2020-01-05 |     0 |     1 |
|       1 | 2020-01-07 |     0 |     1 |
|       1 | 2020-01-11 |     1 |     2 |
|       1 | 2020-01-12 |     0 |     2 |
|       1 | 2020-01-13 |     0 |     2 |
|       1 | 2020-01-14 |     0 |     2 |
|       1 | 2020-01-17 |     0 |     2 |
|       1 | 2020-01-18 |     0 |     2 |
|       1 | 2020-01-19 |     0 |     2 |
|       1 | 2020-01-22 |     0 |     2 |
|       2 | 2020-01-01 |     1 |     1 |
|       2 | 2020-01-02 |     0 |     1 |
|       2 | 2020-01-07 |     1 |     2 |
|       2 | 2020-01-11 |     1 |     3 |
|       2 | 2020-01-12 |     0 |     3 |
|       2 | 2020-01-14 |     0 |     3 |
|       2 | 2020-01-18 |     1 |     4 |
|       2 | 2020-01-19 |     0 |     4 |
|       2 | 2020-01-22 |     0 |     4 |
+---------+------------+-------+-------+
21 rows in set (0.01 sec)

2.5 分组求最大的间隔天数

其实有了上一个步骤的flag2,就可以判断区分开每一个用户的每一个连续登陆区间,直接进行分组就可以求出该连续区间 最大和最小的登陆日期,两个日期差值就是间隔天数。最后根据user_id进行分组,求最大的间隔天数就是最终结果。

需要审题 01-01到01-04 间隔是3天,但是算4天连续登陆 所以datediff函数的结果要+1

代码:

SELECT user_id,
       max(diff_days) +1 max_diff_days
  from 
(
SELECT user_id,
       flag2,
       min(login_date) min_login_date,
       max(login_date) max_login_date,
       datediff(max(login_date),min(login_date)) diff_days
  from 
(
SELECT tmp3.user_id,
       tmp3.login_date,
       tmp3.flag1,
       sum(tmp4.flag1) flag2
  from 
(
SELECT user_id,
       login_date,
       case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
  from 
(
SELECT  user_id,
        login_date,
        (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from 
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
  from test_login 
 group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2  -- 通过标量子查询实现上一天的登陆日期
) tmp3  -- 打标记 如上下间隔<=3 则为1  否则为0
left join 
(
SELECT user_id,
       login_date,
       case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
  from 
(
SELECT  user_id,
        login_date,
        (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from 
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
  from test_login 
 group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2  -- 通过标量子查询实现上一天的登陆日期
) tmp4  -- 功能同tmp3 
on  tmp3.user_id = tmp4.user_id
and tmp4.login_date <= tmp3.login_date
group by tmp3.user_id,tmp3.login_date,tmp3.flag1
) tmp5 -- 每一个用户  每一个连续区间设置为不同的flag  标记为flag2
group by user_id,flag2
) tmp6 -- 计算出每个连续区间 最大值和最小值  以及连续间隔天数
group by user_id

测试记录:

mysql> SELECT user_id,
    ->        flag2,
    ->        min(login_date) min_login_date,
    ->        max(login_date) max_login_date,
    ->        datediff(max(login_date),min(login_date)) diff_days
    ->   from 
    -> (
    -> SELECT tmp3.user_id,
    ->        tmp3.login_date,
    ->        tmp3.flag1,
    ->        sum(tmp4.flag1) flag2
    ->   from 
    -> (
    -> SELECT user_id,
    ->        login_date,
    ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
    ->   from 
    -> (
    -> SELECT  user_id,
    ->         login_date,
    ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    -> from 
    -> (
    -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
    ->   from test_login 
    ->  group by user_id,date_format(login_date,'%Y-%m-%d')
    -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    -> ) tmp3  -- 打标记 如上下间隔<=3 则为1  否则为0
    -> left join 
    -> (
    -> SELECT user_id,
    ->        login_date,
    ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
    ->   from 
    -> (
    -> SELECT  user_id,
    ->         login_date,
    ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    -> from 
    -> (
    -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
    ->   from test_login 
    ->  group by user_id,date_format(login_date,'%Y-%m-%d')
    -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    -> ) tmp4  -- 功能同tmp3 
    -> on  tmp3.user_id = tmp4.user_id
    -> and tmp4.login_date <= tmp3.login_date
    -> group by tmp3.user_id,tmp3.login_date,tmp3.flag1
    -> ) tmp5 -- 每一个用户  每一个连续区间设置为不同的flag  标记为flag2
    -> group by user_id,flag2;
+---------+-------+----------------+----------------+-----------+
| user_id | flag2 | min_login_date | max_login_date | diff_days |
+---------+-------+----------------+----------------+-----------+
|       1 |     1 | 2020-01-01     | 2020-01-07     |         6 |
|       1 |     2 | 2020-01-11     | 2020-01-22     |        11 |
|       2 |     1 | 2020-01-01     | 2020-01-02     |         1 |
|       2 |     2 | 2020-01-07     | 2020-01-07     |         0 |
|       2 |     3 | 2020-01-11     | 2020-01-14     |         3 |
|       2 |     4 | 2020-01-18     | 2020-01-22     |         4 |
+---------+-------+----------------+----------------+-----------+
6 rows in set (0.01 sec)

mysql>


mysql> SELECT user_id,
    ->        max(diff_days) +1 max_diff_days
    ->   from 
    -> (
    -> SELECT user_id,
    ->        flag2,
    ->        min(login_date) min_login_date,
    ->        max(login_date) max_login_date,
    ->        datediff(max(login_date),min(login_date)) diff_days
    ->   from 
    -> (
    -> SELECT tmp3.user_id,
    ->        tmp3.login_date,
    ->        tmp3.flag1,
    ->        sum(tmp4.flag1) flag2
    ->   from 
    -> (
    -> SELECT user_id,
    ->        login_date,
    ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
    ->   from 
    -> (
    -> SELECT  user_id,
    ->         login_date,
    ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    -> from 
    -> (
    -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
    ->   from test_login 
    ->  group by user_id,date_format(login_date,'%Y-%m-%d')
    -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    -> ) tmp3  -- 打标记 如上下间隔<=3 则为1  否则为0
    -> left join 
    -> (
    -> SELECT user_id,
    ->        login_date,
    ->        case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
    ->   from 
    -> (
    -> SELECT  user_id,
    ->         login_date,
    ->         (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
    -> from 
    -> (
    -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
    ->   from test_login 
    ->  group by user_id,date_format(login_date,'%Y-%m-%d')
    -> ) tmp1  -- tmp1表 分组去重 一天多次登陆的算一条
    -> ) tmp2  -- 通过标量子查询实现上一天的登陆日期
    -> ) tmp4  -- 功能同tmp3 
    -> on  tmp3.user_id = tmp4.user_id
    -> and tmp4.login_date <= tmp3.login_date
    -> group by tmp3.user_id,tmp3.login_date,tmp3.flag1
    -> ) tmp5 -- 每一个用户  每一个连续区间设置为不同的flag  标记为flag2
    -> group by user_id,flag2
    -> ) tmp6 -- 计算出每个连续区间 最大值和最小值  以及连续间隔天数
    -> group by user_id
    -> ;
+---------+---------------+
| user_id | max_diff_days |
+---------+---------------+
|       1 |            12 |
|       2 |             5 |
+---------+---------------+
2 rows in set (0.00 sec)

mysql> 
 

三.MySQL 8.0写法

MySQL 8.0的with语句以及分析窗口函数,可以使上面的解决方案的代码简洁度大大提升

代码:

with tmp1 AS
-- tmp1临时表 一天多次登陆算一次
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
  from test_login 
 group by user_id,date_format(login_date,'%Y-%m-%d')
),
tmp2 as
-- tmp2临时表  通过lag获取上一次登陆时间  如是第一条给默认值
(
select user_id,
       login_date,
       lag(login_date,1,'1900-01-01') over(partition by user_id order by login_date) up_login_date
  from tmp1
),
tmp3 AS
-- 判断是否符合3天内标准  打标记flag1
(
select  user_id,
        login_date,
        case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1  end as flag1
  from  tmp2
),
tmp4 AS
-- 通过分析函数将每个用户 每一个连续登陆期间进行标记 flag2
(
select  user_id,
        login_date,
        flag1,
        sum(flag1) over(partition by user_id order by login_date) as flag2
  from tmp3
),
tmp5 AS
-- 通过user_id flag2进行分组
(
SELECT user_id,
       flag2,
       min(login_date) min_login_date,
       max(login_date) max_login_date,
       datediff(max(login_date),min(login_date)) diff_days
  from tmp4
 group by user_id,flag2
)
SELECT user_id,
       max(diff_days) + 1 as diff_days
  from tmp5
 group by user_id;

测试记录:

mysql> select version();
+-----------+
| version() |
+-----------+
| 8.0.19    |
+-----------+
1 row in set (0.00 sec)

mysql> with tmp1 AS
    -> -- tmp1临时表 一天多次登陆算一次
    -> (
    -> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
    ->   from test_login
    ->  group by user_id,date_format(login_date,'%Y-%m-%d')
    -> ),
    -> tmp2 as
    -> -- tmp2临时表  通过lag获取上一次登陆时间  如是第一条给默认值
    -> (
    -> select user_id,
    ->        login_date,
    ->        lag(login_date,1,'1900-01-01') over(partition by user_id order by login_date) up_login_d
    ->   from tmp1
    -> ),
    -> tmp3 AS
    -> -- 判断是否符合3天内标准  打标记flag1
    -> (
    -> select  user_id,
    ->         login_date,
    ->         case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1  end as flag1
    ->   from  tmp2
    -> ),
    -> tmp4 AS
    -> -- 通过分析函数将每个用户 每一个连续登陆期间进行标记 flag2
    -> (
    -> select  user_id,
    ->         login_date,
    ->         flag1,
    ->         sum(flag1) over(partition by user_id order by login_date) as flag2
    ->   from tmp3
    -> ),
    -> tmp5 AS
    -> -- 通过user_id flag2进行分组
    -> (
    -> SELECT user_id,
    ->        flag2,
    ->        min(login_date) min_login_date,
    ->        max(login_date) max_login_date,
    ->        datediff(max(login_date),min(login_date)) diff_days
    ->   from tmp4
    ->  group by user_id,flag2
    -> )
    -> SELECT user_id,
    ->        max(diff_days) + 1 as diff_days
    ->   from tmp5
    ->  group by user_id;
+---------+-----------+
| user_id | diff_days |
+---------+-----------+
|       1 |        12 |
|       2 |         5 |
+---------+-----------+
2 rows in set (0.00 sec)

猜你喜欢

转载自blog.csdn.net/u010520724/article/details/111834623