大数据面试SQL039-连续登录三天的记录
一、题目
现有用户登录记录表,已经按照用户日期进行去重处理。请查询出用户所有连续三天登录的数据记录
+----------+-------------+
| user_id | login_date |
+----------+-------------+
| aaa | 2023-12-01 |
| aaa | 2023-12-03 |
| aaa | 2023-12-04 |
| aaa | 2023-12-05 |
| aaa | 2023-12-06 |
| bbb | 2023-12-01 |
| bbb | 2023-12-03 |
| bbb | 2023-12-04 |
| bbb | 2023-12-07 |
| bbb | 2023-12-08 |
| bbb | 2023-12-09 |
+----------+-------------+
期望输出结果
+----------+-------------+
| user_id | login_date |
+----------+-------------+
| aaa | 2023-12-03 |
| aaa | 2023-12-04 |
| aaa | 2023-12-05 |
| aaa | 2023-12-04 |
| aaa | 2023-12-05 |
| aaa | 2023-12-06 |
| bbb | 2023-12-07 |
| bbb | 2023-12-08 |
| bbb | 2023-12-09 |
+----------+-------------+
二、分析
依旧是连续问题,这个要求是找出所有连续登录三天的记录,期望输出结果中会看到aaa有3,4,5日记录,也会有4,5,6日的记录,那么4,5就会出现两次,原来数据表中至于一次,所以还是得“生成”数据;
维度 | 评分 |
---|---|
题目难度 | ⭐️⭐️⭐️⭐️ |
题目清晰度 | ⭐️⭐️⭐️⭐️ |
业务常见度 | ⭐️⭐️ |
三、SQL
1.使用lead()函数按照用户分组,日期排序,查询出后面第三行的日期date1;使用date_add()计算出3天后的日期date2
select
user_id,
login_date,
lead(login_date,2)over(partition by user_id order by login_date) as date1,
date_add(login_date,2) as date2
from t_login_39
查询结果
2.判断date1和date2是否相等,如果相等则代表为连续登录,否则非连续登录计算出is_conn,然后根据is_conn的标识筛
with t1 as
(
select
user_id,
login_date,
lead(login_date,2)over(partition by user_id order by login_date) as date1,
date_add(login_date,2) as date2
from t_login_39)
select
user_id,
login_date,
date1,
date2,
if(date1=date2,1,0) as is_conn
from t1
查询结果
3.根据is_conn的标识,进行筛选出所有连续登录3天的开始登录日期
with t1 as
(
select
user_id,
login_date,
lead(login_date,2)over(partition by user_id order by login_date) as date1,
date_add(login_date,2) as date2
from t_login_39),
t2 as
(
select
user_id,
login_date,
date1,
date2,
if(date1=date2,1,0) as is_conn
from t1)
select *
from t2
where is_conn=1
查询结果
4.生成一个包含0,1,2三行记录
select explode(sequence(0,2)) as date_diff
查询结果
5.步骤3,4进行笛卡尔积得到最后结果
with t1 as
(
select
user_id,
login_date,
lead(login_date,2)over(partition by user_id order by login_date) as date1,
date_add(login_date,2) as date2
from t_login_39),
t2 as
(
select
user_id,
login_date,
date1,
date2,
if(date1=date2,1,0) as is_conn
from t1),
t3 as
(
select
user_id,
login_date
from t2
where is_conn=1),
t4 as
(
select
explode(sequence(0,2)) as date_diff)
select
t3.user_id,
t3.login_date,
date_add(t3.login_date,t4.date_diff) as new_logdate
from t3 join t4
查询结果
四、建表语句和数据插入
--建表语句
create table t_login_39
(
user_id string COMMENT '用户ID',
login_date string COMMENT '登录日期'
) COMMENT '用户登录记录表'
stored as orc
;
--插入数据
insert into t_login_39(user_id,login_date)
values
('aaa','2023-12-01'),
('aaa','2023-12-03'),
('aaa','2023-12-04'),
('aaa','2023-12-05'),
('aaa','2023-12-06'),
('bbb','2023-12-01'),
('bbb','2023-12-03'),
('bbb','2023-12-04'),
('bbb','2023-12-07'),
('bbb','2023-12-08'),
('bbb','2023-12-09')