hive仓库之拉链表实现

在数仓建设中，经常会遇到需要查找某条数据的历史状态及状态改变的时间点，比如查找某个用户历史所有的变更记录，在业务数据库中是会有变更和物理删除，而用户id是主键，所以只会记录用户最新的记录，如果只是全量同步业务数据库中的用户表，在最新分区中无法查到用户变更记录，如果把全量数据每天快照全部保留，会浪费很多空间，并且查询效率低，逻辑也复杂。做成拉链表既能节省空间，也能快速查询出某个用户所有变更记录和变更类型、变更日期。

此方案比我以前博客的方案优化点：数据仓库之拉链表的更新方法
此方案支持数据重跑，即某天数据发现有问题，ods表重新接入数据了，只需要从那天开始重新跑到拉链表中即可，因为这次拉链表设计成了分区表

以下为实现方法：

先在ods创建和业务数据库同样的表，每天全量同步表数据

drop table if exists flowtest.tmp_user_test_df;
create table if not exists flowtest.tmp_user_test_df (
      user_id          string comment '用户id',
      user_name        string comment '用户名称',
      age              bigint comment '用户年龄',
      gender           string comment '性别'
) comment '用户信息表'              
partitioned by ( pt_day string comment '分区时间-天' )  
--lifecycle 5  --dataWorks和数栈中有此关键字，用于设置表的生命周期
;

在ods创建历史全量拉链表

drop table if exists flowtest.tmp_user_std_test_df;
create table if not exists flowtest.tmp_user_std_test_df (
      user_id          string comment '用户id',
      user_name        string comment '用户名称',
      age              bigint comment '用户年龄',
      gender           string comment '性别',
      md5_key          string comment '所有字段md5值',
      create_date      string comment '创建日期',
      operation_date   string comment '操作日期',
      operation_type   string comment '操作类型(A:第一次全量,U:更新,I:新增插入,D:物理删除)'
) comment '用户信息拉链表'              
partitioned by ( pt_day string comment '分区时间-天' )  
--lifecycle 5  --dataWorks和数栈中有此关键字，用于设置表的生命周期
;

插入模拟数据


insert into table flowtest.tmp_user_test_df partition (pt_day = '20201212') values
("00001", "00001", 26, "男"), 
("00002", "00002", 27, "男"), 
("00003", "00003", 21, "女"), 
("00004", "00004", 22, "女"), 
("00005", "00005", 26, "女"), 
("00006", "00006", 26, "男"), 
("00007", "00007", 26, "男"), 
("00008", "00008", 26, "男"),
("00009", "00009", 26, "男"),
("00010", "00010", 26, "男"),
("00011", "00011", 26, "男"),
("00012", "00012", 26, "男");

--删除用户00006，新增00013，修改00011
insert into table flowtest.tmp_user_test_df partition (pt_day = '20201213') values
("00001", "00001", 26, "男"), 
("00002", "00002", 27, "男"), 
("00003", "00003", 21, "女"), 
("00004", "00004", 22, "女"), 
("00005", "00005", 26, "女"), 
("00006", "浮云", 26, "男"), 
("00007", "00007", 26, "男"), 
("00008", "00008", 26, "男"),
("00009", "00009", 26, "男"),
("00010", "00010", 26, "男"),
("00011", "00011", 26, "男"),
("00013", "00013", 26, "男");

--删除用户00011，新增00014,00012，修改00001
alter table flowtest.tmp_user_test_df drop partition (pt_day = '20201214');
insert into table flowtest.tmp_user_test_df partition (pt_day = '20201214') values
("00001", "ganling", 26, "男"), 
("00002", "00002", 27, "男"), 
("00003", "00003", 21, "女"), 
("00004", "00004", 22, "女"), 
("00005", "00005", 26, "女"), 
("00006", "浮云", 26, "男"), 
("00007", "00007", 26, "男"), 
("00008", "00008", 26, "男"),
("00009", "00009", 26, "男"),
("00010", "00010", 26, "男"),
("00013", "00013", 26, "男"),
("00014", "00014", 27, "男"),
("00012", "insert", 27, "男")
;

向全量拉链表中导入20201212的全量数据

--第一次全量初始化
insert overwrite table flowtest.tmp_user_std_test_df partition (pt_day = '20201212')
select user_id          --用户id
      ,user_name        --用户名称
      ,age              --用户年龄
      ,gender           --性别
      ,md5(concat(nvl(user_id, '')
                 ,nvl(user_name, '')
                 ,nvl(age, '')
                 ,nvl(gender, ''))) as md5_key
      ,'20201212' as create_date      --创建日期
      ,'99991231' as operation_date   --操作日期
      ,'A' as operation_type   --操作类型(A:第一次全量,U:更新,I:新增插入,D:物理删除)
  from flowtest.tmp_user_test_df
 where pt_day = '20201212'
;

后续每天全量拉链将数据导入拉链表当天分区

--后续全量拉链
insert overwrite table flowtest.tmp_user_std_test_df partition (pt_day = '${today}')
select coalesce(t1.user_id, t2.user_id) as user_id          --用户id
      ,coalesce(t1.user_name, t2.user_name) as user_name        --用户名称
      ,coalesce(t1.age, t2.age) as age              --用户年龄
      ,coalesce(t1.gender, t2.gender) as gender           --性别
      ,coalesce(t1.md5_key, t2.md5_key) as md5_key           --md5值
      --如果t2表的主键user_id为空，创建日期取跑数据当天，其他取t2.create_date
      ,case when t2.user_id is null then '${today}' --跑数当天${bdp.system.bizdate}
       else t2.create_date end as create_date      --创建日期
      --如果t1.user_id为空，则为if(t2.operation_date = '99991231', '${bdp.system.bizdate}', t2.operation_date)('D')，如果t2.user_id为空，则为'99991231'('I')，如果t1.md5_key<>t2.md5_key，则为t1.pt_day('U')，其他为t2.operation_date
      ,case when t1.user_id is null then if(t2.operation_date = '99991231', '${today}', t2.operation_date) --跑数当天${bdp.system.bizdate}
            when t2.user_id is null then '99991231'
            when t1.md5_key <> t2.md5_key then '${today}' --跑数当天${bdp.system.bizdate}
       else t2.operation_date end as operation_date   --操作日期
      --如果t1.user_id为空，则为'D'，如果t2.user_id为空，则为'I'，如果t1.md5_key<>t2.md5_key，则为'U'，其他为t2.operation_type
      ,case when t1.user_id is null then 'D'
            when t2.user_id is null then 'I'
            when t1.md5_key <> t2.md5_key then 'U'
       else t2.operation_type end as operation_type   --操作类型(A:第一次全量,U:更新,I:新增插入,D:物理删除)
  from 
  ( --新的分区全量数据
    select user_id          --用户id
          ,user_name        --用户名称
          ,age              --用户年龄
          ,gender           --性别
          ,md5(concat(nvl(user_id, '')
                         ,nvl(user_name, '')
                         ,nvl(age, '')
                         ,nvl(gender, ''))) as md5_key
      from flowtest.tmp_user_test_df
     where pt_day = '${today}'
  ) t1 
  full outer join 
  ( --老的分区全量数据
    select user_id          --用户id
          ,user_name        --用户名称
          ,age              --用户年龄
          ,gender           --性别
          ,md5_key          --md5值
          ,create_date      --创建日期
          ,operation_date   --操作日期
          ,operation_type   --操作类型(A:第一次全量,U:更新,I:新增插入,D:物理删除)
      from flowtest.tmp_user_std_test_df
     where pt_day = '${yesterday}'
  ) t2 on t1.user_id = t2.user_id 
      --因为存在物理删除，并且物理删除后可能又将这个user_id重新录入，所以关联时过滤掉物理删除记录，直接将以前物理删除记录原样保留即可
      and t2.operation_type <> 'D'
;

hive仓库之拉链表实现

猜你喜欢