hive拉链表以及退链例子笔记

 拉链表设计:

  在企业中,由于有些流水表每日有几千万条记录,数据仓库保存5年数据的话很容易不堪重负,因此可以使用拉链表的算法来节省存储空间。

 例子:

-- 用户信息表;  采集当日全量数据存储到 (当日) 表中
CREATE TABLE dwd.user_info(
    id string,      
    name string,
    sex string,
    biz_date string  -- 业务日期
)


-- 用户信息整合表
CREATE TABLE dws.user_merge_info(
    id string,
    name string,
    sex string,
    start_date string,
    end_date string
)

-- 测试插入用户信息
INSERT INTO dwd.user_info 
SELECT 
'1','YaoMing','boy','20190701'
UNION ALL 
SELECT 
'2','YaoLinlin','girl','20190701'
UNION ALL 
SELECT 
'3','CaiLili','girl','20190701'
UNION ALL 
SELECT 
'4','ZhangSan','girl','20190702'
UNION ALL 
SELECT 
'5','LiSi','girl','20190702'


-- 查看数据
SELECT * FROM dwd.user_info 

复制代码

复制代码

-- 初始化用户信息整合表
INSERT overwrite TABLE dws.user_merge_info
SELECT
    id,
    name,
    sex,
    '20190701' AS start_date,
    '99991231' AS end_date
FROM (
    SELECT 
        id,
        name,
        sex,
        row_number() over(PARTITION BY id ORDER BY biz_date) AS row_num  -- 初始化时候根据主键id分组,取最新修改的数据
    FROM dwd.user_info 
    ) t
WHERE t.row_num = 1 

-- 查看数据
SELECT * FROM dws.user_merge_info

复制代码

复制代码

-- 现在biz_date='20190702'这天,新跑了一条全新数据id=6,以及修改了一条id=2的数据
INSERT INTO dwd.user_info 
SELECT 
'6','WangWu','boy','20190702'
UNION ALL 
SELECT 
'2','YaoLinlin','boy','20190702'

-- 查看数据
SELECT * FROM dwd.user_info  ORDER BY id,biz_date

复制代码

 

复制代码

-- 新增修改以及完全新增
INSERT overwrite TABLE tmp.user_merge_info_new
-- 修改的数据
SELECT
b.id,
b.name,
b.sex,
'20190702' AS start_date,        -- ${bizdate} 业务日期
'99991231' AS end_date            -- 99991231代表有效数据
FROM dws.user_merge_info     a,
     dwd.user_info              b
WHERE a.id = b.id 
AND a.end_date = '99991231' 
AND b.biz_date = '20190702'     -- ${bizdate}只取当天数据
AND (
       a.name!= b.name
    OR a.sex != b.sex
)

UNION ALL 
-- 全新的数据
SELECT                                                    
    b.id,
    b.name,
    b.sex,
    '20190702'AS start_date,
    '99991231'AS end_date    
FROM         dws.user_merge_info     a
RIGHT JOIN  dwd.user_info         b
ON a.id = b.id
WHERE b.biz_date='20190702'
    AND a.id IS NULL;

复制代码

复制代码

-- 闭链
INSERT overwrite TABLE tmp.user_merge_info_upt
SELECT
    a.id,
    a.name,
    a.sex,
    a.start_date,
    '20190702'            -- 闭链,${biz_date}业务时间
FROM dws.user_merge_info a
LEFT JOIN dwd.user_info b
ON a.id=b.id 
WHERE     a.end_date='99991231' 
    AND b.biz_date='20190702' 
    AND (
        a.name != b.name
        OR a.sex != b.sex
        )

复制代码

复制代码

-- 历史数据
INSERT overwrite TABLE tmp.user_merge_info_new
SELECT
    a.id,
    a.name,
    a.sex,
    a.start_date,
    a.end_date
FROM dws.user_merge_info       a,
     tmp.user_merge_info_upt  b
WHERE a.id != b.id;

复制代码

复制代码

-- 整合数据 
INSERT OVERWRITE TABLE dws.user_merge_info
SELECT
    id,
    name,
    sex,
    start_date,
    end_date
FROM tmp.user_merge_info_new
UNION ALL 
SELECT
    id,
    name,
    sex,
    start_date,
    end_date
FROM tmp.user_merge_info_upt
UNION ALL 
SELECT
    id,
    name,
    sex,
    start_date,
    end_date
FROM tmp.user_merge_info_his


-- 查看下数据
SELECT * FROM dws.user_merge_info ORDER BY id,start_date

复制代码

以上拉链表就实现好了


以下是退链操作模板

复制代码

#!/bin/bash

# 使用说明提示
if [ $# -ne 1 ]; then
    echo "Usage : `basename $0` biz_date"
    exit 1
fi

#业务时间
biz_date=$1

# 判断是数据整合还是回退拉链表
isGoBack=`execHQL "select count(1) from dws.user_merge_info where (end_date>='$biz_date' or start_date>='$biz_date') and biz_date<>'99991231';"`

if [ $isGoBack -ne 0 ];then
    # 回退模式
    Log "\n## 【user_merge_info表回退】 执行开始 ##" 
    execHQL "
        INSERT overwrite TABLE dws.user_merge_info 
        --  完全不变的数据
        SELECT 
             id
            ,name
            ,sex     
            ,start_date
            ,end_date 
        FROM dws.user_merge_info 
        WHERE (start_date<'$biz_date' AND end_date='99991231') OR end_date<'$biz_date'
            
        UNION ALL 
        
        -- 重跑 重新开链的数据
        SELECT 
             id
            ,name
            ,sex     
            ,start_date
            ,'99991231' AS end_date
        FROM dws.user_merge_info 
        WHERE start_date<'$biz_date' AND end_date>='$biz_date' AND end_date<>'99991231';
    "
    if [ $? -ne 0 ];then
        Log "\n## 【user_merge_info表回退】 执行失败 ##"
        exit 1
    fi
    Log "\n## 【user_merge_info表回退】 执行成功 ##"

fi

复制代码

猜你喜欢

转载自blog.csdn.net/qq_39674417/article/details/113732706