hive如何用sql写一个初始化历史数据为拉链表的语句

虽然已经在知乎上面提了问题，但是没有一个人回答，目前我也把结果做了出来了。

还是需要展示一下.

第一步，将半年的数据分成单月取初始化，如下只是201801的数据，其他就不展示了，改下日期就可以了:

--HIVE_CONFIG=set hive.exec.parallel=true;set mapred.max.split.size=50000000;set mapred.min.split.size.per.node=50000000;set mapred.min.split.size.per.rack=50000000;set  hive.exec.reducers.bytes.per.reducer=50000000;set hive.hadoop.supports.splittable.combineinputformat=true;set hive.vectorized.execution.enabled = true;set hive.vectorized.execution.reduce.enabled = true;set mapreduce.map.memory.mb=5120;set mapreduce.reduce.memory.mb=5120;

with tmp1 as (
  select cust_id,zc_bal,cast(dt as string ) as dt
       ,lead(zc_bal,1,NULL)over(partition by cust_id order by dt) as zc_bal1 
   from   FACT_RPSM.F_CUST_BAL_SUM a
where dt>='20180101' and dt <='20180131' and coalesce(cust_id,'')<>''  
),
tmp2 as (
SELECT A.cust_id,A.DT,A.zc_bal,zc_bal1,(CASE WHEN zc_bal=zc_bal1  THEN '29991231' ELSE DT END ) DT_END FROM tmp1 a
),
tmp3 as (
SELECT A.cust_id,A.DT,A.zc_bal,zc_bal1,DT_END,cast(cast( lag(dt,1)over(partition by cust_id order by dt)+'1' as int) as string) as DATE_2 from tmp2 a WHERE DT_END<>'29991231'
),
tmp4 as (
SELECT cust_id,zc_bal,coalesce(DATE_2, DT) AS START_DATE,DT_END AS END_DATE FROM tmp3 a
)
SELECT a.cust_id,zc_bal,START_DATE
,CASE WHEN max(END_DATE)over(partition by cust_id)=END_DATE and END_DATE='20180131' THEN '29991231' ELSE END_DATE END END_DATE
,CASE WHEN max(END_DATE)over(partition by cust_id)=END_DATE and END_DATE<'20180131' THEN '1'   ELSE '0' END IS_FLAG
FROM tmp4 a

hive基本四则运算会将字符转化为decimal类型，日期相加后要先转int类型再转string类型

第二步，拼接。有sql实现拼接，但是因为数据倾斜或是其他原因，跑不出来，就只能把每月末的数据闭链，再全部整合:

--HIVE_CONFIG=set hive.exec.parallel=true;set mapred.max.split.size=50000000;set mapred.min.split.size.per.node=50000000;set mapred.min.split.size.per.rack=50000000;set  hive.exec.reducers.bytes.per.reducer=50000000;set hive.hadoop.supports.splittable.combineinputformat=true;set hive.vectorized.execution.enabled = true;set hive.vectorized.execution.reduce.enabled = true;set mapreduce.map.memory.mb=5120;set mapreduce.reduce.memory.mb=5120;


select
b.cust_id ,b.zc_bal,b.start_date,b.end_date,b.is_flag 
from
(
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20171031' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_2 a  where   dt ='20180412'  
union all
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20171130' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_11 a where   dt ='20180412'  
union all
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20171231' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_12 a where  dt ='20180412'   
union all
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20180131' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_01 a where   dt ='20180412'  
union all
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20180228' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_02 a where   dt ='20180412'  
union all
select a.cust_id ,a.zc_bal,a.start_date,a.end_date
,a.is_flag from tmp.cust_bal_zip_03 a where   dt ='20180412' 
 ) b

第三步，去处半年不变且zc_bal=0的冗余数据:

--HIVE_CONFIG=set hive.exec.parallel=true;set mapred.max.split.size=50000000;set mapred.min.split.size.per.node=50000000;set mapred.min.split.size.per.rack=50000000;set  hive.exec.reducers.bytes.per.reducer=50000000;set hive.hadoop.supports.splittable.combineinputformat=true;set hive.vectorized.execution.enabled = true;set hive.vectorized.execution.reduce.enabled = true;set mapreduce.map.memory.mb=5120;set mapreduce.reduce.memory.mb=5120;set hive.map.aggr=true;set hive.groupby.skewindata=true;

with tmp1 as (
select a.cust_id , count(1) as cnt  from tmp.cust_bal_zip_init a where dt ='20180412' group by cust_id 
),
tmp2 as (
select a.cust_id, count(1) as cnt from tmp.cust_bal_zip_init a where dt ='20180412'  and zc_bal = 0 group by cust_id 
),
tmp3 as (
select a.cust_id from tmp1 a inner join tmp2 b on a.cust_id = b.cust_id and b.cnt = 6  where a.cnt = 6
)
select * from
(
select b.cust_id ,b.zc_bal,b.start_date,b.end_date,b.is_flag from tmp.cust_bal_zip_init b left join tmp3 c on b.cust_id = c.cust_id where b.dt ='20180412'  and c.cust_id is null
union all
select a.cust_id ,cast(0 as decimal(20,2)) as zc_bal,'20171001' as start_date,'29991231' as end_date,'0' as is_flag from tmp3 a 
) d

第四步，加入到拉链表，执行每日跑批:

----HIVE_CONFIG=set hive.exec.parallel=true;set mapred.max.split.size=50000000;set mapred.min.split.size.per.node=50000000;set mapred.min.split.size.per.rack=50000000;set hive.exec.reducers.bytes.per.reducer=50000000;set hive.hadoop.supports.splittable.combineinputformat=true;set hive.vectorized.execution.enabled = true;set hive.vectorized.execution.reduce.enabled = true;set mapreduce.map.memory.mb=5120;set mapreduce.reduce.memory.mb=5120;
--APPEND_RELY=SELF

--取数据表今日数据t2
with cust_zc_bal_now as (
select     
                            cust_id ,
                            zc_bal 
  from FACT_RPSM.F_CUST_BAL_SUM 
 WHERE DT = '${start|yyyyMMdd}' 
   and coalesce(cust_id,'')<>'' 
)
,

--取拉链表昨日数据t1
cust_zc_bal_zipper_y1 as 
(
select     
                            cust_id ,
                            zc_bal ,
                            start_date as start_dt,
                            end_date as end_dt,
                            is_flag as del_flag,
                            '${start-1d|yyyyMMdd}' as data_date
from tmp.cust_bal_zip_init2 
 where dt ='20180416'
--   from view.CUST_BAL_Z_T 
--  WHERE DT = '${start-1d|yyyyMMdd}'
)
,

--t1 闭链的数据
cust_zc_bal_zipper_y2 as 
(
select     
                            cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag
  from cust_zc_bal_zipper_y1 
 WHERE end_dt <> '29991231' 
)
,

--t1和t2 的客户 的资产余额相同的客户，即数据不变的.
cust_zc_bal_zipper_y3 as (
select     
                            t1.cust_id ,
                            t1.zc_bal ,
                            t1.start_dt,
                            '29991231' as end_dt,         --闭链日期为MAX_DATE
                            '0' as del_flag             --删除标记为0
  from cust_zc_bal_zipper_y1  t1
  left join cust_zc_bal_now   t2
    on t1.cust_id = t2.cust_id  
 where t1.end_dt = '29991231' 
   and t1.del_flag = '0'  
   and t2.cust_id is not null  
   and t1.zc_bal = t2.zc_bal       --T1未删除且结束日期是29991231 ，T2客户不为空,t1和t2资产相同.
)
,


--t1比t2 多的客户,即这些客户失效了
cust_zc_bal_zipper_y4 as (
select     
                            t1.cust_id ,
                            t1.zc_bal ,
                            t1.start_dt,
                            t1.data_date as end_dt ,   --闭链日期为昨日
                            '1' as del_flag          --删除标记为1
  from cust_zc_bal_zipper_y1  t1
  left join cust_zc_bal_now   t2
    on t1.cust_id = t2.cust_id  
 where t1.end_dt = '29991231' 
   and t1.del_flag = '0'  
   and t2.cust_id is null   --T1未删除且结束日期是29991231 ，T2客户为空.
)
,

--t1比t2 变更的客户  进行闭链
cust_zc_bal_zipper_y5 as (
select     
                            t1.cust_id ,
                            t1.zc_bal ,
                            t1.start_dt,
                            t1.data_date as end_dt  ,     --闭链日期为昨日
                            '0' as del_flag              --删除标记为0
  from cust_zc_bal_zipper_y1  t1
  left join cust_zc_bal_now   t2
    on t1.cust_id = t2.cust_id  
 where t1.end_dt = '29991231' 
   and t1.del_flag = '0'  
   and t2.cust_id is not null   
   and t1.zc_bal <> t2.zc_bal --T1未删除且结束日期是29991231 ，T2客户不为空,t1和t2资产不相同
) 
,

--t2新增和变更 新增开链数据

cust_zc_bal_zipper_y6 as (

select         
                            t1.cust_id ,
                            t1.zc_bal ,
                            '${start|yyyyMMdd}' as start_dt, --开链日期为dt
                            '29991231' as end_dt   ,          --闭链日期为昨日
                            '0' as del_flag                  --删除标记为0
  from cust_zc_bal_zipper_y1  t1
  left join cust_zc_bal_now   t2
    on t1.cust_id = t2.cust_id  
 where t1.end_dt = '29991231' 
   and t1.del_flag = '0'  
   and t2.cust_id is not null   
   and t1.zc_bal <> t2.zc_bal --T1未删除且结束日期是29991231 ，T2客户不为空,t1和t2资产不相同

union all 

select         
                            t1.cust_id ,
                            t1.zc_bal ,
                            '${start|yyyyMMdd}' as start_dt, --开链日期为dt
                            '29991231' as end_dt   ,          --闭链日期为昨日
                            '0' as del_flag                  --删除标记为0
  from cust_zc_bal_now   t1
  left join cust_zc_bal_zipper_y1   t2
    on t1.cust_id = t2.cust_id  
 where t2.cust_id is null                                  --t2历史没有此客户号
    or ( t2.cust_id is not null and t2.del_flag ='1' )     --t2历史有此客户号,但是失效了
) 
select cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag 
from(
select cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag  from cust_zc_bal_zipper_y2
union all
select cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag  from cust_zc_bal_zipper_y3
union all
select cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag from cust_zc_bal_zipper_y4
union all 
select cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag  from cust_zc_bal_zipper_y5
union all
select cust_id ,
                            zc_bal ,
                            start_dt,
                            end_dt,
                            del_flag  from cust_zc_bal_zipper_y6
) a

第五步，跑完半年后的第一天，将cust_zc_bal_zipper_y1 中初始化的表改为当前表的昨日表。

hive如何用sql写一个初始化历史数据为拉链表的语句

猜你喜欢