虽然已经在知乎上面提了问题,但是没有一个人回答,目前我也把结果做了出来了。
还是需要展示一下.
第一步,将半年的数据分成单月取初始化,如下只是201801的数据,其他就不展示了,改下日期就可以了:
--HIVE_CONFIG=set hive.exec.parallel=true;set mapred.max.split.size=50000000;set mapred.min.split.size.per.node=50000000;set mapred.min.split.size.per.rack=50000000;set hive.exec.reducers.bytes.per.reducer=50000000;set hive.hadoop.supports.splittable.combineinputformat=true;set hive.vectorized.execution.enabled = true;set hive.vectorized.execution.reduce.enabled = true;set mapreduce.map.memory.mb=5120;set mapreduce.reduce.memory.mb=5120;
with tmp1 as (
select cust_id,zc_bal,cast(dt as string ) as dt
,lead(zc_bal,1,NULL)over(partition by cust_id order by dt) as zc_bal1
from FACT_RPSM.F_CUST_BAL_SUM a
where dt>='20180101' and dt <='20180131' and coalesce(cust_id,'')<>''
),
tmp2 as (
SELECT A.cust_id,A.DT,A.zc_bal,zc_bal1,(CASE WHEN zc_bal=zc_bal1 THEN '29991231' ELSE DT END ) DT_END FROM tmp1 a
),
tmp3 as (
SELECT A.cust_id,A.DT,A.zc_bal,zc_bal1,DT_END,cast(cast( lag(dt,1)over(partition by cust_id order by dt)+'1' as int) as string) as DATE_2 from tmp2 a WHERE DT_END<>'29991231'
),
tmp4 as (
SELECT cust_id,zc_bal,coalesce(DATE_2, DT) AS START_DATE,DT_END AS END_DATE FROM tmp3 a
)
SELECT a.cust_id,zc_bal,START_DATE
,CASE WHEN max(END_DATE)over(partition by cust_id)=END_DATE and END_DATE='20180131' THEN '29991231' ELSE END_DATE END END_DATE
,CASE WHEN max(END_DATE)over(partition by cust_id)=END_DATE and END_DATE<'20180131' THEN '1' ELSE '0' END IS_FLAG
FROM tmp4 a
hive基本四则运算会将字符转化为decimal类型,日期相加后要先转int类型再转string类型
第二步,拼接。有sql实现拼接,但是因为数据倾斜或是其他原因,跑不出来,就只能把每月末的数据闭链,再全部整合:
--HIVE_CONFIG=set hive.exec.parallel=true;set mapred.max.split.size=50000000;set mapred.min.split.size.per.node=50000000;set mapred.min.split.size.per.rack=50000000;set hive.exec.reducers.bytes.per.reducer=50000000;set hive.hadoop.supports.splittable.combineinputformat=true;set hive.vectorized.execution.enabled = true;set hive.vectorized.execution.reduce.enabled = true;set mapreduce.map.memory.mb=5120;set mapreduce.reduce.memory.mb=5120;
select
b.cust_id ,b.zc_bal,b.start_date,b.end_date,b.is_flag
from
(
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20171031' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_2 a where dt ='20180412'
union all
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20171130' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_11 a where dt ='20180412'
union all
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20171231' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_12 a where dt ='20180412'
union all
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20180131' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_01 a where dt ='20180412'
union all
select a.cust_id ,a.zc_bal,a.start_date,case when a.end_date ='29991231' then '20180228' else a.end_date end as end_date
,a.is_flag from tmp.cust_bal_zip_02 a where dt ='20180412'
union all
select a.cust_id ,a.zc_bal,a.start_date,a.end_date
,a.is_flag from tmp.cust_bal_zip_03 a where dt ='20180412'
) b
第三步,去处半年不变且zc_bal=0的冗余数据:
--HIVE_CONFIG=set hive.exec.parallel=true;set mapred.max.split.size=50000000;set mapred.min.split.size.per.node=50000000;set mapred.min.split.size.per.rack=50000000;set hive.exec.reducers.bytes.per.reducer=50000000;set hive.hadoop.supports.splittable.combineinputformat=true;set hive.vectorized.execution.enabled = true;set hive.vectorized.execution.reduce.enabled = true;set mapreduce.map.memory.mb=5120;set mapreduce.reduce.memory.mb=5120;set hive.map.aggr=true;set hive.groupby.skewindata=true;
with tmp1 as (
select a.cust_id , count(1) as cnt from tmp.cust_bal_zip_init a where dt ='20180412' group by cust_id
),
tmp2 as (
select a.cust_id, count(1) as cnt from tmp.cust_bal_zip_init a where dt ='20180412' and zc_bal = 0 group by cust_id
),
tmp3 as (
select a.cust_id from tmp1 a inner join tmp2 b on a.cust_id = b.cust_id and b.cnt = 6 where a.cnt = 6
)
select * from
(
select b.cust_id ,b.zc_bal,b.start_date,b.end_date,b.is_flag from tmp.cust_bal_zip_init b left join tmp3 c on b.cust_id = c.cust_id where b.dt ='20180412' and c.cust_id is null
union all
select a.cust_id ,cast(0 as decimal(20,2)) as zc_bal,'20171001' as start_date,'29991231' as end_date,'0' as is_flag from tmp3 a
) d
第四步,加入到拉链表,执行每日跑批:
----HIVE_CONFIG=set hive.exec.parallel=true;set mapred.max.split.size=50000000;set mapred.min.split.size.per.node=50000000;set mapred.min.split.size.per.rack=50000000;set hive.exec.reducers.bytes.per.reducer=50000000;set hive.hadoop.supports.splittable.combineinputformat=true;set hive.vectorized.execution.enabled = true;set hive.vectorized.execution.reduce.enabled = true;set mapreduce.map.memory.mb=5120;set mapreduce.reduce.memory.mb=5120;
--APPEND_RELY=SELF
--取数据表今日数据t2
with cust_zc_bal_now as (
select
cust_id ,
zc_bal
from FACT_RPSM.F_CUST_BAL_SUM
WHERE DT = '${start|yyyyMMdd}'
and coalesce(cust_id,'')<>''
)
,
--取拉链表昨日数据t1
cust_zc_bal_zipper_y1 as
(
select
cust_id ,
zc_bal ,
start_date as start_dt,
end_date as end_dt,
is_flag as del_flag,
'${start-1d|yyyyMMdd}' as data_date
from tmp.cust_bal_zip_init2
where dt ='20180416'
-- from view.CUST_BAL_Z_T
-- WHERE DT = '${start-1d|yyyyMMdd}'
)
,
--t1 闭链的数据
cust_zc_bal_zipper_y2 as
(
select
cust_id ,
zc_bal ,
start_dt,
end_dt,
del_flag
from cust_zc_bal_zipper_y1
WHERE end_dt <> '29991231'
)
,
--t1和t2 的客户 的资产余额相同的客户,即数据不变的.
cust_zc_bal_zipper_y3 as (
select
t1.cust_id ,
t1.zc_bal ,
t1.start_dt,
'29991231' as end_dt, --闭链日期为MAX_DATE
'0' as del_flag --删除标记为0
from cust_zc_bal_zipper_y1 t1
left join cust_zc_bal_now t2
on t1.cust_id = t2.cust_id
where t1.end_dt = '29991231'
and t1.del_flag = '0'
and t2.cust_id is not null
and t1.zc_bal = t2.zc_bal --T1未删除且结束日期是29991231 ,T2客户不为空,t1和t2资产相同.
)
,
--t1比t2 多的客户,即这些客户失效了
cust_zc_bal_zipper_y4 as (
select
t1.cust_id ,
t1.zc_bal ,
t1.start_dt,
t1.data_date as end_dt , --闭链日期为昨日
'1' as del_flag --删除标记为1
from cust_zc_bal_zipper_y1 t1
left join cust_zc_bal_now t2
on t1.cust_id = t2.cust_id
where t1.end_dt = '29991231'
and t1.del_flag = '0'
and t2.cust_id is null --T1未删除且结束日期是29991231 ,T2客户为空.
)
,
--t1比t2 变更的客户 进行闭链
cust_zc_bal_zipper_y5 as (
select
t1.cust_id ,
t1.zc_bal ,
t1.start_dt,
t1.data_date as end_dt , --闭链日期为昨日
'0' as del_flag --删除标记为0
from cust_zc_bal_zipper_y1 t1
left join cust_zc_bal_now t2
on t1.cust_id = t2.cust_id
where t1.end_dt = '29991231'
and t1.del_flag = '0'
and t2.cust_id is not null
and t1.zc_bal <> t2.zc_bal --T1未删除且结束日期是29991231 ,T2客户不为空,t1和t2资产不相同
)
,
--t2新增和变更 新增开链数据
cust_zc_bal_zipper_y6 as (
select
t1.cust_id ,
t1.zc_bal ,
'${start|yyyyMMdd}' as start_dt, --开链日期为dt
'29991231' as end_dt , --闭链日期为昨日
'0' as del_flag --删除标记为0
from cust_zc_bal_zipper_y1 t1
left join cust_zc_bal_now t2
on t1.cust_id = t2.cust_id
where t1.end_dt = '29991231'
and t1.del_flag = '0'
and t2.cust_id is not null
and t1.zc_bal <> t2.zc_bal --T1未删除且结束日期是29991231 ,T2客户不为空,t1和t2资产不相同
union all
select
t1.cust_id ,
t1.zc_bal ,
'${start|yyyyMMdd}' as start_dt, --开链日期为dt
'29991231' as end_dt , --闭链日期为昨日
'0' as del_flag --删除标记为0
from cust_zc_bal_now t1
left join cust_zc_bal_zipper_y1 t2
on t1.cust_id = t2.cust_id
where t2.cust_id is null --t2历史没有此客户号
or ( t2.cust_id is not null and t2.del_flag ='1' ) --t2历史有此客户号,但是失效了
)
select cust_id ,
zc_bal ,
start_dt,
end_dt,
del_flag
from(
select cust_id ,
zc_bal ,
start_dt,
end_dt,
del_flag from cust_zc_bal_zipper_y2
union all
select cust_id ,
zc_bal ,
start_dt,
end_dt,
del_flag from cust_zc_bal_zipper_y3
union all
select cust_id ,
zc_bal ,
start_dt,
end_dt,
del_flag from cust_zc_bal_zipper_y4
union all
select cust_id ,
zc_bal ,
start_dt,
end_dt,
del_flag from cust_zc_bal_zipper_y5
union all
select cust_id ,
zc_bal ,
start_dt,
end_dt,
del_flag from cust_zc_bal_zipper_y6
) a
第五步,跑完半年后的第一天,将cust_zc_bal_zipper_y1 中初始化的表 改为 当前表 的昨日表。