系列文章目录
文章目录
前言
上篇文章我把生成的数据从mysql的模拟业务数据导入到了hive中,本篇文章我将构建我的第一个主题域——贷款申请主题域,这里有已经构建好的ads层,接下来要构建dim层、dwd层、dws层,ads层我会在最后统一决定需要哪些指标再进行统计。
一、贷款申请主题域——dim维度表
1.第一张维度表:顾客维度表
(这里我将顾客的信息以及信誉评价连接在一起方便以后使用)
create table dim_customer(
customer_id int comment '客户编号',
name string comment '客户姓名',
id_number string comment '身份证号',
contact_info string comment '手机号',
province string comment '所属省份',
occupation string comment '职业身份',
income_levea string comment '收入水平',
credit_score double comment '信用评分',
assessment_date string comment '评估时间'
) comment '顾客维度表'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n';
insert overwrite table dim_customer
select t1.customer_id, t1.name, t1.id_number, t1.contact_info,
t1.province, t1.occupation, t1.income_level,
t2.max_credit_score, t2.max_assessment_date
from (select customer_id, name, id_number, contact_info,
province, occupation, income_level
from ods_customer_info) t1
left join (select customer_id, max(credit_score) as max_credit_score, max(assessment_date) as max_assessment_date
from ods_credit_assessment
group by customer_id) t2 ON t1.customer_id = t2.customer_id;
2.第二张维度表:时间维度表
(这里我构造了业务内的一些时间的详情表方便使用)
注:各位如果也想写这样的表、其实也很简单、完全自己写会耗费很长时间、可以直接搬别人的模板自己修改成自己需要的即可。
-- 时间维度表
create table dim_calendar(
dateid string
,date_desc string
,day_of_month string
,day_of_month_desc string
,day_of_year string
,day_of_year_desc string
,week_of_year string
,week_of_year_desc string
,month_of_year string
,month_of_year_desc string
,monthid string
,month_desc string
,yearid string
,year_desc string
,quarterid string
,quarter_desc string
,quarter_of_year string
,quarter_of_year_desc string
,create_time string
,update_time string
,etl_time string
) comment '日期维度表'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n';
with dates as (
select date_add('2024-09-01', a.pos) as d
from ( select posexplode(split(repeat("o",datediff('2025-01-01', '2024-09-01')),"o"))) a
)
insert overwrite table dim_calendar
select
from_unixtime(unix_timestamp(d,'yyyy-MM-dd'),'yyyyMMdd') as dateid
,d as date_desc
,day(d) as day_of_month
,concat(year(d),'年',month(d),'月第',day(d),'天') as day_of_month_desc
,datediff(d,concat(year(d),'-01-01'))+1 as day_of_year
,concat(year(d),'年第',datediff(d,concat(year(d),'-01-01'))+1,'天') as day_of_year_desc
,weekofyear(d) as week_of_year
,concat(year(d),'年第',weekofyear(d),'周') as week_of_year_desc
,month(d) as month_of_year
,concat(year(d),'年第',month(d),'月') as month_of_year_desc
,from_unixtime(unix_timestamp(d,'yyyy-MM-dd'),'yyyyMM') as monthid
,from_unixtime(unix_timestamp(d,'yyyy-MM-dd'),'yyyy-MM') as month_desc
,year(d) as yearid
,concat(year(d),'年') as year_desc
,(case when month(d) in(1,2,3) then concat(year(d),'01')
when month(d) in(4,5,6) then concat(year(d),'02')
when month(d) in(7,8,9) then concat(year(d),'03')
when month(d) in(10,11,12) then concat(year(d),'04')
else null
end) as quarterid
,(case when month(d) in(1,2,3) then concat(year(d),'-','Q1')
when month(d) in(4,5,6) then concat(year(d),'-','Q2')
when month(d) in(7,8,9) then concat(year(d),'-','Q3')
when month(d) in(10,11,12) then concat(year(d),'-','Q4')
else null
end) as quarter_desc
,(case when month(d) in(1,2,3) then 1
when month(d) in(4,5,6) then 2
when month(d) in(7,8,9) then 3
when month(d) in(10,11,12) then 4
else null
end) as quarter_of_year
,(case when month(d) in(1,2,3) then concat(year(d),'年1季度')
when month(d) in(4,5,6) then concat(year(d),'年2季度')
when month(d) in(7,8,9) then concat(year(d),'年3季度')
when month(d) in(10,11,12) then concat(year(d),'年4季度')
else null
end) as quarter_of_year_desc
,from_unixtime(unix_timestamp(),'yyyy-MM-dd HH:mm:ss') as create_time
,from_unixtime(unix_timestamp(),'yyyy-MM-dd HH:mm:ss') as update_time
,from_unixtime(unix_timestamp(),'yyyy-MM-dd HH:mm:ss') as etl_time
from dates
order by dateid;
展示一下时间维度表的效果:
(里面有时间的各种表现形式)
二、贷款申请主题域——dwd明细表
1.贷款申请明细表建表
代码如下:
create table dwd_loan_application_dtl(
application_id int comment '申请编号',
customer_id int comment '申请人id',
name string comment '申请人姓名',
id_number string comment '申请人身份证号',
contact_info string comment '申请人手机号',
province string comment '申请人所属省份',
occipation string comment '申请人身份职业',
income_levea double comment '申请人收入概况',
credit_score double comment '申请人信誉评分',
amount_requested double comment '申请金额',
loan_term int comment '贷款期限',
loan_purpose string comment '贷款用途',
application_date string comment '申请时间'
)comment '贷款申请表'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n';
2.贷款申请明细表数据
代码如下:
insert overwrite table dwd_loan_application_dtl
select
t1.application_id ,
t2.customer_id ,
t2.name ,
t2.id_number ,
t2.contact_info ,
t2.province ,
t2.occupation ,
t2.income_levea ,
t2.credit_score ,
t1.amount_requested ,
t1.loan_term ,
t1.loan_purpose ,
t1.application_date
from
(select application_id,
customer_id,
amount_requested,
loan_term,
loan_purpose,
application_date
from ods_loan_application) t1
left join(select *
from dim_customer
) t2 on t1.customer_id = t2.customer_id;
贷款申请主题域——dws轻度汇总表
1.贷款申请轻度汇总表建表
代码如下:
create table dws_loan_application(
province string ,
occipation string,
income_levea string,
credit_score_dengji string,
loan_term int,
loan_purpose string,
application_date string,
apply_cnt int,
apply_user_cnt int,
amount_sum double
)comment '贷款申请汇总表'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n';
2.贷款申请轻度汇总表数据
代码如下:
insert overwrite table dws_loan_application
select
province ,
occipation ,
case when income_levea > 5000 then '小康'
when income_levea > 10000 then '富裕'
when income_levea > 50000 then '豪'
else '贫困' end income_lecal ,
case when credit_score > 400 then '良'
when credit_score > 600 then '好'
when credit_score > 800 then '优'
else '较差' end as credit_score_dengji,
loan_term ,
loan_purpose ,
application_date,
count(1) as apply_cnt,
count(distinct customer_id) as apply_user_cnt,
sum(amount_requested) as amount_sum
from dwd_loan_application_dtl
group by
province ,
occipation ,
income_levea ,
credit_score ,
loan_term ,
loan_purpose ,
application_date;
总结
本章也是终于把贷款申请个主题域基本构建完成,本来想的是想把底层mr计算引擎换成tez,但是搞了比较长时间也是没有编译好,所以先往前推进项目。