Kettle使用_22 维度更新 Type2 拉链表
需求:通过Kettle实现对员工维度数据的更新和插入。
解决方法:通过维度查询更新组件实现。
注:补充部分有SQL版Type2实现代码。
Previous work:
准备源数据和维度表结构与数据。
Slow Changing Dimensions(SCD 缓慢变化维)
即数据仓库里的Type2
数据仓库里维度的变化较慢,常用的有三类,分别为:
Type1:
cust_id name age
1 张三 30
如果对用户的age由30更新到40,则
cust_id name age
1 张三 40
Type2:
针对1的情况则,这里假设更新时间是2020-10-21
cust_id name age start_date end_date is_current
1 张三 30 2020-10-10 2020-10-21 0
1 张三 40 2020-10-21 9999-10-21 1
Type3:
直接记录当前最新的值和上一次变化前的值
cust_id name age pre_age
1 张三 30 40
注:type3的缺点是没法跟踪历史数据。
-- 创建原始用户数据业务表
CREATE TABLE tb_user_source(
user_id int,
name varchar(12),
job_title varchar(12),
phone varchar(15)
)
-- 创建用户维度表
CREATE TABLE tb_user_dimupdate(
agency_it int,
user_id int,
name varchar(12),
job_title varchar(12),
phone varchar(15),
versionno int,
startdate date,
enddate date
)
-- 清空原始数据
TRUNCATE TABLE tb_user_dimupdate;
TRUNCATE TABLE tb_user_dimupdate;
-- 初始化员工数据
INSERT INTO tb_user_source VALUES(1,'张三','CEO','13267564321');
INSERT INTO tb_user_source VALUES(2,'王五','CTO','18889897676');
Step1: 新增个转换
Step2:在转换的输入分类下拖个表输入(这里是维度表的源表),并配置如下:
Step3:在转换的数据仓库分类下,拖个维度查询/更新组件并SHIFT连接表输入与当前组件。
Step4:配置维度查询/更新组件
Step5:保存并运行转换验证:
-- 更新员工数据
UPDATE tb_user_source
SET phone='13966666666' WHERE user_id = 1;
-- 执行转换后查看
SELECT * FROM tb_user_source;
UPDATE tb_user_source
SET job_title='COO' WHERE user_id = 2;
-- 查看维度数据变化情况
SELECT * FROM tb_user_dimupdate
ORDER BY user_id
完整流程示意:
补充:SQL方式实现SCD(Type2),当前是SQL Server版:
--Step1 准备数据部分(创建Customer源表和DimCustomer 维度表)
IF OBJECT_ID('Customer') IS NOT NULL
DROP TABLE Customer
GO
IF OBJECT_ID('DimCustomer') IS NOT NULL
DROP TABLE DimCustomer
GO
CREATE TABLE Customer
(
ID INT PRIMARY KEY IDENTITY(1,1),
FullName NVARCHAR(50),
City NVARCHAR(50),
Occupation NVARCHAR(50),
IsCurrent BIT DEFAULT(1)
)
CREATE TABLE DimCustomer
(
CustomerID INT PRIMARY KEY IDENTITY(1,1),
CustomerAlternateKey INT,
FullName NVARCHAR(50),
City NVARCHAR(50),
Occupation NVARCHAR(50),
StartDate DATETIME,
EndDate DATETIME,
IsCurrent BIT DEFAULT(1)
)
--Step2 SCD代码
-- 修改状态
MERGE INTO dbo.DimCustomer AS Dim
USING dbo.Customer AS Src
ON Dim.CustomerAlternateKey = Src.ID
WHEN NOT MATCHED BY TARGET
THEN INSERT VALUES(Src.ID,Src.FullName,Src.City,Src.Occupation,GETDATE(),NULL,IsCurrent)
WHEN MATCHED AND Dim.City <> Src.City or Dim.Occupation <> Src.Occupation or Dim.IsCurrent<>Src.IsCurrent
THEN UPDATE SET
Dim.EndDate=
CASE WHEN Dim.EndDate IS NULL THEN
GETDATE()
ELSE
Dim.EndDate
END ,Dim.IsCurrent = 0 ;
-- 修改数据
MERGE INTO dbo.DimCustomer AS Dim
USING dbo.Customer AS Src
ON Dim.CustomerAlternateKey = Src.ID
AND Dim.City =Src.City and Dim.Occupation =Src.Occupation and Dim.IsCurrent=Src.IsCurrent
WHEN NOT MATCHED BY TARGET
THEN INSERT VALUES(Src.ID,Src.FullName,Src.City,Src.Occupation,GETDATE(),NULL,IsCurrent);
--测试 案例1 插入三条数据,然后运行SCD代码
INSERT INTO dbo.Customer VALUES
('BIWORK','Beijing','IT',0),
('ZhangSan','Shanghai','Education',1),
('Lisi','Guangzhou','Student',1)
SELECT * from Customer
SELECT * FROM DimCustomer
-- 案例2 再插入一条数据;然后运行SCD代码
INSERT INTO .dbo.Customer VALUES
('Wangwu','Beijing','Finance',1)
--修改 Changing Attribute
UPDATE .dbo.Customer
SET Occupation = 'IT'
WHERE ID = 3
SELECT * FROM DimCustomer
-- 案例3 同时修改两个字段 ;然后运行SCD代码
UPDATE dbo.Customer
SET Occupation = 'Publisher',
City = 'Hangzhou'
WHERE ID = 2
SELECT * FROM Customer
SELECT * FROM DimCustomer
GO