– 拉链表实战
– 1.创建一张用于存储每日增量数据的表
drop table test.stu_newdata;
CREATE TABLE test.stu_newdata(
id int,
name string,
age string,
update_time bigint
)
partitioned by (dt string)
stored as parquet
;
– 2.创建一张全量每日快照表
drop table test.stu;
CREATE TABLE test.stu(
id int,
name string,
age string,
update_time bigint
)
partitioned by (dt string)
stored as parquet
;
– 3. 创建一张拉链表
drop table test.stu_zip;
CREATE TABLE test.stu_zip(
id int,
name string,
age string,
update_time bigint,
start_dt string, -- 记录有效起始日
end_dt string -- 记录有效结束日
)
partitioned by (dt string)
stored as parquet
;
– 操作手册
– 操作初始日期:2020-09-03
1. 利用sqoop,初次导入2020-09-03日,业务库中的表
bin/sqoop import \
--connect jdbc:mysql://doitedu01:3306/realtimedw \
--username root \
--password ABC123abc.123 \
--table stu \
--target-dir '/sqoopdata/incr/stu/2020-09-03' \
--delete-target-dir \
--incremental lastmodified \
--check-column update_time \
--last-value '2020-09-01 00:00:00' \
--fields-terminated-by ',' \
--as-parquetfile \
-m 1 \
2. 将本次导入的增量数据,load到hive的增量表中
load data inpath '/sqoopdata/incr/stu/2020-09-03' into table test.stu_newdata partition(dt='2020-09-03');
3.生成2020-09-03日的全量表
bin/sqoop codegen \
--connect jdbc:mysql://doitedu01:3306/realtimedw \
--username root \
--password ABC123abc.123 \
--table stu \
--bindir /opt/apps/code/stu \
--class-name Stu \
--as-parquetfile
模拟准备全量表的2020-09-02分区
insert into table stu partition(dt='2020-09-02')
select 0,'xx',10,1598945495000;
将增量表的2020-09-03数据跟全量表的2020-09-02,合并成全量表的2020-09-03数据
bin/sqoop merge \
--new-data /user/hive/warehouse/test.db/stu_newdata/dt=2020-09-03 \
--onto /user/hive/warehouse/test.db/stu/dt=2020-09-02 \
--target-dir /user/hive/warehouse/test.db/stu/dt=2020-09-03 \
--jar-file /opt/apps/code/stu/Stu.jar \
--class-name Stu \
--merge-key id
将合并好的数据,导入全量表的2020-09-03分区
hive> alter table stu_newdata add partition(dt='2020-09-03');