文章目录
4.6 全量流程
OLTP原始数据(mysql)——》数据采集(ODS)——》清洗转换(DWD)——》统计分析(DWS)——》导出至OLAP(Mysql),如图:
4.6.1 数据采集
4.6.1.1 web_chat_ems表
4.6.1.1.1 SQL:
select id,
create_date_time,
session_id,
sid,
create_time,
seo_source,
seo_keywords,
ip,
area,
country,
province,
city,
origin_channel,
user as user_match,
manual_time,
begin_time,
end_time,
last_customer_msg_time_stamp,
last_agent_msg_time_stamp,
reply_msg_count,
msg_count,
browser_name,
os_info,
"2019-07-01" as starts_time
from web_chat_ems_2019_07;
4.6.1.1.2 Sqoop:
sqoop import \
--connect jdbc:mysql://192.168.52.150:3306/nev \
--username root \
--password 123456 \
--query 'select id, create_date_time, session_id, sid, create_time, seo_source, seo_keywords, ip, area, country, province, city, origin_channel, user as user_match, manual_time, begin_time, end_time, last_customer_msg_time_stamp, last_agent_msg_time_stamp, reply_msg_count, msg_count, browser_name, os_info, "2019-07-01" as starts_time from web_chat_ems_2019_07 where $CONDITIONS' \
--hcatalog-database itcast_ods_test \
--hcatalog-table web_chat_ems \
-m 100 \
--split-by id
bin/sqoop import \
--connect jdbc:mysql://192.168.10.10:3306/nev \
--username root \
--query 'select id, create_date_time, session_id, sid, create_time, seo_source, seo_keywords, ip, area, country, province, city, origin_channel, user as user_match, manual_time, begin_time, end_time, last_customer_msg_time_stamp, last_agent_msg_time_stamp, reply_msg_count, msg_count, browser_name, os_info, "2019-07-01" as starts_time from web_chat_ems_2019_07 where $CONDITIONS' \
--hcatalog-database itcast_ods_test \
--hcatalog-table web_chat_ems \
-m 100 \
--split-by id
-m 100,指的是使用100个MapReduce任务并行处理;
而split-by参数,是指以哪个字段为基础进行分割。
4.6.1.2 web_chat_text_ems表
4.6.1.2.1 SQL
select id,
referrer,
from_url,
landing_page_url,
url_title,
platform_description,
other_params,
history,
"2019-07-01" as start_time
from web_chat_text_ems_2019_07;
4.6.1.2.2 Sqoop
sqoop import \
--connect jdbc:mysql://192.168.52.150:3306/nev \
--username root \
--password 123456 \
--query 'select id,referrer,from_url,landing_page_url,url_title,platform_description,other_params,history, "2019-07-01" as start_time from web_chat_text_ems_2019_07 where $CONDITIONS' \
--hcatalog-database itcast_ods \
--hcatalog-table web_chat_text_ems \
-m 100 \
--split-by id
bin/sqoop import \
--connect jdbc:mysql://192.168.10.10:3306/nev \
--username root \
--query 'select id,referrer,from_url,landing_page_url,url_title,platform_description,other_params,history, "2019-07-01" as start_time from web_chat_text_ems_2019_07 where $CONDITIONS' \
--hcatalog-database itcast_ods \
--hcatalog-table web_chat_text_ems \
-m 100 \
--split-by id
4.6.2.4 代码
--动态分区配置
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
--hive压缩
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
--写入时压缩生效
set hive.exec.orc.compression.strategy=COMPRESSION;
insert into table itcast_dwd.visit_consult_dwd partition (yearinfo, monthinfo, dayinfo)
select
wce.session_id,
wce.sid,
unix_timestamp(wce.create_time, 'yyyy-MM-dd HH:mm:ss.SSS') as create_time,
wce.seo_source,
wce.ip,
wce.area,
cast(if(wce.msg_count is null, 0, wce.msg_count) as int) as msg_count,
wcte.referrer,
wcte.from_url,
wcte.landing_page_url,
wcte.url_title,
wcte.platform_description,
wcte.other_params,
wcte.history,
substr(wce.create_time, 12, 2) as hourinfo,
quarter(wce.create_time) as quarterinfo,
substr(wce.create_time, 1, 4) as yearinfo,
substr(wce.create_time, 6, 2) as monthinfo,
substr(wce.create_time, 9, 2) as dayinfo
from itcast_ods.web_chat_ems wce inner join itcast_ods.web_chat_text_ems wcte
on wce.id = wcte.id;
4.6.3 统计分析
4.6.3.1 分析
DWD层之后是DWM中间层和DWS业务层。回顾建模分析阶段,我们已经得到了指标相关的维度:年、季度、月、天、小时、地区、来源渠道、页面。分两大类:
时间维度:年、季度、月、天、小时
业务属性维度:地区、来源渠道、页面、总访问量。
在DWS层按照不同维度使用count+distinct来统计指标,形成宽表。
空值处理
事实表中的维度关联键不能存在空值,关联的维度信息必须用代理键(-1)而不是空值表示未知的条件。
4.6.3.2 代码
我们的维度一共有两大类:时间维度和产品属性维度,在DWS层我们可以产出一个宽表,将所有维度的数据都生成出来,供APP层和OLAP应用来使用。
4.6.3.2.1 地区分组
统计地区维度时,需要设置产品属性类型groupType为1(地区),同时将其他产品属性设置为-1(搜索来源、来源渠道、会话来源页面),便于团队理解,减少自己和团队出错率的同时也降低了沟通成本。
在insertsql中,尽量为查询出的字段加上别名,特别是字段多的表,便于识别。
小时维度:
--分区
SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=10000;
set hive.exec.max.dynamic.partitions=100000;
set hive.exec.max.created.files=150000;
--hive压缩
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
--写入时压缩生效
set hive.exec.orc.compression.strategy=COMPRESSION;
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
area,
'-1' as seo_source,
'-1' as origin_channel,
hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo,'-',dayinfo,' ',hourinfo) as time_str,
'-1' as from_url,
'1' as grouptype,
'1' as time_type,
yearinfo, monthinfo, dayinfo
from itcast_dwd.visit_consult_dwd
group by area, yearinfo, quarterinfo, monthinfo, dayinfo, hourinfo;
天维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
area,
'-1' as seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo,'-',dayinfo) as time_str,
'-1' as from_url,
'1' as grouptype,
'2' as time_type,
yearinfo, monthinfo, dayinfo
from itcast_dwd.visit_consult_dwd
group by area, yearinfo, quarterinfo, monthinfo, dayinfo;
月维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
area,
'-1' as seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo) as time_str,
'-1' as from_url,
'1' as grouptype,
'3' as time_type,
yearinfo, monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd
group by area, yearinfo, quarterinfo, monthinfo;
季度维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
area,
'-1' as seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-Q',quarterinfo) as time_str,
'-1' as from_url,
'1' as grouptype,
'4' as time_type,
yearinfo,
'-1' as monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd
group by area, yearinfo, quarterinfo;
年维度:
INSERT INTO TABLE itcast_dws.visit_dws PARTITION (yearinfo,monthinfo,dayinfo)
select
COUNT(DISTINCT wce.sid) as sid_total,
COUNT(DISTINCT wce.session_id) as sessionid_total,
COUNT(DISTINCT wce.ip) as ip_total,
wce.area as area,
'-1' as seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
'-1' as quarterinfo,
wce.yearinfo as time_str,
'-1' as from_url,
'1' as groupType,
'5' as time_type,
wce.yearinfo as yearinfo,
'-1' as monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd wce
group by wce.area,wce.yearinfo;
4.6.3.2.2 搜索来源分组
小时维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
seo_source,
'-1' as origin_channel,
hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo,'-',dayinfo,' ',hourinfo) as time_str,
'-1' as from_url,
'2' as grouptype,
'1' as time_type,
yearinfo, monthinfo, dayinfo
from itcast_dwd.visit_consult_dwd
group by seo_source, yearinfo, quarterinfo, monthinfo, dayinfo, hourinfo;
天维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo,'-',dayinfo) as time_str,
'-1' as from_url,
'2' as grouptype,
'2' as time_type,
yearinfo, monthinfo, dayinfo
from itcast_dwd.visit_consult_dwd
group by seo_source, yearinfo, quarterinfo, monthinfo, dayinfo;
月维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo) as time_str,
'-1' as from_url,
'2' as grouptype,
'3' as time_type,
yearinfo, monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd
group by seo_source, yearinfo, quarterinfo, monthinfo;
季度维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-Q',quarterinfo) as time_str,
'-1' as from_url,
'2' as grouptype,
'4' as time_type,
yearinfo,
'-1' as monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd
group by seo_source, yearinfo, quarterinfo;
年维度:
INSERT INTO TABLE itcast_dws.visit_dws PARTITION (yearinfo,monthinfo,dayinfo)
select
COUNT(DISTINCT wce.sid) as sid_total,
COUNT(DISTINCT wce.session_id) as sessionid_total,
COUNT(DISTINCT wce.ip) as ip_total,
'-1' as area,
seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
'-1' as quarterinfo,
wce.yearinfo as time_str,
'-1' as from_url,
'2' as groupType,
'5' as time_type,
wce.yearinfo as yearinfo,
'-1' as monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd wce
group by wce.seo_source,wce.yearinfo;
4.6.3.2.3 来源渠道分组
小时维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
'-1' as seo_source,
origin_channel,
hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo,'-',dayinfo,' ',hourinfo) as time_str,
'-1' as from_url,
'3' as grouptype,
'1' as time_type,
yearinfo, monthinfo, dayinfo
from itcast_dwd.visit_consult_dwd
group by origin_channel, yearinfo, quarterinfo, monthinfo, dayinfo, hourinfo;
天维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
'-1' as seo_source,
origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo,'-',dayinfo) as time_str,
'-1' as from_url,
'3' as grouptype,
'2' as time_type,
yearinfo, monthinfo, dayinfo
from itcast_dwd.visit_consult_dwd
group by origin_channel, yearinfo, quarterinfo, monthinfo, dayinfo;
月维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
'-1' as seo_source,
origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo) as time_str,
'-1' as from_url,
'3' as grouptype,
'3' as time_type,
yearinfo, monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd
group by origin_channel, yearinfo, quarterinfo, monthinfo;
季度维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
'-1' as seo_source,
origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-Q',quarterinfo) as time_str,
'-1' as from_url,
'3' as grouptype,
'4' as time_type,
yearinfo,
'-1' as monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd
group by origin_channel, yearinfo, quarterinfo;
年维度:
INSERT INTO TABLE itcast_dws.visit_dws PARTITION (yearinfo,monthinfo,dayinfo)
select
COUNT(DISTINCT wce.sid) as sid_total,
COUNT(DISTINCT wce.session_id) as sessionid_total,
COUNT(DISTINCT wce.ip) as ip_total,
'-1' as area,
'-1' as seo_source,
origin_channel,
'-1' as hourinfo,
'-1' as quarterinfo,
wce.yearinfo as time_str,
'-1' as from_url,
'3' as groupType,
'5' as time_type,
wce.yearinfo as yearinfo,
'-1' as monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd wce
group by wce.origin_channel,wce.yearinfo;
4.6.3.2.4 会话来源页面分组
小时维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
'-1' as seo_source,
'-1' as origin_channel,
hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo,'-',dayinfo,' ',hourinfo) as time_str,
from_url,
'4' as grouptype,
'1' as time_type,
yearinfo, monthinfo, dayinfo
from itcast_dwd.visit_consult_dwd
group by from_url, yearinfo, quarterinfo, monthinfo, dayinfo, hourinfo;
天维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
'-1' as seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo,'-',dayinfo) as time_str,
from_url,
'4' as grouptype,
'2' as time_type,
yearinfo, monthinfo, dayinfo
from itcast_dwd.visit_consult_dwd
group by from_url, yearinfo, quarterinfo, monthinfo, dayinfo;
月维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
'-1' as seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo) as time_str,
from_url,
'4' as grouptype,
'3' as time_type,
yearinfo, monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd
group by from_url, yearinfo, quarterinfo, monthinfo;
季度维度:
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
'-1' as seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-Q',quarterinfo) as time_str,
from_url,
'4' as grouptype,
'4' as time_type,
yearinfo,
'-1' as monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd
group by from_url, yearinfo, quarterinfo;
年维度:
INSERT INTO TABLE itcast_dws.visit_dws PARTITION (yearinfo,monthinfo,dayinfo)
select
COUNT(DISTINCT wce.sid) as sid_total,
COUNT(DISTINCT wce.session_id) as sessionid_total,
COUNT(DISTINCT wce.ip) as ip_total,
'-1' as area,
'-1' as seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
'-1' as quarterinfo,
wce.yearinfo as time_str,
from_url,
'4' as groupType,
'5' as time_type,
wce.yearinfo as yearinfo,
'-1' as monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd wce
group by wce.from_url,wce.yearinfo;
4.6.3.2.5 总访问量
小时(小时段区间的基础数据)
因为小时段数据可以直接sum求和,因此OLAP应用可以在小时数据基础上,进行简单的sum操作以获取到区间小时段数据。
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
'-1' as seo_source,
'-1' as origin_channel,
hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo,'-',dayinfo,' ',hourinfo) as time_str,
'-1' as from_url,
'5' as grouptype,
'1' as time_type,
yearinfo, monthinfo, dayinfo
from itcast_dwd.visit_consult_dwd
group by yearinfo, quarterinfo, monthinfo, dayinfo, hourinfo;
天
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
'-1' as seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo,'-',dayinfo) as time_str,
'-1' as from_url,
'5' as grouptype,
'2' as time_type,
yearinfo, monthinfo, dayinfo
from itcast_dwd.visit_consult_dwd
group by yearinfo, quarterinfo, monthinfo, dayinfo;
月
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
'-1' as seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-',monthinfo) as time_str,
'-1' as from_url,
'5' as grouptype,
'3' as time_type,
yearinfo, monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd
group by yearinfo, quarterinfo, monthinfo;
季度
insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
select
count(distinct sid) as sid_total,
count(distinct session_id) as session_total,
count(distinct ip) as ip_total,
'-1' as area,
'-1' as seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
quarterinfo,
concat(yearinfo,'-Q',quarterinfo) as time_str,
'-1' as from_url,
'5' as grouptype,
'4' as time_type,
yearinfo,
'-1' as monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd
group by yearinfo, quarterinfo;
年
INSERT INTO TABLE itcast_dws.visit_dws PARTITION (yearinfo,monthinfo,dayinfo)
select
COUNT(DISTINCT wce.sid) as sid_total,
COUNT(DISTINCT wce.session_id) as sessionid_total,
COUNT(DISTINCT wce.ip) as ip_total,
'-1' as area,
'-1' as seo_source,
'-1' as origin_channel,
'-1' as hourinfo,
'-1' as quarterinfo,
wce.yearinfo as time_str,
'-1' as from_url,
'5' as groupType,
'5' as time_type,
wce.yearinfo as yearinfo,
'-1' as monthinfo,
'-1' as dayinfo
from itcast_dwd.visit_consult_dwd wce
group by wce.yearinfo;
4.6.4 导出数据
4.6.4.1 创建mysql表
create database scrm_bi default character set utf8mb4 collate utf8mb4_general_ci;
CREATE TABLE `itcast_visit` (
sid_total int(11) COMMENT '根据sid去重求count',
sessionid_total int(11) COMMENT '根据sessionid去重求count',
ip_total int(11) COMMENT '根据IP去重求count',
area varchar(32) COMMENT '区域信息',
seo_source varchar(32) COMMENT '搜索来源',
origin_channel varchar(32) COMMENT '来源渠道',
hourinfo varchar(32) COMMENT '小时信息',
quarterinfo varchar(32) COMMENT '季度',
time_str varchar(32) COMMENT '时间明细',
from_url varchar(32) comment '会话来源页面',
groupType varchar(32) COMMENT '产品属性类型:1.地区;2.搜索来源;3.来源渠道;4.会话来源页面;5.总访问量',
time_type varchar(32) COMMENT '时间聚合类型:1、按小时聚合;2、按天聚合;3、按月聚合;4、按季度聚合;5、按年聚合;',
yearinfo varchar(32) COMMENT '年信息',
monthinfo varchar(32) COMMENT '月信息',
dayinfo varchar(32) COMMENT '日信息'
);
4.6.4.2 执行sqoop导出脚本
sqoop export \
--connect "jdbc:mysql://192.168.52.150:3306/scrm_bi?useUnicode=true&characterEncoding=utf-8" \
--username root \
--password '123456' \
--table itcast_visit \
--hcatalog-database itcast_dws \
--hcatalog-table visit_dws \
-m 100