版权声明: https://blog.csdn.net/weixin_37254888/article/details/79884959
一、回顾
-》基于ETL结果进行数据仓库建模
-》ETL结果
true72.46.128.140-2013-09-18 07:58:50/hadoop-zookeeper-intro/20014722"https://www.google.com/""Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.1(KHTML,likeGecko)Chrome/21.0.1174.0Safari/537.1"
valid
ip
user_id
time
request
status
body_size
http_ref
user_agent
-》pageview模型
07f26862-f31c-40dd-ad91-2d61fac91a9c1.80.249.223-2013-09-18 07:57:33/hadoop-hive-intro/160"http://www.google.com.hk/url?sa=t&rct=j&q=hive%E7%9A%84%E5%AE%89%E8%A3%85&source=web&cd=2&ved=0CC4QFjAB&url=%68%74%74%70%3a%2f%2f%62%6c%6f%67%2e%66%65%6e%73%2e%6d%65%2f%68%61%64%6f%6f%70%2d%68%69%76%65%2d%69%6e%74%72%6f%2f&ei=5lw5Uo-2NpGZiQfCwoG4BA&usg=AFQjCNF8EFxPuCMrm7CvqVgzcBUzrJZStQ&bvm=bv.52164340,d.aGc&cad=rjt""Mozilla/5.0(WindowsNT5.2;rv:23.0)Gecko/20100101Firefox/23.0"14764200
session_id
ip
user_id
time
request
step
length
http_ref
user_agent
body_size
status
-》visit模型
004b0a11-081b-402c-be5c-8957f85a5abb180.153.163.1902013-09-18 09:37:412013-09-18 09:37:41/finance-rhive-repurchase//finance-rhive-repurchase/"-"1
session_id
ip
intime
outtime
inpage
outpage
refere
numpage
二、数据仓库的设计
-》表的类型
-》事实表
-》订单
订单id 订单类型 订单名称
-》用户表
-》行为表
-》维度表
-》时间维度
时间维度id 年 季度 月 周 日 小时
-》地域维度
地域维度id 国家 省份 城市
-》构成
某个时间某个地域的订单信息
时间维度id 地域维度id 订单id 订单类型 订单名称
-》常见模型
-》星型模型:存在冗余,但数据处理的效率较高
-》雪花模型:没有冗余,但数据处理时需要进行关联查询
三、构建数据仓库
-》构建ETL以后的源表
create database if not exists weblog;
use weblog;
drop table if exists weblog_origin;
create table weblog_origin(
valid string,
ip string,
user_id string,
time string,
request string,
status string,
body_size string,
http_ref string,
user_agent string)
partitioned by (datestr string)
row format delimited fields terminated by '\001';
load data local inpath '/opt/datas/part-r-etl' into table weblog_origin partition (datestr = '20180409');
--page view 模型
drop table if exists click_pageviews;
create table click_pageviews(
session string,
ip string,
user_id string,
time string,
request string,
step string,
staylong string,
http_ref string,
user_agent string,
body_size string,
status string)
partitioned by (datestr string)
row format delimited
fields terminated by '\001';
load data local inpath '/opt/datas/part-r-pv' into table click_pageviews partition (datestr = '20180409');
select session,ip,time,step,staylong from click_pageviews;
--visit模型
drop table if exist click_visit;
create table click_visit(
session string,
ip string,
inTime string,
outTime string,
inPage string,
outPage string,
refere string,
pageNum int)
partitioned by (datestr string);
load data local inpath '/opt/datas/part-r-visit' into table click_visit partition (datestr = '20180409');
--分析详情表
drop table weblog_detail;
create table weblog_detail(
valid string, --有效标识
ip string, --来源IP
user_id string, --用户标识
time string, --访问完整时间
daystr string, --访问日期
timestr string, --访问时间
month string, --访问月
day string, --访问日
hour string, --访问时
request string, --请求的url
status string, --响应码
body_size string, --传输字节数
http_ref string, --来源url
ref_host string, --来源的host
ref_path string, --来源的路径
ref_query string, --来源参数query
ref_query_id string, --来源参数query的值
user_agent string --客户终端标识
)partitioned by(datestr string);
--时间字段:需要通过截取time字段得到以下字段
daystr string, --访问日期
timestr string, --访问时间
month string, --访问月
day string, --访问日
hour string, --访问时
解决:substring:2013-09-18 09:37:41
select
substring(time,1,10) as daystr,
substring(time,12) as timestr,
substring(time,6,2) as month,
substring(time,9,2) as day,
substring(time,12,2) as hour
from weblog_origin;
--来源详情数据:需要通过解析来源url得到
ref_host string, --来源的host
ref_path string, --来源的路径
ref_query string, --来源参数query
ref_query_id string, --来源参数query的值
解决:parse_url_tuple,专门用于解析url
用法:parse_url_tuple(url, partname1, partname2, ..., partnameN)
parse_url_tuple(http_ref,'HOST','PATH','QUERY','QUERY:id')
-》将httpref变成标准的url格式,去除双引号
-》regexp_replace:使用正则表达式进行替换
regexp_replace(str, regexp, rep)
select regexp_replace(http_ref,"\"","") from weblog_origin limit 10;
-》对http_ref进行解析
select parse_url_tuple(regexp_replace(http_ref,"\"",""),'HOST','PATH','QUERY','QUERY:id') from weblog_origin limit 10;
-》生成详情表
LATERAL VIEW:经常与explode、parse_url_tuple等udtf函数连用,实现行列转换或者一对多
hadoop hive hbase -> explode -> hadoop
hive
hbase
insert overwrite table weblog_detail partition (datestr='20180409')
select
a.valid,
a.ip,
a.user_id,
a.time,
substring(a.time,1,10) as daystr,
substring(a.time,12) as timestr,
substring(a.time,6,2) as month,
substring(a.time,9,2) as day,
substring(a.time,12,2) as hour,
a.request,
a.status,
a.body_size,
a.http_ref,
b.ref_host,
b.ref_path,
b.ref_query,
b.ref_query_id,
a.user_agent
from weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_ref,"\"",""),'HOST','PATH','QUERY','QUERY:id') b as ref_host,ref_path,ref_query,ref_query_id where a.time != 'null';
select daystr,timestr,month,ref_host,ref_path,ref_query from weblog_detail limit 10;
四、模块分析
-》SQL语句结构
select 字段条件 from 数据源(表、视图、子查询、join) where 对字段的值进行过滤 group by 分组字段 having 分组后数据进行过滤 order by 排序字段 asc|desc limit
-》浏览分析
-》统计每小时的PV数
create table if not exists pv_hour as select daystr ,hour ,count(request) as pv from weblog_detail group by daystr,hour;
select daystr ,hour ,count(request) as pv from weblog_detail group by daystr,hour order by hour asc,pv desc;
-》统计每天的pv数
create table if not exists pv_day as select daystr ,count(request) as pv from weblog_detail group by daystr;
-》统计每月的pv数
create table if not exists pv_month as select month,count(request) as pv from weblog_detail group by month;
-》统计不同终端维度的PV数
create table if not exists pv_agent as select user_agent,count(request) as pv from weblog_detail group by user_agent limit 10;
-》统计不同终端类型
select distinct(user_agent) from weblog_detail where user_agent like '%Mozilla%' limit 10;
-》统计每个月每个终端的pv数
create table if not exists pv_month_agent as select month,user_agent,count(request) as pv from weblog_detail group by month , user_agent ;
-》统计人均pv数
-》先求每个人的pv数
select ip,count(request) as pv_per from weblog_detail group by ip;
-》平均
select sum(a.pv_per) / count(ip) as pv_avg from (select ip,count(request) as pv_per from weblog_detail group by ip) a;
-》统计不同来源的pv数
create table if not exists pv_ref as select http_ref,ref_host,count(request) as pv from weblog_detail group by http_ref ,ref_host limit 10;
-》统计一天内每个小时来源中产生最多的pv的的TOPN
-》每个小时每个来源的pv数
create table if not exists pv_refer select daystr,hour,http_ref,ref_host,count(request) as pvs from weblog_detail group by daystr,hour,http_ref,ref_host limit 10;
-》求topN
concat():用于拼接字符串
select
tmp.hour,
tmp.ref_host,
tmp.pvs,
tmp.od
from (
select concat(daystr,hour) as hour ,ref_host,pvs,row_number() over (partition concat(daystr,hour) order by pvs desc) as od ) as tmp where tmp.od <= 3;
-》受访分析
-》统计每日热门的受访页面的topN
-》统计每天每个页面的访问次数
select
tmp2.daystr,
tmp2.request,
tmp2.pvs,
tmp2.od
from(
select
tmp.daystr,
tmp.request,
tmp.pvs,
row_number() over (partition by tmp.daystr order by tmp.pvs desc) as od
from(
select daystr,request,count(request) as pvs from weblog_detail group by daystr ,request) as tmp ) as tmp2 where od < 4;
-》访客分析
-》每个访客每小时的访问数
select daystr,hour,ip,count(1) as pvs from weblog_detail group by daystr,hour,ip;
-》每个小时的ip访问数
select daystr,hour,count(ip) as pvs from weblog_detail group by daystr,hour;
-》每天的/每月的ip访问数
select daystrcount(ip) as pvs from weblog_detail group by daystr;
select month,count(ip) as pvs from weblog_detail group by month;
-》统计新增访客
-》历史访客表:用于存储所有访问过的用户的ip
create table history(day string,ip string)
-》判断当前ip是否是一个新的用户
select * from weblog_detail a join history b on a.ip = b.ip;
-》将新增访客的id写入history
insert into table history
select
tmp.daystr as day,
tmp.ip
from
(select daystr,a.ip as new_ip ,b.ip as old_ip from weblog_detail a left join history b on a.ip = b.ip) tmp where tmp.old_ip is null;
-》visit模型分析
-》单次访客统计
默认使用ip表示用户,使用session表示用户访问的次数
select ip,count(distinct session) as number from click_visit group by ip where number = 1 ;
-》回头访客个数
select ip,count(distinct session) as number from click_visit group by ip where number >1 ;
-》人均访问频率
所有用户的总次数/用户的个数
select sum(tmp.number)/count(tmp.ip)
from(
select ip,count(distinct session) as number from click_visit group by ip ) as tmp;
-》用户平均访问页数
总页数/ip数
select sum(pageNum) / count(ip) from click_visit;
-》基于ETL结果进行数据仓库建模
-》ETL结果
true72.46.128.140-2013-09-18 07:58:50/hadoop-zookeeper-intro/20014722"https://www.google.com/""Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.1(KHTML,likeGecko)Chrome/21.0.1174.0Safari/537.1"
valid
ip
user_id
time
request
status
body_size
http_ref
user_agent
-》pageview模型
07f26862-f31c-40dd-ad91-2d61fac91a9c1.80.249.223-2013-09-18 07:57:33/hadoop-hive-intro/160"http://www.google.com.hk/url?sa=t&rct=j&q=hive%E7%9A%84%E5%AE%89%E8%A3%85&source=web&cd=2&ved=0CC4QFjAB&url=%68%74%74%70%3a%2f%2f%62%6c%6f%67%2e%66%65%6e%73%2e%6d%65%2f%68%61%64%6f%6f%70%2d%68%69%76%65%2d%69%6e%74%72%6f%2f&ei=5lw5Uo-2NpGZiQfCwoG4BA&usg=AFQjCNF8EFxPuCMrm7CvqVgzcBUzrJZStQ&bvm=bv.52164340,d.aGc&cad=rjt""Mozilla/5.0(WindowsNT5.2;rv:23.0)Gecko/20100101Firefox/23.0"14764200
session_id
ip
user_id
time
request
step
length
http_ref
user_agent
body_size
status
-》visit模型
004b0a11-081b-402c-be5c-8957f85a5abb180.153.163.1902013-09-18 09:37:412013-09-18 09:37:41/finance-rhive-repurchase//finance-rhive-repurchase/"-"1
session_id
ip
intime
outtime
inpage
outpage
refere
numpage
二、数据仓库的设计
-》表的类型
-》事实表
-》订单
订单id 订单类型 订单名称
-》用户表
-》行为表
-》维度表
-》时间维度
时间维度id 年 季度 月 周 日 小时
-》地域维度
地域维度id 国家 省份 城市
-》构成
某个时间某个地域的订单信息
时间维度id 地域维度id 订单id 订单类型 订单名称
-》常见模型
-》星型模型:存在冗余,但数据处理的效率较高
-》雪花模型:没有冗余,但数据处理时需要进行关联查询
三、构建数据仓库
-》构建ETL以后的源表
create database if not exists weblog;
use weblog;
drop table if exists weblog_origin;
create table weblog_origin(
valid string,
ip string,
user_id string,
time string,
request string,
status string,
body_size string,
http_ref string,
user_agent string)
partitioned by (datestr string)
row format delimited fields terminated by '\001';
load data local inpath '/opt/datas/part-r-etl' into table weblog_origin partition (datestr = '20180409');
--page view 模型
drop table if exists click_pageviews;
create table click_pageviews(
session string,
ip string,
user_id string,
time string,
request string,
step string,
staylong string,
http_ref string,
user_agent string,
body_size string,
status string)
partitioned by (datestr string)
row format delimited
fields terminated by '\001';
load data local inpath '/opt/datas/part-r-pv' into table click_pageviews partition (datestr = '20180409');
select session,ip,time,step,staylong from click_pageviews;
--visit模型
drop table if exist click_visit;
create table click_visit(
session string,
ip string,
inTime string,
outTime string,
inPage string,
outPage string,
refere string,
pageNum int)
partitioned by (datestr string);
load data local inpath '/opt/datas/part-r-visit' into table click_visit partition (datestr = '20180409');
--分析详情表
drop table weblog_detail;
create table weblog_detail(
valid string, --有效标识
ip string, --来源IP
user_id string, --用户标识
time string, --访问完整时间
daystr string, --访问日期
timestr string, --访问时间
month string, --访问月
day string, --访问日
hour string, --访问时
request string, --请求的url
status string, --响应码
body_size string, --传输字节数
http_ref string, --来源url
ref_host string, --来源的host
ref_path string, --来源的路径
ref_query string, --来源参数query
ref_query_id string, --来源参数query的值
user_agent string --客户终端标识
)partitioned by(datestr string);
--时间字段:需要通过截取time字段得到以下字段
daystr string, --访问日期
timestr string, --访问时间
month string, --访问月
day string, --访问日
hour string, --访问时
解决:substring:2013-09-18 09:37:41
select
substring(time,1,10) as daystr,
substring(time,12) as timestr,
substring(time,6,2) as month,
substring(time,9,2) as day,
substring(time,12,2) as hour
from weblog_origin;
--来源详情数据:需要通过解析来源url得到
ref_host string, --来源的host
ref_path string, --来源的路径
ref_query string, --来源参数query
ref_query_id string, --来源参数query的值
解决:parse_url_tuple,专门用于解析url
用法:parse_url_tuple(url, partname1, partname2, ..., partnameN)
parse_url_tuple(http_ref,'HOST','PATH','QUERY','QUERY:id')
-》将httpref变成标准的url格式,去除双引号
-》regexp_replace:使用正则表达式进行替换
regexp_replace(str, regexp, rep)
select regexp_replace(http_ref,"\"","") from weblog_origin limit 10;
-》对http_ref进行解析
select parse_url_tuple(regexp_replace(http_ref,"\"",""),'HOST','PATH','QUERY','QUERY:id') from weblog_origin limit 10;
-》生成详情表
LATERAL VIEW:经常与explode、parse_url_tuple等udtf函数连用,实现行列转换或者一对多
hadoop hive hbase -> explode -> hadoop
hive
hbase
insert overwrite table weblog_detail partition (datestr='20180409')
select
a.valid,
a.ip,
a.user_id,
a.time,
substring(a.time,1,10) as daystr,
substring(a.time,12) as timestr,
substring(a.time,6,2) as month,
substring(a.time,9,2) as day,
substring(a.time,12,2) as hour,
a.request,
a.status,
a.body_size,
a.http_ref,
b.ref_host,
b.ref_path,
b.ref_query,
b.ref_query_id,
a.user_agent
from weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_ref,"\"",""),'HOST','PATH','QUERY','QUERY:id') b as ref_host,ref_path,ref_query,ref_query_id where a.time != 'null';
select daystr,timestr,month,ref_host,ref_path,ref_query from weblog_detail limit 10;
四、模块分析
-》SQL语句结构
select 字段条件 from 数据源(表、视图、子查询、join) where 对字段的值进行过滤 group by 分组字段 having 分组后数据进行过滤 order by 排序字段 asc|desc limit
-》浏览分析
-》统计每小时的PV数
create table if not exists pv_hour as select daystr ,hour ,count(request) as pv from weblog_detail group by daystr,hour;
select daystr ,hour ,count(request) as pv from weblog_detail group by daystr,hour order by hour asc,pv desc;
-》统计每天的pv数
create table if not exists pv_day as select daystr ,count(request) as pv from weblog_detail group by daystr;
-》统计每月的pv数
create table if not exists pv_month as select month,count(request) as pv from weblog_detail group by month;
-》统计不同终端维度的PV数
create table if not exists pv_agent as select user_agent,count(request) as pv from weblog_detail group by user_agent limit 10;
-》统计不同终端类型
select distinct(user_agent) from weblog_detail where user_agent like '%Mozilla%' limit 10;
-》统计每个月每个终端的pv数
create table if not exists pv_month_agent as select month,user_agent,count(request) as pv from weblog_detail group by month , user_agent ;
-》统计人均pv数
-》先求每个人的pv数
select ip,count(request) as pv_per from weblog_detail group by ip;
-》平均
select sum(a.pv_per) / count(ip) as pv_avg from (select ip,count(request) as pv_per from weblog_detail group by ip) a;
-》统计不同来源的pv数
create table if not exists pv_ref as select http_ref,ref_host,count(request) as pv from weblog_detail group by http_ref ,ref_host limit 10;
-》统计一天内每个小时来源中产生最多的pv的的TOPN
-》每个小时每个来源的pv数
create table if not exists pv_refer select daystr,hour,http_ref,ref_host,count(request) as pvs from weblog_detail group by daystr,hour,http_ref,ref_host limit 10;
-》求topN
concat():用于拼接字符串
select
tmp.hour,
tmp.ref_host,
tmp.pvs,
tmp.od
from (
select concat(daystr,hour) as hour ,ref_host,pvs,row_number() over (partition concat(daystr,hour) order by pvs desc) as od ) as tmp where tmp.od <= 3;
-》受访分析
-》统计每日热门的受访页面的topN
-》统计每天每个页面的访问次数
select
tmp2.daystr,
tmp2.request,
tmp2.pvs,
tmp2.od
from(
select
tmp.daystr,
tmp.request,
tmp.pvs,
row_number() over (partition by tmp.daystr order by tmp.pvs desc) as od
from(
select daystr,request,count(request) as pvs from weblog_detail group by daystr ,request) as tmp ) as tmp2 where od < 4;
-》访客分析
-》每个访客每小时的访问数
select daystr,hour,ip,count(1) as pvs from weblog_detail group by daystr,hour,ip;
-》每个小时的ip访问数
select daystr,hour,count(ip) as pvs from weblog_detail group by daystr,hour;
-》每天的/每月的ip访问数
select daystrcount(ip) as pvs from weblog_detail group by daystr;
select month,count(ip) as pvs from weblog_detail group by month;
-》统计新增访客
-》历史访客表:用于存储所有访问过的用户的ip
create table history(day string,ip string)
-》判断当前ip是否是一个新的用户
select * from weblog_detail a join history b on a.ip = b.ip;
-》将新增访客的id写入history
insert into table history
select
tmp.daystr as day,
tmp.ip
from
(select daystr,a.ip as new_ip ,b.ip as old_ip from weblog_detail a left join history b on a.ip = b.ip) tmp where tmp.old_ip is null;
-》visit模型分析
-》单次访客统计
默认使用ip表示用户,使用session表示用户访问的次数
select ip,count(distinct session) as number from click_visit group by ip where number = 1 ;
-》回头访客个数
select ip,count(distinct session) as number from click_visit group by ip where number >1 ;
-》人均访问频率
所有用户的总次数/用户的个数
select sum(tmp.number)/count(tmp.ip)
from(
select ip,count(distinct session) as number from click_visit group by ip ) as tmp;
-》用户平均访问页数
总页数/ip数
select sum(pageNum) / count(ip) from click_visit;