大数据技术学习笔记之网站流量日志分析项目：网站业务分析数据实现5

一、回顾
   -》基于ETL结果进行数据仓库建模
       -》ETL结果
           true72.46.128.140-2013-09-18 07:58:50/hadoop-zookeeper-intro/20014722"https://www.google.com/""Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.1(KHTML,likeGecko)Chrome/21.0.1174.0Safari/537.1"
           valid
           ip
           user_id
           time
           request
           status
           body_size
           http_ref
           user_agent
       -》pageview模型
           07f26862-f31c-40dd-ad91-2d61fac91a9c1.80.249.223-2013-09-18 07:57:33/hadoop-hive-intro/160"http://www.google.com.hk/url?sa=t&rct=j&q=hive%E7%9A%84%E5%AE%89%E8%A3%85&source=web&cd=2&ved=0CC4QFjAB&url=%68%74%74%70%3a%2f%2f%62%6c%6f%67%2e%66%65%6e%73%2e%6d%65%2f%68%61%64%6f%6f%70%2d%68%69%76%65%2d%69%6e%74%72%6f%2f&ei=5lw5Uo-2NpGZiQfCwoG4BA&usg=AFQjCNF8EFxPuCMrm7CvqVgzcBUzrJZStQ&bvm=bv.52164340,d.aGc&cad=rjt""Mozilla/5.0(WindowsNT5.2;rv:23.0)Gecko/20100101Firefox/23.0"14764200

           session_id
           ip
           user_id
           time
           request
           step
           length
           http_ref
           user_agent
           body_size
           status

       -》visit模型
       004b0a11-081b-402c-be5c-8957f85a5abb180.153.163.1902013-09-18 09:37:412013-09-18 09:37:41/finance-rhive-repurchase//finance-rhive-repurchase/"-"1

       session_id
       ip
       intime
       outtime
       inpage
       outpage
       refere
       numpage


二、数据仓库的设计
   -》表的类型
       -》事实表
           -》订单
               订单id   订单类型   订单名称
           -》用户表
           -》行为表
       -》维度表
           -》时间维度
               时间维度id 年       季度       月       周       日       小时
           -》地域维度
               地域维度id   国家   省份       城市

       -》构成
           某个时间某个地域的订单信息
               时间维度id 地域维度id 订单id   订单类型   订单名称
   -》常见模型
       -》星型模型：存在冗余，但数据处理的效率较高
       -》雪花模型：没有冗余，但数据处理时需要进行关联查询

三、构建数据仓库

-》构建ETL以后的源表
create database if not exists weblog;
use weblog;
drop table if exists weblog_origin;
create table weblog_origin(
valid string,
ip    string,
user_id string,
time string,
request string,
status string,
body_size string,
http_ref string,
user_agent string)
partitioned by (datestr string)
row format delimited fields terminated by '\001';

load data local inpath '/opt/datas/part-r-etl' into table weblog_origin partition (datestr = '20180409');

--page view 模型
drop table if exists click_pageviews;
create table click_pageviews(
session string,
ip string,
user_id string,
time string,
request string,
step string,
staylong string,
http_ref string,
user_agent string,
body_size string,
status string)
partitioned by (datestr string)
row format delimited
fields terminated by '\001';

load data local inpath '/opt/datas/part-r-pv' into table click_pageviews partition (datestr = '20180409');

select session,ip,time,step,staylong from click_pageviews;

--visit模型
drop table if exist click_visit;
create table click_visit(
session     string,
ip          string,
inTime      string,
outTime     string,
inPage      string,
outPage     string,
refere     string,
pageNum int)
partitioned by (datestr string);

load data local inpath '/opt/datas/part-r-visit' into table click_visit partition (datestr = '20180409');

--分析详情表
drop table weblog_detail;
create table weblog_detail(
valid           string, --有效标识
ip              string, --来源IP
user_id         string, --用户标识
time            string, --访问完整时间
daystr          string, --访问日期
timestr         string, --访问时间
month           string, --访问月
day             string, --访问日
hour            string, --访问时
request         string, --请求的url
status          string, --响应码
body_size       string, --传输字节数
http_ref        string, --来源url
ref_host        string, --来源的host
ref_path        string, --来源的路径
ref_query       string, --来源参数query
ref_query_id    string, --来源参数query的值
user_agent      string --客户终端标识
)partitioned by(datestr string);

--时间字段：需要通过截取time字段得到以下字段
daystr          string, --访问日期
timestr         string, --访问时间
month           string, --访问月
day             string, --访问日
hour            string, --访问时

解决：substring:2013-09-18 09:37:41
select
substring(time,1,10) as daystr,
substring(time,12) as timestr,
substring(time,6,2) as month,
substring(time,9,2) as day,
substring(time,12,2) as hour
from weblog_origin;

--来源详情数据：需要通过解析来源url得到
ref_host        string, --来源的host
ref_path        string, --来源的路径
ref_query       string, --来源参数query
ref_query_id    string, --来源参数query的值

解决：parse_url_tuple，专门用于解析url
用法：parse_url_tuple(url, partname1, partname2, ..., partnameN)
parse_url_tuple(http_ref,'HOST','PATH','QUERY','QUERY:id')

-》将httpref变成标准的url格式，去除双引号
   -》regexp_replace：使用正则表达式进行替换
       regexp_replace(str, regexp, rep)
   select regexp_replace(http_ref,"\"","") from weblog_origin limit 10;

-》对http_ref进行解析
   select parse_url_tuple(regexp_replace(http_ref,"\"",""),'HOST','PATH','QUERY','QUERY:id') from weblog_origin limit 10;


-》生成详情表

LATERAL VIEW：经常与explode、parse_url_tuple等udtf函数连用，实现行列转换或者一对多

hadoop hive hbase -> explode -> hadoop
                               hive
                               hbase


insert overwrite table weblog_detail partition (datestr='20180409')
select
a.valid,
a.ip,
a.user_id,
a.time,
substring(a.time,1,10) as daystr,
substring(a.time,12) as timestr,
substring(a.time,6,2) as month,
substring(a.time,9,2) as day,
substring(a.time,12,2) as hour,
a.request,
a.status,
a.body_size,
a.http_ref,
b.ref_host,
b.ref_path,
b.ref_query,
b.ref_query_id,
a.user_agent
from weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_ref,"\"",""),'HOST','PATH','QUERY','QUERY:id') b as ref_host,ref_path,ref_query,ref_query_id where a.time != 'null';

select daystr,timestr,month,ref_host,ref_path,ref_query from weblog_detail limit 10;

四、模块分析
   -》SQL语句结构
       select 字段条件 from 数据源（表、视图、子查询、join） where 对字段的值进行过滤 group by 分组字段 having 分组后数据进行过滤 order by 排序字段 asc|desc limit
   -》浏览分析
       -》统计每小时的PV数
           create table if not exists pv_hour as select daystr ,hour ,count(request) as pv from weblog_detail group by daystr,hour;
           select daystr ,hour ,count(request) as pv from weblog_detail group by daystr,hour order by hour asc,pv desc;
       -》统计每天的pv数
           create table if not exists pv_day as select daystr ,count(request) as pv from weblog_detail group by daystr;
       -》统计每月的pv数
           create table if not exists pv_month as select month,count(request) as pv from weblog_detail group by month;
       -》统计不同终端维度的PV数
           create table if not exists pv_agent as select user_agent,count(request) as pv from weblog_detail group by user_agent limit 10;

       -》统计不同终端类型
           select distinct(user_agent) from weblog_detail where user_agent like '%Mozilla%' limit 10;
       -》统计每个月每个终端的pv数
           create table if not exists pv_month_agent as select month,user_agent,count(request) as pv from weblog_detail group by month , user_agent ;
       -》统计人均pv数
           -》先求每个人的pv数
               select ip,count(request) as pv_per from weblog_detail group by ip;
           -》平均
               select sum(a.pv_per) / count(ip) as pv_avg from (select ip,count(request) as pv_per from weblog_detail group by ip) a;
       -》统计不同来源的pv数
           create table if not exists pv_ref as select http_ref,ref_host,count(request) as pv from weblog_detail group by http_ref ,ref_host limit 10;
       -》统计一天内每个小时来源中产生最多的pv的的TOPN
           -》每个小时每个来源的pv数
               create table if not exists pv_refer select daystr,hour,http_ref,ref_host,count(request) as pvs from weblog_detail group by daystr,hour,http_ref,ref_host limit 10;
           -》求topN
               concat()：用于拼接字符串
               select
                   tmp.hour,
                   tmp.ref_host,
                   tmp.pvs,
                   tmp.od
               from (
               select concat(daystr,hour) as hour ,ref_host,pvs,row_number() over (partition concat(daystr,hour) order by pvs desc) as od ) as tmp where tmp.od <= 3;
   -》受访分析
       -》统计每日热门的受访页面的topN
           -》统计每天每个页面的访问次数
           select
           tmp2.daystr,
           tmp2.request,
           tmp2.pvs,
           tmp2.od
           from(
               select
               tmp.daystr,
               tmp.request,
               tmp.pvs,
               row_number() over (partition by tmp.daystr order by tmp.pvs desc) as od
               from(
               select daystr,request,count(request) as pvs from weblog_detail group by daystr ,request) as tmp ) as tmp2 where od < 4;
   -》访客分析
       -》每个访客每小时的访问数
           select daystr,hour,ip,count(1) as pvs from weblog_detail group by daystr,hour,ip;
       -》每个小时的ip访问数
           select daystr,hour,count(ip) as pvs from weblog_detail group by daystr,hour;
       -》每天的/每月的ip访问数
           select daystrcount(ip) as pvs from weblog_detail group by daystr;
           select month,count(ip) as pvs from weblog_detail group by month;
       -》统计新增访客
           -》历史访客表：用于存储所有访问过的用户的ip
               create table history(day string,ip string)
           -》判断当前ip是否是一个新的用户
               select * from weblog_detail a join history b on a.ip = b.ip;
           -》将新增访客的id写入history
               insert into table history
               select
               tmp.daystr as day,
               tmp.ip
               from
               (select daystr,a.ip as new_ip ,b.ip as old_ip from weblog_detail a left join history b on a.ip = b.ip) tmp where tmp.old_ip is null;

   -》visit模型分析
       -》单次访客统计
           默认使用ip表示用户，使用session表示用户访问的次数
           select ip,count(distinct session) as number from click_visit group by ip where number = 1 ;
       -》回头访客个数
           select ip,count(distinct session) as number from click_visit group by ip where number >1 ;
       -》人均访问频率
           所有用户的总次数/用户的个数
           select sum(tmp.number)/count(tmp.ip)
           from(
           select ip,count(distinct session) as number from click_visit group by ip ) as tmp;
       -》用户平均访问页数
           总页数/ip数
           select sum(pageNum) / count(ip) from click_visit;

大数据技术学习笔记之网站流量日志分析项目：网站业务分析数据实现5

猜你喜欢