hive数据仓库建设

hive数据仓库建设

1、设计原生日志表

原生日志表用来存放上报的原始日志,数据经过清洗加工后会进入到各个日志表中。

1.1 创建数据库

#创建数据库
$hive>create database umeng_big11 ;

1.2 创建原生日志表

原生表使用分区表设计,分区字段为ym/d/hm,hive使用动态分区表,分区采用非严格模式,即所有分区都可以是动态分区。hive命令行终端打开显式表头设置:

#临时设置,只在当前回话有效
$hive>set hive.cli.print.header=true ;

永久配置hive-site.xml:

...
<property>
  <name>hive.cli.print.header</name>
  <value>true</value>
</property>
...

创建hive原生日志表raw_logs:

$hive>create table raw_logs
(
  servertimems  float ,
  servertimestr string , 
  clientip      string ,
  clienttimems  bigint,
  status        int ,
  log           string 
)
PARTITIONED BY (ym int, day int , hm int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

1.3 加载hdfs数据到hive原生表

$hive>use umeng_big11 ;
$hive>load data inpath '/user/centos/umeng/raw-logs/201805/24/1809' into table raw_logs partition(ym=201805 , day = 24 , hm = 1809) ;

2、自定义UDF函数完成数据清洗

2.1 介绍

将原生数据加载原生日志表后,将status码为200的记录查询出来进行清洗,将结果分别插入到5类日志子表中。供以后分析使用。

3、创建日志子表

3.1 准备sql语句

创建/home/centos/umeng/umeng_create_logs_ddl.sql文件,内容如下:

--使用指定库
use umeng_big11 ;

--startuplogs
create table if not exists startuplogs
(
  appChannel    string ,
  appId         string ,
  appPlatform   string ,
  appVersion    string ,
  brand         string ,
  carrier       string ,
  country       string ,
  createdAtMs   bigint ,
  deviceId      string ,
  deviceStyle   string ,
  ipAddress     string ,
  network       string ,
  osType        string ,
  province      string ,
  screenSize    string ,
  tenantId      string
) 
partitioned by (ym int ,day int , hm int) 
stored as parquet ; 

--eventlogs
create table if not exists eventlogs
(
  appChannel        string ,
  appId             string ,
  appPlatform       string ,
  appVersion        string ,
  createdAtMs       bigint ,
  deviceId          string ,
  deviceStyle       string ,
  eventDurationSecs bigint ,
  eventId           string ,
  osType            string ,
  tenantId          string
) 
partitioned by (ym int ,day int , hm int) 
stored as parquet ; 

--errorlogs
create table if not exists errorlogs
(
  appChannel    string ,
  appId         string ,
  appPlatform   string ,
  appVersion    string ,
  createdAtMs   bigint ,
  deviceId      string ,
  deviceStyle   string ,
  errorBrief    string ,
  errorDetail   string ,
  osType        string ,
  tenantId      string
) 
partitioned by (ym int ,day int , hm int) 
stored as parquet ; 

--usagelogs
create table if not exists usgaelogs
(
  appChannel            string ,
  appId                 string ,
  appPlatform           string ,
  appVersion            string ,
  createdAtMs           bigint ,
  deviceId              string ,
  deviceStyle           string ,
  osType                string ,
  singleDownloadTraffic bigint ,
  singleUploadTraffic   bigint ,
  singleUseDurationSecs bigint ,
  tenantId              string
) 
partitioned by (ym int ,day int , hm int) 
stored as parquet ; 

--pagelogs
create table if not exists pagelogs
(
  appChannel            string ,
  appId                 string ,
  appPlatform           string ,
  appVersion            string ,
  createdAtMs           bigint ,
  deviceId              string ,
  deviceStyle           string ,
  nextPage              string ,
  osType                string ,
  pageId                string ,
  pageViewCntInSession  int ,
  stayDurationSecs      bigint ,
  tenantId              string ,
  visitIndex            int
) 
partitioned by (ym int ,day int , hm int) 
stored as parquet ; 

3.2 执行sql脚本

$hive>source /home/centos/umeng/umeng_create_logs_ddl.sql

猜你喜欢

转载自www.cnblogs.com/xupccc/p/9545701.html