版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u012292754/article/details/86549178
1 项目思路
- 针对不同的业务创建不同的子表
* 数据存储格式 orcfile /parquet
* 数据压缩
* map output 数据压缩 snappy
* 外部表
* 分区表
2 实战
- 创建表
drop TABLE if exists defalut.web_log_src;
create table if NOT exists default.web_log_src(
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "(\"[^ ]*\") (\"-|[^ ]*\") (\"[^\]]*\") (\"[^\"]*\") (\"[0-9]*\") (\"[0-9]*\") (-|[^ ]*) (\"[^ ]*\") (\"[^\"]*\") (-|[^ ]*) (\"[^ ]*\")"
)
STORED AS TEXTFILE;
- 加载数据
- https://cwiki.apache.org/confluence/display/Hive/GettingStarted#GettingStarted-ApacheWeblogData
- https://c.runoob.com/front-end/854
- https://issues.apache.org/jira/browse/HIVE-662
- https://www.cnblogs.com/cxchanpin/p/6911286.html
正则网站
https://www.regexpal.com/
http://www.txt2re.com/
load data local inpath '/home/hadoop/tempdata/webfb.access.log'
into table default.web_log_src;
- 创建子表
drop table if exists default.web_log_comm;
create table IF NOT exists default.web_log_comm(
remote_addr string,
time_local string,
request string,
http_referer string
)
row format delimited FIELDS terminated BY '\t'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");
- 为子表导入数据
INSERT into TABLE default.web_log_comm SELECT remote_addr,time_local,request,http_referer
FROM default.web_log_src;