1.使用 load 方式加载数据到 Hive 表中,注意分区表加载数据的特殊性
CREATE TABLE IF NOT EXISTS myinfo (id string, name string, city string) PARTITIONED BY (day string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE ;
Load加载OVERWRITE为覆盖:
LOAD DATA LOCAL INPATH '/home/info.txt' OVERWRITE INTO TABLE myinfo PARTITION (day='0308');
2.如何保存 HiveQL 查询结果:保存到表中,保存到本地文件(注意指定列分隔符)
保存到本地文件:
insert overwrite local directory '/home/info' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select * from myinfo ;
推荐用这种方式:
bin/hive -e " select * from myinfo;" > /home/info /info.txt
保存到HDFS中:
user/hive路径要存在,info路径不可以存在,不可以指定分割符。
insert overwrite directory 'user/hive/info' select * from myinfo ;
保存到表中:
create table newinfo as select * from myinfo ;
3.常见查询练习,如 groupby、having、join、sortby、orderby 等。
查询deptid不等于字符串null按时间倒叙
SELECT cn, namepy, sex, birthday, idcard, deptid FROM EMPLOYEE WHERE deptid <> "null" order by cn asc limit 100;
查询sender和receiver包含@符合并且body不等于空。
SELECT * FROM ofmess WHERE sender LIKE '%@%' and receiver LIKE'%@%' and body IS NOT NULL and body <>"" limit 100;
根据条件查询后创建表
create table ofmess3 as
select logtime, substr(sender,0,instr(sender,'@')-1) as sender, substr(receiver,0,instr(receiver,'@')-1) as receiver ,body from ofmess1 where substr(sender,0,instr(sender,'@')-1) rlike '^\\d+$' and substr(receiver,0,instr(receiver,'@')-1) rlike '^\\d+$' ;
根据条件进行联合查询
SELECT a.logtime, b.deptid,a.sender, c.deptid,a.receiver,a.body FROM ofmess3 a JOIN EMPLOYEE2 b ON (a.sender = b.cn) JOIN EMPLOYEE2 c ON (a.receiver = c.cn) limit 200 ;
根据部门id与工作类型分组查询平均年龄
select e.deptid, e.jobtype, avg(age) avg_age from EMPLOYEE e group by e.deptid, e.jobtype ;
根据部门id分组查询平均年龄大于30的
select e.deptid, e.jobtype, avg(age) avg_age from EMPLOYEE e group by e.deptid having avg_age > 30;
分区的文件按字段排序
Set mapreduce.job.reduces=2;
Select * from myinfo sort by id desc;
Insert overwrite local directory ‘/home/myinfo-res’ select * from myinfo sort by id desc;
每个分区的文件按id倒序排序