hive操作-2

版权声明:本文为博主原创文章,转载请标明出处。 https://blog.csdn.net/chuan403082010/article/details/84765015

描述表:
desc student;
desc formatted student;
 MANAGED_TABLE  管理表内部表

修改表名:
alter table student rename to test; 
给表添加一个列
alter table test add columns(age int comment 'age');
修改列名和类型
alter table test change age age1 string;
alter table test change age age1 string first;

替换列:是全表替换
alter table test replace columns(age string,gg string);
alter table test replace columns(dd string);

只清除数据,表结构还在,元数据信息还在
truncate table test;
drop table test;
 
hive 的帮助信息 bin/hive -help
1:指定登录到哪个数据库
bin/hive --database hadoop6;
2:指定一条sql语句,必须要用引号引起来
bin/hive -e 'show databases'
3:将输出结果进行重定向到某个文件
bin/hive -e 'show databases' >> a.txt 
4:bin/hive -f a.sql 
vi a.sql
select * from hadoop6.test;
5:bin/hive --hiveconf hive.cli.print.current.db=false;

hive的建表方式:
1:create table student(
id int,
name string
)row format delimited fields terminated by '\t';
2: 子查询方式:as select 
create table test1 as select name from test; 
将查询的数据和结构赋予一张新的表
类似于保存了一个中间结果集

3:like方式

create table test2 like test;
复制表的结构赋予一张新的表

员工表:
create table emp(
empno int comment '员工编号',
ename string comment '员工姓名',
job string comment '员工职位',
mgr int comment '领导编号',
hiredate string comment '入职时间',
sal double comment '薪资',
comm double comment '奖金',
deptno int comment '部门编号'
)
row format delimited fields terminated by '\t';
load data local inpath 'emp.txt' into table emp;
覆盖数据:overwrite 
load data local inpath 'emp.txt' overwrite into table emp;  
  
Table Type:             MANAGED_TABLE
                        内部表
                        
create external table emp1(
empno int comment '员工编号',
ename string comment '员工姓名',
job string comment '员工职位',
mgr int comment '领导编号',
hiredate string comment '入职时间',
sal double comment '薪资',
comm double comment '奖金',
deptno int comment '部门编号'
)
row format delimited fields terminated by '\t'
location '/user/hive/warehouse/hadoop6.db/emp';    Table Type:             EXTERNAL_TABLE
                
删除表:drop table emp1;
--》外部表只是删除元数据
--》不会删除对应文件夹
一般先创建内部表,然后根据需求创建多张外部表
保证数据安全的作用
分区表
2000行 select name from student    where name='zhangsan';
索引
执行流程:先对全表进行查询,然后过滤
 create  table emp_part(
 empno int,
 empname string,
 empjob string,
 mgrno int,
 birthday string,
 salary float,
 bonus float,
 deptno   int
 )
partitioned by(`date` string)
row format delimited fields terminated by '\t';                    

load data local inpath 'emp.txt' into table emp_part partition(`date`='20181129');

select * from emp_part where `date`='20181129';

create table if not exists track_log(
id              string,
url             string,
referer         string,
keyword         string,
type            string,
guid            string,
pageId          string,
moduleId        string,
linkId          string,
attachedInfo    string,
sessionId       string,
trackerU        string,
trackerType     string,
ip              string,
trackerSrc      string,
cookie          string,
orderCode       string,
trackTime       string,
endUserId       string,
firstLink       string,
sessionViewNo   string,
productId       string,
curMerchantId   string,
provinceId      string,
cityId          string,
fee             string,
edmActivity     string,
edmEmail        string,
edmJobId        string,
ieVersion       string,
platform        string,
internalKeyword string,
resultSum       string,
currentPage     string,
linkPosition    string,
buttonPosition  string
)
partitioned by (`date` string,`hour` string)
row format delimited fields terminated by '\t';


分区表提高查询效率

<name>hive.fetch.task.conversion</name>
    <value>more</value>
    
为了提高查询效率,简单的select,不带 count,sum,group by .... 不走MP

    
select hiredate from emp;    
年份 4个数字
月   2个或1个
天   1个或2个

4-1-1  
4-2-1
4-1-2
4-2-2

relike 

4-1-1   \d{4}-\d{1}-\d{1}
select * from emp where hiredate rlike  '^\\d{4}-\\d{1}-\\d{1}$';


create table test(
id int,
name string
)row format delimited fields terminated by '\t';

vi a.txt 
1    li 
2    zhao

load data local inpath 'a.txt' into table test;

bin/hdfs dfs -put /home/hadoop/a.txt /input/ 
    
load data inpath '/input/a.txt' into table test;
overwrite 覆盖写入
外部表:
create external table test1(
id int,
name string
)row format delimited fields terminated by '\t'
location '/user/hive/warehouse/ibeifeng/test ';
分区表:
create  table test1(
id int,
name string
)partitioned by(data string)
row format delimited fields terminated by '\t';

load data local inpath '/input/a.txt' into table test partition(data='chinese');
24   13--  14 ----
13---  筛选好的13--  

CREATE TABLE apachelog (
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_set string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
 )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
  "input.regex" = "(\"[^ ]*\") (\"-|[^ ]*\") (\"[^\]]*\") (\"[^\]]*\") (\"[0-9]*\") (\"[0-9]*\") (-|[^ ]*) (\"[^ ]*\") (\"[^\"]*\") (\"-|[^ ]*\") (\"[^ ]*\")"
)
STORED AS TEXTFILE;

"27.38.5.159" "-" "31/Aug/2015:00:04:37 +0800" "GET /course/view.php?id=27 HTTP/1.1" "303" "440" - "http://www.ibeifeng.com/user.php?act=mycourse" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36" "-" "learn.ibeifeng.com"    

load data local inpath 'moodle.ibeifeng.access.log'
into table apachelog;
select * from apachelog limit 1;

查看所有的内置函数:
show functions;

显示函数的描述信息
desc function substr;

显示函数的扩展信息
desc function extended substr;

hive 分析函数
主要是对于分组后的数据进行处理,然后输出处理后的数据
create table emp(
empno int comment '员工编号',
ename string comment '员工姓名',
job string comment '员工职位',
mgr int comment '领导编号',
hiredate string comment '入职时间',
sal double comment '薪资',
comm double comment '奖金',
deptno int comment '部门编号'
)
row format delimited fields terminated by '\t';
1:查询部门编号为10的所有员工,按照薪资进行降序排序
select ename,sal,deptno from emp where deptno='10' order by sal desc;

2:将每个部门薪资最高的薪资显示在最后一列
select empno,ename,deptno,sal,max(sal) over(
partition by deptno order by sal desc) as rn from 
emp;

3:将每个部门最后一列显示唯一编号
select empno,ename,deptno,sal,row_number() over(
partition by deptno order by sal desc) as rn from emp;

select empno,ename,deptno,sal,dense_rank() over(
partition by deptno order by sal desc) as rn from emp;

4:获取每个部门薪资最高的前两位
select empno,ename,deptno,sal from(select empno,ename,deptno,sal,rank() over(
partition by deptno order by sal desc) as rn from emp) tmp where rn <3;

hive导入和导出数据
1:load data local
2: load data + overwrite 覆盖数据:overwrite
3: as select 
4: insert 
insert into table select sql;---追加
insert override into table select sql
5:location
导出:
1: insert overwrite [local] directory 'path' select sql;
insert overwrite local directory '/opt/datas/emp_001' row format delimited fields terminated by '\t' select * from emp; 
--->hdfs 
insert overwrite directory 'path' select * from emp;
2:bin/hdfs dfs -get 
3:bin/hive -e 或-f + sql  + >> 或 >  a.txt 

常见的HQL语句:
where  limit distinct  is null, is not null ,
between and 
select * from emp where sal > 3000;
select * from emp limit 1;
select distinct deptno from emp;
select * from emp where sal between 2000 and 3000;

聚合函数:
count,sum,max,min,avg, group by ,having 

join 

等值连表 
A表
ID   name
1    zhangsan
2    lisi
3    wangwu
5    chenqi

B表
id    age 
1    98
2    30
3    18    
4    28    

create table B(
id int,
age string
)row format delimited fields terminated by '\t';

load data local inpath 'b.txt'
into table B;
等值join:共同拥有
select a.id,a.name,b.id,b.age from A a join B b on a.id=b.id;
左join 
select a.id,a.name,b.id,b.age from A a left join B b on a.id=b.id;
右join:
select a.id,a.name,b.id,b.age from A a right join B b on a.id=b.id;
全join:
select a.id,a.name,b.id,b.age from A a full join B b on a.id=b.id;

时间日期函数:
获取当前日期的时间戳:
select unix_timestamp();
select unix_timestamp('2017-03-12 15:23:23');
select to_date('2017-03-12 15:23:23');
datediff  两个日期差
SELECT date_add('2009-07-30', 1) FROM src LIMIT 1;
  '2009-07-31'
SELECT date_sub('2009-07-30', 1) FROM src LIMIT 1;
  '2009-07-29'

case when 
select ename,deptno,case deptno when '10' then 'your part is 10' when '20' then 'your part is 20'
else 'your part is 30' end from emp;
 

猜你喜欢

转载自blog.csdn.net/chuan403082010/article/details/84765015