Hive 面试题收集(未完)

目录

排序

分组类

JOIN类

窗口函数类

参考文章


排序类

1、有1亿个用户,存储在表users中,包含用户uid、用户年纪age、用户消费总金额total,其中以uid唯一标识1个用户,按照用户年龄从大到小排序,如果年龄相同则以消费总金额从小到大排序。

这是1个全排序问题,首先预估总内存消耗大小,1亿[用户数]*(8B[uid]+4B[age]+8B[total])约等于2G,在现有计算条件可以满足全部放入内存的需求,因此可以不必过多考虑优化问题。

-- 全局排序
SELECT *
FROM Users
ORDER BY age DESC,total ASC;

-- 局部(分桶)排序
SELECT *
FROM Users
DISTRIBUTE BY age
SORT BY age DESC,total ASC

分组类

1、有10万个店铺,每个顾客访问任意一个店铺时都会生成1条访问日志,表名未Visist,其中用户id字段未uid,访问的店铺字段未store,试统计每个店铺的uv。

SELECT store,COUNT(DISTINCT uid) uv
FROM Visit
GROUP BY store;

2、有1张表示人生阶段的表Lifestage,包含2个字段:用户唯一标识uid、人生阶段组合字段stage,其中stage由","分隔的字符串组成,如“计划买车,已经买房”,试统计每一个细分人生阶段的用户人数。

-- 列转行
SELECT stage_detail,COUNT(DISTINCT uid)
FROM Lifestage
LATERAL VIEW EXPLODE(SPLIT(stage,',')) Lifestage_tmp AS stage_detail
GROUP BY stage_detail

3、有1张表示人生阶段的表Lifestage,包含2个字段:用户唯一标识uid、人生阶段字段stage,每行存储一个用户的人生阶段数据,如一个用户43有2条记录:43,计划买车; 43,已经买房,试将同一个用户的所有人生阶段字段整合成一个用“,”分隔的组合字段,如“计划买车,已经买房”。

-- 行转列
SELECT uid,
    CONCAT_WS(',', COLLECT_LIST(stage)) -- 如果一个用户stage 会有重复的话,则使用COLLECT_SET(stage)
FROM Lifestage
GROUP BY uid

4、1张学生成绩表course_t,包含学生sid、课程号course、成绩score几个字段,试得到语文成绩大于数学成绩的学生成绩数据。如

sid course score
1 yuwen 43
1 shuxue 55
2 yuwen 77
2 shuxue 88
3 yuwen 98
3 shuxue 65
SELECT 
	*
FROM
(
SELECT sid,
	MAX(CASE WHEN course='yuwen' THEN score
    	ELSE NULL 
    END) AS yuwen_score,
    MAX(CASE WHEN course='shuxue' THEN score
    	ELSE NULL 
    END) AS shuxue_score
FROM mart_fsp_security_safetmp.course_t
GROUP BY sid
) course_tmp_t
WHERE yuwen_score>shuxue_score
; 

-- 构造的数据
CREATE TABLE mart_fsp_security_safetmp.course_t AS
SELECT 1 AS id,1 AS  sid,'yuwen' AS course,43 AS score
UNION ALL SELECT 2 AS id,1 AS  sid,'shuxue' AS course,55 AS score
UNION ALL SELECT 3 AS id,2 AS  sid,'yuwen' AS course,77 AS score
UNION ALL SELECT 4 AS id,2 AS  sid,'shuxue' AS course,88 AS score
UNION ALL SELECT 5 AS id,3 AS  sid,'yuwen' AS course,98 AS score
UNION ALL SELECT 6 AS id,3 AS  sid,'shuxue' AS course,65 AS score

JOIN类

1、将下面的Address表,转成如后面所示的表

id name parent_id
1 北京市 0
2 山东省 0
3 昌平区 1
4 海淀区 1
5 沙闸镇 3
6 马池口镇 3
7 中关村 4
8 上地 4
9 烟台市 2
10 青岛市 2
11 五通桥区 9
12 马边区 9
13 定文镇 10
14 罗成镇 10

-- 即层次对象进行折叠
SELECT
	first_second_t.first_name first_name,
    first_second_t.second_name second_name,
    C.name third_name    
FROM
(
-- 取第1即
SELECT A.id AS first_id,
	A.name AS first_name,
    A.parent_id AS first_parent_id,
    B.id AS second_id,
	B.name AS second_name,
    B.parent_id AS second_parent_id
FROM Address A
JOIN Address B
ON A.id=B.parent_id
WHERE A.parent_id=0
) first_second_t
JOIN Address C
ON first_second_t.second_id=C.parent_id
;

-- 创建临时表
CREATE TABLE Address AS
SELECT 1 AS id,'北京市' AS name,0 AS parent_id
UNION ALL SELECT 2,'山东省',0
UNION ALL SELECT 3,'昌平区',1
UNION ALL SELECT 4,'海淀区',1
UNION ALL SELECT 5,'沙闸镇',3
UNION ALL SELECT 6,'马池口镇',3
UNION ALL SELECT 7,'中关村',4
UNION ALL SELECT 8,'上地',4
UNION ALL SELECT 9,'烟台市',2
UNION ALL SELECT 10,'青岛市',2
UNION ALL SELECT 11,'五通桥区',9
UNION ALL SELECT 12,'马边区',9
UNION ALL SELECT 13,'定文镇',10
UNION ALL SELECT 14,'罗成镇',10

窗口函数类

1、用户访问表vist_t,包含唯一标识用户uid、访问月份month、访问次数vist_cnt字段,试计算每个用户截止到每月为止的最大单月访问次数和累计到该月的总访问次数。

如数据表:

uid month vist_cnt
A 2015-01 5
A 2015-01 15
B 2015-01 5
A 2015-01 8
B 2015-01 25
A 2015-01 5
A 2015-02 4
A 2015-02 6
B 2015-02 10
B 2015-02 5
A 2015-03 16
A 2015-03 22
B 2015-03 23
B 2015-03 10
B 2015-03 1

得到

SELECT
	uid,
    month,
    -- MAX(vist_cnt_m) OVER (PARTITION BY uid ORDER BY month) vist_cnt_max, -- 分组内,截止到当前行
    MAX(vist_cnt_m) OVER (PARTITION BY uid) vist_cnt_max, -- 分组内所有行
    SUM(vist_cnt_m) OVER (PARTITION BY uid ORDER BY month) vist_cnt_sum, -- 分组内,截止到当前行
    vist_cnt_m
FROM
(
SELECT uid,
	month,
    SUM(vist_cnt) vist_cnt_m
FROM mart_fsp_security_safetmp.vist_t
GROUP BY 1,2
) m_t

-- 表数据
CREATE TABLE mart_fsp_security_safetmp.vist_t AS
SELECT 'A' AS uid,'2015-01' AS month,5 AS vist_cnt
UNION ALL SELECT  'A' AS uid,'2015-01' AS month,15 AS vist_cnt
UNION ALL SELECT  'B' AS uid,'2015-01' AS month,5 AS vist_cnt
UNION ALL SELECT  'A' AS uid,'2015-01' AS month,8 AS vist_cnt
UNION ALL SELECT  'B' AS uid,'2015-01' AS month,25 AS vist_cnt
UNION ALL SELECT  'A' AS uid,'2015-01' AS month,5 AS vist_cnt
UNION ALL SELECT  'A' AS uid,'2015-02' AS month,4 AS vist_cnt
UNION ALL SELECT  'A' AS uid,'2015-02' AS month,6 AS vist_cnt
UNION ALL SELECT  'B' AS uid,'2015-02' AS month,10 AS vist_cnt
UNION ALL SELECT  'B' AS uid,'2015-02' AS month,5 AS vist_cnt
UNION ALL SELECT  'A' AS uid,'2015-03' AS month,16 AS vist_cnt
UNION ALL SELECT  'A' AS uid,'2015-03' AS month,22 AS vist_cnt
UNION ALL SELECT  'B' AS uid,'2015-03' AS month,23 AS vist_cnt
UNION ALL SELECT  'B' AS uid,'2015-03' AS month,10 AS vist_cnt
UNION ALL SELECT  'B' AS uid,'2015-03' AS month,1 AS vist_cnt

2、销售表包含商户(merchant)、月份(month)、销售额(money)3个字段,需要求每个店铺的当月销售额和总销售额。

merchant month money
a 1 150
a 1 200
b 1 1000
b 1 800
c 1 250
c 1 220
b 1 6000
a 2 2000
a 2 3000
b 2 1000
b 2 1500
c 2 350
c 2 280
a 3 350
a 3 250
-- method1: rollup 参考http://lxw1234.com/archives/2015/04/190.htm
SELECT merchant,
	month,
    CASE WHEN group_id=1 THEN total 
    ELSE NULL END AS total_sale_money,
    CASE WHEN group_id=3 THEN total 
    ELSE NULL END AS month_sale_money
FROM
(
SELECT merchant,
	month,
    GROUPING__ID AS group_id,
    SUM(money) total
FROM mart_fsp_security_safetmp.sale_t
GROUP BY merchant,
	month
WITH ROLLUP
) sale_tmp_t
-- WHERE group_id IN (1,3)

-- method2 更合适
SELECT DISTINCT merchant,
	month,
    SUM(money) OVER (PARTITION BY merchant,month) month_sale_money,
    SUM(money) OVER (PARTITION BY merchant) total_sale_money
FROM mart_fsp_security_safetmp.sale_t


-- 测试数据
CREATE TABLE mart_fsp_security_safetmp.sale_t AS
SELECT  'a' AS  merchant,'1' AS month,150 AS money
UNION ALL SELECT  'a' AS  merchant,'1' AS month,200 AS money
UNION ALL SELECT  'b' AS  merchant,'1' AS month,1000 AS money
UNION ALL SELECT  'b' AS  merchant,'1' AS month,800 AS money
UNION ALL SELECT  'c' AS  merchant,'1' AS month,250 AS money
UNION ALL SELECT  'c' AS  merchant,'1' AS month,220 AS money
UNION ALL SELECT  'b' AS  merchant,'1' AS month,6000 AS money
UNION ALL SELECT  'a' AS  merchant,'2' AS month,2000 AS money
UNION ALL SELECT  'a' AS  merchant,'2' AS month,3000 AS money
UNION ALL SELECT  'b' AS  merchant,'2' AS month,1000 AS money
UNION ALL SELECT  'b' AS  merchant,'2' AS month,1500 AS money
UNION ALL SELECT  'c' AS  merchant,'2' AS month,350 AS money
UNION ALL SELECT  'c' AS  merchant,'2' AS month,280 AS money
UNION ALL SELECT  'a' AS  merchant,'3' AS month,350 AS money
UNION ALL SELECT  'a' AS  merchant,'3' AS month,250 AS money

参考文章

  1. https://blog.csdn.net/qq_41568597/article/details/84309503
  2. https://www.cnblogs.com/qingyunzong/p/8747656.html#_label0_2
发布了27 篇原创文章 · 获赞 4 · 访问量 5万+

猜你喜欢

转载自blog.csdn.net/hysfwjr/article/details/104062534
今日推荐