sparksql2.0整理-自用

各种运算符

算术运算符：

| collect_set(col) --返回集合col元素重排后的数组，把下面的去重

collect_set(0) 具有稳定性排序，以后每次跑都是同一个，但性能差一点
first(0) 不是稳定性排序，随机的，但性能好一些，所以影响的关键字段最好具有稳定性

| collect_list --collect_list函数返回的类型是array< ？ >类型，？表示该列的类型（列表）
例如：select id,collect_list(name) from test_yxl group by id order by id;
| approx_count_distinct --近似去重统计，数据量大时结果会与distinct有偏差
| count(*) --计算包含null值的行
count(expr) count(distinct expr) --计算非null值的行
| covar_pop(col) --返回一组数值的协方差
| covar_samp(col) --返回一组数值的标准样本方差
| var_pop(col) --返回集合col的一组数值的方差
| var_samp(col) --返回集合col的一组数值的样本方差
| variance(col) --返回集合col的一组数值的方差
| corr(col1,col2) --返回两组数值的相关系数
| stddev --偏差
| stddev_pop
| stddev_samp

表生成函数：
| explode
例如：select explode(split(concat_ws(',','1','2','3','4'),','))

例如：select id,name,explode(split(concat_ws(',','1','2','3','4'),',')) as n from test_yxl;分割一个拼接字段再拓展成几行（stack做不到）

--原来每一行拓展成4行

id1，name1，1

id1，name1，2

id1，name1，3

id1，name1，4

...

****************建表和数据*******************

CREATE EXTERNAL TABLE `test_yxl`(`id` int, `name` string, `val` double)
PARTITIONED BY (`p_provincecode` int, `p_date` string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS textfile
LOCATION 'hdfs://nameservice/路径/'
)

数据

3,au,100.0
3,bob,78.0
6,pp,92.0
8,we,57.0
8,jy,27.0
7,ha,27.0
6,,85.0
3,tom,30.0
10,we,24.0
12,,100.0
3,pp,92.0
,jay,49.0
8,jy,27.0
8,jy,28.0

*****************************************************************分析函数*************************************************************
GROUP BY a, b, c WITH CUBE
GROUP BY a, b, c WITH ROLLUP
GROUP BY a, b, c GROUPING SETS (**** )
lag

==============grouping sets==============
select
name,
val,
count(distinct id) as uv,
grouping_id() as groupid
from test_yxl
group by name,val
grouping sets (name,val,(name,val))
order by groupid

select
name,
val,
count(distinct id) as uv,
grouping_id() as groupid
from test_yxl
group by name,val
grouping sets (name,val,(name,val),())
order by groupid

==============with cube==============
select
name,
val,
count(distinct id) as uv,
grouping_id() as groupid
from test_yxl
group by name,val
with cube
order by groupid

select
id,
name,
val,
count(distinct p_provincecode) as uv,
grouping_id() as groupid
from test_yxl
group by id,name,val
with cube
order by groupid

select
id,
name,
val,
count(p_provincecode) as uv,
grouping_id() as groupid
from test_yxl
group by id,name,val
with cube
order by groupid

==============with rollup==============
select
id,
name,
val,
count(distinct id) as uv,
grouping_id() as groupid
from test_yxl
group by id,name,val
with rollup
order by groupid

==============lag==============
select
id,
name,
val,
row_number() over(partition by id order by val desc ) as row_r,
lag(val,1,8888.88) over(partition by id order by val desc ) as lag1,
lag(val,2) over(partition by id order by val desc ) as lag2
from test_yxl

select
id,
name,
val,
row_number() over(order by val desc ) as row_r,
lag(val,1,8888.88) over(order by val desc ) as lag1,
lag(val,2) over(order by val desc ) as lag2
from test_yxl

*****************************************************************窗口函数*************************************************************
row_number、rank、dense_rank、sum、avg、min、max、ntile、first_value

==============row_number、rank、dense_rank==============
select
id,
name,
val,
row_number() over(partition by id order by val desc) as level1
from test_yxl

select
id,
name,
val,
row_number() over(order by val desc) as row_level,
rank() over(order by val desc) as rank_level,
dense_rank() over(order by val desc) as dense_level
from test_yxl

==============sum、avg、min、max==============
select
id,
name,
sum(val) over(partition by id order by name) as sum1,
sum(val) over(partition by id order by name ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sum2,
sum(val) over(partition by id) as sum3,
sum(val) over(partition by id order by name ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING) as sum4,
max(val) over(partition by id order by name) as max1,
max(val) over(partition by id order by name ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as max2,
max(val) over(partition by id) as max3,
max(val) over(partition by id order by name ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING) as max4
from test_yxl

==============ntile==============
select
id,
name,
val,
ntile(2) over(partition by id order by val) as ntile1,
ntile(3) over(partition by id order by val) as ntile2,
ntile(4) over(order by val) as ntile3
from test_yxl
order by id,name

--结合case when
select
id,
name,
val,
case ntile(4) over(order by val desc )
when 1 then "优"
when 2 then "良"
when 3 then "及格"
when 4 then "惨"
end as level
from test_yxl
order by id,name

==============first_value==============
select
id,
name,
val,
row_number() over(partition by id order by val desc) as level1,
first_value(name) over(partition by id order by val desc) as first_1
from test_yxl

==============last_value==============
--不对
select
id,
name,
val,
row_number() over(partition by id order by val desc) as level1,
last_value(name) over(partition by id order by val desc) as last_1
from test_yxl

--如果不指定ORDER BY且没有row_number()，则默认按照记录在文件中的偏移量进行排序，会出现错误的结果
select
id,
name,
val,
last_value(name) over(partition by id ) as last_1
from test_yxl

select
id,
name,
val,
row_number() over(partition by id order by val desc) as level1,
first_value(name) over(partition by id order by val desc) as first_1,
last_value(name) over(partition by id order by val desc ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as last_1
from test_yxl

==============cume_dist==============
select
id,
name,
val,
cume_dist() over(order by val) as cume_1,
cume_dist() over(partition by id order by val) as cume_2
from test_yxl

==============percent_rank==============
select
id,
name,
val,
percent_rank() over(order by val) as percent_1,
rank() over(order by val) as rank_1,
sum(1) over(partition by null) as sum_1,
percent_rank() over(partition by id order by val) as percent_2
from test_yxl

sparksql2.0整理-自用

猜你喜欢