hive、oracle提供开窗函数,mysql8之前版本不提供,但MySQL 8.0版本支持窗口函数(over)和公用表表达式(with)这两个重要的功能!
hive窗口函数参考《Hive分析函数整理(开窗函数、窗口函数)》
一、分析函数语法
function_name(<argument>,<argument>...) over(<partition_Clause><order by_Clause><windowing_Clause>);
function_name():函数名称
argument:参数
over( ):开窗函数
partition_Clause:分区子句,数据记录集分组,group by...
order by_Clause:排序子句,数据记录集排序,order by...
windowing_Clause:开窗子句,定义分析函数在操作行的集合,三种开窗方式:rows、range、Specifying
注:使用开窗子句时一定要有排序子句!!!
二、分析函数汇总
1、count() over() :
统计分区中各组的行数,partition by 可选,order by 可选
--总计数
select ename,esex,eage,count(*) over() from emp;
--递加计数
select ename,esex,eage,count(*) over(order by eage) from emp;
--分组计数
select ename,esex,eage,count(*) over(partition by esex) from emp;
--分组递加计数
select ename,esex,eage,count(*) over(partition by esex order by eage) from emp;
2、sum() over()
:统计分区中记录的总和,partition by 可选,order by 可选
--总累计求和
select ename,esex,eage,sum(salary) over() from emp;
--递加累计求和
select ename,esex,eage,sum(salary) over(order by eage) from emp;
--分组累计求和
select ename,esex,eage,sum(salary) over(partition by esex) from emp;
--分组递加累计求和
select ename,esex,eage,sum(salary) over(partition by esex order by eage) from emp;
3、avg() over()
:统计分区中记录的平均值,partition by 可选,order by 可选
--总平均值
select ename,esex,eage,avg(salary) over() from emp;
--递加求平均值
select ename,esex,eage,avg(salary) over(order by eage) from emp;
--分组求平均值
select ename,esex,eage,avg(salary) over(partition by esex) from emp;
--分组递加求平均值
select ename,esex,eage,avg(salary) over(partition by esex order by eage) from emp;
4、min() over()
:统计分区中记录的最小值,partition by 可选,order by 可选
max() over()
:统计分区中记录的最大值,partition by 可选,order by 可选
select ename,esex,eage,salary,min(salary) over() from emp; --求总最小值
--递加求最小值
select ename,esex,eage,salary,min(salary) over(order by eage) from emp;
--分组求最小值
select ename,esex,eage,salary,min(salary) over(partition by esex) from emp;
--分组递加求最小值
select ename,esex,eage,salary,min(salary) over(partition by esex order by eage)
from emp;
--求总最大值
select ename,esex,eage,salary,max(salary) over() from emp;
--递加求最大值
select ename,esex,eage,salary,max(salary) over(order by eage) from emp;
--分组求最大值
select ename,esex,eage,salary,max(salary) over(partition by esex) from emp;
--分组递加求最大值
select ename,esex,eage,salary,max(salary) over(partition by esex order by eage)
from emp;
5、rank() over()
:跳跃排序,partition by 可选,order by 必选
select ename,eage,rank() over(partition by job order by eage) from emp;
select ename,eage,rank() over(order by eage) from emp;
6、dense_rank()
:连续排序,partition by 可选,order by 必选
select ename,eage,dense_rank() over(partition by job order by eage) from emp;
select ename,eage,dense_rank() over(order by eage) from emp;
7、row_number() over()
:排序,无重复值,partition by 可选,order by 必选
此方法也可以用作生成数字ID
select ename,eage,row_number() over(partition by job order by eage) from emp;
select ename,eage,row_number() over(order by eage) from emp;
-- 生成数字ID
select row_number()over(order by 1) from emp;
8、ntile(n) over()
:partition by 可选,order by 必选
n表示将分区内记录平均分成n份,多出的按照顺序依次分给前面的组
select ename,salary,ntile(3) over(order by salary desc) from emp;
select ename,salary,ntile(3) over(partition by job order by salary desc) from emp;
9、first_value() over()
:取出分区中第一条记录的字段值,partition by 可选,order by 可选
last_value() over()
:取出分区中最后一条记录的字段值,partition by 可选,order by 可选
select ename,first_value(salary) over() from emp;
select ename,first_value(salary) over(order by salary desc) from emp;
select ename,first_value(salary) over(partition by job) from emp;
select ename,first_value(salary) over(partition by job order by salary desc) from emp;
select ename,last_value(ename) over() from emp;
select ename,last_value(ename) over(order by salary desc) from emp;
select ename,last_value(ename) over(partition by job) from emp;
select ename,last_value(ename) over(partition by job order by salary desc) from emp;
10、first
:从DENSE_RANK返回的集合中取出排在最前面的一个值的行
last
:从DENSE_RANK返回的集合中取出排在最后面的一个值的行
select job,max(salary) keep(dense_rank first order by salary desc),
max(salary) keep(dense_rank last order by salary desc) from emp
group by job;
11、lag() over()
:取出前n行数据,partition by 可选,order by 必选
lead() over()
:取出后n行数据,partition by 可选,order by 必选
select ename,eage,lag(eage,1,0) over(order by salary),
lead(eage,1,0) over(order by salary) from emp;
select ename,eage,lag(eage,1) over(partition by esex order by salary),
lead(eage,1) over(partition by esex order by salary) from emp;
12、ratio_to_report(a) over(partition by b)
:求按照b分组后a的值在所属分组中总值的占比,a的值必须为数值或数值型字段
partition by 可选,order by 不可选
--给每一行赋值1,求当前行在总值的占比,总是0.1
select ename,job,salary,ratio_to_report(1) over() from emp;
--当前行的值在所有数据中的占比
select ename,job,salary,ratio_to_report(salary) over() from emp;
--给每一行赋值1,求当前行在分组后的组内总值的占比
select ename,job,salary,ratio_to_report(1) over(partition by job) from emp;
--当前行的值在分组后组内总值占比
select ename,job,salary,ratio_to_report(salary) over(partition by job) from emp;
13、percent_rank() over()
:partition by 可选,order by 必选
所在组排名序号-1除以该组所有的行数-1,排名跳跃排序
select ename,job,salary,percent_rank() over(order by salary) from emp;
select ename,job,salary,percent_rank() over(partition by job order by salary) from emp;
14、cume_dist() over()
:partition by 可选,order by必选
所在组排名序号除以该组所有的行数,注意对于重复行,计算时取重复行中的最后一行的位置
select ename,job,salary,cume_dist() over(order by salary) from emp;
select ename,job,salary,cume_dist() over(partition by job order by salary) from emp;
15、precentile_cont( x ) within group(order by ...) over()
:over()中partition by可选,order by 不可选
x为输入的百分比,是0-1之间的一个小数,返回该百分比位置的数据,若没有则返回以下计算值(r):
a=1+( x *(N-1) ) x为输入的百分比,N为分区内的记录的行数
b=ceil ( a ) 向上取整
c = floor( a ) 向下取整
r=a * 百分比位置上一条数据 + b * 百分比位置下一条数据
select ename,job,salary,percentile_cont(0.5) within group(order by salary) over()
from emp;
select ename,job,salary,percentile_cont(0.5) within group(order by salary)
over(partition by job) from emp;
16、precentile_disc( x ) within group(order by ...) over()
:over()中partition by可选,order by 不可选
x为输入的百分比,是0-1之间的一个小数,返回百分比位置对应位置上的数据值,若没有对应数据值,就取大于该分布值的下一个值
select ename,job,salary,percentile_disc(0.5) within group(order by salary) over()
from emp;
select ename,job,salary,percentile_disc(0.5) within group(order by salary)
over(partition by job) from emp;
17、stddev() over()
:计算样本标准差,只有一行数据时返回0,partition by 可选,order by 可选
stddev_samp() over()
:计算样本标准差,只有一行数据时返回null,partition by 可选,order by 可选
stddev_pop() over()
:计算总体标准差,partition by 可选,order by 可选
--计算所有记录的样本标准差
select stddev(stu_age) over() from student;
--计算递加的样本标准差
select stddev(stu_age) over(order by stu_age) from student;
--计算分组的样本标准差
select stddev(stu_age) over(partition by stu_major) from student;
--计算分组递加的样本标准差
select stddev(stu_age) over(partition by stu_major order by stu_age) from student;
--计算所有记录的样本标准差
select stddev_samp(stu_age) over() from student;
--计算递加的样本标准差
select stddev_samp(stu_age) over(order by stu_age) from student;
--计算分组的样本标准差
select stddev_samp(stu_age) over(partition by stu_major) from student;
--计算分组递加的样本标准差
select stddev_samp(stu_age) over(partition by stu_major order by stu_age) from student;
--计算所有记录的总体标准差
select stddev_pop(stu_age) over() from student;
--计算递加的总体标准差
select stddev_pop(stu_age) over(order by stu_age) from student;
--计算分组的总体标准差
select stddev_pop(stu_age) over(partition by stu_major) from student;
--计算分组递加的总体标准差
select stddev_pop(stu_age) over(partition by stu_major order by stu_age) from student;
18、variance() over()
:计算样本方差,只有一行数据时返回0,partition by 可选,order by 可选
var_samp() over()
:计算样本方差,只有一行数据时返回null,partition by 可选,order by 可选
var_pop() over()
:计算总体方差,partition by 可选,order by 可选
--计算所有记录的样本方差
select variance(stu_age) over() from student;
--计算递加的样本方差
select variance(stu_age) over(order by stu_age) from student;
-计算分组的样本方差
select variance(stu_age) over(partition by stu_major) from student; -
--计算分组递加的样本方差
select variance(stu_age) over(partition by stu_major order by stu_age) from student;
--计算所有记录的样本方差
select var_samp(stu_age) over() from student;
--计算递加的样本方差
select var_samp(stu_age) over(order by stu_age) from student;
--计算分组的样本方差
select var_samp(stu_age) over(partition by stu_major) from student;
--计算分组递加的样本方差
select var_samp(stu_age) over(partition by stu_major order by stu_age) from student;
--记录所有就的总体方差
select var_pop(stu_age) over() from student;
--计算递加的总体方差
select var_pop(stu_age) over(order by stu_age) from student;
--计算分组的总体方差
select var_pop(stu_age) over(partition by stu_major) from student;
--计算分组递加的样本方差
select var_pop(stu_age) over(partition by stu_major order by stu_age) from student;
stddev()=sqrt( variance() ) sqrt()--求开方
stddev_samp()=sqrt( var_samp() )
stddec_pop=sqrt( var_pop() )
19、covar_samp over()
:返回一对表达式的样本协方差,partition by 可选,order by 可选
covar_pop over()
: 返回一堆表达式的总体协方差,partition by 可选,order by 可选
--计算所有记录的样本协方差
select covar_samp(stu_age,line) over() from student;
--计算递加的样本协方差
select covar_samp(stu_age,line) over(order by stu_age) from student;
--计算分组的样本协方差
select covar_samp(stu_age,line) over(partition by stu_major) from student;
--计算分组递加的样本协方差
select covar_samp(stu_age,line) over(partition by stu_major order by stu_age)
from student;
--计算所有记录的总体协方差
select covar_pop(stu_age,line) over() from student;
--计算递加的总体协方差
select covar_pop(stu_age,line) over(order by stu_age) from student;
--计算分组的总体协方差
select covar_pop(stu_age,line) over(partition by stu_major) from student;
--计算分组递加的总体协方差
select covar_pop(stu_age,line) over(partition by stu_major order by stu_age)
from student;
20、corr() over()
:返回一对表达式的相关系数,partition by 可选,order by 可选
--计算所有记录的相关系数
select corr(stu_age,line) over() from student;
--计算递加的相关系数
select corr(stu_age,line) over(order by stu_age) from student;
--计算分组的相关系数
select corr(stu_age,line) over(partition by stu_major) from student;
--计算分组递加的相关系数
select corr(stu_age,line) over(partition by stu_major order by stu_age) from student;
21、REGR_ (Linear Regression) Functions:
这些线性回归函数适合最小二乘法回归线,有9个不同的回归函数可使用