1.群体稳定性指标
群体稳定性指标(Population Stability Index,PSI),通常用于检验模型及数据分布的稳定性。其计算公式如下:
psi = sum((实际占比-预期占比)* ln(实际占比/预期占比))
2.HiveQL计算PSI
以下PSI计算公式主要是为了计算一些模型中的各个指标的稳定性,其主要解决了利用当天的标签分布和上一个有效日期(比如:有数据)的标签的稳定性PSI计算。
以下代码段假设psi_bin_rate_data中已经计算好了每个features的各个分箱的实际概率。
select features,features_name,sum(psi_bins) psi
from
(select features,features_name,bins,(last_rate-rate)*ln(last_rate/rate) psi_bins
from
(select coalesce(x.features,y.features) features,coalesce(x.features_name,y.features_name) features_name,
coalesce(x.bins,y.bins) bins, nvl(x.rate,0.0001) rate,nvl(y.rate,0.0001) last_rate
from
(select features,features_name,bins,rate
from
(select *,dense_rank() over(order by date_time desc) rn
from psi_bins_rate_data) x
where rn=1)x
full outer join
(select features,features_name,bins,rate
from
(select *,dense_rank() over(order by date_time desc) rn
from psi_bins_rate_data) y
where rn=2)y
on x.features=y.features and x.bins=y.bins)t)t
group by features,features_name