auc定义,正样本预测概率大于负样本概率的比例
表示个数
第一种方法直接根据定义最好理解
第二种方法先排序,对于正负样本概率相同的情况,这里用到了多级排序,先对预测值由小到大排序,如果预测值相等则对标签由小到排序
第三种方法对正负样本的预测值分别进行分桶,这实际上是一种间接的排序,分桶之后再根据定义进行计算
时间复杂度 | 空间复杂度 | |
auc | O(n^2) | O(1) |
auc_sort | O(nlogn) | O(1) |
auc-bin | O(n) | O(n) |
"""
#解释一
auc = sum(I(P(+) , P(-))) / M(+)*N(-)
if P(正样本) > P(负样本) I(P(正样本), P(负样本)) = 1
if P(正样本) = P(负样本) I(P(正样本), P(负样本)) = 0.5
if P(正样本) < P(负样本) I(P(正样本), P(负样本)) = 0
#解释二
auc = sum( index_i \in posiitveclass Rank_index_i) - M*(M+1)/2 / M*N
"""
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, accuracy_score
def auc(y, y_hat):
"""
fp-tp curve
y is true labels
y_hat is probability
"""
# positve samples
pos = [j for i,j in zip(y,y_hat) if i==1]
# negative samples
neg = [j for i,j in zip(y,y_hat) if i==0]
nominator = 0.0
for i in pos:
for j in neg:
if i > j:
nominator +=1
elif i == j:
nominator +=0.5
else:
nominator +=0
M = len(pos)
N = len(neg)
#print(nominator, M, N)
auc_ = nominator / (M*N)
return auc_
def auc_sort(y, y_hat):
"""
fp-tp curve
y is true labels
y_hat is probability
y_hat 0,1 0,2 0,3 0,4 0,5 排序
y 0 1 0 0 1
"""
pos_len = sum(y)
neg_len = len(y) - pos_len
total_case = pos_len * neg_len
labels_pred = zip(y, y_hat)
# 先对y_hat从小到大排序 如果y_hat相等对y从小到大排序
labels_pred = sorted(labels_pred, key = lambda x :(x[1],x[0]))
print(f"labels_pred {labels_pred}")
accumulated_neg = 0
satisfied_pair = 0
# 这种算法是有问题的因为当预测概率相同的时候应该乘于 0.5
prev = -1
prev_num = 0
for i in range(len(y)):
if labels_pred[i][0] == 1:
if prev== labels_pred[i][1]:
print("ok")
satisfied_pair -= prev_num*0.5
satisfied_pair += accumulated_neg
else:
if labels_pred[i][1]!= prev:
prev = labels_pred[i][1]
prev_num = 1
else:
prev_num += 1
accumulated_neg +=1
return satisfied_pair / float(total_case)
def auc_bin(y, y_hat, bins=100):
"""
对正负样本的预测值分别分桶 构建直方图再计算满足条件的正负样本对
fp-tp curve
y is true labels
y_hat is probability
"""
pos_len = sum(y)
neg_len = len(y) - pos_len
total_case = pos_len * neg_len
pos_histogram = [0 for _ in range(bins+1)]
neg_histogram = [0 for _ in range(bins+1)]
bin_width = 1.0 / bins
# 分桶相当于对预测值进行排序,统计预测值第i大的样本的个数
for i in range(len(y)):
nth_bin = int(y_hat[i] / bin_width) # [0 , bins - 1]
#print(f"nth_bin {nth_bin} ")
if y[i] == 1:
pos_histogram[nth_bin] +=1
else:
neg_histogram[nth_bin] +=1
accumulated_neg = 0
satisfied_pair = 0
for i in range(bins):
satisfied_pair += (pos_histogram[i]*accumulated_neg + pos_histogram[i]* neg_histogram[i]*0.5)
"""
对上面的理解 pos_histogram[i] 表示第i+1大的预测值所在的桶中样本的个数
accumulated_neg 表示预测值小于第i+1大的预测值的负样本总个数,所以这个公式代表 P(正样本) > P(负样本)
第二个公式很好理解 表示 P(正样本) == P(负样本)
P(正样本) < P(负样本) 时为0不需要写出来
"""
accumulated_neg += neg_histogram[i]
return satisfied_pair / float(total_case)
对比自己实现和sklearn的函数进行验证
if __name__ == '__main__':
y = [1]*5 + [0]*5
y_hat = [0.9]*3 + [0.2]*7
auc_definition = auc(y, y_hat)
auc_sort_ = auc_sort(y, y_hat)
auc_bin_ = auc_bin(y, y_hat)
auc_sklearn = roc_auc_score(y, y_hat)
print(f"auc_definition {auc_definition:.4f}\n auc_sort {auc_sort_:.4f}\n auc_bin {auc_bin_:.4f} \n auc_sklearn {auc_sklearn:.4f}")
如有纰漏,敬请指正!感谢