auc定义，正样本预测概率大于负样本概率的比例

$auc=\frac{I(Pred(Postive sample) > Pred(negative smaple) )}{I(positvesample)\cdot{I(negative smaple)})}$ $I$ 表示个数

第一种方法直接根据定义最好理解

第二种方法先排序，对于正负样本概率相同的情况，这里用到了多级排序，先对预测值由小到大排序，如果预测值相等则对标签由小到排序

第三种方法对正负样本的预测值分别进行分桶，这实际上是一种间接的排序，分桶之后再根据定义进行计算

实现复杂度对比
	时间复杂度	空间复杂度
auc	O(n^2)	O(1)
auc_sort	O(nlogn)	O(1)
auc-bin	O(n)	O(n)


"""
#解释一
auc = sum(I(P(+) ,  P(-))) / M(+)*N(-)

if P(正样本) > P(负样本) I(P(正样本), P(负样本)) = 1

if P(正样本) = P(负样本) I(P(正样本), P(负样本)) = 0.5

if P(正样本) < P(负样本) I(P(正样本), P(负样本)) = 0


#解释二
auc = sum( index_i \in posiitveclass Rank_index_i) - M*(M+1)/2     / M*N
"""
import numpy as np
import time
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, accuracy_score

def auc(y, y_hat):
    """

    fp-tp curve

    y is true labels
    y_hat is probability

    """
    # positve samples
    pos = [j for i,j in zip(y,y_hat) if i==1]
    # negative samples
    neg = [j for i,j in zip(y,y_hat) if i==0]

    nominator = 0.0
    for i in pos:
        for j in neg:
            if  i > j:
                nominator +=1
            elif i == j:
                nominator +=0.5
            else:
                nominator +=0
    M = len(pos)
    N = len(neg)
    #print(nominator, M, N)
    auc_ = nominator / (M*N)

    return auc_


def auc_sort(y, y_hat):
    """

    fp-tp curve

    y is true labels
    y_hat is probability

    y_hat 0,1 0,2 0,3 0,4 0,5  排序

    y     0    1   0   0   1

    """
    pos_len = sum(y)
    neg_len =  len(y) - pos_len
    total_case = pos_len * neg_len
    
    labels_pred = zip(y, y_hat)
    # 先对y_hat从小到大排序 如果y_hat相等对y从小到大排序
    labels_pred = sorted(labels_pred, key = lambda x :(x[1],x[0]))
    print(f"labels_pred {labels_pred}")
    accumulated_neg = 0
    satisfied_pair = 0
    # 这种算法是有问题的因为当预测概率相同的时候应该乘于 0.5
    prev = -1
    prev_num = 0
    for i in range(len(y)):
        if labels_pred[i][0] == 1:
            if prev== labels_pred[i][1]:
                print("ok")
                satisfied_pair -= prev_num*0.5
            satisfied_pair += accumulated_neg
        else:
            if labels_pred[i][1]!= prev:
                prev = labels_pred[i][1]
                prev_num = 1
            else:
                prev_num += 1
            accumulated_neg +=1
        
    return satisfied_pair / float(total_case)


def auc_bin(y, y_hat, bins=100):

    """
    对正负样本的预测值分别分桶 构建直方图再计算满足条件的正负样本对
    fp-tp curve

    y is true labels
    y_hat is probability

    """
    pos_len = sum(y)
    neg_len =  len(y) - pos_len
    total_case = pos_len * neg_len
    pos_histogram = [0 for _ in range(bins+1)]
    neg_histogram = [0 for _ in range(bins+1)]
    bin_width = 1.0 / bins
    # 分桶相当于对预测值进行排序，统计预测值第i大的样本的个数
    for i in range(len(y)):
        nth_bin = int(y_hat[i] / bin_width)  # [0 , bins - 1]
        #print(f"nth_bin {nth_bin} ")
        if y[i] == 1:
            pos_histogram[nth_bin] +=1
        else:
            neg_histogram[nth_bin] +=1
    accumulated_neg = 0
    satisfied_pair = 0
    for i in range(bins):
        satisfied_pair += (pos_histogram[i]*accumulated_neg + pos_histogram[i]* neg_histogram[i]*0.5)
        """
        对上面的理解 pos_histogram[i] 表示第i+1大的预测值所在的桶中样本的个数
        accumulated_neg 表示预测值小于第i+1大的预测值的负样本总个数，所以这个公式代表 P(正样本) > P(负样本)

        第二个公式很好理解 表示 P(正样本) == P(负样本)

        P(正样本) < P(负样本) 时为0不需要写出来

        """
        accumulated_neg += neg_histogram[i]
    return satisfied_pair / float(total_case)

对比自己实现和sklearn的函数进行验证

if __name__ == '__main__':

    y = [1]*5 + [0]*5
    y_hat = [0.9]*3 + [0.2]*7

    auc_definition = auc(y, y_hat)
    
    auc_sort_ = auc_sort(y, y_hat)
    auc_bin_ = auc_bin(y, y_hat)
    auc_sklearn = roc_auc_score(y, y_hat)

    print(f"auc_definition {auc_definition:.4f}\n auc_sort {auc_sort_:.4f}\n auc_bin {auc_bin_:.4f} \n auc_sklearn {auc_sklearn:.4f}")

如有纰漏，敬请指正！感谢

python使用3种方法实现auc评价指标

auc定义，正样本预测概率大于负样本概率的比例

猜你喜欢