scorecardpy perf.py The calculation of KS

perf.py The calculation of KS

Open sunrisehang opened this issue 5 years ago • 6 comments

当我使用sc.perf_eva(y_train, train_pred , plot_type = ["ks"])计算KS时，KS曲线的定义貌似不是正确的。正确的KS曲线，应该是以阈值点为x轴，TPR和FPR为y轴，显然源码中计算是错误。这里附上我，参考网上的部分代码，并自己更改后的代码，希望谢老师能参考一下：

def model_score_pro_ks(df_fact_tag, df_expected_score_or_pro, buckets, type_input='score'):
    # 初始等间隔分段区间列表
    breakpoints = np.arange(0, buckets + 1) / (buckets)
    # 将预期得分，按照等距分箱，分成buckets个，返回的是每段的上下界限,array
    # input指的是，初始分箱间隔，组成的列表
    # min和max指的是，df_expected_score_or_pro的最大最小值
    def transform_scale(bin_list, df_expected_score_or_pro_min, df_expected_score_or_pro_max):
        # 将最初间隔变成实际分数间隔
        bin_list /= np.max(bin_list) / (df_expected_score_or_pro_max - df_expected_score_or_pro_min)
        # 加入最小值，即可以实现将分数分隔
        bin_list += df_expected_score_or_pro_min
        return bin_list
    # 针对分数进行等间隔分段
    breakpoints = transform_scale(breakpoints, np.min(df_expected_score_or_pro), np.max(df_expected_score_or_pro))

    # 存在阈值情况下，计算KS值
    def calculate_ecpected_tag(fact_tag, expected_score_pro, bins_point):
        
        # 将真实和预期的组成数据框，便于打标签
        ksdf = pd.DataFrame({'fact_tag': fact_tag, 'expected_score_pro': expected_score_pro})
        # 概率小的是不违约，分数大的也是不违约，但这里计算ks时，大的设定为0，还是小的设定为0，不影响。
        # 因为最后的KS取绝对值，但会影响TPR和FRP曲线
        if type_input == 'score':
            ksdf.loc[:,'expected_tag'] = ksdf.apply(lambda x:1 if x['expected_score_pro'] <= bins_point else 0,axis=1)
        elif type_input == 'pro':
            ksdf.loc[:,'expected_tag'] = ksdf.apply(lambda x:0 if x['expected_score_pro'] <= bins_point else 1,axis=1)
        else:
            raise Exception("Incorrect inputs; the value of type should be choosed between 'score' and 'pro'.")

        #计算TPR和FPR 
        #shilian_tag为真实值,1表示失联,0表示未失联
        #expected_tag为预测值，1表示失联（分数低），0表示未失联（分数高）
        TP = sum([1 if a==b==1 else 0 for a,b in zip(ksdf['fact_tag'],ksdf['expected_tag'])])#正例被预测为正例
        FN = sum([1 if a==1 and b==0 else 0 for a,b in zip(ksdf['fact_tag'],ksdf['expected_tag'])])#正例被预测为反例
        TPR = TP/(TP+FN) 
        TN = sum([1 if a==b==0 else 0 for a,b in zip(ksdf['fact_tag'],ksdf['expected_tag'])])#反例被预测为反例
        FP = sum([1 if a==0 and b==1 else 0 for a,b in zip(ksdf['fact_tag'],ksdf['expected_tag'])])#反例被预测为正例
        FPR = FP/(TN+FP)

        KS = TPR - FPR

        return pd.DataFrame({'bins_point':[bins_point],'TPR':[TPR], 'FPR':[FPR],'KS':[abs(KS)]})

    df1 = pd.DataFrame()
    
    for i in breakpoints:
        # 不断组成数据框，直至所有KS计算完成
        df1 = pd.concat([df1,calculate_ecpected_tag(df_fact_tag, df_expected_score_or_pro, i)])
    return df1