数据处理实用代码库分享

该机器学习工具模块本人自己编写,用于快速完成机器学习开发,降低开发代码难度,集中自己的精力优化模型,该代码仅供参考。

开发语言python
机器学习框架sklearn
Python工具包

from math import log
from scipy import stats
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as metrics

信息熵计算:

def calcShannonEnt(dataSet):
    """
    :param dataSet: 数据
    :return: 信息熵
    """
    numEntries = len(dataSet)  # 样本数
    labelCounts = {}               # 该数据集每个类别的频数
    for featVec in dataSet:        # 对每一行样本
        currentLabel = featVec[-1]  # 该样本的标签
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries      # 计算p(xi)
        shannonEnt -= prob * log(prob, 2)  # log base 2
    return shannonEnt

绘制混淆矩阵:

def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

查看训练特征对模型的影响程度:

def featureImportant(X_train, model):
    """
    :param X_train: 训练集
    :param RF_model: 训练模型
    :return: 训练特征对模型的重要程度
    """
    feat_lables = X_train.columns[0:]
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    for i in range(X_train.shape[1]):
        print("%2d) %-*s %f" % (i + 1, 30, feat_lables[indices[i]],importances[indices[i]]))

模型评估:准确率,召回率:

def evaluation(model, y_test, y_pred_test):
    """
    :param RF_model: 训练模型
    :param y_test: 测试集分类值
    :param y_pred_test: 测试集分类预测值
    :return: 召回率,准确率,oob袋外准确率
    """
    cnf_matrix = confusion_matrix(y_test, y_pred_test)
    print("召回率: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))
    print("准确率: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1]))
    # oob
    print('oob袋外准确率', model.oob_score_)

ROC曲线:

def roc(y_train, y_pred_train, y_test, y_pred_test):
    """
    :param y_train: 训练集分类值
    :param y_pred_train: 训练集分类预测值
    :param y_test: 测试集分类值
    :param y_pred_test: 测试集分类预测值
    :return: roc值及roc曲线
    """
    fpr_test, tpr_test, th_test = metrics.roc_curve(y_test, y_pred_test)
    fpr_train, tpr_train, th_train = metrics.roc_curve(y_train, y_pred_train)
    print('auc:', metrics.auc(fpr_test, tpr_test))
    plt.figure(figsize=[3, 3])
    plt.plot(fpr_test, tpr_test, 'b--', linestyle='--')
    plt.plot(fpr_train, tpr_train, 'r-')
    plt.title('ROC curve')
    plt.show()

在原始数据集中分离出特征数据集:

def featureColsTrainDataFilter(df_original_data, cols, target):
    """
    :param df_original_data: 原始数据集
    :param cols: 特征列表
    :param target: 分类标签
    :return: 获取特征类数据集
    """
    features_labels = []
    for feature in range(0, len(cols)):
        if cols[feature] != target:
            features_labels.append(cols[feature])
    return df_original_data[features_labels]

后续其他代码陆续更新。

发布了25 篇原创文章 · 获赞 27 · 访问量 2175

猜你喜欢

转载自blog.csdn.net/m0_38053092/article/details/104411721