该机器学习工具模块本人自己编写,用于快速完成机器学习开发,降低开发代码难度,集中自己的精力优化模型,该代码仅供参考。
开发语言
: python
机器学习框架
:sklearn
Python工具包
:
from math import log
from scipy import stats
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
信息熵计算:
def calcShannonEnt(dataSet):
"""
:param dataSet: 数据
:return: 信息熵
"""
numEntries = len(dataSet) # 样本数
labelCounts = {} # 该数据集每个类别的频数
for featVec in dataSet: # 对每一行样本
currentLabel = featVec[-1] # 该样本的标签
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries # 计算p(xi)
shannonEnt -= prob * log(prob, 2) # log base 2
return shannonEnt
绘制混淆矩阵:
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
查看训练特征对模型的影响程度:
def featureImportant(X_train, model):
"""
:param X_train: 训练集
:param RF_model: 训练模型
:return: 训练特征对模型的重要程度
"""
feat_lables = X_train.columns[0:]
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
for i in range(X_train.shape[1]):
print("%2d) %-*s %f" % (i + 1, 30, feat_lables[indices[i]],importances[indices[i]]))
模型评估:准确率,召回率:
def evaluation(model, y_test, y_pred_test):
"""
:param RF_model: 训练模型
:param y_test: 测试集分类值
:param y_pred_test: 测试集分类预测值
:return: 召回率,准确率,oob袋外准确率
"""
cnf_matrix = confusion_matrix(y_test, y_pred_test)
print("召回率: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))
print("准确率: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1]))
# oob
print('oob袋外准确率', model.oob_score_)
ROC曲线:
def roc(y_train, y_pred_train, y_test, y_pred_test):
"""
:param y_train: 训练集分类值
:param y_pred_train: 训练集分类预测值
:param y_test: 测试集分类值
:param y_pred_test: 测试集分类预测值
:return: roc值及roc曲线
"""
fpr_test, tpr_test, th_test = metrics.roc_curve(y_test, y_pred_test)
fpr_train, tpr_train, th_train = metrics.roc_curve(y_train, y_pred_train)
print('auc:', metrics.auc(fpr_test, tpr_test))
plt.figure(figsize=[3, 3])
plt.plot(fpr_test, tpr_test, 'b--', linestyle='--')
plt.plot(fpr_train, tpr_train, 'r-')
plt.title('ROC curve')
plt.show()
在原始数据集中分离出特征数据集:
def featureColsTrainDataFilter(df_original_data, cols, target):
"""
:param df_original_data: 原始数据集
:param cols: 特征列表
:param target: 分类标签
:return: 获取特征类数据集
"""
features_labels = []
for feature in range(0, len(cols)):
if cols[feature] != target:
features_labels.append(cols[feature])
return df_original_data[features_labels]
后续其他代码陆续更新。