[Série Xiaobai] Visualisation de l'arbre de décision du modèle de prédiction de survie Titanic via pydot + GraphViz

Pour une installation spécifique de GraphViz, veuillez cliquer sur le lien

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier  # 从sklearn中导入决策树分类器模型
from sklearn.feature_extraction import DictVectorizer  # 特征抽取:将特征与值的映射字典组成的列表转换成向量。
from sklearn.model_selection import cross_val_score  # 导入数据交叉验证的数据方法
from sklearn import metrics  # 指明Python sklearn机器学习各种评价指标
import matplotlib.pyplot as plt
import seaborn as sns

# 数据加载
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')


# 使用平均年龄来填充年龄中的nan值
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)
# 使用票价的均值填充票价中的nan值
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True)
#print(train_data['Embarked'].value_counts())
# 使用登录最多的港口来填充登录港口的nan值
train_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked'].fillna('S',inplace=True)
# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features = train_data[features]

# 显示特征之间的相关系数
plt.figure(figsize=(10, 10))
plt.title('Pearson Correlation between Features',y=1.05,size=15)
train_data_hot_encoded = train_features.drop('Embarked',1).join(train_features.Embarked.str.get_dummies())
train_data_hot_encoded = train_data_hot_encoded.drop('Sex',1).join(train_data_hot_encoded.Sex.str.get_dummies())
#  计算特征之间的Pearson系数,即相似度,具体可以看链接:https://blog.csdn.net/KaelCui/article/details/105235136
sns.heatmap(train_data_hot_encoded.corr(),linewidths=0.1,vmax=1.0, fmt= '.2f', square=True,linecolor='white',annot=True)
plt.show()


# 使用饼图来进行Survived取值的可视化
#print(type(train_data["Survived"].value_counts()))
train_data["Survived"].value_counts().plot(kind = "pie", label='Survived')
plt.show()

# 不同的Pclass,幸存人数(条形图)
sns.barplot(x = 'Pclass', y = "Survived", data = train_data);
plt.show()

# 不同的Embarked,幸存人数(条形图)
sns.barplot(x = 'Embarked', y = "Survived", data = train_data);
plt.show()


# 训练并显示特征向量的重要程度
def train(train_features, train_labels):
	# 构造CART决策树
	clf = DecisionTreeClassifier()
	# 决策树训练
	clf.fit(train_features, train_labels)
	# 显示特征向量的重要程度
	coeffs = clf.feature_importances_
	#print(coeffs)
	df_co = pd.DataFrame(coeffs, columns=["importance_"])
	# 下标设置为Feature Name
	df_co.index = train_features.columns
	#print(df_co.index)
	df_co.sort_values("importance_", ascending=True, inplace=True)
	df_co.importance_.plot(kind="barh")

	plt.title("Feature Importance")
	plt.show()
	return clf

clf = train(train_data_hot_encoded, train_data["Survived"])

# 决策树可视化
import pydotplus  # 可以将Sklearn生成dot格式
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz  #  #可视化所需的导入工具从sklearn.tree导入export_graphviz导入pydot导入操作系统

def show_tree(clf):
	dot_data = StringIO()
	export_graphviz(clf, out_file=dot_data)
	graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf("titanic_tree.pdf")

show_tree(clf)

Insérez la description de l'image ici
Insérez la description de l'image ici
Insérez la description de l'image ici
Insérez la description de l'image ici
Insérez la description de l'image ici
Insérez la description de l'image ici

Publié 42 articles originaux · loué 28 · visites 4961

Je suppose que tu aimes

Origine blog.csdn.net/KaelCui/article/details/105261749
conseillé
Classement