matplotlib.pyplot绘制kmeans的聚合程度,以及轮廓系数

Kmeans2Pmml.py
# -*- coding:utf-8 -*-
import pandas
from sklearn.model_selection import train_test_split
import numpy as np  # 导入numpy库
import matplotlib.pyplot as plt  # 导入matplotlib库
from sklearn.cluster import KMeans  # 导入sklearn聚类模块
from sklearn import metrics  # 导入sklearn效果评估模块
import random

from sklearn.decomposition import PCA
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml

def make_result_pic(x_train,silhouette_s,pic_kmeans,data_view,n_clusters,result_pic):
    # 为方便模型可视化将元数据降维成2维
    reduced_data = PCA(n_components=2).fit_transform(x_train)
    print(len(reduced_data))
    pic_kmeans.fit(reduced_data)
    x_pre=pic_kmeans.predict(reduced_data)
    dict = {}
    for key in x_pre:
        dict[key] = dict.get(key, 0) + 1
    # print(dict)
    # 模型效果可视化
    centers = pic_kmeans.cluster_centers_  # 各类别中心
    color_list = []  # 颜色列表
    for index in range(n_clusters):
        R = round(random.uniform(0, 1), 4)
        G = round(random.uniform(0, 1), 4)
        B = round(random.uniform(0, 1), 4)
        cur_color = (R, G, B)
        color_list.append(cur_color)
    plt.figure(figsize=(10, 10))  # 创建画布
    plt.subplot(2, 2, 1)  # 第一个子网格
    for i in range(n_clusters):  # 循环读类别
        index_sets = np.where(x_pre == i)  # 找到相同类的索引集合
        cluster = reduced_data[index_sets]  # 将相同类的数据划分为一个聚类子集
        plt.scatter(cluster[:, 0], cluster[:, 1], c=color_list[i], marker='.')  # 展示聚类子集内的样本点
        plt.plot(centers[i][0], centers[i][1], 'o', markerfacecolor=color_list[i], markeredgecolor='k',
                 markersize=6)  # 展示各聚类子集的中心

    # 子网格3:
    plt.subplot(2, 2, 2)  # 第二个子网格
    plt.axis('off')
    plt.title('silhouette_s:' + str(silhouette_s)+" "+data_view, loc='center')  # 子网格标题
    # 子网格2:
    plt.subplot(2, 2, 3)  # 第二个子网格
    plt.axis('off')
    plt.title('distribution:' +str(dict), loc='center')  # 子网格标题
    # 自动调整绘图区的大小及间距
    fig = plt.gcf()
    fig.tight_layout()
    fig.savefig(result_pic)
    plt.close()

def main(modelName,trainingFilePath,code,k_test_size,k_clusters,k_random_state,k_max_iter):
    # 评估结果生成路径
    result_pic = str(modelName).replace("pmml", "png")
    iris_df = pandas.read_csv(trainingFilePath, encoding=code)
    columns = iris_df.columns.tolist()
    # 默认第一列为行号  最后一列为标签列
    first_colName = columns[0];
    feature_list = (iris_df.columns.difference([first_colName])).tolist()
    X = iris_df[iris_df.columns.difference([first_colName])]
    # 按照比例将数据分成训练集和测试集
    x_train, x_test= train_test_split(X, test_size=k_test_size, random_state=0);
    data_view = "total: " + str(len(iris_df)) + " train:" + str(len(x_train)) + " test:" + str(len(x_test))
    print(data_view)
    if len(columns) < 3:
        print("columnNum error")
        exit(1)
    else:
        print("check success")
    from sklearn2pmml.pipeline import PMMLPipeline
    model_kmeans = KMeans(n_clusters=k_clusters, random_state=k_random_state, max_iter=k_max_iter)  # 建立聚类模型对象
    pic_kmeans = KMeans(n_clusters=k_clusters, random_state=k_random_state, max_iter=k_max_iter)  # 建立聚类模型对象
    pipeline = PMMLPipeline([
        ("classifier", model_kmeans)
    ])
    pipeline.fit(X)  # 训练聚类模型
    y_pre = pipeline.predict(X)  # 预测聚类模型
    # 模型效果指标评估
    silhouette_s = metrics.silhouette_score(X, y_pre, metric='euclidean')  # 平均轮廓系数
    silhouette_s=round(silhouette_s,4)
    #绘制结果
    make_result_pic(x_train,silhouette_s,pic_kmeans,data_view,k_clusters,result_pic)
    sklearn2pmml(pipeline, modelName, with_repr=True)
kmeans_main.py
# -*- coding:utf-8 -*-
import sys
import kmeans_pmml.Kmeans2Pmml as m
#sys.argv[0] 为脚本自身
#m.main(sys.argv[1],sys.argv[2])
from util import codingUtil
try:
     # modelName ="E:/data/out/kmeans.pmml"
     # trainingFilePath = "E:/data/cluster2.csv"
     # k_clusters=3
     # k_random_state=None
     # k_max_iter=200
     # k_test_size = 0.2
     modelName=sys.argv[1]
     trainingFilePath=sys.argv[2]
     # 默认 8
     k_clusters = int(sys.argv[3])
     if sys.argv[4]=="None":
          k_random_state =None
     else :
          k_random_state =int(sys.argv[4])
     k_max_iter = int(sys.argv[5])
     k_test_size = float(sys.argv[6])
     code = codingUtil.file_encoding(trainingFilePath)
     m.main(modelName,trainingFilePath,code,k_test_size,k_clusters,k_random_state,k_max_iter)
except Exception as e:
    print('Exception :\t\t', str(e))

聚类效果图  ,数据分布,聚合分布情况

总体代码参考 

文章来源-宋天龙.《Python数据分析与数据化运营》

但是案例里面的为两个特征的数据刚好能画聚合效果图

实际可能为多个特征 所以需要先降维再去绘图

猜你喜欢

转载自blog.csdn.net/qq_14865711/article/details/82998785