版权声明:本文为博主原创文章,若需转载,请注明http://blog.csdn.net/qq_30091945 https://blog.csdn.net/qq_30091945/article/details/82184489
前言
K-Means聚类算法学习笔记请移步:斯坦福机器学习笔记(六)——K-Means聚类算法。同时本篇博客的PDF笔记请移步:K-Means聚类算法。
K-Means聚类Pyhon核心代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/8/2910:38
# @Author : DaiPuWei
# E-Mail : [email protected]
# @Site :
# @File : KMeans.py
# @Software: PyCharm
import numpy as np
class KMeans:
def __init__(self,train_data,k):
"""
K-Means聚类算法的构造函数
:param train_data: 训练数据
:param k: 聚类簇数
"""
self.Train_Data = train_data
self.K = k
row,col = np.shape(train_data)
self.Label = np.array([0]*row)
self.centroids = np.zeros((k,col)) # 质心坐标
# 初始化质心坐标
for i in range(col):
Min = np.min(self.Train_Data[:,i])
Max = np.max(self.Train_Data[:,i])
self.centroids[:,i] = Min+float(Max-Min)*np.random.rand(k)
def EcludDistance(self,dataA,dataB):
"""
计算欧式距离
:param dataA: 数据A
:param dataB: 数据B
"""
return np.sqrt(np.sum((dataA-dataB)**2))
def kmeans(self):
"""
这是进行K-Means聚类算法的函数
"""
dist = 0
olddist = 1
row,col = np.shape(self.Train_Data)
while np.abs(dist-olddist) > 1E-6:
olddist = dist
# 更新每组训练数据的标签
for i in range(row):
dist = []
# 计算一组数据与质心之间的距离
for j in range(self.K):
dist.append(self.EcludDistance(self.Train_Data[i],self.centroids[j])**2)
self.Label[i] = np.argmin(dist)
# 更新聚类质心的坐标
for i in range(self.K):
cluster_data = self.Train_Data[self.Label == i]
size = len(cluster_data)
self.centroids[i] = np.sum(cluster_data,0)/size
# 计算距离函数
dist = 0
for (data,label) in zip(self.Train_Data,self.Label):
dist += self.EcludDistance(data,self.centroids[label])**2
return self.Label,self.centroids
实验结果代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/8/2911:18
# @Author : DaiPuWei
# E-Mail : [email protected]
# @Site :
# @File : demo.py
# @Software: PyCharm
from KMeans import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans as _KMeans
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
def run_main():
"""
这是主函数
"""
# 导入Iris数据集
iris = load_iris()
label = np.array(iris.target)
data = np.array(iris.data)
# 对数据进行预处理
data = MinMaxScaler().fit_transform(data)
# 解决画图是的中文乱码问题
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
# 执行K-Means聚类算法
k = 3
color = ['r','g','b']
kmeans = KMeans(data,k)
predict_label,centroids = kmeans.kmeans()
print("手写K-Means聚类算法的准确率:%f"%(accuracy_score(label,predict_label)))
for i in range(k):
plt.scatter(centroids[i][2],centroids[i][3],c=color[i],marker='x',s=40)
for (_data,_label) in zip(data,predict_label):
plt.scatter(_data[2], _data[3], color=color[_label],alpha=0.3)
plt.show()
# sklearn的KMeans
k = 3
kmeans = _KMeans(n_clusters=k,max_iter=1000)
kmeans.fit(data,label)
predict_label = kmeans.predict(data)
centroids = kmeans.cluster_centers_
color = ['r', 'g', 'b']
print("sklearn的K-Means聚类算法的准确率:%f" % (accuracy_score(label, predict_label)))
for i in range(k):
plt.scatter(centroids[i][2], centroids[i][3], c=color[i], marker='x', s=40)
for (_data, _label) in zip(data, predict_label):
plt.scatter(_data[2], _data[3], color=color[_label],alpha=0.3)
plt.show()
if __name__ == '__main__':
run_main()
实验结果
上述的程序利用K-Means聚类算法对鸢尾花数据进行聚类,计算了聚类后的分类准确率。实验数据表明,实验结果对质心的初始位子比较敏感。下面是两组不同的是实验结果。
实验一:
手写K-Means的数据可视化:
sklearn的K-Means的数据可视化:
实验二:
手写K-Means的数据可视化:
sklearn的K-Means的数据可视化: