kMeans聚类的python实现

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u014539580/article/details/78264423
from numpy import *
import matplotlib.pyplot as plt

#辅助函数
#载入数据集
def loadDataSet(filename):
    dataMat = []
    f = open(filename)
    for line in f.readlines():
        curLine = line.strip().split('\t')
        #python3.之后需要用list(map())
        fltLine = list(map(float,curLine))
        dataMat.append(fltLine)
    return dataMat

#返回两个点的欧氏距离
def distEclud(vecA,vecB):
    return sqrt(sum(power(vecA-vecB,2)))

#构建一个包含k个随机质心的集合
def randCent(dataSet,k):
    #获取每一位维的度数
    n = shape(dataSet)[1]
    #生成(k,n)维空数组矩阵
    centroids = mat(zeros((k,n)))
    #在minJ到maxJ之间生成随机质心填充 centroids
    for j in range(n):
        minJ = min(dataSet[:,j])
        rangeJ = float(max(dataSet[:,j]) - minJ)
        centroids[:,j] = minJ + rangeJ * random.rand(k,1)
    return centroids

def kMeans(dataSet,k,dist = distEclud, createCent = randCent):
    m = shape(dataSet)[0]
    #长度为m的label数组
    label = zeros((1,m))[0]
    centroids = createCent(dataSet,k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        #为每个点寻找最近的质心
        for i in range(m):
            minDist = inf; minIndex = -1;
            for j in range(k):
                distJI = dist(centroids[j,:],dataSet[i,:])
                if distJI < minDist:
                    minDist = distJI;minIndex = j
            if(label[i] != minIndex):
                clusterChanged = True
                label[i] = minIndex 
        print(centroids)

        #重新计算质心的位置
        for cent in range(k):
            ptsInclust = dataSet[nonzero(label == cent)[0]]
            centroids[cent,:] = mean(ptsInclust,axis = 0)

    return centroids , label 


if __name__ == '__main__':
    k = 4
    filename = 'testSet.txt'
    dataSet = loadDataSet(filename)
    dataArray = array(dataSet)
    #dataMat = mat(loadDataSet(filename))
    #plt.plot(dataArray[:,0],dataArray[:,1],'o')
    centroids,label = kMeans(dataArray,k)

    str = 'o*s^'
    color = 'bgrc'
    for i in range(len(label)):
        ch = str[int(label[i])]
        co = color[int(label[i])]
        plt.plot(dataArray[i,0],dataArray[i,1],color =co ,marker = ch)

    for i in range(len(centroids)):
        plt.plot(centroids[i,0],centroids[i,1],'k+')

猜你喜欢

转载自blog.csdn.net/u014539580/article/details/78264423