PCA主成分分析法 简化数据

通过pca可以查看出每个特征的重要性,通过比较可以去除重要性低的特征,减少收集数据的成本

示例:该示例可以将数据中的590个特征缩减为6个。

pca.py

'''
Created on 2018年8月1日

@author: hcl
'''
from numpy import *
import matplotlib.pyplot as plt

# 加载数据
def loadDataSet(filename,delim = '\t'):
    fr = open(filename)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]
    dataArr = [list(map(float,line)) for line in stringArr]
    return mat(dataArr)

def pca(dataMat,topN=999999):
    # 形成样本矩阵,样本中心化
    meanVals= mean(dataMat,axis=0)
    meanRemoved = dataMat - meanVals
    # 计算样本矩阵的协方差矩阵
    covMat = cov(meanRemoved,rowvar=0)
    #  对协方差矩阵进行特征值分解,选取最大的 p 个特征值对应的特征向量组成投影矩阵
    eigVals,eigVects =  linalg.eig(mat(covMat))
    eigValInd = argsort(eigVals)
    eigValInd = eigValInd[:-(topN+1):-1]
    redEigVects = eigVects[:,eigValInd]
    # 对原始样本矩阵进行投影,得到降维后的新样本矩阵
    lowDDataMat = meanRemoved * redEigVects
    reconMat = (lowDDataMat * redEigVects.T)+meanVals
    return lowDDataMat,reconMat

#缺失值处理函数
def replaceNaNWithMean():
    #解析数据
    datMat=loadDataSet('secom.data.txt',' ')
    #获取特征维度     
    numFeat=shape(datMat)[1]
    #遍历数据集每一个维度
    for i in range(numFeat):
        #利用该维度所有非NaN特征求取均值
        meanVal=mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i])
        #将该维度中所有NaN特征全部用均值替换
        datMat[nonzero(isnan(datMat[:,i].A))[0],i]=meanVal
    return datMat

if __name__ == '__main__':
#     dataMat = loadDataSet('testSet.txt')
#     lowMat,reconMat = pca(dataMat,1)
#     fig = plt.figure()
#     ax = fig.add_subplot(111)
#     ax.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],marker='^',s=90)
#     ax.scatter(reconMat[:,0].flatten().A[0],reconMat[:,1].flatten().A[0],marker='o',s=50,c='red')
#     plt.show()
    dataMat=replaceNaNWithMean()
    meanVals=mean(dataMat,axis=0)
    meanRemoved=dataMat-meanVals
    conMat=cov(meanRemoved,rowvar=0)
    eigVals,eigVects=linalg.eig(mat(conMat))
    print(eigVals.shape)
    

输出:

[ 5.34151979e+07+0.00000000e+00j  2.17466719e+07+0.00000000e+00j
  8.24837662e+06+0.00000000e+00j  2.07388086e+06+0.00000000e+00j
  1.31540439e+06+0.00000000e+00j  4.67693557e+05+0.00000000e+00j
  2.90863555e+05+0.00000000e+00j  2.83668601e+05+0.00000000e+00j
  2.37155830e+05+0.00000000e+00j  2.08513836e+05+0.00000000e+00j
  1.96098849e+05+0.00000000e+00j  1.86856549e+05+0.00000000e+00j
  1.52422354e+05+0.00000000e+00j  1.13215032e+05+0.00000000e+00j
  1.08493848e+05+0.00000000e+00j  1.02849533e+05+0.00000000e+00j
...
  0.00000000e+00+0.00000000e+00j  0.00000000e+00+0.00000000e+00j
  0.00000000e+00+0.00000000e+00j  0.00000000e+00+0.00000000e+00j
  0.00000000e+00+0.00000000e+00j  0.00000000e+00+0.00000000e+00j
  0.00000000e+00+0.00000000e+00j  0.00000000e+00+0.00000000e+00j
  0.00000000e+00+0.00000000e+00j  0.00000000e+00+0.00000000e+00j]

猜你喜欢

转载自blog.csdn.net/zhuisaozhang1292/article/details/81347158