机器学习实战---朴素贝叶斯分类方法

from numpy import *

def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec
    
def createVocabList(dataSet): #dataSet就是上面那个函数返回的postingList
	vocabSet = set([]) #创建一个空集
	for document in dataSet:
		vocabSet = vocabset | set(document) #符号‘|’用来求解两个集合的并集
	return list(vocabSet) #再将集合转换成列表,返回
		
def setOfWords2Vec(vocabList,inputSet):#vocabList是词汇表,是上面那个函数的输出,inputSet是一个文档
	returnVec=[0]*len(vocabList)  #创建一个元素都是0的向量
	for word in inputSet: #对于文档中的每一个单词
		if word in vocabList:
			returnVec[vocabList.index(word)]=1 #就令词汇表中该word所在位置为1
		else: print "the word: %s is not int my Vocabulary!"  %word
	return returnVec
	
def trainNB0(trainMatrix,trainCategory): #trainMatrix:文档矩阵(已经是有01构成)。trainCategory:每篇文档标签所构成的向量
	numTrainDocs = len(trainMatrix) #获取文档矩阵中有几篇文档
	numWords = len(trainMatrix[0])#获取第一篇文档的单词长度
	pAbusive = sum(trainCategory)/float(numTrainDocs) #类别为1的样本数量/样本总数
	
	p0Num = ones(numWords); p1Num = ones(numWords)	#初始化求概率的分子和分母向量
	p0Denom=2.0; p1Denom=2.0 
	
	for i in range(numTrainDocs): 	#对于每一篇文档 
		if trainCategory[i]==1: 	#如果它的类别是1类
			p1Num += trainMatrix[i]   		#p1Num向量中统计了类别为1时,每个特征在类别为1样本中出现的总次数
			p1Denom += sum(trainMatrix[i])	#p1Denom统计了类别为1的所有样本中,所有出现过的单词的总数目
		else:
			p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)      #计算类别为1时,每个特征出现的频率   
    p0Vect = log(p0Num/p0Denom)          
    return p0Vect,p1Vect,pAbusive
    
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
	#vec2Classify:待分类的向量
	#p0Vec,p1Vec:由trainNB0函数输出的:元素的每个词在该类别上出现的概率
	#pClass1:trainNB0函数输出的pAbusive,既类别为1侮辱性概率
	p1=sum(vec2Classify*p1Vec)+log(pclass1)
	p0=sum(vec2Classify*p0Vec)+log(1-pclass1)
	
	if p1 > p0:
		return 1
	else:
		return 0
			
def testingNB():
	listOPosts,listClasses = loadDataSet()  #listOPosts列表的每个元素是:一个词条文档的列表
	myVocabList = createVocabList(listOPosts) #生成listOPosts的所有词汇的词汇表myVocabList
	trainMat=[]
	for postinDoc in listOPosts: #trainMat列表的每个元素是:每个文档转化成 词汇是否出现的01文档向量
		trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
	p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #这也是训练过程
	
	testEntry = ['love', 'my', 'dalmation'] #测试文档
	thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) #首先要将测试文档转化成为文档向量
	print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
	testEntry = ['stupid', 'garbage']
	thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
	print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
		
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1 #统计每个文档中,一个词出现的次数
    return returnVec
		
def textParse(bigString):    #input is big string, #output is word list
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] 
    
def spamTest():
	docList=[]; classList = []; fullText =[]	
	for i in range(1,26):
		wordList = textParse(open('email/spam/%d.txt' %i).read())
		#读文本文档分割字符串,得到这个文档所有单词元素的列表
		docList.append(wordList)#把每个文档切分后,加入一个列表中	
		fullText.extend(wordList) #把这个文档的所有单词都拿出来,做一个列表合并
		classList.append(1) #这个文档的类别标签是1
		
		wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)#这个文档的类别标签是0
        
	vocabList = createVocabList(docList) #得到词汇表
	trainingSet = range(50);testSet=[]
	
	for i in range(10):#就是随机挑出10个做为测试文本,其余的作为训练样本
		randIndex = int(random.uniform(0,len(trainingSet))) #在0~49随机选取一个数
		testSet.append(trainingSet[randIndex])
		del(trainingSet[randIndex])  #在训练集中删掉被选中的随机数
	
	trainMat=[];trianClass=[]
	for docIndex in trainingSet:
		trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
		trainClass.append(classList[docIndex])
	#到此,得到训练样本矩阵和训练样本类标签向量
	
	p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClass))
	errorCount =0
	
	for docIndex in testSet:        #随机选出的10个测试样本进行测试
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
            print "classification error",docList[docIndex]
    print 'the error rate is: ',float(errorCount)/len(testSet)
    #return vocabList,fullText
		
def calMostFreq(vocabList,fulltext):
	freqDict={}
	for token in vocabList:
		freqDict[token]=fullText.count(token) #统计词汇表中的每个词在fullText中出现的次数
	sortedFreq=sorted(freqDict.iteritems,key=operator.itemgetter(1),reverse=True)
	return sortedFreq[:30]
	
def localWords(feed1,feed0):
	import feedparser
	docList=[];classList=[];fullText=[]
	minLen = min(len(feed1['entries']),len(feed0['entries']))#求两个源所有条目列表中长度较小的那个长度值
	
	for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])#每次访问一条RSS源
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
       
	vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList,fullText)#得到在两个源中出现次数最高的30个单词
    
    for pairW in top30Words:
        if pairW[0] in vocabList:
			vocabList.remove(pairW[0])#从词汇表中把高频的30个词移除
	trainingSet = range(2*minLen); testSet=[]#
    for i in range(20):#从两个rss源中挑出20条作为测试文本
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
        
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:#训练文本
        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
        
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:#计算分类,和错误率
        wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ',float(errorCount)/len(testSet)
    return vocabList,p0V,p1V

猜你喜欢

转载自blog.csdn.net/carl95271/article/details/80765756