# -- coding: utf-8 -- from bayes import* import matplotlib.pyplot as plt ListOposts, listClasses = loadDataSet() myVocablist = creatVocabList(ListOposts) #创建单词列表 print(myVocablist) trainMat=[] for i in range(len(ListOposts)): trainMat.append(setOfWords2Vec(myVocablist, ListOposts[i])) print(trainMat) p0v, p1v, pAb = trainNB0(trainMat, listClasses) print(p1v) print(p0v) testEntry = [['love', 'my', 'dalmation'], ['stupid', 'garbage']] for a in range(len(testEntry)): t = setOfWords2Vec(myVocablist, testEntry[a]) print(t) print(classifyNB(array(t), array(p0v), array(p1v), pAb)) spamTest()
# -- coding: utf-8 -- #bayes from numpy import* from math import log import operator import matplotlib.pyplot as plt def loadDataSet(): postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] classVec = [0, 1, 0, 1, 0, 1] #1 侮辱文字 0 正常文字 return postingList, classVec def creatVocabList(dataSet): #构建词汇表 vocabSet = set([]) for document in dataSet: vocabSet = vocabSet | set(document) #或操作 return list(vocabSet) #计算每个文档的词是否出现 词集模型 def setOfWords2Vec(vocabList, inputSet): returnVec = [0]*len(vocabList) #[]*n 将数组重复n次并依次连接形成一个新数组 for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] = 1 else: print('the word : %s is not in my Vocabulary!' %word) return returnVec #计算每个文档的词出现次数 词袋模型 def bagOfWords2VecMN(vocabList,inputSet): returnVec = [0]*len(vocabList) #[]*n 将数组重复n次并依次连接形成一个新数组 for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] += 1 return returnVec #训练贝叶斯模型 def trainNB0(trainMatrix, trainCategory): numTrainDocs = len(trainMatrix) numWords = len(trainMatrix[0]) pAbusive = sum(trainCategory)/float(numTrainDocs) #计算p(ck)分类概率 p0Num = ones(numWords) #防止p(w|c)累乘 中有值为0的项 所以用ones矩阵 p1Num = ones(numWords) p0Denom = 2.0 p1Denom = 2.0 for i in range(numTrainDocs): if trainCategory[i] == 1: p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect = [] p0Vect = [] for i in range(numWords): #单个单词的p(x1|c1)取log,防止小数累乘溢出变0 p1Vect.append(log(p1Num[i]/p1Denom)) p0Vect.append(log(p0Num[i]/p0Denom)) return p0Vect, p1Vect, pAbusive #朴素贝叶斯分类函数 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): p1 = sum(vec2Classify * p1Vec) + log(pClass1) p0 = sum(vec2Classify * p0Vec) + log(1.0-pClass1) if p1 > p0: return 1 else: return 0 #文本解析 def textParse(bigString): import re ListofTokens = re.split(r'\w*', bigString) #正则表达式 return [tok.lower() for tok in ListofTokens if len(tok) > 2] def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): #50个样本 正负各25个 wordList = textParse(open(r'E:\file\python\test\test\bayes_data\email\spam/%d.txt' %i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open(r'E:\file\python\test\test\bayes_data\email\ham/%d.txt' %i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = creatVocabList(docList) trainingSet = range(50) #从50个样本随机选10个作为测试集 testSet = [] for i in range(10): randIndex = int(random.uniform(0,len(trainingSet))) #随机生成下一个实数,它在(x,y)范围内 testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat = [] #训练集矩阵 trainClasses = [] #训练集类别 for docIndex in trainingSet: trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses)) erroCount = 0 for docIndex in testSet: wordVector = setOfWords2Vec(vocabList, docList[docIndex]) if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: erroCount += 1 print('the error rate is ', float(erroCount)/len(testSet))