机器学习笔记 - 学习朴素贝叶斯概念及应用

概念理解

朴素贝叶斯方法是一组监督学习算法，基于贝叶斯定理，并在给定类变量值的情况下，每对特征之间的条件独立性的“朴素”假设。尽管它们的假设显然过分简化，但朴素的贝叶斯分类器在许多实际情况下（在著名的文档分类和垃圾邮件过滤中）都表现良好。他们需要少量的训练数据来估计必要的参数。另一方面也是由于过于朴素，所以对于输入数据的准备方式较为敏感。

贝叶斯发展出来很多分类

Gaussian Naive Bayes（高斯朴素贝叶斯算法）

Multinomial Naive Bayes（多项式朴素贝叶斯）

Complement Naive Bayes（补充朴素贝叶斯）

Bernoulli Naive Bayes（伯努利·朴素贝叶斯）

Categorical Naive Bayes（分类朴素贝叶斯）

Out-of-core naive Bayes model fitting（超核贝叶斯模型拟合）

举例说明

数据集下载：

链接：https://pan.baidu.com/s/1JmfjcLZz3fGb_RUmPG0Rnw
提取码：098k

下面用汽车分类来举例说明

扫描二维码关注公众号，回复： 12726457 查看本文章

import os
import numpy as np
import pandas as pd
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics , model_selection
## Import the Classifier.
from sklearn.naive_bayes import GaussianNB

data = pd.read_csv('data/car_quality/car.data',names=['buying','maint','doors','persons','lug_boot','safety','class'])
data.head()

数据

     buying    maint       doors      persons  lug_boot    safety      class
0    vhigh      vhigh      2              2     small       low          unacc
1    vhigh      vhigh      2              2     small       med         unacc
2    vhigh      vhigh      2              2     small       high        unacc
3    vhigh      vhigh      2              2     med         low          unacc
4    vhigh      vhigh      2              2     med         med         unacc

data['class'],class_names = pd.factorize(data['class'])
print(class_names)

print(data['class'].unique())

打印结果

Index([u'unacc', u'acc', u'vgood', u'good'], dtype='object')
[0 1 2 3]

data['buying'],_ = pd.factorize(data['buying'])

data['maint'],_ = pd.factorize(data['maint'])

data['doors'],_ = pd.factorize(data['doors'])

data['persons'],_ = pd.factorize(data['persons'])

data['lug_boot'],_ = pd.factorize(data['lug_boot'])

data['safety'],_ = pd.factorize(data['safety'])

data.head()

	`buying`	`maint`	`doors`	`persons`	`lug_boot`	`safety`	`class`
0	0	0	0	0	0	0	0
1	0	0	0	0	0	1	0
2	0	0	0	0	0	2	0
3	0	0	0	0	1	0	0
4	0	0	0	0	1	1	0

选择预测变量，然后选择目标变量

X = data.iloc[:,:-1]

y = data.iloc[:,-1]

分割训练集和测试集

# split data randomly into 70% training and 30% test

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=123)

训练

model = GaussianNB()

## Fit the model on the training data.

model.fit(X_train, y_train)

预测

# use the model to make predictions with the test data

y_pred = model.predict(X_test)

# how did our model perform?

count_misclassified = (y_test != y_pred).sum()

print('Misclassified samples: {}'.format(count_misclassified))

accuracy = metrics.accuracy_score(y_test, y_pred)

print('Accuracy: {:.2f}'.format(accuracy))

预测结果

Misclassified samples: 150
Accuracy: 0.71

其它代码参考，使用朴素贝叶斯进行敏感词分类

代码来源：图灵程序设计丛书 - 机器学习实战

from numpy import *

# 词表到向量的转换函数
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec

# 创建一个包含在所有文档中出现的不重复词的列表，
def createVocabList(dataSet):
    # 创建一个空集合
    vocabSet = set([])  #create empty set
    # 创建并集
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)

# 该函数的输入参数为词汇表及某个文
档，输出的是文档向量，向量的每一元素为1或0，分别表示词汇表中的单词在输入文档中是否出
现。
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print "the word: %s is not in my Vocabulary!" % word
    return returnVec

#朴素贝叶斯分类器训练函数 
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    # 初始化概率
    p0Num = ones(numWords); p1Num = ones(numWords)      #change to ones() 
    p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)          #change to log()
    p0Vect = log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive

# 朴素贝叶斯分类函数 
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0
    
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
    testEntry = ['stupid', 'garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)

def textParse(bigString):    #input is big string, #output is word list
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] 
    
def spamTest():
    docList=[]; classList = []; fullText =[]
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#create vocabulary
    trainingSet = range(50); testSet=[]           #create test set
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
            print "classification error",docList[docIndex]
    print 'the error rate is: ',float(errorCount)/len(testSet)
    #return vocabList,fullText

def calcMostFreq(vocabList,fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token]=fullText.count(token)
    sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True) 
    return sortedFreq[:30]       

def localWords(feed1,feed0):
    import feedparser
    docList=[]; classList = []; fullText =[]
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) #NY is class 1
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#create vocabulary
    top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingSet = range(2*minLen); testSet=[]           #create test set
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ',float(errorCount)/len(testSet)
    return vocabList,p0V,p1V

def getTopWords(ny,sf):
    import operator
    vocabList,p0V,p1V=localWords(ny,sf)
    topNY=[]; topSF=[]
    for i in range(len(p0V)):
        if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
        if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"
    for item in sortedSF:
        print item[0]
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"
    for item in sortedNY:
        print item[0]

其它参考文章

https://www.24tutorials.com/machine-learning/naive-bayes-algorithm/

https://www.intechopen.com/books/artificial-neural-networks-application/modeling-spammer-behavior-artificial-neural-network-vs-nai-ve-bayesian-classifier

https://www.emerald.com/insight/content/doi/10.1108/JEFAS-02-2017-0039/full/html

https://scikit-learn.org/stable/modules/naive_bayes.html

https://en.wikipedia.org/wiki/Naive_Bayes_classifier

http://www.ruanyifeng.com/blog/2013/12/naive_bayes_classifier.html

https://www.jianshu.com/p/cb5d1a7f9033

https://blog.csdn.net/weixin_43557810/article/details/91350799

https://blog.csdn.net/weixin_43225966/article/details/109909534