版权声明:本文为博主原创文章,若需转载,请注明http://blog.csdn.net/qq_30091945 https://blog.csdn.net/qq_30091945/article/details/82556763
前言
本篇博客所写的算法对应于吴恩达教授的机器学习教程里的多元伯努利事件模型的朴素贝叶斯。
多元伯努利事件模型的Python代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/9/415:55
# @Author : DaiPuWei
# E-Mail : [email protected]
# @Site : 北教25实验室
# @File : NaiveBayes.py
# @Software: PyCharm
import numpy as np
"""
这份朴素贝叶斯的代码对应于吴恩达机器学习笔记中多元伯努利模型
即每组词向量的每一维的取值为0或1。
"""
class NaiveBayes_Bernoulli(object):
def __init__(self,Train_Data,Train_Label):
"""
这厮朴素贝叶斯的构造函数
:param Train_Data: 训练数据
:param Train_Label:训练标签
"""
# 生成字典
self.VocabularyList = self.CreateVocabularyList(Train_Data)
self.Train_Label = Train_Label
self.Train_Data = Train_Data
# 文本数据向量化
self.Train_Data_Vector = []
for data in self.Train_Data:
self.Train_Data_Vector.append(self.Word2Vector(data,self.VocabularyList))
self.Train_Data_Vector = np.array(self.Train_Data_Vector)
# 初始化朴素贝叶斯的相关参数
self.py1 = 0 # p(y=1)
self.py0 = 0 # p(y=0)
col = len(self.VocabularyList)
self.pj_y1 = np.array([0]*col) # p(x_j|y=1)
self.pj_y0 = np.array([0]*col) # p(x_j|y=0)
def CreateVocabularyList(self,dataset):
"""
这是利用数据集构造词汇表(字典)的函数
:param dataset: 数据集
"""
# 初始化字典,
VocabularyList = set([])
for data in dataset:
for _data in data:
# 集合内不会包含重复元素
VocabularyList.add(_data)
return list(VocabularyList)
def Word2Vector(self,input_data,VocabularyList):
"""
这是将一组数据向量化的函数
:param input_data: 输入数据
:param VocabularyList: 字典
"""
_data = [0]*len(VocabularyList)
# enumerate(input_data)返回文字与对应下标的元组
for (index,data) in enumerate(input_data):
if data in VocabularyList:
_data[index] = 1
return _data
def Train(self):
"""
这是朴素贝叶斯分类器的训练函数,这里利用拉普拉斯平滑对结果进行修正
"""
# 计算标签为0和1的个数
_py0 = self.Train_Label == 0
_py1 = self.Train_Label == 1
num_label0 = len(_py0)
num_label1 = len(_py1)
# 计算p(y=1)和p(y=0)
self.py1 = num_label1/float(num_label0+num_label1)
self.py0 = num_label0/float(num_label1+num_label0)
# 计算p(x_j|y=1)和p(x_j|y=0),加入了拉普拉斯平滑来修正结果
p1_num = 2.0
p0_num = 2.0
for (index,data) in enumerate(self.Train_Data_Vector):
# np.sum(data)为每数据中1的总和
if self.Train_Label[index] == 1:
self.pj_y1 += data
p1_num += np.sum(data)
else:
self.pj_y0 += data
p0_num += np.sum(data)
self.pj_y1 = (self.pj_y1+1)*1.0/p1_num
self.pj_y0 = (self.pj_y0+1)*1.0/p0_num
def predict(self,test_data):
"""
这是对一组数据进行预测的函数
:param test_data: 一组测试函数,类型为np.array
"""
"""
由于p(x_j|y=1)和p(x_j|y=0)数组中的每一项大小过小,从而导致连乘过后
导致数值下溢,因此我们采取折回策略,首先将其取对数后再做指数运算,获得近似结果。
"""
_p0 = np.exp(np.sum(test_data * np.log(self.pj_y0)) + np.log(self.py0))
_p1 = np.exp(np.sum(test_data * np.log(self.pj_y1)) + np.log(self.py1))
px = _p0+_p1
p0 = float(_p0)/float(px)
p1 = float(_p1)/float(px)
print(p0)
print(p1)
if p1 >= p0:
return 1
else:
return 0
def Test(self,Test_Data):
# 数据向量化
Test_Data_Vector = []
for test_data in Test_Data:
Test_Data_Vector.append(self.Word2Vector(test_data,self.VocabularyList))
Test_Predict_Label = []
for test_data in Test_Data_Vector:
Test_Predict_Label.append(self.predict(test_data))
return Test_Predict_Label
测试代码
下面的代码的数据来自于《机器学习实战》。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/9/910:23
# @Author : DaiPuWei
# E-Mail : [email protected]
# @Site : 北教25实验室
# @File : Bernoulli_demo.py
# @Software: PyCharm
import numpy as np
from NaiveBayes.NaiveBayes_Bernoulli import NaiveBayes_Bernoulli
def run_main():
"""
这是主函数
"""
# 初始化训练数据与标签
traindata = np.array([['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']])
trainlabel = np.array([0, 1, 0, 1, 0, 1]) # 1 is abusive, 0 not
testdata = np.array([['love', 'my', 'dalmation'],['stupid', 'garbage']])
# 导入朴素贝叶斯分类器
NB_Bernoulli= NaiveBayes_Bernoulli(traindata,trainlabel)
# 训练朴素贝叶斯
NB_Bernoulli.Train()
# 预测
predict_label = NB_Bernoulli.Test(testdata)
print(predict_label)
if __name__ == '__main__':
run_main()