朴素贝叶斯实现的文本分类

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u014539580/article/details/78118435

参考文章: 朴素贝叶斯实现的文本分类原理

github 地址:https://github.com/CodeforL/Naive-Bayes

# coding=UTF-8
'''
Created on 2017
@author: XYJ
'''
import jieba
import os
import random
import math


def TextProcessing(floder_path,train_size =0.8):
    floder_list = os.listdir(floder_path)
    train_data_list = []
    train_class_list = [] 
    test_data_list = []
    test_class_list = [] 
    for floder in floder_list:
        new_floder_path = os.path.join(floder_path,floder)
        new_floder_list = os.listdir(new_floder_path)
        word_list = []
        for file in new_floder_list:
            txt_list =[]
            with open(os.path.join(new_floder_path,file),'rb') as f:
                raw = f.read().decode('ANSI','ignore')
            txt_list = list(jieba.cut(raw,cut_all = False))

            while '\u3000' in txt_list:
                txt_list.remove('\u3000')
            while '\r\n' in txt_list:
                txt_list.remove('\r\n')
            while '\x00' in txt_list:
                txt_list.remove('\x00')
            while '\n' in txt_list:
                txt_list.remove('\n')
            word_list.append(txt_list)
        random.shuffle(word_list)
        size = int(len(word_list)*train_size)
        print(floder)
        print(size)

        tem_train_list = word_list[:size]
        tem_test_list = word_list[size:]
        tem_train_word = []
        for a in tem_train_list :
            for b in a:
                tem_train_word.append(b)
        3
        ##生成训练数据集和测试数据集
        train_data_list.append(tem_train_word)
        train_class_list.append(floder)
        test_data_list.append(tem_test_list)
        test_class_list.append(floder)

    return train_data_list,test_data_list,train_class_list,test_class_list

'''
@param  param is stopwords's filename: 
@return: a set of stopwords_file
'''
def makeStopwordsSet(stopwords_file):
    words_set = set()
    with open(stopwords_file,'rb') as f:
        lines = f.readlines()
    for line in lines:
        word = line[:-2].decode('UTF-8')
        if len(word)>0 and word not in words_set:
            words_set.add(word)
    return words_set

def listToDict(data_list,stopwords_set=set()):
    data_dict = {}
    for word in data_list:
        if word not in stopwords_set and not word.isdigit():
            if word in data_dict:
                data_dict[word] += 1
            else:
                data_dict[word] = 1

    return data_dict

def clearlist(test_list,stopwords_set = set()):
    test = []
    for word in test_list:
        if word not in stopwords_set and not word.isdigit():
            test.append(word)
    return test

def predicted(test_list,train_data_list_dict,train_class_list,train_data_count):
    predicte = []
    for dic ,count in zip(train_data_list_dict,train_data_count):
        laplace = 0
        for word in test_list:
            laplace += P(word,dic,count)
        predicte.append(laplace)
    ma = max(predicte)
    return train_class_list[list.index(predicte,ma)]

def P(word,dic,count):
    if word in dic:
        laplace = math.log(((dic[word]+1)/(count + len(dic))))/math.log(10)
    else:
        laplace = math.log((1/(count + len(dic))))/math.log(10)
    return laplace


def main():
    abspath = os.path.abspath(os.path.dirname(os.getcwd()))
    ##########获取不关键单词集合##########
    stopwords_file = abspath + '\\stopwords_cn.txt'
    stopwords_set = makeStopwordsSet(stopwords_file)

    ###########获取数据集################
    folder_path = abspath+'/Reduced'
    train_data_list,test_data_list,train_class_list,test_class_list = TextProcessing(folder_path,train_size = 0.8)

    ##处理训练数据集#####################
    train_data_list_dict = []
    for word_list in train_data_list:
        train_data_list_dict.append(listToDict(word_list, stopwords_set))
    print('训练数据集处理完成')

    ##处理测试训练集########
    for test_list in test_data_list:
        for test in test_list:
            test = clearlist(test,stopwords_set)
    print('测试数据集处理完成')

    ##对每一类的关键词按照递减顺序排列
    for a in train_data_list_dict:
        internet_list = sorted(a.items(),key = lambda f : f[1],reverse = True)
        print(internet_list[:200])

    ##统计每一类的单词数,为了方便计算P(Bi/A)
    train_data_count = []
    for dic in train_data_list_dict:
        count = 0
        for v in dic.values():
            count += v
        train_data_count.append(count) 

    ###test###########################################

    for li,classtpye in zip(test_data_list,test_class_list):
        corr = 0
        count = 0
        for lis in li:
            name = predicted(lis, train_data_list_dict, train_class_list, train_data_count)
            count += 1
            if name == classtpye:
                corr += 1

        print(classtpye+'类预测成功率为 %.3f %%'%(corr*100/count))


if __name__  == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/u014539580/article/details/78118435