机器学习sklearn(7)朴素贝叶斯分类

import os
import jieba

folder_path = r"C:\Users\meachine learn\sample"
os.listdir(folder_path)

['C000008',
 'C000010',
 'C000013',
 'C000014',
 'C000016',
 'C000020',
 'C000022',
 'C000023',
 'C000024']

通过路径拼接依次读取各个文件夹中所有文件内容,并把读取到的内容通过jieba分词,把分词结果以所属txt文件为分割标准保存在data_list中,并把对应原文件夹保存在class_list中

folder_list = os.listdir(folder_path)
data_list = []
class_list = []
for folder in folder_list:
    new_folder_path = os.path.join(folder_path, folder)   #路径拼接
    files = os.listdir(new_folder_path)
    j = 1
    for file in files:
        if j > 100:
            break
        with open(os.path.join(new_folder_path, file), 'r', encoding='ANSI') as f:
            raw = f.read()
        word_cut = jieba.cut(raw, cut_all=False)  # jieba.cut():第一个参数为需要分词的字符串,第二个cut_all控制是否为全模式。
        word_list = list(word_cut)
        data_list.append(word_list)
        class_list.append(folder)
        j += 1

把原来的文件次序打乱

import numpy as np
data_class_list = list(zip(data_list, class_list))   #数据集和标签对应压缩
np.random.shuffle(data_class_list)   #打乱次序

#切分测试集和训练集
index = int(len(data_class_list) * .2) + 1
train_list = data_class_list[:-index]
test_list = data_class_list[index:]
train_data_list, train_class_list = zip(*train_list)   #测试数据、测试标签对应解压缩
test_data_list, test_class_list = zip(*test_list)

#统计训练集中所有词的词频
all_word_dict = {}
for word_list in train_data_list:
    for word in word_list:
        if word in all_word_dict.keys():
            all_word_dict[word] += 1
        else:
            all_word_dict[word] = 1

根据字典的值降序排列

#这里的d.items()实际上是将d转换为可迭代对象,items()方法将字典的元素 转化为了元组,而这里key参数对应的lambda表达式的意思则是选取元组中的第二个元素作为比较参数(如果写作key=lambda item:item[0]的话则是选取第一个元素作为比较对象,也就是key值作为比较对象。lambda x:y中x表示输出参数,y表示lambda 函数的返回值),所以采用这种方法可以对字典的value进行排序。注意排序后的返回值是一个list,而原字典中的名值对被转换为了list中的元组。
all_word_tuple_list = sorted(all_word_dict.items(), key=lambda item:item[1], reverse=True)  
all_word_list, all_word_nums = zip(*all_word_tuple_list)  
all_word_list = list(all_word_list)   #转换成列表

读取常用介词

import pandas as pd
words_set = set()
words_file = r"C:\Users\meachine learn\stopwords_cn.txt"
with open(words_file, 'r',encoding='ANSI') as f:
    for line in f.readlines():
        word = line.strip()
        if len(word) > 0:
            words_set.add(word)

文本特征选取

def words_dict(all_words_list, N, stopwords_set=set()):
    feature_words = []
    n = 1
    for t in range(N, len(all_words_list)):
        if n > 1000:
            break
        if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
            feature_words.append(all_words_list[t])
        n += 1
    return feature_words

feature_words = words_dict(all_word_list, 100, words_set)

获取特征列表

def textfeatures(text, feature_words):
    text_words = set(text)
    features = [1 if word in text_words else 0 for word in feature_words]
    return features 

train_feature_list = [textfeatures(text, feature_words) for text in train_data_list]   #训练特征列表
test_feature_list = [textfeatures(text, feature_words) for text in test_data_list]    #测试特征列表

from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
%matplotlib inline

朴素贝叶斯分类器

NB = MultinomialNB()    
NB.fit(train_feature_list, train_class_list)
test_accuracy = NB.score(test_feature_list, test_class_list)

print(test_accuracy)   #分类器精度

0.8732394366197183

删除前N个高频词汇后得到的特征列表在分类器中的得分与N的关系图

test_accuracy_list = []
deleteN = range(0, 1000, 20)
for N in deleteN:
    feature_words = words_dict(all_word_list, N, words_set)
    train_feature_list = [textfeatures(text, feature_words) for text in train_data_list]
    test_feature_list = [textfeatures(text, feature_words) for text in test_data_list]
    NB.fit(train_feature_list, train_class_list)
    test_accuracy = NB.score(test_feature_list, test_class_list)
    test_accuracy_list.append(test_accuracy)

plt.plot(deleteN, test_accuracy_list)
plt.title("Relationship of deldteN and test_accuracy_list")
plt.xlabel("deleteN")
plt.ylabel("test_accuracy_list")
plt.show()

NB_pred = NB.predict(test_feature_list)   用分类器对测试集预测
print(NB_pred)

['C000023' 'C000008' 'C000016' 'C000010' 'C000023' 'C000020' 'C000010'
 'C000024' 'C000008' 'C000024' 'C000020' 'C000020' 'C000022' 'C000010'
 'C000016' 'C000023' 'C000008' 'C000013' 'C000010' 'C000014' 'C000013'
 'C000020' 'C000020' 'C000014' 'C000010' 'C000013' 'C000008' 'C000010'
 'C000008' 'C000023' 'C000022' 'C000016' 'C000022' 'C000014' 'C000010'
 'C000016' 'C000024' 'C000016' 'C000020' 'C000020' 'C000014' 'C000013'
 'C000008' 'C000022' 'C000014' 'C000023' 'C000008' 'C000016' 'C000014'
 'C000024' 'C000014' 'C000024' 'C000023' 'C000022' 'C000020' 'C000010'
 'C000022' 'C000010' 'C000020' 'C000016' 'C000016' 'C000023' 'C000022'
 'C000016' 'C000020' 'C000010' 'C000022' 'C000024' 'C000024' 'C000014'
 'C000023']

构建新闻类型代码与类型名称的字典,并用该字典对上述预测结果进行映射,把预测类型代码转换成类型名称

names = ['Code', 'Name']
data_trans = pd.read_csv(r"C:\Users\meachine learn\classlist.txt", sep='\t',engine='python', names=names, encoding='ANSI')
data_trans = np.array(data_trans)
dict_trans = {}
for i in range(0, len(data_trans)):
    dict_trans[data_trans[i][0]] = data_trans[i][1]
print(dict_trans)

{'C000008': '财经', 'C000010': 'IT', 'C000013': '健康', 'C000014': '体育', 'C000016': '旅游', 'C000020': '教育', 'C000022': '招聘', 'C000023': '文化', 'C000024': '军事'}

np.vectorize(dict_trans.get)(NB_pred)  #矩阵映射

array(['文化', '财经', '旅游', 'IT', '文化', '教育', 'IT', '军事', '财经', '军事', '教育',
       '教育', '招聘', 'IT', '旅游', '文化', '财经', '健康', 'IT', '体育', '健康', '教育',
       '教育', '体育', 'IT', '健康', '财经', 'IT', '财经', '文化', '招聘', '旅游', '招聘',
       '体育', 'IT', '旅游', '军事', '旅游', '教育', '教育', '体育', '健康', '财经', '招聘',
       '体育', '文化', '财经', '旅游', '体育', '军事', '体育', '军事', '文化', '招聘', '教育',
       'IT', '招聘', 'IT', '教育', '旅游', '旅游', '文化', '招聘', '旅游', '教育', 'IT',
       '招聘', '军事', '军事', '体育', '文化'], dtype='<U2')

猜你喜欢

转载自blog.csdn.net/weixin_44530236/article/details/88769351