中文分词并字典化

一、数据集准备

在这里插入图片描述
二、代码实现

import csv
import fnmatch
import os
import re
from collections import OrderedDict
import jieba

text_list = []


def word_frequency_analysis(path):
    # print("111")
    files = os.listdir(path)  # files为列表，存储的是path里面的所有文件名
    for filename in files:  # 遍历文件夹中的所有文件
        if not fnmatch.fnmatch(filename, '*.TXT'):  # 找到txt文件
            continue
        txt_path = os.path.join(path, filename)
        txt_content = open(txt_path, 'rb').read()  # 读取txt文件中的中文

        text_cutted = jiebaCutText(txt_content)  # 用结巴分词
        text_list.append(text_cutted)  # 列表中添加分词后的结果


def jiebaCutText(text):
    seg_list = jieba.cut(text, cut_all=True)
    liststr = '/'.join(seg_list)  # 分词用/分割
    return liststr  # 返回的结果中会带标点符号


def jiebaCutText(text):
    seg_list = jieba.cut(text, cut_all=True)
    liststr = '/'.join(seg_list)
    return liststr  # 返回的结果中会带标点符号


def clearText(text):
    mywordlist = []
    for myword in text.split('/'):
        if len(myword.strip()) > 1 and contain_zh(myword.strip()):
            mywordlist.append(myword.strip())
    return '/'.join(mywordlist)


def contain_zh(word):
    zh = re.compile(u'[\u4200-\u9fa5]+')
    match = zh.search(word)
    return match


def countwords(text, counter_file):
    count_dict = dict()
    for item in text.split('/'):
        if item in count_dict:
            count_dict[item] += 1
        else:
            count_dict[item] = 1

    d_sorted_by_value = OrderedDict(sorted(count_dict.items(), key=lambda x: x[1]))
    with open(counter_file, 'w', encoding='utf-8-sig') as f:
        # f.write(codecs.BOM_UTF8)
        w = csv.writer(f)
        w.writerows(d_sorted_by_value.items())


if __name__ == '__main__':
    rootdir = r"C:\Users\fyyuj\Desktop\实验室练习\大作业\dataset"

    list = os.listdir(rootdir)  # 列出文件夹下所有的目录与文件

    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        word_frequency_analysis(path)
    newfile = open(r'test1.txt', 'w')
    for i in range(0, len(text_list)):
        text_clear = clearText(text_list[i])
        newfile.write(text_clear)
    newfile.close()

    with open('test1.txt', 'r') as f:
        contents = f.read()
        contents = contents.split('/')

    with open('output.txt', 'w') as f:
        # f.write(str(contents))
        dic = dict(enumerate(contents))
        f.write(str(dic))
三、结果片段

在这里插入图片描述
中文分词并字典化

一、数据集准备

二、代码实现

三、结果片段

猜你喜欢