具体代码如下
# -*- encoding=utf-8 -*-
import jieba.analyse
import jieba
import pandas as pd
# 载入自定义词典
jieba.load_userdict('dict.txt')
# 载入自定义停止词
jieba.analyse.set_stop_words('stop_words.txt')
# 去掉中英文状态下的逗号、句号
def clearSen(comment):
comment = comment.strip()
comment = comment.replace('、', '')
comment = comment.replace(',', '。')
comment = comment.replace('《', '。')
comment = comment.replace('》', '。')
comment = comment.replace('~', '')
comment = comment.replace('…', '')
comment = comment.replace('\r', '')
comment = comment.replace('\t', ' ')
comment = comment.replace('\f', ' ')
comment = comment.replace('/', '')
comment = comment.replace('、', ' ')
comment = comment.replace('/', '')
comment = comment.replace('。', '')
comment = comment.replace('(', '')
comment = comment.replace(')', '')
comment = comment.replace('_', '')
comment = comment.replace('?', ' ')
comment = comment.replace('?', ' ')
comment = comment.replace('了', '')
comment = comment.replace('➕', '')
comment = comment.replace(':', '')
return comment
# 读取数据
# zhengce_content = pd.read_table('0020.txt', sep=',')
zhengce_content = pd.read_table('./2016_wenben/0007.txt', sep='\t')
# 数据重命名
zhengce_content.columns = ['content']
# 文件写入
outputfile = open('2016_jieba_output.txt', 'a+', encoding="utf-8")
for each in zhengce_content['content']:
# 清除标点符号
kk = clearSen(each)
# 精确模式切词
seg_list = jieba.cut(kk)
comment = " ".join(seg_list)
print(comment)
# 写出数据
outputfile.write(comment)
# 关闭文件
outputfile.close()