# -*- coding: utf-8 -*-
"""
Created on Tue Apr 17 15:11:44 2018
@author: NAU
"""
##############分词、自定义词表、停用词################
import jieba
jieba.load_userdict('E:\\userdict.txt') #自定义词典
inputs = open('E:\\wdkb.txt', 'r') #分词文本
outputs = open('E:\\wdkbfenci.txt', 'w') #分词输出文本
stopwords = open('E:\\stop.txt') #停用词
def seg_sentence(sentence): #分词
sentence_seged = jieba.cut(sentence.strip())
outstr =""
for word insentence_seged:
if wordnot in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
for line in inputs: #读文本进行分词
line_seg =seg_sentence(line)
outputs.write(line_seg + '\n')
outputs.close()
inputs.close()
##############词频统计与权值################
import jieba
import jieba.analyse
content = u'中国特色社会主义是我们党领导的伟大事业,全面推进党的建设新的伟大工程,是这一伟大事业取得胜利的关键所在。党坚强有力,事业才能兴旺发达,国家才能繁荣稳定,人民才能幸福安康。党的十八大以来,我们党坚持党要管党、从严治党,凝心聚力、直击积弊、扶正祛邪,党的建设开创新局面,党风政风呈现新气象。习近平总书记围绕从严管党治党提出一系列新的重要思想,为全面推进党的建设新的伟大工程进一步指明了方向。'
keywords = jieba.analyse.extract_tags(content,topK=20, withWeight=True, allowPOS=())
for item in keywords:
printitem[0],item[1]
##############tfidf高频词################
tfidf = []
inputs = open('C:\\Users\\NAU\\Desktop\\top.txt', 'r', encoding='utf8') #分词数据
outputs = open('C:\\Users\\NAU\\Desktop\\top_feature.txt', 'w', encoding='utf8') #输出文本
nagetive_top_words = inputs.read() #读取数据
inputs.close() #关闭输入文本
tags = jieba.analyse.extract_tags(nagetive_top_words, topK=100, withWeight=True) #tfidf代码调用
print (''.join(str(tags)) + '\n') #打印所有前100词汇
for i in tags: #每行打印一个词汇
print (i)
outputs.write(str(i) + '\n' for i in tags)
##############词性标注################
import jieba
import jieba.posseg as pseg
jieba.load_userdict('E:\\userdict.txt')
inputs = open('E:\\negetive_sentence.txt', 'r')
outputs = open('E:\\negetive_tag.txt', 'w')
negative=inputs.read()
lines=negative.strip().split('\n')
def seg_sentence(sentence):
sentence_seged = pseg.cut(sentence.strip())
outstr = ""
for w in sentence_seged:
if w != '\t':
outstr += str(w)
outstr += " "
return outstr
for line in lines: #读文本进行分词
line_seg = seg_sentence(line)
outputs.write(line_seg + '\n')
print(line_seg + '\n')
##############词性删除###############
import re
inputs = open('C:\\Users\\NAU\\Desktop\\data1.txt', 'r', encoding='utf8')
outputs = open('C:\\Users\\NAU\\Desktop\\data2.txt', 'w', encoding='utf8')
negative=inputs.readlines()
txtlist=[]
remove_word=["/nz","/zg","/m"]
for line in negative: #选择需要的词性
line_list2 = re.split(' ',line)
line_list = line_list2[:]
for segs in line_list2:
for k in remove_word:
if k in segs:
line_list.remove(segs)
break
else:
pass
txtlist.append(line_list)
resultlist=txtlist[:]
for sent in resultlist:
for word in sent: #删除词性标记
if "/" in word:
slope=word.index("/")
letter=word[0:slope]+" "
outputs.write(letter)
print(letter)
else:
outputs.write(word)
python (分词、自定义词表、停用词、词频统计与权值(tfidf)、词性标注与部分词性删除)
猜你喜欢
转载自blog.csdn.net/wanpi931014/article/details/81088432
今日推荐
周排行