#coding:utf-8
from pyltp import Segmentor
from pyltp import Postagger
def read_and_seg_pos(file_dir):
segmentor = Segmentor()
postagger = Postagger()
segmentor.load_with_lexicon("模型地址/cws.model","用户词典/fulluserdict")
postagger.load_with_lexicon("模型地址/pos.model","用户词典/fulluserdict")
#用户词典为纯文本,第一列为词,第二列为词性
file_read = open(file_dir,"r")
texts = file_read.readlines() #这里是一次性全部读取,对于大语料,往往采用readline(),一次读取一行
file_write_seg = open(file_dir+"_seg","w")
file_write_pos = open(file_dir+"_pos","w")
for text in texts:
words = segmentor.segment(text)#分词
file_write_seg.write(" ".join(words)+"\n")#将以空格分好的词写入文档
postags = postagger.postag(words) #必须先分词再词性标注
words_and_pos = zip(words,postags)
words_and_pos.append(('$','$')) #'$'作为判断一句话结束的标志
for word,pos in words_and_pos:
if word != '$':
file_write_pos.write(word+" "+pos+" ")
else:
file_write_pos.write('\n')
file_read.close()
file_write_seg.close()
file_write_pos.close()
segmentor.release()
postagger.release()
read_and_seg_pos("./corpus")
LTP分词与词性标注(使用用户词典)
猜你喜欢
转载自blog.csdn.net/liushui94/article/details/78512835
今日推荐
周排行