LTP分词与词性标注(使用用户词典)

#coding:utf-8

from pyltp import Segmentor
from pyltp import Postagger

def read_and_seg_pos(file_dir):
    segmentor = Segmentor()
    postagger = Postagger()
    segmentor.load_with_lexicon("模型地址/cws.model","用户词典/fulluserdict")
    postagger.load_with_lexicon("模型地址/pos.model","用户词典/fulluserdict")
    #用户词典为纯文本,第一列为词,第二列为词性
    file_read = open(file_dir,"r")
    texts = file_read.readlines()    #这里是一次性全部读取,对于大语料,往往采用readline(),一次读取一行
    file_write_seg = open(file_dir+"_seg","w")
    file_write_pos = open(file_dir+"_pos","w")
    for text in texts:
        words = segmentor.segment(text)#分词
        file_write_seg.write(" ".join(words)+"\n")#将以空格分好的词写入文档
        postags = postagger.postag(words)         #必须先分词再词性标注
        words_and_pos = zip(words,postags)
        words_and_pos.append(('$','$'))   #'$'作为判断一句话结束的标志
        for word,pos in words_and_pos:
            if word != '$':
                file_write_pos.write(word+" "+pos+" ")
            else:
                file_write_pos.write('\n')


    file_read.close()
    file_write_seg.close()
    file_write_pos.close()
    segmentor.release()
    postagger.release()

read_and_seg_pos("./corpus")

猜你喜欢

转载自blog.csdn.net/liushui94/article/details/78512835
今日推荐