词性标注数据预处理

import numpy as np
import torch
from IPython.display import display, Image
import os
from os import listdir
import re
from tqdm import tqdm

def list_file_path(dirpath):
    return [os.path.join(dirpath,dir) for dir in listdir(dirpath)]

def replace_lambda(strings,symbles=[' ','\ufeff'],Replace_the_symbol="\n"):
    srcrep = {
    
    i:Replace_the_symbol for i in symbles }
    rep = dict((re.escape(k), v) for k, v in srcrep.items())
    pattern = re.compile("|".join(rep.keys()))
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)

def clean_page(filepath):
    file = open(filepath,'r').readlines()
    sentence_tagging = ''.join([replace_lambda('{}{}'.format('sentence/sentence',sentence)) for sentence in file])
    split_pos_tagging = re.split(r'\n',sentence_tagging)
    remove_empty = list(map(lambda x : x.split('/') ,list(filter(lambda x : x!='' , split_pos_tagging))))
    return np.array([['page','page']]+remove_empty) 

def load_pos_tags(dirpath):
    init_pos_tage = []
    cl = []
    for dir in tqdm(listdir(dirpath),desc='加载词性标注文件'):
        one_page = clean_page(os.path.join(dirpath,dir))
        for pos in one_page:
            if len(pos) == 2:
                init_pos_tage.append(pos)
            else:
                cl.append(pos)
    return np.array(init_pos_tage),cl

dirpath = "/NLP数据集合/词性标注数据集/国家语委人工词性标注语料"

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/weixin_43069769/article/details/107877487