结巴并行分词

源文件有4列

import os
import sys


import pandas as pd
from joblib import Parallel, delayed
import jieba

import yaml
config = yaml.load(open('config.yaml', 'r'))


def read_df(trainfile):
    data = pd.read_csv(trainfile, sep='\\t', header=None, nrows=60000,
                       encoding='utf-8', names=['id', 'title', 'content', 'label'])
    return data


def word_cut(df):
    with open(config['train_cut'], 'a+') as f:
        line = '\t'.join([df[0],' '.join(jieba.cut(df[1])) ,' '.join(jieba.cut(df[2])),df[3]])   
        f.writelines(line)
        f.writelines('\n')


def applyParallel(content, func, n_thread):
    with Parallel(n_jobs=n_thread) as parallel:
        parallel(delayed(func)(c) for c in content)


def main():
    overwrite = True
    if overwrite:
        if os.path.exists(config['train_cut']):
            os.remove(config['train_cut'])

    trainfile = 'data/train_fusai.tsv'
    df = read_df(trainfile)
    content = df.values
    applyParallel(content, word_cut, 22)
if __name__ == '__main__':
    main()

猜你喜欢

转载自www.cnblogs.com/zle1992/p/8967644.html