基于Word2Vec 的相似度计算

#coding=UTF-8
import re
import time
import csv
import sys
import os
import gensim.models.word2vec as w2v
import jieba
import glob

def loadPoorEnt(path2 = 'G:/project/sentimation_analysis/data/stopwords.csv'):
    csvfile = open(path2,encoding='UTF-8')
    stopwords  = [line.strip() for line in csvfile.readlines()]
    return stopwords

def get_all_content():
    #abel_dir = [path + x for x in os.listdir(path) if os.path.isdir(path + x)]
    all_files = glob.glob(r'D:/GFZQ/GFZQ/xuesu2018/xuesu/*.csv')
    return all_files

def get_wenben(path):
	csvfile = open(path,'r',encoding='UTF-8')
	reader = csv.reader(csvfile)
	return reader

def set_ourdict(all_files,length):
    all_file = []
    for i in range(length):
        print ("正在解析第%d家公司" %i)
        file_path = all_files[i]
        wenben = get_wenben(file_path)
        all_file.append(wenben)
    return all_file

def cal_time(time):
    if time < 60:
        return str(time) + 'secs'
    if time < 60*60:
        return str(time/60.0) + 'mins'
    if time< 60*60*60:
        return str(time/60.0) + 'hours'

if __name__ =='__main__':
    start = time.time()
    stop_words = loadPoorEnt()
    
    ##test
    path = 'G:/project/GFZQ1/跨境通2017年度业绩网上说明会.csv'
    f = open(path, 'r',encoding='UTF-8')
    sentences = f.read()
    #with open(path, encoding='utf-8') as f:
    #sentences= [line for line in f.readlines()]
        #sentences = f.readlines()
   
    sentences = [i.strip() for i in sentences.split('\n') if i.strip()
                 and len(i) >= 2 and i[0:3] != '---']
    words = list()
    for sentence in sentences:
        words.append([word.strip() for word in jieba.lcut(sentence) if len(word.strip()) > 1 and word not in stop_words])



    # 初始化word2vec的参数
    features = 400  # 词向量的长度,数值越大,精确度越高,但是消耗时间越长
    min_word_count = 3  # 设置最小词频,低于这个频率的词语会被过滤,不进入后续计算
    context_size = 7  # 设置上下文窗口大小
    # 将分词结果写到本地, 每行是每句话的分词结果,用空格分割
    with open("G:/project/sentimation_analysis/data/corpus.csv", "w", encoding="utf-8") as f:
        for word in words:
            f.write("{}\n".format(" ".join(word)))
    # 训练模型
    corpus = w2v.Text8Corpus('G:/project/sentimation_analysis/data/corpus.csv')
    book2vec = w2v.Word2Vec(corpus,
                            sg=1,
                            size=features,
                            min_count=min_word_count,
                            window=context_size)
    end = time.time()
    print('模型训练使用了:%s' %cal_time(end - start))

    # 查看模型结果
    ## 寻找关联词
    sim_word = book2vec.wv.most_similar(positive=['合作'], topn=3)
    print('与合作相近的词语: ', sim_word)

    ## 用函数similarity()查看词语的相似度
    # sim = book2vec.wv.similarity('阿里', '朝鲜')
    # print('阿里和朝鲜的相似度为: ', sim)

运行结果:

语料库越大越准确`~~~~还要迭代呀

猜你喜欢

转载自blog.csdn.net/weixin_40411446/article/details/81072291
今日推荐