#coding=UTF-8
import re
import time
import csv
import sys
import os
import gensim.models.word2vec as w2v
import jieba
import glob
def loadPoorEnt(path2 = 'G:/project/sentimation_analysis/data/stopwords.csv'):
csvfile = open(path2,encoding='UTF-8')
stopwords = [line.strip() for line in csvfile.readlines()]
return stopwords
def get_all_content():
#abel_dir = [path + x for x in os.listdir(path) if os.path.isdir(path + x)]
all_files = glob.glob(r'D:/GFZQ/GFZQ/xuesu2018/xuesu/*.csv')
return all_files
def get_wenben(path):
csvfile = open(path,'r',encoding='UTF-8')
reader = csv.reader(csvfile)
return reader
def set_ourdict(all_files,length):
all_file = []
for i in range(length):
print ("正在解析第%d家公司" %i)
file_path = all_files[i]
wenben = get_wenben(file_path)
all_file.append(wenben)
return all_file
def cal_time(time):
if time < 60:
return str(time) + 'secs'
if time < 60*60:
return str(time/60.0) + 'mins'
if time< 60*60*60:
return str(time/60.0) + 'hours'
if __name__ =='__main__':
start = time.time()
stop_words = loadPoorEnt()
##test
path = 'G:/project/GFZQ1/跨境通2017年度业绩网上说明会.csv'
f = open(path, 'r',encoding='UTF-8')
sentences = f.read()
#with open(path, encoding='utf-8') as f:
#sentences= [line for line in f.readlines()]
#sentences = f.readlines()
sentences = [i.strip() for i in sentences.split('\n') if i.strip()
and len(i) >= 2 and i[0:3] != '---']
words = list()
for sentence in sentences:
words.append([word.strip() for word in jieba.lcut(sentence) if len(word.strip()) > 1 and word not in stop_words])
# 初始化word2vec的参数
features = 400 # 词向量的长度,数值越大,精确度越高,但是消耗时间越长
min_word_count = 3 # 设置最小词频,低于这个频率的词语会被过滤,不进入后续计算
context_size = 7 # 设置上下文窗口大小
# 将分词结果写到本地, 每行是每句话的分词结果,用空格分割
with open("G:/project/sentimation_analysis/data/corpus.csv", "w", encoding="utf-8") as f:
for word in words:
f.write("{}\n".format(" ".join(word)))
# 训练模型
corpus = w2v.Text8Corpus('G:/project/sentimation_analysis/data/corpus.csv')
book2vec = w2v.Word2Vec(corpus,
sg=1,
size=features,
min_count=min_word_count,
window=context_size)
end = time.time()
print('模型训练使用了:%s' %cal_time(end - start))
# 查看模型结果
## 寻找关联词
sim_word = book2vec.wv.most_similar(positive=['合作'], topn=3)
print('与合作相近的词语: ', sim_word)
## 用函数similarity()查看词语的相似度
# sim = book2vec.wv.similarity('阿里', '朝鲜')
# print('阿里和朝鲜的相似度为: ', sim)
运行结果:
语料库越大越准确`~~~~还要迭代呀