模型训练
# 引入 word2vec
from gensim.models import word2vec
# 引入日志配置
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# 引入数据集
raw_sentences = ["the quick brown fox jumps over the lazy dogs","yoyoyo you go home now to sleep"]
# 切分词汇
sentences= [s.encode('utf-8').split() for s in sentences]
# 构建模型
model = word2vec.Word2Vec(sentences, min_count=1)
# 进行相关性比较
model.similarity('dogs','you')
模型参数
- min_count
model = Word2Vec(sentences, min_count=10) # default value is 5
在较大的语料集中,我们希望忽略那些只出现过一两次的单词,这里我们就可以通过设置min_count参数进行控制。一般而言,合理的参数值会设置在0~100之间。
- size 词向量维度
model = Word2Vec(sentences, size=200) # default value is 100
- workers参数用于设置并发训练时候的线程数
model = Word2Vec(sentences, workers=4) # default = 1 worker = no parallelization
外部语料
如果是对于大量的输入语料集或者需要整合磁盘上多个文件夹下的数据,我们可以以迭代器的方式而不是一次性将全部内容读取到内存中来节省 RAM 空间
class MySentences(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname)):
yield line.split()
sentences = MySentences('/some/directory') # a memory-friendly iterator
model = gensim.models.Word2Vec(sentences)
模型保存与读取
model.save('text8.model')
model1 = Word2Vec.load('text8.model')
model.save_word2vec_format('text.model.bin', binary=True)
model1 = word2vec.Word2Vec.load_word2vec_format('text.model.bin', binary=True)
模型预测
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
[('queen', 0.50882536)]
model.doesnt_match("breakfast cereal dinner lunch";.split())
'cereal'
model.similarity('woman', 'man')
0.73723527
model.most_similar(['man'])
[(u'woman', 0.5686948895454407),
(u'girl', 0.4957364797592163),
(u'young', 0.4457539916038513),
(u'luckiest', 0.4420626759529114),
(u'serpent', 0.42716869711875916),
(u'girls', 0.42680859565734863),
(u'smokes', 0.4265017509460449),
(u'creature', 0.4227582812309265),
(u'robot', 0.417464017868042),
(u'mortal', 0.41728296875953674)]