(七)Tensorflow的word2vec的skip-gram-model训练

对word2vec的skip-gram模型进行训练(中心词去估计滑窗周边词),数据预处理:实现了下载数据,使用counter库进行分词

处理,在构建了数据(center-word和target-word),构建skip-gram模型实现了数据占位,投影矩阵,损失函数处理,优化器

的选取,对loss可视化的summary......,训练模型:主要是构建可视化,模型保存的处理...

所有定义的常量

import os
import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf
from urllib.request  import urlretrieve
import zipfile,random
import counter

VOCAB_SIZE = 50000 #稠密词的个数
#注意:词嵌入的(词典)的维度和批处理的维度一样
BATCH_SIZE = 128  #批处理的大小
EMBED_SIZE = 128 # 词嵌入的大小
SKIP_WINDOW = 1 # 滑窗的大小
NUM_SAMPLED = 64    # 下采样多少负样本(没有使用huffman数来层次化,而是用负例的下采样来进行优化的)
LEARNING_RATE = 1.0 #学习率
NUM_TRAIN_STEPS = 100000 #训练的迭代次数
init_iterator_step = 55000

WEIGHTS_FLD = 'processed/'
SKIP_STEP = 1000 #保存模型,计算损失的step!
#保存模型的路径
MODEL_PATH = './model'
MODEL_NAME  = "model.ckpt"

DOWNLOAD_URL = 'http://mattmahoney.net/dc/'
EXPECTED_BYTES = 31344016
DATA_FOLDER = 'data/'
FILE_NAME = 'text8.zip'

1.数据处理模块

# 下载数据
def download(file_name, expected_bytes):
    file_path = DATA_FOLDER + file_name
    if os.path.exists(file_path):
        print("Dataset ready")
        return file_path
    file_name, _ = urlretrieve(DOWNLOAD_URL + file_name, file_path)
    file_stat = os.stat(file_path)
    if file_stat.st_size == expected_bytes:
        print('Successfully downloaded the file', file_name)
    else:
        raise Exception('File ' + file_name +
                        ' might be corrupted. You should try downloading it with a browser.')
    return file_path

# 读取数据
def read_data(file_path):
    with zipfile.ZipFile(file_path) as f:
        words = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return words

# 构建字典
def build_vocab(words, vocab_size):
    dictionary = dict()
    count = []
    count.extend(counter.Counter(words).most_common(vocab_size - 1))
    index = 0
    make_dir('processed')
    with open('processed/vocab_1000.tsv', "w") as f:
        for _,word in enumerate(count):
            dictionary[word] = index
            if index < 1000:
                f.write(word + "\n")
            index += 1
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, index_dictionary

# 词和下标的映射转换
def convert_words_to_index(words, dictionary):
    return [dictionary[word] if word in dictionary else 0 for word in words]

# 按照skip-gram重新组织数据,利用中心词去估计周边词
def generate_sample(index_words, context_window_size):
    #index当成中心词的索引,center是中心词!
    for index, center in enumerate(index_words):
        context = random.randint(1, context_window_size)  #这属于随机窗口。。。。
        # get a random target before the center word
        for target in index_words[max(0, index - context): index]:
            yield center, target
        # get a random target after the center wrod
        for target in index_words[index + 1: index + context + 1]:
            yield center, target

# 把一个batch数据处理成numpy格式返回
def get_batch(iterator, batch_size):
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(iterator)
        yield center_batch, target_batch

# 如果文件夹不存在,创建文件夹
def make_dir(path):
    try:
        os.mkdir(path)
    except OSError:
        pass

# 数据处理
def process_data(vocab_size, batch_size, skip_window):
    #下载数据
    file_path = download(FILE_NAME, EXPECTED_BYTES)
    #读取数据
    words = read_data(file_path)
    #构建词表
    dictionary, _ = build_vocab(words, vocab_size)
    #将词进行下标编码
    index_words = convert_words_to_index(words, dictionary)
    del words # 节省空间
    #产出skip_gram样本
    single_gen = generate_sample(index_words, skip_window)
    return get_batch(single_gen, batch_size)

2.skip-gram模型

#词模型
class SkipModel:
    def __init__(self,vocab_size,batch_size,embed_size,num_sample,learning_rate):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sample
        self.learning_rate = learning_rate
        #用于可视化,记录每次的迭代,trainable=False不需要进行训练
        self.global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name="global_step")

    def _create_placeholders(self):
        """定义占位符存储数据"""
        with tf.name_scope("data"): #注意格式:中心词和目标词
            self.center_words = tf.placeholder(tf.int32,[self.batch_size],name="center_words")
            self.target_words = tf.placeholder(tf.int32,[self.batch_size,1],name="target_words")

    def _create_embedding(self):
        """定义一个投影矩阵,行代表vocab_size稠密词,列代表embed_size嵌入词"""
        with tf.device("/cpu:0"):
            with tf.name_scope("embed"):
                #均匀分布
                self.embed_matrix = tf.Variable(tf.random_uniform([self.vocab_size,self.embed_size],
                                                                  minval=-1,maxval=1,name="embed_matrix"))
    def _create_loss(self):
        """定义word2vec的结果,同时定义自带的损失函数"""
        with tf.device("/cpu:0"):
            with tf.name_scope("loss"):
                #投影矩阵对中心词进行映射查找
                # Returns:A `Tensor` with the same type as the tensors in `params`.
                embed = tf.nn.embedding_lookup(self.embed_matrix,self.center_words,name="embed")
                #定义损失函数,通常词表很大,如果平铺开来进行全连接会有问题,可以使用huffman树层次化或者负例采样
                #权重是投影矩阵的shape(self.vocab_size,self.embed_size预测的类别)与向量全连接进行输出,一定注意!!!
                # 偏置是稠密词shape(softmax的类别数)
                nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size,self.embed_size],stddev=1/(self.embed_size**0.5))
                                         ,name="nce_weight")
                nce_biase = tf.Variable(tf.zeros([self.vocab_size]),name="nce_biase")
                self.loss = tf.reduce_mean(tf.nn.nce_loss(
                        weights=nce_weight,biases=nce_biase,labels=self.target_words,inputs=embed,
                        num_sampled=self.num_sampled,num_classes=self.vocab_size,name="loss"
                    ))
        return

    def _create_optimizer(self):
        """定义一个优化器"""
        with tf.device("/cpu:0"):
            self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        return

    def _create_summary(self):
        """定义summary,以便在tensorboard里进行可视化"""
        with tf.name_scope("summary"):
            tf.summary.scalar("loss",self.loss)
            tf.summary.histogram("histogram",self.loss)
            #进行合并summary
            self.summary_op = tf.summary.merge_all()
        return

    def build_graph(self):
        """构建整个定义的图graph"""
        self._create_placeholders()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summary()

3.训练模块

#训练词模型
def train(model,batch_data,num_train_step):
    saver = tf.train.Saver()
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        if os.path.exists(MODEL_PATH):
            path = os.path.join(MODEL_PATH,MODEL_NAME)+"-"+str(init_iterator_step)
            saver.restore(sess,path)
        else:
            os.mkdir(MODEL_PATH)
            sess.run(init)
        writer = tf.summary.FileWriter("./graph",sess.graph)
        loss_total = 0
        for i in range(init_iterator_step,num_train_step):
            centers,targers = next(batch_data)
            feed_dict = {model.center_words:centers,model.target_words:targers}
            _,l,summary = sess.run([model.train,model.loss,model.summary_op],feed_dict=feed_dict)
            loss_total+=l
            writer.add_summary(summary,global_step=i)
            if i%SKIP_STEP==0 and i!=1:
                print("Iteration:{},loss:{}".format(i,loss_total/SKIP_STEP))
                loss_total = 0
                saver.save(sess,os.path.join(MODEL_PATH,MODEL_NAME),global_step=i)
        writer.close()
    return

4.启动代码

if __name__ == '__main__':
    model = SkipModel(VOCAB_SIZE,BATCH_SIZE,EMBED_SIZE,NUM_SAMPLED,LEARNING_RATE)
    model.build_graph()
    data = process_data(VOCAB_SIZE,BATCH_SIZE,SKIP_WINDOW)
    train(model,data,NUM_TRAIN_STEPS)

猜你喜欢

转载自blog.csdn.net/taka_is_beauty/article/details/89077860