Show and Tell: Lessons learned from the 2015 MSCOCO Image Captioning Challenge代码

                     Show and Tell: Lessons learned from the 2015 MSCOCO

                                         Image Captioning Challenge代码

   

      Image caption任务是给定一幅图像,用一幅图像来描述图像包含的信息。其中包含两方面的内容,图像特征提取和语句序列描述,其中CNN和RNN扮演重要角色。

       下图主要是将CNN提取的4096维度的图像特征作为LSTM的第一个输入,图像的内容描述作为其他时间序列的输入。LSTM主要是用来学习对应语句每个词语之间的依赖信息。


       具体的表达式如下图所示:

       如下图所示,需要把高维4096的图像特征和每个one-hot单词编码到固定维度,然后将描述信息作为LSTM的输入


具体代码如下

#-*- coding: utf-8 -*-
import math
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import cPickle

from tensorflow.models.rnn import rnn_cell
import tensorflow.python.platform
from keras.preprocessing import sequence
from collections import Counter
from cnn_util import * 

class Caption_Generator():
    def init_weight(self, dim_in, dim_out, name=None, stddev=1.0):
        return tf.Variable(tf.truncated_normal([dim_in, dim_out], stddev=stddev/math.sqrt(float(dim_in))), name=name)

    def init_bias(self, dim_out, name=None):
        return tf.Variable(tf.zeros([dim_out]), name=name)

    def __init__(self, dim_image, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, bias_init_vector=None):

        self.dim_image = np.int(dim_image) # 图像维度
        self.dim_embed = np.int(dim_embed) # dim_embed和dim_hidden的维度相同,用于编码词向量
        self.dim_hidden = np.int(dim_hidden) # LSTMcell中神经元的个数
        self.batch_size = np.int(batch_size)
        self.n_lstm_steps = np.int(n_lstm_steps) #序列长度
        self.n_words = np.int(n_words)

        with tf.device("/cpu:0"):#编码词向量到固定为度
            self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_embed], -0.1, 0.1), name='Wemb')

        self.bemb = self.init_bias(dim_embed, name='bemb')

        self.lstm = rnn_cell.BasicLSTMCell(dim_hidden)#初始化LSTMCell

        #self.encode_img_W = self.init_weight(dim_image, dim_hidden, name='encode_img_W')
        #编码图像特征到固定为度
        self.encode_img_W = tf.Variable(tf.random_uniform([dim_image, dim_hidden], -0.1, 0.1), name='encode_img_W')
        self.encode_img_b = self.init_bias(dim_hidden, name='encode_img_b')
        #将输出的结果恢复到词向量空间
        self.embed_word_W = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='embed_word_W')

        if bias_init_vector is not None:
            self.embed_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='embed_word_b')
        else:
            self.embed_word_b = self.init_bias(n_words, name='embed_word_b')

    def build_model(self):#用于train

        image = tf.placeholder(tf.float32, [self.batch_size, self.dim_image])#图像维度
        sentence = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])#对应的图像标签(语句信息描述)
        mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])#mask(非1则0) 用于最终计算Loss
        
        #将图像特征编码到固定的维度
        image_emb = tf.matmul(image, self.encode_img_W) + self.encode_img_b # (batch_size, dim_hidden)

        state = tf.zeros([self.batch_size, self.lstm.state_size])# state的初始化,state_size大小为2(c,h),初始化为0

        loss = 0.0
        with tf.variable_scope("RNN"):
            for i in range(self.n_lstm_steps): # maxlen + 1
                if i == 0:
                    current_emb = image_emb# t=-1时刻的LSTM的出入:图像编码特征
                else:
                    with tf.device("/cpu:0"): #词向量编码固定维度,这里sentence[:,i-1]的i-1表示第一个词语
                        current_emb = tf.nn.embedding_lookup(self.Wemb, sentence[:,i-1]) + self.bemb

                if i > 0 : tf.get_variable_scope().reuse_variables()
                #LSTM的输出状态信息
                output, state = self.lstm(current_emb, state) # (batch_size, dim_hidden)

                if i > 0: 
                    #labels表示每个batch的第i个单词
                    labels = tf.expand_dims(sentence[:, i], 1) # (batch_size)-->(batch_size,1)
                    #indices表示对于labels的索引
                    indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)#(batch_size)-->batch_size,1)
                    #(batch_indices,labels)
                    concated = tf.concat(1, [indices, labels])
                    #将concated编码为one-hot矩阵,1表示以tf.pack([self.batch_size, self.n_words]为索引指定为1,其他设定为0
                    onehot_labels = tf.sparse_to_dense(
                            concated, tf.pack([self.batch_size, self.n_words]), 1.0, 0.0) # (batch_size, n_words)
                    #LSTM的输出结果
                    logit_words = tf.matmul(output, self.embed_word_W) + self.embed_word_b # (batch_size, n_words)
                    #计算Loss损失结果
                    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logit_words, onehot_labels)
                    cross_entropy = cross_entropy * mask[:,i]#计算损失

                    current_loss = tf.reduce_sum(cross_entropy)
                    loss = loss + current_loss

            loss = loss / tf.reduce_sum(mask[:,1:])
            return loss, image, sentence, mask

    def build_generator(self, maxlen):#用于test阶段,maxlen表示输出的结果语句的最大长度
        image = tf.placeholder(tf.float32, [1, self.dim_image])
        image_emb = tf.matmul(image, self.encode_img_W) + self.encode_img_b

        state = tf.zeros([1, self.lstm.state_size])
        #last_word = image_emb 
        generated_words = []

        with tf.variable_scope("RNN"):
            #t=-1时刻出入信息为image_feature
            output, state = self.lstm(image_emb, state)
            #self.Wemb表示embed的词向量(n_word,embed)
            last_word = tf.nn.embedding_lookup(self.Wemb, [0]) + self.bemb

            for i in range(maxlen):
                tf.get_variable_scope().reuse_variables()
                #t=0时刻 LSTM的输出
                output, state = self.lstm(last_word, state)
                #将输出结果映射到原始的词向量空间
                logit_words = tf.matmul(output, self.embed_word_W) + self.embed_word_b
                #计算last_word和image为前提条件下,计算当前时刻有可能输出的最大概率。max_prob_word是索引
                max_prob_word = tf.argmax(logit_words, 1)

                with tf.device("/cpu:0"):
                    last_word = tf.nn.embedding_lookup(self.Wemb, max_prob_word)

                last_word += self.bemb

                generated_words.append(max_prob_word)

        return image, generated_words

猜你喜欢

转载自blog.csdn.net/liyaohhh/article/details/53955715