Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation(2014)代

任务：机器翻译（不加attention）

import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable

dtype = torch.FloatTensor
# S: Symbol that shows starting of decoding input:input的开始
# E: Symbol that shows ending of decoding output：output的结束
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps：填充

# char_arr = [c for c in 'SEPabcdefghijklmnopqrstuvwxyz']
# num_dic = {n: i for i, n in enumerate(char_arr)}
# print('num_dic:',num_dic)
seq_data = [['我', '爱', '中国'], ['我', '很', '喜欢', '你'], ['猫', '正在', '跑']] # 训练集：3个样本
seq_label = [['I', 'love', 'china'], ['I', 'like', 'you', 'very', 'much'], ['the', 'cat', 'is', 'running']] # 训练集标签
num_dic = {}
num_dic['S'] = 0 # S: Symbol that shows starting of decoding input:input的开始
num_dic['E'] = 1 # E: Symbol that shows starting of decoding output：output的开始
num_dic['P'] = 2 # P: Symbol that will fill in blank sequence if current batch data size is short than time steps：填充
index = 3
for seq in seq_data:
    for i in seq:
        if i not in num_dic.keys():
            num_dic[i] = index
            index += 1
for seq in seq_label:
    for i in seq:
        if i not in num_dic.keys():
            num_dic[i] = index
            index += 1

print(num_dic)
    
char_arr = num_dic.keys()
print(char_arr)

# Seq2Seq Parameter
n_step = 5 # 时序5
n_hidden = 128
n_class = len(num_dic) 
batch_size = len(seq_data) # 3个样本
# print(batch_size) # 6个样本

{'S': 0, 'E': 1, 'P': 2, '我': 3, '爱': 4, '中国': 5, '很': 6, '喜欢': 7, '你': 8, '猫': 9, '正在': 10, '跑': 11, 'I': 12, 'love': 13, 'china': 14, 'like': 15, 'you': 16, 'very': 17, 'much': 18, 'the': 19, 'cat': 20, 'is': 21, 'running': 22}
dict_keys(['S', 'E', 'P', '我', '爱', '中国', '很', '喜欢', '你', '猫', '正在', '跑', 'I', 'love', 'china', 'like', 'you', 'very', 'much', 'the', 'cat', 'is', 'running'])

def make_batch(seq_data, seq_label):
    num = 0
    input_batch, output_batch, target_batch = [], [], [] # input_batch:input of encoder； # output：input of decoder； target_batch:label
    
    for seq in seq_data:
        seq.extend((n_step - len(seq)) * 'P')
        input = [num_dic[i] for i in seq]
        input_batch.append(np.eye(n_class)[input]) # np.eye单位矩阵E，取出input'中对应的行数。类似于one-hot处理。
    for seq in seq_label:
        seq.extend((n_step - len(seq)) * 'P')
        output = [num_dic['S']] + [num_dic[i] for i in seq]
        target = [num_dic[i] for i in seq] + [num_dic['E']]   
        output_batch.append(np.eye(n_class)[output])
        target_batch.append(target) # not one-hot
      
    # make tensor
    return Variable(torch.Tensor(input_batch)), Variable(torch.Tensor(output_batch)), Variable(torch.LongTensor(target_batch))

input_batch, output_batch, target_batch = make_batch(seq_data, seq_label)# input_batch, output_batch可以说是one-hot形式
print(input_batch.shape,output_batch.shape, target_batch.shape)
# input_batch.shape:3表示3个样本或者batch_size;5指的是time_step(即n_step),23维度
print(input_batch[0])
#input_batch[0]:1所在的column数对应相应的字母，对于第一个样本就是manpp，故shape第1维是5。本质上是字母的向量化表示
print(target_batch)

torch.Size([3, 5, 23]) torch.Size([3, 6, 23]) torch.Size([3, 6])
tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.]])
tensor([[12, 13, 14,  2,  2,  1],
        [12, 15, 16, 17, 18,  1],
        [19, 20, 21, 22,  2,  1]])

# Model
class Seq2Seq(nn.Module):
    def __init__(self):
        super(Seq2Seq, self).__init__()

        self.enc_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5) # encoder
        self.dec_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5) # decoder
        self.fc = nn.Linear(n_hidden, n_class)

    def forward(self, enc_input, enc_hidden, dec_input):
        enc_input = enc_input.transpose(0, 1) # 为了转化成torch中RNN一般形式（seq_len, batch_size, input_length）
        dec_input = dec_input.transpose(0, 1) # transpose将第0维和第1维交换

        _, enc_states = self.enc_cell(enc_input, enc_hidden)
        outputs, _ = self.dec_cell(dec_input, enc_states)# encoder的输出作为decoder的h0

        model = self.fc(outputs) 
        return model

model = Seq2Seq()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(5000):

    hidden = Variable(torch.zeros(1, batch_size, n_hidden))
    optimizer.zero_grad()
    output = model(input_batch, hidden, output_batch)
    output = output.transpose(0, 1) 
    loss = 0
    for i in range(0, len(target_batch)):
        # output[i] : [max_len+1, n_class, target_batch[i] : max_len+1]
        loss += criterion(output[i], target_batch[i])
    if (epoch + 1) % 1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()

# Test
def translate(test_seq, test_label):
    input_batch, output_batch, _ = make_batch(test_seq, test_label)

    # make hidden shape [num_layers * num_directions, batch_size, n_hidden]
    hidden = Variable(torch.zeros(1, 2, n_hidden))
    output = model(input_batch, hidden, output_batch)
    
    predict = output.data.max(2, keepdim=True)[1] # select n_class dimension
    predict = predict.transpose(0,1)
  
    return predict

test_seq = [['我', '喜欢', '猫'], ['我', '爱', '你']]
test_label = [['P'] * len(i) for i in test_seq]

result = translate(test_seq, test_label)
print(result.shape)
print(result)

对上述代码进行总结，对于训练集，网络的具体输入如下：

对于测试集，网络的具体输入如下：

weixin_43178406

发布了111 篇原创文章 · 获赞 113 · 访问量 1万+

私信关注

Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation(2014)代

猜你喜欢