使用文本序列的word2vec词向量作为seq2seq模型的输入和输出,训练得到中间层的文本特征表示,可进一步进行分类任务等,encoder和decoder都使用LSTM。
import tensorflow as tf import numpy as np import re from gensim.models import Word2Vec import pandas as pd import matplotlib.pyplot as plt import time import warnings warnings.filterwarnings("ignore") #导入Word2vec词向量模型 model = Word2Vec.load('model/daixia_w2c_char_100.model') #超参数 num_units = 256 input_size = 100 batch_size = 5 vocab_size = 946 # 读取词典,包括病案所有字,还有结束符号EOS def get_dict(): f = open('data/char_dict.txt', 'r', encoding='utf-8') dict_char = dict() dict_id = dict() for i in range(0, 946): word = f.readline() dict_char[re.sub('\n', '', word)] = i dict_id[i] = re.sub('\n', '', word) return dict_char, dict_id # 获取数据,病案的字序列,生成训练数据的batch def get_batches(filename, dict_char, batch_size): # 文本,分类,对应字典序号,文本长度 texts = [] label = [] targets = [] length = [] # 记录文本信息及标签,序号 data = pd.read_csv(filename, delimiter=',', encoding='utf-8') for i in range(data.shape[0]): char_list = re.split(' ', data['text'].loc[i]) texts.append(char_list) label.append(data['label'].loc[i]) target = [dict_char[char] for char in char_list] targets.append(target) # 记录每个文本的长度 for t in texts: length.append(len(t)) length = np.array(length, dtype=np.int32) # #返回整个数据集 # return texts,targets # batches生成器 i = 0 while True: yield texts[i:i + batch_size], targets[i:i + batch_size] i = i + batch_size if i + batch_size > len(texts): i = 0 #将string序列转化为词向量,格式转化为time_major def make_batch(texts, isTargets=False, max_sequence_length=None): sequence_lengths = [len(text) for text in texts] batch_size = len(texts) if max_sequence_length is None: max_sequence_length = max(sequence_lengths) if isTargets is False: inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length, input_size], dtype=np.float32) for i, text in enumerate(texts): for j, char in enumerate(text): inputs_batch_major[i, j] = model[char] else: inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) for i, target in enumerate(texts): for j, t in enumerate(target): inputs_batch_major[i, j] = t inputs_time_major = inputs_batch_major.swapaxes(0, 1) return inputs_time_major #构建训练计算图 train_graph = tf.Graph() with train_graph.as_default(): encoder_inputs = tf.placeholder(shape=[None, batch_size, input_size], dtype=tf.float32, name='encoder_inputs') decoder_inputs = tf.placeholder(shape=[None, batch_size, input_size], dtype=tf.float32, name='decoder_inputs') decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets') #LSTM encoder encoder_cell = tf.contrib.rnn.LSTMCell(num_units) encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn( encoder_cell, encoder_inputs, dtype=tf.float32, time_major=True, ) #LSTM decoder decoder_cell = tf.contrib.rnn.LSTMCell(num_units) decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn( decoder_cell, decoder_inputs, initial_state=encoder_final_state, dtype=tf.float32, time_major=True, scope="plain_decoder", ) #分类层 decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocab_size) decoder_prediction = tf.argmax(decoder_logits, 2) stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits( labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32), logits=decoder_logits, ) #计算精确度 correct_prediction = tf.equal(decoder_prediction, tf.argmax(tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32), 2)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) #交叉熵损失 loss = tf.reduce_mean(stepwise_cross_entropy) #优化 train_op = tf.train.AdamOptimizer().minimize(loss) #创建saver保存模型 saver = tf.train.Saver() if __name__ == '__main__': # 记录损失 loss_track = [] epochs = 10001 # 读取字典 dict_char, dict_id = get_dict() # batch生成器 gen_batches = get_batches('data/data_char.csv', dict_char, batch_size) #开启会话 with tf.Session(graph=train_graph) as sess: sess.run(tf.global_variables_initializer()) time_start = time.time() for epoch in range(epochs): batch = next(gen_batches) texts = batch[0] targets = batch[1] #EOS为句子结束符号,在字典中对应0,decoder输入以结束符号开始,decoder的targets以结束符号为结尾 encoder_inputs_ = make_batch(texts) decoder_inputs_ = make_batch([['EOS'] + text for text in texts]) decoder_targets_ = make_batch([target + [0] for target in targets], True, None) feed_dict = {encoder_inputs: encoder_inputs_, decoder_inputs: decoder_inputs_, decoder_targets: decoder_targets_, } _, l, acc = sess.run([train_op, loss, accuracy], feed_dict) loss_track.append(l) #展示预测效果 if epoch == 0 or epoch % 10 == 0: print('loss: {}'.format(sess.run(loss, feed_dict))) print('acc: {}'.format(sess.run(accuracy, feed_dict))) predict_ = sess.run(decoder_prediction, feed_dict) for i, (inp, pred) in enumerate(zip(texts, predict_.T)): print('input > {}'.format(inp)) print('predicted > {}'.format([dict_id[id] for id in pred])) if i >= 2: break time_span = time.time() - time_start print('训练花费了{}'.format(time_span)) saver.save(sess, 'model/dl/model.ckpt') plt.plot(loss_track) plt.show()