基于bAbI数据集训练记忆网络
bAbi: 来自 FAIR(Facebook AI Research,脸书人工智能研究)的合成式阅读理解与问答数据集。
官网:https://research.fb.com/downloads/babi/
代码注释
'''Trains a memory network on the bAbI dataset. 基于bAbI数据集训练记忆网络 bAbi: 来自 FAIR(Facebook AI Research,脸书人工智能研究)的合成式阅读理解与问答数据集。 官网:https://research.fb.com/downloads/babi/ References: 参考 - Jason Weston, Antoine Bordes, Sumit Chopra, Tomas Mikolov, Alexander M. Rush, "Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks", 面向AI完成的问答:一组必备的玩具任务 http://arxiv.org/abs/1502.05698 - Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, Rob Fergus, "End-To-End Memory Networks", 端对端记忆网络 http://arxiv.org/abs/1503.08895 Reaches 98.6% accuracy on task 'single_supporting_fact_10k' after 120 epochs. Time per epoch: 3s on CPU (core i7). 120个周期后,single_supporting_fact_10k任务达到98.6%精确度, 每个周期3s基于cpu(Central Processing Unit,中央处理器)(intel i7的架构) ''' from __future__ import print_function from keras.models import Sequential, Model from keras.layers.embeddings import Embedding from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate from keras.layers import LSTM from keras.utils.data_utils import get_file from keras.preprocessing.sequence import pad_sequences from functools import reduce import tarfile import numpy as np import re def tokenize(sent): '''Return the tokens of a sentence including punctuation. 返回包含标点符号的句子的词序列(句子分解后的各部分) >>> tokenize('Bob dropped the apple. Where is the apple?') ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?'] ''' return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()] def parse_stories(lines, only_supporting=False): '''Parse stories provided in the bAbi tasks format 基于bAbi(数据集)任务模式,解析故事 If only_supporting is true, only the sentences that support the answer are kept. 只要支持是真的,只有支持答案的句子被保留下来。 ''' data = [] story = [] for line in lines: line = line.decode('utf-8').strip() nid, line = line.split(' ', 1) nid = int(nid) if nid == 1: story = [] if '\t' in line: q, a, supporting = line.split('\t') q = tokenize(q) substory = None if only_supporting: # Only select the related substory supporting = map(int, supporting.split()) substory = [story[i - 1] for i in supporting] else: # Provide all the substories # 提供所有子故事 substory = [x for x in story if x] data.append((substory, q, a)) story.append('') else: sent = tokenize(line) story.append(sent) return data def get_stories(f, only_supporting=False, max_length=None): '''Given a file name, read the file, 根据文件名,读文件 retrieve the stories, 检索故事 and then convert the sentences into a single story. 转换句子为一个故事 If max_length is supplied, any stories longer than max_length tokens will be discarded. 如果最大长度已提供, ''' data = parse_stories(f.readlines(), only_supporting=only_supporting) flatten = lambda data: reduce(lambda x, y: x + y, data) data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length] return data def vectorize_stories(data): inputs, queries, answers = [], [], [] for story, query, answer in data: inputs.append([word_idx[w] for w in story]) queries.append([word_idx[w] for w in query]) answers.append(word_idx[answer]) return (pad_sequences(inputs, maxlen=story_maxlen), pad_sequences(queries, maxlen=query_maxlen), np.array(answers)) try: path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz') except: print('Error downloading dataset, please download it manually:\n' '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n' '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz') raise challenges = { # QA1 with 10,000 samples # 问答数据集1:包含 10,000个样本 'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt', # QA2 with 10,000 samples # 问答数据集2:包含 10,000个样本 'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt', } challenge_type = 'single_supporting_fact_10k' challenge = challenges[challenge_type] print('Extracting stories for the challenge:', challenge_type) with tarfile.open(path) as tar: train_stories = get_stories(tar.extractfile(challenge.format('train'))) test_stories = get_stories(tar.extractfile(challenge.format('test'))) vocab = set() for story, q, answer in train_stories + test_stories: vocab |= set(story + q + [answer]) vocab = sorted(vocab) # Reserve 0 for masking via pad_sequences # 通过ad_sequences掩膜的保留0 vocab_size = len(vocab) + 1 story_maxlen = max(map(len, (x for x, _, _ in train_stories + test_stories))) query_maxlen = max(map(len, (x for _, x, _ in train_stories + test_stories))) print('-') print('Vocab size:', vocab_size, 'unique words') print('Story max length:', story_maxlen, 'words') print('Query max length:', query_maxlen, 'words') print('Number of training stories:', len(train_stories)) print('Number of test stories:', len(test_stories)) print('-') print('Here\'s what a "story" tuple looks like (input, query, answer):') print(train_stories[0]) print('-') print('Vectorizing the word sequences...') word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) inputs_train, queries_train, answers_train = vectorize_stories(train_stories) inputs_test, queries_test, answers_test = vectorize_stories(test_stories) print('-') print('inputs: integer tensor of shape (samples, max_length)') print('inputs_train shape:', inputs_train.shape) print('inputs_test shape:', inputs_test.shape) print('-') print('queries: integer tensor of shape (samples, max_length)') print('queries_train shape:', queries_train.shape) print('queries_test shape:', queries_test.shape) print('-') print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)') print('answers_train shape:', answers_train.shape) print('answers_test shape:', answers_test.shape) print('-') print('Compiling...') # placeholders # 占位符 input_sequence = Input((story_maxlen,)) question = Input((query_maxlen,)) # encoders # 编码器 # embed the input sequence into a sequence of vectors # 输入序列嵌入到向量序列 input_encoder_m = Sequential() input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64)) input_encoder_m.add(Dropout(0.3)) # output: (samples, story_maxlen, embedding_dim) # 输出:(样本,故事最大长度,嵌入维度) # embed the input into a sequence of vectors of size query_maxlen # 输入序列嵌入到向量序列,该序列大小为query_maxlen input_encoder_c = Sequential() input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=query_maxlen)) input_encoder_c.add(Dropout(0.3)) # output: (samples, story_maxlen, query_maxlen) # 输出:(样本,故事最大长度,问题最大长度) # embed the question into a sequence of vectors # 输入序列嵌入到向量序列 question_encoder = Sequential() question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=query_maxlen)) question_encoder.add(Dropout(0.3)) # output: (samples, query_maxlen, embedding_dim) # 输出:(samples, query_maxlen, embedding_dim) # encode input sequence and questions (which are indices) # to sequences of dense vectors # 编码输入序列和问题(索引)为全连接向量序列 input_encoded_m = input_encoder_m(input_sequence) input_encoded_c = input_encoder_c(input_sequence) question_encoded = question_encoder(question) # compute a 'match' between the first input vector sequence # and the question vector sequence # 计算第一输入向量序列与问题向量序列之间的“匹配” # shape: `(samples, story_maxlen, query_maxlen)` # 形状:(samples, story_maxlen, query_maxlen)` match = dot([input_encoded_m, question_encoded], axes=(2, 2)) match = Activation('softmax')(match) # add the match matrix with the second input vector sequence # 用第二输入向量序列加入匹配矩阵 response = add([match, input_encoded_c]) # (samples, story_maxlen, query_maxlen) response = Permute((2, 1))(response) # (samples, query_maxlen, story_maxlen) # concatenate the match matrix with the question vector sequence # 用问题向量序列串接匹配矩阵 answer = concatenate([response, question_encoded]) # the original paper uses a matrix multiplication for this reduction step. # 原论文使用矩阵相乘降低步骤 # we choose to use a RNN instead. # 本文选择RNN(循环神经网络)替换 answer = LSTM(32)(answer) # (samples, 32) # one regularization layer -- more would probably be needed. # 一个正则化层--很需要。 answer = Dropout(0.3)(answer) answer = Dense(vocab_size)(answer) # (samples, vocab_size) # we output a probability distribution over the vocabulary # 基于词汇表输出概率分布 answer = Activation('softmax')(answer) # build the final model # 建立模型 model = Model([input_sequence, question], answer) model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy']) # train # 训练 model.fit([inputs_train, queries_train], answers_train, batch_size=32, epochs=120, validation_data=([inputs_test, queries_test], answers_test))
代码执行
Keras详细介绍
中文:http://keras-cn.readthedocs.io/en/latest/
实例下载
https://github.com/keras-team/keras
https://github.com/keras-team/keras/tree/master/examples
完整项目下载
方便没积分童鞋,请加企鹅452205574,共享文件夹。
包括:代码、数据集合(图片)、已生成model、安装库文件等。