#implementing an rnn in tensorflow # #we implement an RNN intensorflow to predict spam/ham from texts # import os import re import io import requests import numpy as np import matplotlib.pyplot as plt import tensorflow as tf from zipfile import ZipFile from tensorflow.python.framework import ops ops.reset_default_graph() import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #start a graph sess=tf.Session() #set rnn parameters epochs=20 batch_size=250 max_sequence_length=25 #短信最大长度为25个单词,超过部分会被截取掉,不够的部分用0填充 rnn_size=10 #rnn模型由10个单元组成,一个cell中神经元的个数 embedding_size=50 #每个单词会被嵌套在长度为50的词向量中 min_word_frequency=10 #词频超过10的单词 learning_rate=0.0005 dropout_keep_prob=tf.placeholder(tf.float32) # Download or open data data_dir = 'temp' data_file = 'text_data.txt' if not os.path.exists(data_dir): os.makedirs(data_dir) if not os.path.isfile(os.path.join(data_dir, data_file)): zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip' r = requests.get(zip_url) z = ZipFile(io.BytesIO(r.content)) file = z.read('SMSSpamCollection') # Format Data text_data = file.decode() text_data = text_data.encode('ascii',errors='ignore') text_data = text_data.decode().split('\n') # Save data to text file with open(os.path.join(data_dir, data_file), 'w') as file_conn: for text in text_data: file_conn.write("{}\n".format(text)) else: # Open data from text file text_data = [] with open(os.path.join(data_dir, data_file), 'r') as file_conn: for row in file_conn: text_data.append(row) text_data = text_data[:-1] text_data = [x.split('\t') for x in text_data if len(x)>=1] [text_data_target, text_data_train] = [list(x) for x in zip(*text_data)] # Create a text cleaning function def clean_text(text_string): text_string = re.sub(r'([^\s\w]|_|[0-9])+', '', text_string) text_string = " ".join(text_string.split()) text_string = text_string.lower() return(text_string) # Clean texts text_data_train = [clean_text(x) for x in text_data_train] print(text_data_train) print('_________________________________') # Change texts into numeric vectors,将文本转换为索引列表,具体用法可以百度 vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_sequence_length, min_frequency=min_word_frequency) text_processed = np.array(list(vocab_processor.fit_transform(text_data_train))) # Shuffle and split data text_processed = np.array(text_processed) text_data_target = np.array([1 if x=='ham' else 0 for x in text_data_target]) shuffled_ix = np.random.permutation(np.arange(len(text_data_target))) x_shuffled = text_processed[shuffled_ix] y_shuffled = text_data_target[shuffled_ix] # Split train/test set ix_cutoff = int(len(y_shuffled)*0.80) x_train, x_test = x_shuffled[:ix_cutoff], x_shuffled[ix_cutoff:] y_train, y_test = y_shuffled[:ix_cutoff], y_shuffled[ix_cutoff:] vocab_size = len(vocab_processor.vocabulary_) print("Vocabulary Size: {:d}".format(vocab_size)) print("80-20 Train Test split: {:d} -- {:d}".format(len(y_train), len(y_test))) #create placeholders x_data=tf.placeholder(tf.int32,[None,max_sequence_length]) y_output=tf.placeholder(tf.int32,[None]) # Create embedding, 创建输入数据x_data的嵌套矩阵和嵌套查找操作,代码如下 embedding_mat = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0)) embedding_output = tf.nn.embedding_lookup(embedding_mat, x_data) #return max_sequence_length*embedding_size #embedding_output_expanded = tf.expand_dims(embedding_output, -1) #define the RNN cell cell=tf.nn.rnn_cell.BasicRNNCell(num_units=rnn_size) #num_units是每一个cell中神经元的个数 output,state=tf.nn.dynamic_rnn(cell,embedding_output,dtype=tf.float32) #因为这里是dynamic_rnn,所以不用指定cell的个数,其可以处理边长序列 output=tf.nn.dropout(output,dropout_keep_prob) #get out of rnn sequence output=tf.transpose(output,[1,0,2]) last=tf.gather(output,int(output.get_shape()[0])-1) #取到rnn最后的一个输出作为全连接层的输入 weight=tf.Variable(tf.truncated_normal([rnn_size,2],stddev=0.1)) bias=tf.Variable(tf.constant(0.1,shape=[2])) logits_out=tf.nn.softmax(tf.matmul(last,weight)+bias) #loss function losses=tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_out,labels=y_output) loss=tf.reduce_mean(losses) accuracy=tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits_out,1),tf.cast(y_output, tf.int64)),tf.float32)) optimizer = tf.train.RMSPropOptimizer(learning_rate) train_step = optimizer.minimize(loss) init = tf.initialize_all_variables() sess.run(init) train_loss = [] test_loss = [] train_accuracy = [] test_accuracy = [] #start training for epoch in range(epochs): # shuffle training data shuffled_ix=np.random.permutation(np.arange(len(x_train))) x_train=x_train[shuffled_ix] y_train=y_train[shuffled_ix] num_batches=int(len(x_train)/batch_size)+1 #do for i in range(num_batches): # select train data min_ix=i*batch_size max_ix=np.min([len(x_train),((i+1)*batch_size)]) x_train_batch=x_train[min_ix:max_ix] y_train_batch=y_train[min_ix:max_ix] #run train step train_dict={x_data:x_train_batch,y_output:y_train_batch,dropout_keep_prob:0.5} sess.run(train_step,feed_dict=train_dict) #run loss and accuracy for training temp_train_loss,temp_train_acc=sess.run([loss,accuracy],feed_dict=train_dict) train_loss.append(temp_train_loss) train_accuracy.append(temp_train_acc) #run eval step test_dict={x_data:x_test,y_output:y_test,dropout_keep_prob:1.0} temp_test_loss,temp_test_acc=sess.run([loss,accuracy],feed_dict=test_dict) test_loss.append(temp_test_loss) test_accuracy.append(temp_test_acc) print('Epoch: {}, Test Loss: {:.2}, Test Acc: {:.2}'.format(epoch + 1, temp_test_loss, temp_test_acc)) # Plot loss over time epoch_seq = np.arange(1, epochs+1) plt.plot(epoch_seq, train_loss, 'k--', label='Train Set') plt.plot(epoch_seq, test_loss, 'r-', label='Test Set') plt.title('Softmax Loss') plt.xlabel('Epochs') plt.ylabel('Softmax Loss') plt.legend(loc='upper left') plt.show() # Plot accuracy over time plt.plot(epoch_seq, train_accuracy, 'k--', label='Train Set') plt.plot(epoch_seq, test_accuracy, 'r-', label='Test Set') plt.title('Test Accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend(loc='upper left') plt.show() #1 进行rnn的训练和学习,首先是要获取数据集,将其弄成向量的形式 #2 然后定义rnn的模型,主要的参数包括一个cell中神经元的个数,还有就是cell的个数,当然了,如果用的是dynamic_rnn,可以不用定义rnn #的个数,因为dynamic_rnn可以处理变长的序列 #3 定义loss function,然后进行优化即可,具体可以看源码和另一篇关于用rnn处理mnist数据集的博客
用dynamic_rnn来进行垃圾邮件分类
猜你喜欢
转载自blog.csdn.net/tianguiyuyu/article/details/80176877
今日推荐
周排行