Following the reading of the previous article, this is the identification code of cnn. Many online codes can actually be run directly, but I just coded it and understood it again.
# -*- coding: utf-8 -*- import numpy as np w_alpha=0.01 b_alpha=0.1 IMAGE_HEIGHT = 240 IMAGE_WIDTH = 320 MAX_CAPTCHA = 1 # Image type 37 CHAR_SET_LEN = 37 dropout = 0.7 conv_dict = { # The first layer convolution parameter 3*3, because it is a color image, the first layer input channel is 3, and the output is 32 "w_1": tf.Variable(w_alpha * tf.random_normal([3, 3, 3, 32]), name='w_1'), "b_1": tf.Variable(b_alpha * tf.random_normal([32]), name='b_1'), # Second layer convolution parameters "w_2": tf.Variable(w_alpha * tf.random_normal([3, 3, 32, 64]), name='w_2'), "b_2": tf.Variable(b_alpha * tf.random_normal([64]), name='b_2'), # The third layer convolution parameters "w_3": tf.Variable(w_alpha * tf.random_normal([3, 3, 64, 128]), name='w_3'), "b_3": tf.Variable(b_alpha * tf.random_normal([128]), name='b_3'), # Fourth layer convolution parameters "w_4": tf.Variable(w_alpha * tf.random_normal([3, 3, 128, 128]), name='w_4'), "b_4": tf.Variable(b_alpha * tf.random_normal([128]), name='b_4'), 'out': tf.Variable(tf.random_normal([1024, CHAR_SET_LEN])), 'out_add': tf.Variable(tf.random_normal([CHAR_SET_LEN])) } # batch normalization - prevent gradient dispersion # wx_plus_b tensor # out_size number of channels def batch_normal(wx_plus_b, out_size): fc_mean, fc_var = tf.nn.moments( wx_plus_b, axes=[0, 1, 2], # The dimension you want to normalize, [0] represents the batch dimension # If it is image data, you can pass in [0, 1, 2], which is equivalent to finding the mean/variance of [batch, height, width], be careful not to add the channel dimension ) # out_size is the same as the number of output channels of wx_plus_b scale = tf.Variable(tf.ones([out_size])) shift = tf.Variable(tf.zeros([out_size])) epsilon = 0.001 wx_plus_b = tf.nn.batch_normalization(wx_plus_b, fc_mean, fc_var, shift, scale, epsilon) return wx_plus_b X = tf.placeholder(tf.float32, [None, IMAGE_HEIGHT , IMAGE_WIDTH,3]) Y = tf.placeholder(tf.float32, [None, MAX_CAPTCHA * CHAR_SET_LEN]) NOR = tf.placeholder(tf.float32) keep_prob = tf.placeholder(tf.float32) # dropout # Convert a single number to an array def one_hot_n(x, n): x = np.array(x) return np.eye(n)[x] def conv2d(conv, cd1, cd2, out_size, nor): conv = tf.nn.bias_add(tf.nn.conv2d(conv, cd1, strides=[1, 1, 1, 1], padding='SAME'), cd2) # do batch_normal # if nor > 1: # conv = batch_normal(conv, out_size) conv = tf.nn.relu(conv) conv = tf.nn.max_pool(conv, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') # dropout prevents overfitting conv = tf.nn.dropout(conv, keep_prob) return conv # define CNN def crack_captcha_cnn(): # Four layers of convolution pooling conv1 = conv2d(X, conv_dict['w_1'], conv_dict['b_1'], 32, NOR) conv2 = conv2d(conv1, conv_dict['w_2'], conv_dict['b_2'], 64, NOR) conv3 = conv2d(conv2, conv_dict['w_3'], conv_dict['b_3'], 128, NOR) conv4 = conv2d(conv3, conv_dict['w_4'], conv_dict['b_4'], 128, NOR) # Fully connected layer # 240/16=15 320/16=20 w_d = tf.Variable(w_alpha * tf.random_normal([15 * 20 * 128, 1024])) b_d = tf.Variable(b_alpha * tf.random_normal([1024])) dense = tf.reshape(conv4, [-1, w_d.get_shape().as_list()[0]]) dense = tf.nn.relu(tf.add(tf.matmul(dense, w_d), b_d)) out = tf.add(tf.matmul(dense, conv_dict['out']), conv_dict['out_add']) return out # read tfrecrods data def read_and_decode(filename): filename_queue = tf.train.string_input_producer([filename]) reader = tf.TFRecordReader() _, serialized_example = reader.read(filename_queue) features = tf.parse_single_example(serialized_example, features={ 'label': tf.FixedLenFeature([], tf.int64), 'img_raw' : tf.FixedLenFeature([], tf.string), }) img = tf.decode_raw(features['img_raw'], tf.uint8) img = tf.reshape(img, [IMAGE_HEIGHT, IMAGE_WIDTH, 3]) # normalize img = tf.cast(img, tf.float32) * (1. / 255) - 0.5 label = tf.cast(features['label'], tf.int32) return img, label # Training def train_crack_captcha_cnn(): output = crack_captcha_cnn() # softmax , sigmoid The first one is for single result, the second one is for multiple results loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output,labels=Y)) #loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(output, Y)) # optimizer In order to speed up the training learning_rate should start large, and then slowly decay optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) # Evaluate model # gives the index of the maximum value of pred in the horizontal dimension. prd tensor, 1 horizontal dimension, returns a boolen correct_pred = tf.equal(tf.argmax(output, 1), tf.argmax(Y, 1)) # Convert boolean to floating point data, find the average accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) img, label = read_and_decode("anm_pic_train.tfrecords") img_batch, label_batch = tf.train.shuffle_batch([img, label], batch_size=30, capacity=7000,min_after_dequeue=1000) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # saver.restore(sess, tf.train.latest_checkpoint('/home/root/wtf/yzm/code/')) step = 0 # img, label = read_and_decode("anm_pic_train.tfrecords") # img_batch, label_batch = tf.train.shuffle_batch([img, label], batch_size=64, capacity=70000,min_after_dequeue=1000) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) while True: # for i in range(3000): imgs, labs = sess.run([img_batch, label_batch]) # print (labs) one_hot_labs = sess.run(tf.cast(one_hot_n(labs, CHAR_SET_LEN), tf.float32)) sess.run(optimizer, feed_dict={X: imgs, Y: one_hot_labs, keep_prob: dropout, NOR: 1.}) if step % 50 == 0: acc = sess.run( accuracy, feed_dict={X: imgs, Y: one_hot_labs, keep_prob: 1., NOR: 1.}) print(step, acc) if acc > 0.5: saver.save(sess, "crack_capcha.model", global_step=step) print("Complete!!") coord.request_stop() coord.join(threads) sess.close() break step += 1 # print("Complete!!") # coord.request_stop() # coord.join(threads) # sess.close() train_crack_captcha_cnn()
1 .batch_noraml is to prevent gradient dispersion, but it is not clear whether it is placed before activation, and how to do the if statement ....
2. This code is run with cpu, don't ask me why I use cpu, poor. It's best to run with gpu if you have the conditions, don't worry
The result of running for half a day:
Recommended blog address:
http://blog.topspeedsnail.com
https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/5-13-BN/