import argparse
import sys
import tensorflow as tf
import model
import icdar
import time
import numpy as np
import tensorflow as tf
from tensorflow.contrib import slim
tf.app.flags.DEFINE_integer('input_size', 512, '')
tf.app.flags.DEFINE_integer('batch_size_per_gpu', 14, '')
tf.app.flags.DEFINE_integer('num_readers',2 , '')
tf.app.flags.DEFINE_float('learning_rate', 0.0001, '')
tf.app.flags.DEFINE_integer('max_steps', 100000, '')
tf.app.flags.DEFINE_float('moving_average_decay', 0.997, '')
tf.app.flags.DEFINE_integer('num_gpus', 1 , '')
tf.app.flags.DEFINE_string('checkpoint_path', ' ', '')
tf.app.flags.DEFINE_boolean('restore', False, 'whether to resotre from checkpoint')
tf.app.flags.DEFINE_integer('save_checkpoint_steps', 1000, '')
tf.app.flags.DEFINE_integer('save_summary_steps', 100, '')
tf.app.flags.DEFINE_string('pretrained_model_path', None, '')
tf.app.flags.DEFINE_enum('job_name', '', ('ps', 'worker', 'controller', ''),
'One of "ps", "worker", "controller", "". Empty for local '
'training')
tf.app.flags.DEFINE_string('ps_hosts', '', 'Comma-separated list of target hosts')
tf.app.flags.DEFINE_string('worker_hosts', '', 'Comma-separated list of target hosts')
tf.app.flags.DEFINE_integer('task_index', 0, 'Index of task within the job')
FLAGS = tf.app.flags.FLAGS
def tower_loss(images, score_maps, geo_maps, training_masks, reuse_variables=None):
return total_loss, model_loss
def average_gradients(tower_grads):
return average_grads
def main(argv=None):
ps_hosts = FLAGS.ps_hosts.split(",")
worker_hosts = FLAGS.worker_hosts.split(",")
# Create a cluster from the parameter server and worker hosts.
cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
# Create and start a server for the local task.
server = tf.train.Server(cluster,
job_name=FLAGS.job_name,
task_index=FLAGS.task_index)
if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker":
# worker_device = "/job:worker/replica:0/task:%d/gpu:0" % FLAGS.task_index
# Assigns ops to the local worker by default.
with tf.device(tf.train.replica_device_setter(
ps_device = "/job:ps/cpu:0",
worker_device="/job:worker/replica:0/task:%d" % FLAGS.task_index,
cluster=cluster)):
global_step = tf.Variable(0, name="global_step", trainable=False)
# Build model...
input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images')
input_score_maps = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_score_maps')
if FLAGS.geometry == 'RBOX':
input_geo_maps = tf.placeholder(tf.float32, shape=[None, None, None, 5], name='input_geo_maps')
else:
input_geo_maps = tf.placeholder(tf.float32, shape=[None, None, None, 8], name='input_geo_maps')
input_training_masks = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_training_masks')
#split
gpus = range(0,FLAGS.num_gpus)
input_images_split = tf.split(input_images, len(gpus))
input_score_maps_split = tf.split(input_score_maps, len(gpus))
input_geo_maps_split = tf.split(input_geo_maps, len(gpus))
input_training_masks_split = tf.split(input_training_masks, len(gpus))
num_workers = len(worker_hosts)
opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
opt = tf.train.SyncReplicasOptimizer(
opt,
replicas_to_aggregate=num_workers,
total_num_replicas=num_workers,
name="sync_replicas")
tower_grads = 0
reuse_variables = None
for gpu_id in gpus:
with tf.device("/job:worker/replica:0/task:%d/gpu:%d" % (FLAGS.task_index, gpu_id)):
# multi_gpu
with tf.name_scope('model_%d' % gpu_id) as scope:
i = gpu_id
tf.logging.info('aaaaaaaa')
iis = input_images_split[i]
isms = input_score_maps_split[i]
igms = input_geo_maps_split[i]
itms = input_training_masks_split[i]
total_loss, model_loss = tower_loss(iis, isms, igms, itms, reuse_variables)
batch_norm_updates_op = tf.group(*tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope))
reuse_variables = True
tower_grads+=total_loss
apply_gradient_op = opt.minimize(tower_grads, global_step=global_step)
# The StopAtStepHook handles stopping after running given steps.
with tf.control_dependencies([apply_gradient_op, batch_norm_updates_op]):
train_op = tf.no_op(name='no')
sync_replicas_hook = opt.make_session_run_hook((FLAGS.task_index == 0))
hook = tf.train.StopAtStepHook(last_step=10)
hooks = [hook,
sync_replicas_hook,
tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': total_loss}, every_n_iter=1)]
# The MonitoredTrainingSession takes care of session initialization,
# restoring from a checkpoint, saving to a checkpoint, and closing when done
# or an error occurs.
saver = tf.train.Saver(tf.global_variables(),save_relative_paths=True)
config = tf.ConfigProto()
config.allow_soft_placement = True
with tf.train.MonitoredTrainingSession(master=server.target,
is_chief=(FLAGS.task_index == 0),
checkpoint_dir=" ",
hooks = hooks,
config = config) as mon_sess:
data_generator = icdar.get_batch(num_workers=FLAGS.num_readers,
input_size=FLAGS.input_size,
batch_size=FLAGS.batch_size_per_gpu * len(gpus))
while not mon_sess.should_stop():
# Run a training step asynchronously.
# See <a href="../api_docs/python/tf/train/SyncReplicasOptimizer"><code>tf.train.SyncReplicasOptimizer</code></a> for additional details on how to
# perform *synchronous* training.
# mon_sess.run handles AbortedError in case of preempted PS.
start = time.time()
for step in range(FLAGS.max_steps):
print(step)
data = next(data_generator)
mon_sess.run([train_op], feed_dict={input_images: data[0],
input_score_maps: data[2],
input_geo_maps: data[3],
input_training_masks: data[4]})
if __name__ == "__main__":
tf.app.run()
问题记录:
1.with tf.device(tf.train.replica_device_setter(
ps_device = "/job:ps/cpu:0",
worker_device="/job:worker/replica:0/task:%d" % FLAGS.task_index,
cluster=cluster)):
注意worker_device设置
2.整个程序中没有做参数initialize,但benchmark、mnist分布式程序中都做,尤其是benchmark做的比较详细,这应该是程序继续要改进的地方
3 global_step的设置等
4.tf.summary等要多应用
更新:
tf.logging 先设置,再引用tf.logging.info等
如果有幸被大佬看到,请不吝赐教!