前面 找了一份540M的猫狗大战的数据集,想使用这个数据集在小型数据集上从头开始训练一个卷积神经网络,使用了其中的2500个样本,这个猫狗大战的数据集总的是25000张图片,所以在前面2500张图片缺失的时候我就自己从后面的数据集中拷贝图片补齐前面的,但是发现缺失图片比较多,手动去查找太麻烦,所以干脆还是判断一下文件是不是存在的,写代码来解决这个问题。
其实这个代码比较简单,但是考虑到前面自己在CSDN上分享过这个数据集(https://download.csdn.net/download/lxiao428/10747658),不想坑别人,就把代码贴出来,供参考。如果想用全部的数据集作训练,不能从后面考图片,但是应该可以自己去搜集图片来不全缺失,只要将这一部分的代码修改一下,打印出来其中的缺失的index,然后去对应的补全,免得自己去找,还是有一定的小作用的。
代码如下:
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 27 16:25:41 2018
@author: Lxiao217
"""
import os, shutil
original_dataset_dir = 'F:\\python\\DeepLearning\\train'
base_dir = 'F:\\python\\DeepLearning\\cats_and_dogs_small'
if not os.path.exists(base_dir):
os.mkdir(base_dir)
train_dir = os.path.join(base_dir, 'train')
if not os.path.exists(train_dir):
os.mkdir(train_dir)
test_dir = os.path.join(base_dir, 'test')
if not os.path.exists(test_dir):
os.mkdir(test_dir)
validation_dir = os.path.join(base_dir, 'validation')
if not os.path.exists(validation_dir):
os.mkdir(validation_dir)
train_cats_dir = os.path.join(train_dir, 'cats')
if not os.path.exists(train_cats_dir):
os.mkdir(train_cats_dir)
train_dogs_dir = os.path.join(train_dir, 'dogs')
if not os.path.exists(train_dogs_dir):
os.mkdir(train_dogs_dir)
validation_cats_dir = os.path.join(validation_dir, 'cats')
if not os.path.exists(validation_cats_dir):
os.mkdir(validation_cats_dir)
validation_dogs_dir = os.path.join(validation_dir, 'dogs')
if not os.path.exists(validation_dogs_dir):
os.mkdir(validation_dogs_dir)
test_cats_dir = os.path.join(test_dir, 'cats')
if not os.path.exists(test_cats_dir):
os.mkdir(test_cats_dir)
test_dogs_dir = os.path.join(test_dir, 'dogs')
if not os.path.exists(test_dogs_dir):
os.mkdir(test_dogs_dir)
#将前1000张猫的图片复制到train_cats_dir中
fnames = ['cat.{}.jpg'.format(i) for i in range (1000)]
for fname in fnames:
src = os.path.join(original_dataset_dir, fname)
if not os.path.exists(src):
nameList = fname.split('.')
picindex = int(nameList[1])
newindex = picindex + 6666
newName = nameList[0] + '.' + str(newindex) + '.' + nameList[2]
src = os.path.join(original_dataset_dir, newName)
dst = os.path.join(train_cats_dir, fname)
shutil.copyfile(src, dst)
#500张验证猫
fnames = ['cat.{}.jpg'.format(i) for i in range(1000, 1500)]
for fname in fnames:
src = os.path.join(original_dataset_dir, fname)
if not os.path.exists(src):
nameList = fname.split('.')
picindex = int(nameList[1])
newindex = picindex + 6666 #用的原始index+6666的图片
newName = nameList[0] + '.' + str(newindex) + '.' + nameList[2]
src = os.path.join(original_dataset_dir, newName)
dst = os.path.join(validation_cats_dir, fname)
shutil.copyfile(src, dst)
#500张测试猫
fnames = ['cat.{}.jpg'.format(i) for i in range(1500, 2000)]
for fname in fnames:
src = os.path.join(original_dataset_dir, fname)
if not os.path.exists(src):
nameList = fname.split('.')
picindex = int(nameList[1])
newindex = picindex + 6666
newName = nameList[0] + '.' + str(newindex) + '.' + nameList[2]
src = os.path.join(original_dataset_dir, newName)
dst = os.path.join(test_cats_dir, fname)
shutil.copyfile(src, dst)
#1000张训练狗
fnames = ['dog.{}.jpg'.format(i) for i in range(1000)]
for fname in fnames:
src = os.path.join(original_dataset_dir, fname)
if not os.path.exists(src):
nameList = fname.split('.')
picindex = int(nameList[1])
newindex = picindex + 6666
newName = nameList[0] + '.' + str(newindex) + '.' + nameList[2]
src = os.path.join(original_dataset_dir, newName)
dst = os.path.join(train_dogs_dir, fname)
shutil.copyfile(src, dst)
#500张验证狗
fnames = ['dog.{}.jpg'.format(i) for i in range(1000, 1500)]
for fname in fnames:
src = os.path.join(original_dataset_dir, fname)
if not os.path.exists(src):
nameList = fname.split('.')
picindex = int(nameList[1])
newindex = picindex + 6666
newName = nameList[0] + '.' + str(newindex) + '.' + nameList[2]
src = os.path.join(original_dataset_dir, newName)
dst = os.path.join(validation_dogs_dir, fname)
shutil.copyfile(src, dst)
#500张测试狗
fnames = ['dog.{}.jpg'.format(i) for i in range(1500, 2000)]
for fname in fnames:
src = os.path.join(original_dataset_dir, fname)
if not os.path.exists(src):
nameList = fname.split('.')
picindex = int(nameList[1])
newindex = picindex + 6666
newName = nameList[0] + '.' + str(newindex) + '.' + nameList[2]
src = os.path.join(original_dataset_dir, newName)
dst = os.path.join(test_dogs_dir, fname)
shutil.copyfile(src, dst)