我的猫狗大战数据集图片缺失处理

前面 找了一份540M的猫狗大战的数据集,想使用这个数据集在小型数据集上从头开始训练一个卷积神经网络,使用了其中的2500个样本,这个猫狗大战的数据集总的是25000张图片,所以在前面2500张图片缺失的时候我就自己从后面的数据集中拷贝图片补齐前面的,但是发现缺失图片比较多,手动去查找太麻烦,所以干脆还是判断一下文件是不是存在的,写代码来解决这个问题。

其实这个代码比较简单,但是考虑到前面自己在CSDN上分享过这个数据集(https://download.csdn.net/download/lxiao428/10747658),不想坑别人,就把代码贴出来,供参考。如果想用全部的数据集作训练,不能从后面考图片,但是应该可以自己去搜集图片来不全缺失,只要将这一部分的代码修改一下,打印出来其中的缺失的index,然后去对应的补全,免得自己去找,还是有一定的小作用的。

代码如下:

# -*- coding: utf-8 -*-
"""
Created on Sat Oct 27 16:25:41 2018

@author: Lxiao217
"""
import os, shutil
original_dataset_dir = 'F:\\python\\DeepLearning\\train'
base_dir = 'F:\\python\\DeepLearning\\cats_and_dogs_small'
if not os.path.exists(base_dir):
    os.mkdir(base_dir)

train_dir = os.path.join(base_dir, 'train')
if not os.path.exists(train_dir):
    os.mkdir(train_dir)

test_dir = os.path.join(base_dir, 'test')
if not os.path.exists(test_dir):
    os.mkdir(test_dir)

validation_dir = os.path.join(base_dir, 'validation')
if not os.path.exists(validation_dir):
    os.mkdir(validation_dir)

train_cats_dir = os.path.join(train_dir, 'cats')
if not os.path.exists(train_cats_dir):
    os.mkdir(train_cats_dir)

train_dogs_dir = os.path.join(train_dir, 'dogs')
if not os.path.exists(train_dogs_dir):
    os.mkdir(train_dogs_dir)

validation_cats_dir = os.path.join(validation_dir, 'cats')
if not os.path.exists(validation_cats_dir):
    os.mkdir(validation_cats_dir)

validation_dogs_dir = os.path.join(validation_dir, 'dogs')
if not os.path.exists(validation_dogs_dir):
    os.mkdir(validation_dogs_dir)

test_cats_dir = os.path.join(test_dir, 'cats')
if not os.path.exists(test_cats_dir):
    os.mkdir(test_cats_dir)

test_dogs_dir = os.path.join(test_dir, 'dogs')
if not os.path.exists(test_dogs_dir):
    os.mkdir(test_dogs_dir)

#将前1000张猫的图片复制到train_cats_dir中
fnames = ['cat.{}.jpg'.format(i) for i in range (1000)]
for fname in fnames:
    src = os.path.join(original_dataset_dir, fname)
    if not os.path.exists(src):
        nameList = fname.split('.')
        picindex = int(nameList[1])
        newindex = picindex + 6666
        newName = nameList[0] + '.' +  str(newindex) + '.' +  nameList[2]
        src = os.path.join(original_dataset_dir, newName)
    dst = os.path.join(train_cats_dir, fname)
    shutil.copyfile(src, dst)
#500张验证猫
fnames = ['cat.{}.jpg'.format(i) for i in range(1000, 1500)]
for fname in fnames:
    src = os.path.join(original_dataset_dir, fname)
    if not os.path.exists(src):
        nameList = fname.split('.')
        picindex = int(nameList[1])
        newindex = picindex + 6666   #用的原始index+6666的图片
        newName = nameList[0] + '.' +  str(newindex) + '.' +  nameList[2]
        src = os.path.join(original_dataset_dir, newName)
    dst = os.path.join(validation_cats_dir, fname)
    shutil.copyfile(src, dst)
#500张测试猫
fnames = ['cat.{}.jpg'.format(i) for i in range(1500, 2000)]
for fname in fnames:
    src = os.path.join(original_dataset_dir, fname)
    if not os.path.exists(src):
        nameList = fname.split('.')
        picindex = int(nameList[1])
        newindex = picindex + 6666
        newName = nameList[0] + '.' +  str(newindex) + '.' +  nameList[2]
        src = os.path.join(original_dataset_dir, newName)
    dst = os.path.join(test_cats_dir, fname)
    shutil.copyfile(src, dst)

#1000张训练狗
fnames = ['dog.{}.jpg'.format(i) for i in range(1000)]
for fname in fnames:
    src = os.path.join(original_dataset_dir, fname)
    if not os.path.exists(src):
        nameList = fname.split('.')
        picindex = int(nameList[1])
        newindex = picindex + 6666
        newName = nameList[0] + '.' +  str(newindex) + '.' +  nameList[2]
        src = os.path.join(original_dataset_dir, newName)
    dst = os.path.join(train_dogs_dir, fname)
    shutil.copyfile(src, dst)

#500张验证狗
fnames = ['dog.{}.jpg'.format(i) for i in range(1000, 1500)]
for fname in fnames:
    src = os.path.join(original_dataset_dir, fname)
    if not os.path.exists(src):
        nameList = fname.split('.')
        picindex = int(nameList[1])
        newindex = picindex + 6666
        newName = nameList[0] + '.' +  str(newindex) + '.' +  nameList[2]
        src = os.path.join(original_dataset_dir, newName)
    dst = os.path.join(validation_dogs_dir, fname)
    shutil.copyfile(src, dst)

#500张测试狗
fnames = ['dog.{}.jpg'.format(i) for i in range(1500, 2000)]
for fname in fnames:
    src = os.path.join(original_dataset_dir, fname)
    if not os.path.exists(src):
        nameList = fname.split('.')
        picindex = int(nameList[1])
        newindex = picindex + 6666
        newName = nameList[0] + '.' +  str(newindex) + '.' +  nameList[2]
        src = os.path.join(original_dataset_dir, newName)
    dst = os.path.join(test_dogs_dir, fname)
    shutil.copyfile(src, dst)

猜你喜欢

转载自blog.csdn.net/lxiao428/article/details/83479379