1.训练集的数据亮已经足够了,所以我只下载了训练集,下载地址:
2.数据集中图片类别是用wordnet编码进行命名的,wordnet编码与实际的语义映射,参考以下教程:
3.对下载好的数据集图片进行缩小,划分训练/验证/测试集
import os import glob from PIL import Image import random #create a dir for save uncompress files uncompress_path = 'imagenet2012' os.system('mkdir '+uncompress_path) #uncompress all_tars all_tars = glob.glob('*.tar') for tar_file in all_tars: #uncompress print('uncompress '+tar_file+' ...') s1, _ = tar_file.split('.') os.system('mkdir '+uncompress_path+'/'+s1) os.system('tar -xf '+tar_file+' -C '+uncompress_path+'/'+s1) #resize images all_images = glob.glob(uncompress_path+'/'+s1+'/*') for image_file in all_images: im = Image.open(image_file) im = im.resize((84, 84), resample=Image.LANCZOS) #image_file rename #TODO: im.save(image_file) #put in correct directory all_classes = glob.glob(uncompress_path+'/*') all_classes_num = len(all_classes) trian_classes_num = int(all_classes_num*0.64) val_classes_num = int(all_classes_num*0.16) test_classes_num = all_classes_num - trian_classes_num - val_classes_num #trian_directory fill os.system('mkdir '+uncompress_path+'/train') trian_classes = random.sample(all_classes,trian_classes_num) for train_class in trian_classes: print('mv '+train_class+' to trian_directory...') os.system('mv '+train_class+' '+uncompress_path+'/train') all_classes.remove(train_class) #val_directory fill os.system('mkdir '+uncompress_path+'/val') val_classes = random.sample(all_classes,val_classes_num) for val_class in val_classes: print('mv '+val_class+' to val_directory...') os.system('mv '+val_class+' '+uncompress_path+'/val') all_classes.remove(val_class) #test_directory fill os.system('mkdir '+uncompress_path+'/test') test_classes = random.sample(all_classes,test_classes_num) for test_class in test_classes: print('mv '+test_class+' to test_directory...') os.system('mv '+test_class+' '+uncompress_path+'/test') all_classes.remove(test_class) #compress result print('compress result...') os.system('zip -r '+uncompress_path+'.zip '+uncompress_path) #success print('proc success!!!')