如何将原始数据集分为训练集与测试集

我们通常拿到的数据集为一整个数据集,往往我们需要拆分为训练集与测试集,其拆分代码如下。该代码从数据集中取出测试集,剩下的就可以作为训练集使用。当然,也可以在此代码上更改,获得训练集。

import os
import shutil
import random

'''
image_path:原始数据集图像文件夹路径
label_path:原始数据集标签文件夹路径
test_image_path:测试集图像文件夹路径
test_label_path:测试集标签文件夹路径
test_percent:测试集数据占比,默认0.2,一般数据拆分时测试集与训练集是2:8
'''

def split_train_test_data(image_path, label_path, test_image_path, test_label_path, test_percent=0.2):
    '''建立测试图像文件夹'''
    if not os.path.exists(os.path.join(test_image_path)):
        os.makedirs(os.path.join(test_image_path))
    if not os.path.exists(os.path.join(test_label_path)):
        os.makedirs(os.path.join(test_label_path))

    '''获得标签文件列表'''
    label_list = os.listdir(label_path)
    '''将标签列表随机排列'''
    random.shuffle(label_list)
    image_name_list = []
    label_name_list = []
    for label_name in label_list:
        each_name, _ = os.path.splitext(label_name)
        image_name = os.path.join(image_path, '{}.jpg'.format(each_name))
        label_name = os.path.join(label_path, '{}.txt'.format(each_name))
        image_name_list.append(image_name)
        label_name_list.append(label_name)

    '''计算测试集数据数量'''
    test_label_len = int(test_percent * len(label_list))
    '''将图像及标签文件移动到test文件夹中'''
    count = 0
    for i in range(test_label_len):
        shutil.move(label_name_list[i], test_label_path)
        shutil.move(image_name_list[i], test_image_path)
        count += 1
    print('split complete,test_image number=%d' % count)

猜你喜欢

转载自blog.csdn.net/m0_54361461/article/details/127740789