统计机器学习标注图片中各个类别的样本样以及检查特殊样本数量

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/lingyunxianhe/article/details/82913499

       在进行机器学习或深度学习中,对于那种边训练边增加图片样的情况,我们要经常需要获知目前数据量中样本的分布以及处理特殊情况(比如标注框面积小于指定阈值的标注等),为此写了个简单程序方面后面使用,特记录于此.由于程序简明扼要有些python基础的童鞋都能看得懂,在此不多说.具体见如下代码:

#!/usr/bin/python
# -*- coding: UTF-8 -*-
#2018/09/12 by DQ
import os
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

BoxLenTol=30
BoxAreaTol=BoxLenTol**2											
ImSize=[640,480]
fileIdLen=6
ImExpName='.jpg'
AnotExpName='.xml'
ClsNameSet=('blis','cosd','nake','break')
AnotFolder='/home/KingMe/project/py-faster-rcnn/data/FABdevkit2017/FAB2017/Annotations'#Annotations' 
TrainValTestAssignFolder='/home/KingMe/project/py-faster-rcnn/data/FABdevkit2017/FAB2017/ImageSets/Main'
TrainValTestFiles={'train':'train.txt','val':'val.txt','test':'test.txt'}             

##get object annotation bndbox loc start 
def GetAnnotBoxLoc(AnotPath):
    #open xml 
    tree = ET.ElementTree(file=AnotPath)
    root = tree.getroot()
    ObjectSet=root.findall('object')
    ObjBndBoxSet={}
    for Object in ObjectSet:
        ObjName=Object.find('name').text
        BndBox=Object.find('bndbox')
        x1 = int(BndBox.find('xmin').text)-1
        y1 = int(BndBox.find('ymin').text)-1
        x2 = int(BndBox.find('xmax').text)-1
        y2 = int(BndBox.find('ymax').text)-1
        BndBoxLoc=[x1,y1,x2,y2]
        if ObjBndBoxSet.has_key(ObjName):
        	ObjBndBoxSet[ObjName].append(BndBoxLoc)
        else:
        	ObjBndBoxSet[ObjName]=[BndBoxLoc]#why not ues dict(key=val)?
    return ObjBndBoxSet
##get object annotation bndbox loc end


def CalSampleNum(BoxSet,BoxNumSet):
	for Key,Val in BoxSet.iteritems():
		if BoxNumSet.has_key(Key):
			BoxNumSet[Key]=BoxNumSet[Key]+len(Val)

#计算标记样本中小面积样本数目(我这个是自己标记的可能会存在,标准数据应该不存在)
def CalSmallAreaSampleNum(BoxSet,SmallBoxNumSet):
    for Key,Val in BoxSet.iteritems():
        if SmallBoxNumSet.has_key(Key):
            for Box in Val:
                X1=Box[0]
                Y1=Box[1]
                X2=Box[2]
                Y2=Box[3]
                BoxArea=(X2-X1)*(Y2-Y1)
                if BoxArea<BoxAreaTol:
                   SmallBoxNumSet[Key]=SmallBoxNumSet[Key]+1



############################################ 
def GetTotalSampleNum():
    AnotFileNum=len(os.listdir(AnotFolder))
    TotalSampleNum=dict.fromkeys(ClsNameSet, 0)
    SmallBoxNumSet=dict.fromkeys(ClsNameSet, 0)
    MeanSampleNum=dict.fromkeys(ClsNameSet, 0) 
    BigAreaSampleNum=dict.fromkeys(ClsNameSet, 0) 

    ImIdSet=range(1,AnotFileNum+1)
    for ImId in ImIdSet:
    	ImIdStr=str(ImId).zfill(fileIdLen)
    	AnotName=ImIdStr+AnotExpName
    	AnotPath=os.path.join(AnotFolder,AnotName)
    	AnotBoxSet=GetAnnotBoxLoc(AnotPath)
    	CalSampleNum(AnotBoxSet,TotalSampleNum)
        CalSmallAreaSampleNum(AnotBoxSet,SmallBoxNumSet)

    for Key,Val in TotalSampleNum.iteritems():
        if MeanSampleNum.has_key(Key):
            MeanSampleNum[Key]=round(Val*1.0/AnotFileNum,2)
    for Key,Val in TotalSampleNum.iteritems():
        if BigAreaSampleNum.has_key(Key):
            BigAreaSampleNum[Key]=TotalSampleNum[Key]-SmallBoxNumSet[Key]

    print 'ImNum='+str(AnotFileNum)
    print 'TotalSampleNum='+str(TotalSampleNum)
    print 'MeanSampleNum='+str(MeanSampleNum)
    print 'BoxAreaTol='+str(BoxLenTol)+'*'+str(BoxLenTol)
    print 'SmallAreaSampleNum='+str(SmallBoxNumSet)    
    print 'BigAreaSampleNum='+str(BigAreaSampleNum)



def GetTrainValTestSample(SampleNumSet,ImIdFilePath):
    with open(ImIdFilePath,'r') as FId:
        k=0
        TxtList=FId.readlines()
        for LineStr in TxtList:
            PureStr=LineStr.strip()
            AnotFileName=PureStr+AnotExpName
            AnotFilePath=os.path.join(AnotFolder,AnotFileName)
            AnotBoxSet=GetAnnotBoxLoc(AnotFilePath)
            CalSampleNum(AnotBoxSet,SampleNumSet)
            k=k+1
        FileName=os.path.basename(ImIdFilePath)
        print FileName +' ImageNum='+str(k)+';',


def GetTrainValTestSampleMain():
    for Key,FileName in TrainValTestFiles.iteritems():
        ImIdFilePath=os.path.join(TrainValTestAssignFolder,FileName)
        SampleNumSet=dict.fromkeys(ClsNameSet, 0)
        GetTrainValTestSample(SampleNumSet,ImIdFilePath)
        print FileName[:-4]+'SampleNumSet='+str(SampleNumSet)


GetTotalSampleNum()
GetTrainValTestSampleMain()

猜你喜欢

转载自blog.csdn.net/lingyunxianhe/article/details/82913499