版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/lingyunxianhe/article/details/82913499
在进行机器学习或深度学习中,对于那种边训练边增加图片样的情况,我们要经常需要获知目前数据量中样本的分布以及处理特殊情况(比如标注框面积小于指定阈值的标注等),为此写了个简单程序方面后面使用,特记录于此.由于程序简明扼要有些python基础的童鞋都能看得懂,在此不多说.具体见如下代码:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#2018/09/12 by DQ
import os
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
BoxLenTol=30
BoxAreaTol=BoxLenTol**2
ImSize=[640,480]
fileIdLen=6
ImExpName='.jpg'
AnotExpName='.xml'
ClsNameSet=('blis','cosd','nake','break')
AnotFolder='/home/KingMe/project/py-faster-rcnn/data/FABdevkit2017/FAB2017/Annotations'#Annotations'
TrainValTestAssignFolder='/home/KingMe/project/py-faster-rcnn/data/FABdevkit2017/FAB2017/ImageSets/Main'
TrainValTestFiles={'train':'train.txt','val':'val.txt','test':'test.txt'}
##get object annotation bndbox loc start
def GetAnnotBoxLoc(AnotPath):
#open xml
tree = ET.ElementTree(file=AnotPath)
root = tree.getroot()
ObjectSet=root.findall('object')
ObjBndBoxSet={}
for Object in ObjectSet:
ObjName=Object.find('name').text
BndBox=Object.find('bndbox')
x1 = int(BndBox.find('xmin').text)-1
y1 = int(BndBox.find('ymin').text)-1
x2 = int(BndBox.find('xmax').text)-1
y2 = int(BndBox.find('ymax').text)-1
BndBoxLoc=[x1,y1,x2,y2]
if ObjBndBoxSet.has_key(ObjName):
ObjBndBoxSet[ObjName].append(BndBoxLoc)
else:
ObjBndBoxSet[ObjName]=[BndBoxLoc]#why not ues dict(key=val)?
return ObjBndBoxSet
##get object annotation bndbox loc end
def CalSampleNum(BoxSet,BoxNumSet):
for Key,Val in BoxSet.iteritems():
if BoxNumSet.has_key(Key):
BoxNumSet[Key]=BoxNumSet[Key]+len(Val)
#计算标记样本中小面积样本数目(我这个是自己标记的可能会存在,标准数据应该不存在)
def CalSmallAreaSampleNum(BoxSet,SmallBoxNumSet):
for Key,Val in BoxSet.iteritems():
if SmallBoxNumSet.has_key(Key):
for Box in Val:
X1=Box[0]
Y1=Box[1]
X2=Box[2]
Y2=Box[3]
BoxArea=(X2-X1)*(Y2-Y1)
if BoxArea<BoxAreaTol:
SmallBoxNumSet[Key]=SmallBoxNumSet[Key]+1
############################################
def GetTotalSampleNum():
AnotFileNum=len(os.listdir(AnotFolder))
TotalSampleNum=dict.fromkeys(ClsNameSet, 0)
SmallBoxNumSet=dict.fromkeys(ClsNameSet, 0)
MeanSampleNum=dict.fromkeys(ClsNameSet, 0)
BigAreaSampleNum=dict.fromkeys(ClsNameSet, 0)
ImIdSet=range(1,AnotFileNum+1)
for ImId in ImIdSet:
ImIdStr=str(ImId).zfill(fileIdLen)
AnotName=ImIdStr+AnotExpName
AnotPath=os.path.join(AnotFolder,AnotName)
AnotBoxSet=GetAnnotBoxLoc(AnotPath)
CalSampleNum(AnotBoxSet,TotalSampleNum)
CalSmallAreaSampleNum(AnotBoxSet,SmallBoxNumSet)
for Key,Val in TotalSampleNum.iteritems():
if MeanSampleNum.has_key(Key):
MeanSampleNum[Key]=round(Val*1.0/AnotFileNum,2)
for Key,Val in TotalSampleNum.iteritems():
if BigAreaSampleNum.has_key(Key):
BigAreaSampleNum[Key]=TotalSampleNum[Key]-SmallBoxNumSet[Key]
print 'ImNum='+str(AnotFileNum)
print 'TotalSampleNum='+str(TotalSampleNum)
print 'MeanSampleNum='+str(MeanSampleNum)
print 'BoxAreaTol='+str(BoxLenTol)+'*'+str(BoxLenTol)
print 'SmallAreaSampleNum='+str(SmallBoxNumSet)
print 'BigAreaSampleNum='+str(BigAreaSampleNum)
def GetTrainValTestSample(SampleNumSet,ImIdFilePath):
with open(ImIdFilePath,'r') as FId:
k=0
TxtList=FId.readlines()
for LineStr in TxtList:
PureStr=LineStr.strip()
AnotFileName=PureStr+AnotExpName
AnotFilePath=os.path.join(AnotFolder,AnotFileName)
AnotBoxSet=GetAnnotBoxLoc(AnotFilePath)
CalSampleNum(AnotBoxSet,SampleNumSet)
k=k+1
FileName=os.path.basename(ImIdFilePath)
print FileName +' ImageNum='+str(k)+';',
def GetTrainValTestSampleMain():
for Key,FileName in TrainValTestFiles.iteritems():
ImIdFilePath=os.path.join(TrainValTestAssignFolder,FileName)
SampleNumSet=dict.fromkeys(ClsNameSet, 0)
GetTrainValTestSample(SampleNumSet,ImIdFilePath)
print FileName[:-4]+'SampleNumSet='+str(SampleNumSet)
GetTotalSampleNum()
GetTrainValTestSampleMain()