情感语音识别---特征提取

1、首先是读取语音:

       首先要知道语音信号常见的有:*.txt文本文件和*.wav语音文件;

       为什么会有*.txt文件?

       这个很好理解,对于*.wav可以理解为以为时间信号,经过采样之后就变成了离散的点,即为*.txt文件存放的一堆数字。接下来,看一下读取语音信号的两种方式:(注意这里使用的是MATLAB代码)

Example1:(*.txt--即把采样点读取出来)

fid=fopen('happy.txt','rt');    %打开文件

Example2:(*.wav)

[y,fs,nbits]=wavread('happy.wav');

注意:wavread()该函数是适用于MATLAB2010版本,后期的版本就需要使用audioread(),(注意:由于2017年10月左右,matlab的大量陈旧老版本失效,后期的都是使用:audioread()来完成的。)但是这个函数的输出只有两个,见Example3。

Example3:(*.wav)

[y, fs]=audioread('happy.wav');

注意:关于比特率在*.wav文件的属性中的详细信息中是可以看到的。可见MATLAB的不断更新的是为更简便、更快。


输出参数中:y表示的是采样点,fs表示的是采样频率


2、然后就是预处理操作(端点检测-预加重-加窗分帧)

这里有两个制胜函数:

①epdByVolZcr.m

function [epInSampleIndex, epInFrameIndex, soundSegment, zeroOneVec, volume] = epdByVolZcr(y, fs, nbits, epdParam, plotOpt)
% epdByVol: EPD based on volume only
%	Usage: [epInSampleIndex, epInFrameIndex, soundSegment, zeroOneVec, volume] = epdByVol(y, fs, nbits, epdParam, plotOpt)
%		epInSampleIndex: two-element end-points in sample index
%		epInFrameIndex: two-element end-points in frame index
%		soundSegment: resulting sound segments
%		zeroOneVec: zero-one vector for each frame
%		volume: volume
%		y: input audio signals
%		fs: sampling rate
%		epdParam: parameters for EPD
%		plotOpt: 0 for silence operation, 1 for plotting
%
%	Example:
%		waveFile='SingaporeIsAFinePlace.wav';
%		[y, fs, nbits]=wavReadInt(waveFile);
%		epdParam=epdParamSet(fs);
%		plotOpt=1;
%		out=epdByVol(y, fs, nbits, epdParam, plotOpt);

%	Roger Jang, 20040413, 20070320

if nargin<1, selfdemo; return; end
if nargin<2, fs=16000; end
if nargin<3, nbits=16; end
if nargin<4 | isempty(epdParam), epdParam=epdParamSet(fs); end
if nargin<5, plotOpt=0; end

if size(y, 2)~=1, error('Wave is not mono!'); end

frameSize=epdParam.frameSize;
overlap=epdParam.overlap;minSegment=round(epdParam.minSegment*fs/(frameSize-overlap));
maxSilBetweenWord=round(epdParam.maxSilBetweenWord*fs/(frameSize-overlap));
%minLastWordDuration=round(epdParam.minLastWordDuration*fs/(frameSize-overlap));

y = double(y);					% convert to double data type
frameMat=buffer2(y, frameSize, overlap);	% frame blocking
frameMat=frameZeroMean(frameMat, 2);
frameNum=size(frameMat, 2);			% no. of frames
volume=frame2volume(frameMat, 1);		% compute volume
temp=sort(volume);
index=round(frameNum/32); if index==0, index=1; end
volMin=temp(index);
volMax=temp(frameNum-index+1);			% To avoid qiYin
volTh1=(volMax-volMin)/epdParam.volRatio+volMin;	% compute volume threshold
volTh2=(volMax-volMin)/epdParam.volRatio2+volMin;	% compute volume threshold

% ====== Identify voiced part that's larger than volTh2
soundSegment=findSegment(volume>volTh1);

% ====== Compute ZCR
[minVol, index]=min(volume);
shiftAmount=epdParam.zcrShiftGain*max(abs(frameMat(:,index)));		% shiftAmount is equal to epdParam.zcrShiftGain times the max. abs. sample within the frame of min. volume
shiftAmount=max(shiftAmount, 2);	
zcr=frame2zcr(frameMat, 1, shiftAmount);
zcrTh=max(zcr)*epdParam.zcrRatio;

% ====== Expansion 1: Expand end points to volume level1 (lower level)
for i=1:length(soundSegment),
	head = soundSegment(i).begin;
	while (head-1)>=1 & volume(head-1)>=volTh1,
		head=head-1;
	end
	soundSegment(i).begin = head;
	tail = soundSegment(i).end;
	while (tail+1)<=length(volume) & volume(tail+1)>=volTh1,
		tail=tail+1;
	end
	soundSegment(i).end = tail;
end
% ====== Expansion 2: Expand end points to include high zcr region
for i=1:length(soundSegment),
	head = soundSegment(i).begin;
	while (head-1)>=1 & zcr(head-1)>zcrTh			% Extend at beginning
		head=head-1;
	end
	soundSegment(i).begin = head;
	tail = soundSegment(i).end;
	while (tail+1)<=length(zcr) & zcr(tail+1)>zcrTh		% Extend at ending
		tail=tail+1;
	end
	soundSegment(i).end = tail;
end

% ====== Delete repeated sound segments
index = [];
for i=1:length(soundSegment)-1,
	if soundSegment(i).begin==soundSegment(i+1).begin & soundSegment(i).end==soundSegment(i+1).end,
		index=[index, i];
	end
end
soundSegment(index) = [];

% ====== Delete short sound clips
index = [];
for i=1:length(soundSegment),
	if soundSegment(i).duration<=minSegment
		index = [index, i];
	end
end
soundSegment(index) = [];

zeroOneVec=0*volume;
for i=1:length(soundSegment)
	for j=soundSegment(i).begin:soundSegment(i).end
		zeroOneVec(j)=1;
	end
end

if isempty(soundSegment)
	epInSampleIndex=[];
	epInFrameIndex=[];
	fprintf('Warning: No sound segment found in %s.m.\n', mfilename);
else
	epInFrameIndex=[soundSegment(1).begin, soundSegment(end).end];
	epInSampleIndex=frame2sampleIndex(epInFrameIndex, frameSize, overlap);		% conversion from frame index to sample index
	for i=1:length(soundSegment),
		soundSegment(i).beginSample = frame2sampleIndex(soundSegment(i).begin, frameSize, overlap);
		soundSegment(i).endSample   = min(length(y), frame2sampleIndex(soundSegment(i).end, frameSize, overlap));
		soundSegment(i).beginFrame = soundSegment(i).begin;
		soundSegment(i).endFrame = soundSegment(i).end;
	end
	soundSegment=rmfield(soundSegment, 'begin');
	soundSegment=rmfield(soundSegment, 'end');
	soundSegment=rmfield(soundSegment, 'duration');
end

% Plotting...
if plotOpt,
	axes1H=subplot(4,1,1);
	time=(1:length(y))/fs;
	plot(time, y);
	axisLimit=[min(time) max(time) -2^nbits/2, 2^nbits/2];
	if -1<=min(y) & max(y)<=1
		axisLimit=[min(time) max(time) -1, 1];
	end
	axis(axisLimit);
	ylabel('Amplitude'); title('Waveform'); grid on
	% Plot end points
	yBound=axisLimit(3:4);
	for i=1:length(soundSegment),
		line(frame2sampleIndex(soundSegment(i).beginFrame, frameSize, overlap)/fs*[1,1], yBound, 'color', 'm');
		line(frame2sampleIndex(  soundSegment(i).endFrame, frameSize, overlap)/fs*[1,1], yBound, 'color', 'g');
	end

	axes2H=subplot(4,1,2);
	frameTime = frame2sampleIndex(1:frameNum, frameSize, overlap)/fs;
	plot(frameTime, volume, '.-');
	line([min(frameTime), max(frameTime)], volTh1*[1 1], 'color', 'r');
	line([min(frameTime), max(frameTime)], volTh2*[1 1], 'color', 'r');
	axis tight
	ylabel('Volume'); title('Volume'); grid on
	% Plot end points
	yBound = [min(volume) max(volume)];
	for i=1:length(soundSegment),
		line(frame2sampleIndex(soundSegment(i).beginFrame, frameSize, overlap)/fs*[1,1], yBound, 'color', 'm');
		line(frame2sampleIndex(  soundSegment(i).endFrame, frameSize, overlap)/fs*[1,1], yBound, 'color', 'g');
	end

	axes3H=subplot(4,1,3);
	plot(frameTime, zcr, '.-');
	line([min(frameTime), max(frameTime)], zcrTh*[1 1], 'color', 'c');
	axis([min(frameTime), max(frameTime), 0, max(zcr)]);
	ylabel('ZCR'); title('Zero crossing rate'); grid on
	% Plot end points
	yBound = [0 max(zcr)];
	for i=1:length(soundSegment),
		line(frame2sampleIndex(soundSegment(i).beginFrame, frameSize, overlap)/fs*[1,1], yBound, 'color', 'm');
		line(frame2sampleIndex(  soundSegment(i).endFrame, frameSize, overlap)/fs*[1,1], yBound, 'color', 'g');
	end

	%axes4H=subplot(4,1,4);
	%voicedIndex=epInSampleIndex(1):epInSampleIndex(2);
	%voicedTime=time(voicedIndex);
	%voicedY=y(voicedIndex);
	%voicedH=plot(voicedTime, voicedY);
	%axis([time(epInSampleIndex(1)), time(epInSampleIndex(2)), -2^nbits/2, 2^nbits/2]);
	%ylabel('Amplitude'); title('Voiced waveform'); grid on
	
	%U.y=y; U.fs=fs; U.nbits=nbits;
	%U.axes1H=axes1H; U.axes2H=axes2H; U.axes3H=axes3H; U.axes4H=axes4H;
	%U.voicedIndex=voicedIndex; U.voicedH=voicedH;
	%U.voicedY=voicedY; U.voicedTime=voicedTime;
	%set(gcf, 'userData', U);
	%uicontrol('string', 'Play all', 'callback', 'U=get(gcf, ''userData''); sound(U.y/(2^U.nbits/2), U.fs);');
	%uicontrol('string', 'Play voiced', 'callback', 'U=get(gcf, ''userData''); sound(U.voicedY/(2^U.nbits/2), U.fs);', 'position', [100, 20, 60, 20]);

	% Play the segmented sound
%	head = soundSegment(1).beginFrame*(frameSize-overlap);
%	tail = min(length(y), soundSegment(end).endFrame*(frameSize-overlap));
%	thisY = y(head:tail);
%	fprintf('His return to hear the cutted sound %g:', i);
%	pause;
%	fprintf('\n');
%	wavplay(thisY, fs, 'sync');
%	fprintf('\n');
end

% ====== Self demo
function selfdemo
waveFile='SingaporeIsAFinePlace.wav';
[y, fs, nbits]=wavReadInt(waveFile);
epdParam=epdParamSet(fs);
plotOpt=1;
out=feval(mfilename, y, fs, nbits, epdParam, plotOpt);

②buffer2

function out = buffer2(y, frameSize, overlap)
% buffer2: Frame blocking
%	Usage: out = buffer2(y, frameSize, overlap)
%	This is almost the same as "buffer" except that there is no leading/trailing zeros

%	Roger Jang, 20010908

if nargin<3, overlap=0; end
if nargin<2, frameSize=256; end

y = y(:);
step = frameSize-overlap;
frameCount = floor((length(y)-overlap)/step);

out = zeros(frameSize, frameCount);
for i=1:frameCount,
	startIndex = (i-1)*step+1;
	out(:, i) = y(startIndex:(startIndex+frameSize-1));
end

一般设置:帧长为256,帧移为128,(帧移的取值单位一般是帧长的0~1/2)。

对于预处理获得的信号:y1

plotOpt1=0;
[endPoint, epInFrameIndex, soundSegment, zeroOneVec, volume]=epdByVolZcr(y, fs, nbits, [], plotOpt1);
y1=y(endPoint(1):endPoint(2));

注意:plotOpt1的取值决定于是否绘制图像,认真读过epdByVolZrc()函数的就可以理解。

3,接下来就是特征提取环节,这里就需要根据自己提取的方法进行编程了。

     推荐看《情感语音信号入门解析》中推荐的张雪英教授的书籍,就可以提取常用的传统声学特征。


猜你喜欢

转载自blog.csdn.net/songchunxiao1991/article/details/80197049
今日推荐