【python】《多媒体技术与应用》实验报告「数字音频处理」

《多媒体技术与应用》实验报告

实验名称

数字视频处理

实验时间

2022/4/25

姓名

班级

计非201

学号

成绩

一. 实验目的

1.掌握数字音频的读取与打开；

2.掌握数字音频信号的频谱分析；

3.验证 PCM 编码算法。

二．实验原理

声音是由物体振动而产生的，声波的三要素是频率、振幅和波形，频率代表音阶的高低，振幅代表响度（音量），波形代表音色，采样频率越高所能描述的声波频率就越高。采样率决定声音频率的范围（相当于音调），可以用数字波形表示。以波形表示的频率范围通常被称为带宽。

采样频率是单位时间内对媒体对象的采样次数，单位为赫兹 Hz。采样频率是指录音设备在一秒钟内对声音信号的采样次数，采样频率越高声音的还原就越真实越自然。脉冲编码调制(PCM，Pulse Code Modulation)在通信系统中完成将语音信号数字化功能。是一种对模拟信号数字化的取样技术，将模拟信号变换为数字信号的编码方式，特别是对于音频信号。

PCM 的实现主要包括三个步骤完成：抽样、量化、编码。分别完成时间上离散、幅度上离散、及量化信号的二进制表示。根据 CCITT 的建议，为改善小信号量化性能，采用压扩非均匀量化，有两种建议方式，分别为 A 律和 m 律方式，本设计采用了 A 律方式。

三．实验内容与关键步骤

1. 打开音频信号文件，获取音高、音量等信息；

2. 对打开的文件进行频谱分析；

3. 验证 PCM 算法。

四．代码

4-1 音量

import math
import wave
import pylab as pl
import numpy as np


# method 1: absSum
def calVolume(waveData, frameSize, overLap):
    wlen = len(waveData)
    step = frameSize - overLap
    frameNum = int(math.ceil(wlen * 1.0 / step))
    volume = np.zeros((frameNum, 1))
    for i in range(frameNum):
        curFrame = waveData[np.arange(i * step, min(i * step + frameSize, wlen))]
        curFrame = curFrame - np.median(curFrame)  # zero-justified
        volume[i] = np.sum(np.abs(curFrame))
    return volume


# method 2: 10 times log10 of square sum
def calVolumeDB(waveData, frameSize, overLap):
    wlen = len(waveData)
    step = frameSize - overLap
    frameNum = int(math.ceil(wlen * 1.0 / step))
    volume = np.zeros((frameNum, 1))
    for i in range(frameNum):
        curFrame = waveData[np.arange(i * step, min(i * step + frameSize, wlen))]
        curFrame = curFrame - np.mean(curFrame)  # zero-justified
        volume[i] = 10 * np.log10(np.sum(curFrame * curFrame))
    return volume


# ============ test the algorithm =============
# read wave file and get parameters.
fw = wave.open('sunday.wav', 'r')
params = fw.getparams()
print(params)
nchannels, sampwidth, framerate, nframes = params[:4]
strData = fw.readframes(nframes)
waveData = np.frombuffer(strData, dtype=np.int16)
waveData = waveData * 1.0 / max(abs(waveData))  # normalization
fw.close()

# calculate volume
frameSize = 256
overLap = 128
volume11 = calVolume(waveData, frameSize, overLap)
volume12 = calVolumeDB(waveData, frameSize, overLap)

# plot the wave
time = np.arange(0, nframes) * (1.0 / framerate)
time2 = np.arange(0, len(volume11)) * (frameSize - overLap) * 1.0 / framerate
pl.figure(figsize=(15, 9))
pl.subplot(311)
pl.plot(time, waveData)
pl.ylabel("Amplitude")
pl.subplot(312)
pl.plot(time2, volume11)
pl.ylabel("absSum")
pl.subplot(313)
pl.plot(time2, volume12, c="g")
pl.ylabel("Decibel(dB)")
pl.xlabel("time (seconds)")
pl.show()

4-2 音高

import wave
import numpy as np
import pylab as pl

# ============ test the algorithm =============
# read wave file and get parameters.
fw = wave.open('Ring08.wav', 'rb')
params = fw.getparams()
print(params)
nchannels, sampwidth, framerate, nframes = params[:4]
strData = fw.readframes(nframes)
# waveData = np.fromstring(strData, dtype=np.int16)
waveData = np.frombuffer(strData, np.int16)
waveData = waveData * 1.0 / max(abs(waveData))  # normalization
fw.close()

# plot the wave
time = np.arange(0, len(waveData)) * (1.0 / framerate)

index1 = 10000.0 / framerate
index2 = 10512.0 / framerate
index3 = 15000.0 / framerate
index4 = 15512.0 / framerate

pl.subplot(311)
pl.plot(time, waveData)
pl.plot([index1, index1], [-1, 1], 'r')
pl.plot([index2, index2], [-1, 1], 'r')
pl.plot([index3, index3], [-1, 1], 'g')
pl.plot([index4, index4], [-1, 1], 'g')
pl.xlabel("time (seconds)")
pl.ylabel("Amplitude")

pl.subplot(312)
pl.plot(np.arange(512), waveData[10000:10512], 'r')
pl.plot([59, 59], [-1, 1], 'b')
pl.plot([169, 169], [-1, 1], 'b')
print(1 / ((169 - 59) * 1.0 / framerate))
pl.xlabel("index in 1 frame")
pl.ylabel("Amplitude")

pl.subplot(313)
pl.plot(np.arange(512), waveData[15000:15512], 'g')
pl.xlabel("index in 1 frame")
pl.ylabel("Amplitude")
pl.show()

4-3 波形频谱

import librosa
import librosa.display
import matplotlib.pyplot as plt

# Load a wav file
y, sr = librosa.load('PCM.wav', sr=None)
# extract mel spectrogram feature
melspec = librosa.feature.melspectrogram(y, sr, n_fft=1024, hop_length=512, n_mels=128)
# convert to log scale
logmelspec = librosa.power_to_db(melspec)
plt.figure(figsize=(15, 10))
# plot a wavform
plt.subplot(2, 1, 1)
librosa.display.waveshow(y, sr)
plt.title('Beat wavform')
# plot mel spectrogram
plt.subplot(2, 1, 2)
librosa.display.specshow(logmelspec, sr=sr, x_axis='time', y_axis='mel')
plt.title('Mel spectrogram')
plt.tight_layout()  # 保证图不重叠
plt.show()

4-4 PCM

import numpy as np
import librosa
import matplotlib.pyplot as plt


def PCM_encode(x):
    n = len(x)
    out = np.zeros((n, 8))
    for i in range(n):
        # 符号位
        if x[i] > 0:
            out[i, 0] = 1
        else:
            out[i, 0] = 0
        # 数据位
        if abs(x[i]) < 32:
            out[i, 1], out[i, 2], out[i, 3], step, st = 0, 0, 0, 2, 0
        elif abs(x[i]) < 64:
            out[i, 1], out[i, 2], out[i, 3], step, st = 0, 0, 1, 2, 32
        elif abs(x[i]) < 128:
            out[i, 1], out[i, 2], out[i, 3], step, st = 0, 1, 0, 4, 64
        elif abs(x[i]) < 256:
            out[i, 1], out[i, 2], out[i, 3], step, st = 0, 1, 1, 8, 128
        elif abs(x[i]) < 512:
            out[i, 1], out[i, 2], out[i, 3], step, st = 1, 0, 0, 16, 256
        elif abs(x[i]) < 1024:
            out[i, 1], out[i, 2], out[i, 3], step, st = 1, 0, 1, 32, 512
        elif abs(x[i]) < 2048:
            out[i, 1], out[i, 2], out[i, 3], step, st = 1, 1, 0, 64, 1024
        else:
            out[i, 1], out[i, 2], out[i, 3], step, st = 1, 1, 1, 128, 2048

        if abs(x[i]) >= 4096:
            out[i, 1:] = np.array([1, 1, 1, 1, 1, 1])
        else:
            tmp = bin(int((abs(x[i]) - st) / step)).replace('0b', '')
            tmp = '0' * (4 - len(tmp)) + tmp
            t = [int(k) for k in tmp]
            out[i, 4:] = t
    return out.reshape(8 * n)


def PCM_decode(ins, v):
    inn = ins.reshape(len(ins) // 8, 8)
    slot = np.array([0, 32, 64, 128, 256, 512, 1024, 2048])
    step = np.array([2, 2, 4, 8, 16, 32, 64, 128])
    out = np.zeros(len(ins) // 8)
    for i in range(inn.shape[0]):
        sgn = 2 * inn[i, 0] - 1
        tmp = int(inn[i, 1] * 4 + inn[i, 2] * 2 + inn[i, 3])
        st = slot[tmp]
        dt = (inn[i, 4] * 8 + inn[i, 5] * 4 + inn[i, 6] * 2 + inn[i, 7]) * step[tmp] + 0.5 * step[tmp]
        out[i] = sgn * (st + dt) / 4096 * v
    return out


plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

data, sr = librosa.load('Ring08.wav')

sxx = np.array(list(map(int, data * 4096)))

y = PCM_encode(sxx)

yy = PCM_decode(y, 1)

plt.subplot(3, 1, 1)
plt.plot(data)
plt.title('编码前')

plt.subplot(3, 1, 2)
plt.plot(yy)
plt.title('解码后')

plt.subplot(3, 1, 3)
plt.plot(yy - data)
plt.title('误差')
plt.show()

五. 实验结果与分析

4-1 音量

aeiou.wav

sunday.wav

结果