１　语音预处理

本章主要内容是基于ｗａｖｅ框架进行操作，主要步骤包括：

????????音频数据属性查看

????????音频数据转换矩阵

????????音频频谱

????????音频波形

1.1　音频的基本属性　

import wave as we
import numpy as np
from scipy.io import wavfile
import warnings
warnings.filterwarnings("ignore")


fileName = './data/thchs30/train/A2_0.wav'

# 输出信息 （声道，采样宽度，帧速率，帧数，唯一标识，无损）
WAVE = we.open(fileName)  # 打开一个音频文件

for item in enumerate(WAVE.getparams()):
    print(item)


a = WAVE.getparams().nframes
print('帧总数：',a)
f = WAVE.getparams().framerate
print('采样频率：',f)
sample_time = 1/f
print('采样点的时间间隔：',sample_time)
time = a/f
print('声音信号的长度：',time)
sample_frequency, audio_sequence = wavfile.read(fileName)
print('声音信号每一帧的大小：',audio_sequence,len(audio_sequence ))
x_seq = np.arange(0,time,sample_time)
print(x_seq,len(x_seq))

"""
(0, 1)
(1, 2)
(2, 16000)
(3, 157000)
(4, 'NONE')
(5, 'not compressed')
帧总数： 157000
采样频率： 16000
采样点的时间间隔： 6.25e-05
声音信号的长度： 9.8125
声音信号每一帧的大小： [-296 -424 -392 ... -394 -379 -390] 157000
[0.0000000e+00 6.2500000e-05 1.2500000e-04 ... 9.8123125e+00 9.8123750e+00
?9.8124375e+00] 157000

"""

1.2　波形序列

import wave as we
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
import warnings
warnings.filterwarnings("ignore")


fileName = './data/thchs30/train/A2_1.wav'

WAVE = we.open(fileName)  # 打开一个音频文件

a = WAVE.getparams().nframes
print('帧总数：',a)
f = WAVE.getparams().framerate
print('采样频率：',f)
sample_time = 1/f
print('采样点的时间间隔：',sample_time)
time = a/f
sample_frequency, audio_sequence = wavfile.read(fileName)
x_seq = np.arange(0,time,sample_time)

# 查看波形序列
plt.plot(x_seq,audio_sequence,'blue')
plt.xlabel('time(s)')
plt.show()

?1.3　文件信息的获取

import wave as we
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
from matplotlib.backend_bases import RendererBase
from scipy import signal
from scipy.io import wavfile
import os
from PIL import Image
from scipy.fftpack import fft
import warnings
warnings.filterwarnings("ignore")

audio_path = './data/train/audio/'
pict_Path = './data/picts/train/'
samples = []


# 验证文件是否存在，若不在这，则创建
if not os.path.exists(pict_Path):
    os.makedirs(pict_Path)

subFolderList= []
for x in os.listdir(audio_path):  # 遍历audio_path目录下的目录
    if os.path.isdir(audio_path+'/'+x):  # 判断是否是一个目录，是目录则添加到subFolderList
        subFolderList.append(x)
        if not os.path.exists(pict_Path + '/' + x):  # 判断x目录在pict_Path路径下是否存在
            os.mkdir(pict_Path + x) # 不存在则创建

# 查看子文件名称和数量
print('子文件名称：\n',subFolderList)
print('子文件数量：\n',len(subFolderList))

1.4　统计每个子文件夹语音文件数量

# 统计每个子文件夹语音文件数量
sample_audio=[]
total=0
for x in subFolderList:
    # 获取所有wav文件
    all_files=[y for y in os.listdir(audio_path+x) if '.wav' in y]
    total+=len(all_files)

    sample_audio.append(audio_path+x+'/'+all_files[0])
    # print(sample_audio)
    # 查看每个子文件夹文件数量
    print('%s:count:%d'%(x,len(all_files)))

# 查看wav文件总数
print("TOTAL:",total)

运行：

bed:count:10
bird:count:15
cat:count:17
dog:count:20
down:count:36
eight:count:16
five:count:16
four:count:22
go:count:18
happy:count:16
house:count:15
left:count:20
marvin:count:19
nine:count:14
no:count:16
off:count:20
on:count:11
one:count:18
right:count:22
seven:count:20
sheila:count:17
six:count:15
stop:count:12
three:count:19
tree:count:14
two:count:12
up:count:10
wow:count:18
yes:count:17
zero:count:20
_background_noise_:count:6
TOTAL: 521

?1.5　查看每个子文件当中第一个文件

# 查看每个子文件当中第一个文件
for x in sample_audio:
    print(x)

运行结果：

./data/train/audio/bed/00f0204f_nohash_0.wav
./data/train/audio/bird/00b01445_nohash_0.wav
./data/train/audio/cat/00b01445_nohash_0.wav
./data/train/audio/dog/fc2411fe_nohash_0.wav
./data/train/audio/down/fbdc07bb_nohash_0.wav
./data/train/audio/eight/fd395b74_nohash_0.wav
./data/train/audio/five/fd395b74_nohash_2.wav
./data/train/audio/four/fd32732a_nohash_0.wav
./data/train/audio/go/00b01445_nohash_0.wav
./data/train/audio/happy/fbf3dd31_nohash_0.wav
./data/train/audio/house/fcb25a78_nohash_0.wav
./data/train/audio/left/00b01445_nohash_0.wav
./data/train/audio/marvin/fc2411fe_nohash_0.wav
./data/train/audio/nine/00b01445_nohash_0.wav
./data/train/audio/no/fe1916ba_nohash_0.wav
./data/train/audio/off/00b01445_nohash_0.wav
./data/train/audio/on/00b01445_nohash_0.wav
./data/train/audio/one/00f0204f_nohash_0.wav
./data/train/audio/right/00b01445_nohash_0.wav
./data/train/audio/seven/0a0b46ae_nohash_0.wav
./data/train/audio/sheila/00f0204f_nohash_0.wav
./data/train/audio/six/00b01445_nohash_0.wav
./data/train/audio/stop/0ab3b47d_nohash_0.wav
./data/train/audio/three/00b01445_nohash_0.wav
./data/train/audio/tree/00b01445_nohash_0.wav
./data/train/audio/two/00b01445_nohash_0.wav
./data/train/audio/up/00b01445_nohash_0.wav
./data/train/audio/wow/00f0204f_nohash_0.wav
./data/train/audio/yes/00f0204f_nohash_0.wav
./data/train/audio/zero/0ab3b47d_nohash_0.wav
./data/train/audio/_background_noise_/doing_the_dishes.wav

1.6　多样本单个频谱可视化　


# 构建频谱处理函数
def log_specgram(audio,sample_rate,window_size=20,step_size=10,eps=1e-10):
    nperseg=int(round(window_size*sample_rate/1e3))
    noverlap=int(round(step_size*sample_rate/1e3))
    freqs,_,spec=signal.spectrogram(audio,
                       fs=sample_rate,
                       window='hann',
                       nperseg=nperseg,
                       noverlap=noverlap,
                       detrend=False
                       )
    return freqs,np.log(spec.T.astype(np.float32)+eps)
# 多样本单个频谱可视化
fig=plt.figure(figsize=(20,20))

for i,filepath in enumerate(sample_audio[:16]):
    #make subplots
    plt.subplot(4,4,i+1)
    #pull the labels
    label=filepath.split('/')[-2]
    plt.title(label)
    #create spectrogram
    samplerate,test_sound=wavfile.read(filepath)
    _,spectrogram=log_specgram(test_sound,samplerate)

    plt.imshow(spectrogram.T,aspect='auto',origin='lower')
    plt.axis("off")
plt.show()

实验二 GMM+HMM?

# 导入依赖
from python_speech_features import mfcc
from scipy.io import wavfile
from hmmlearn import hmm
import joblib
import numpy as np
import os


# 定义数据目录
trainPath = 'training_data'
testPath = 'test_data'

def get_wavlist(wavpath):
    '''
    获取数据
    :param wavpath:  路径
    :return:
    '''
    wavdict = {}
    labeldict = {}
    '''
        dirpath：当前遍历的目录
        dirnames：当前遍历的目录下的目录列表
        filenames：当前遍历的目录下的文件列表
    '''
    for (dirpath,dirnames,filenames) in os.walk(wavpath):

        for filename in filenames:
            if filename.endswith('.wav'): # 判断是否是wav文件
                filepath = os.sep.join([dirpath,filename])
                fileid = filename.strip('.wav')
                wavdict[fileid] = filepath
                label = fileid.split('_')[1]
                labeldict[fileid] = label
    return wavdict,labeldict


def compute_mfcc(file):
    '''
    特征提取
    :param file:
    :return:
    '''
    fs,audio = wavfile.read(file)  # 读取语音文件
    '''
        mfcc()  特征提取
            参数：
                signal ： 待操作的文件
                samplerate：我们用来工作的信号的采样率
                numcep: 倒频谱返回的数量 默认是13
    '''
    mfcc_feat = mfcc(audio,samplerate=(fs/2),numcep=26)
    return mfcc_feat

class Model():
    def __init__(self,CATEGORY,n_comp=3,n_mix =3,cov_type = 'diag',ecpe= 1000):
        '''
        :param CATEGORY:  标签列表
        :param n_comp:  每个孤立词中的状态数
        :param n_minx:   每个状态包含的混合高斯数量
        :param cov_type:  协方差矩阵的类型
        :param ecpe:   训练迭代的次数
        '''
        self.CATEGORY = CATEGORY
        self.category = len(CATEGORY)
        self.n_comp = n_comp
        self.n_mix = n_mix
        self.cov_type = cov_type
        self.ecpe = ecpe

        # 初始化models,返回特定参数的模型列表
        self.models = []
        for k in range(self.category):
            model = hmm.GMMHMM(n_components=self.n_comp,n_mix=self.n_mix,covariance_type=self.cov_type,
                               n_iter=ecpe)  # 创建GMM-HMM模型
            self.models.append(model)

    def train(self,wavdict=None,labeldict=None):
        '''
        进行模型训练
        :param wavdic:  训练数据
        :param labeldict:   训练目标值
        :return:
        '''
        for k in range(self.category):
            subdata = []
            model = self.models[k]  # 把前面定义好的模型获取出来
            for x in wavdict:
                if labeldict[x] == self.CATEGORY[k]:  # 判断CATEGORY列表中是否存在labeldict[x]标签
                    mfcc_feat = compute_mfcc(wavdict[x])  #提取数据特征
                    model.fit(mfcc_feat)

    def test(self,wavdict=None,labeldict=None):
        '''
        测试模型
        :param wavdict:
        :param labeldict:
        :return:
        '''
        resul = []  # 存放测试结果
        for k in range(self.category):
            subre = []
            label = []
            model = self.models[k]
            for x in wavdict:
                mfcc_feat = compute_mfcc(wavdict[x])
                # 生成每个数据在当前模型下的得分情况
                re = model.score(mfcc_feat)
                subre.append(re)
                label.append(labeldict[x])
            # 汇总得分情况
            resul.append(subre)
        # 选取得分最高的种类
        result = np.vstack(resul).argmax(axis=0)
        # 返回种类的类别标签
        result = [self.CATEGORY[l] for l in result]
        print('识别得到的结果：\n',result)
        print('原始标签类别：\n',label)

        # 检查识别率
        totalnum = len(label)  # 统计总数
        correctnum = 0   # 用于记录识别正确的个数
        for i in range(totalnum):
            if result[i] == label[i]:  # 判断识别的结果和真实结果是否相等
                correctnum += 1
        print('识别率：',correctnum/totalnum)


    def save(self,path='model.pkl'):
        '''
        保存模型
        :param path:  模型保存路径
        :return:
        '''
        joblib.dump(self.models,path)

    def load(self,path='model.pkl'):
        '''
        加载模型
        :param path:
        :return:
        '''
        self.models = joblib.load(path)

# 准备训练所需要的数据
CATEGORY = ['1','2','3','4','5','6','7','8','9','10']   # 标签列表

wavdict,labeldict = get_wavlist(trainPath)  # 获取训练的语音数据和标签
testdict,testlabel = get_wavlist(testPath)  # 获取测试的语音数据和标签

# print(wavdict)  "{'10_1': 'training_data\\10_1.wav', '10_10': 'training_data\\10_10.wav', ...}"
# print(labeldict) "{'10_1': '1', '10_10': '10', ..."
# 进行模型训练
# models = Model(CATEGORY=CATEGORY)
# if not os.path.exists('model.pkl'):
#     models.train(wavdict=wavdict, labeldict=labeldict)
#     models.save()
# else:
#     models.load()
#     models.test(wavdict=testdict,labeldict=testlabel)