# import Speaker Recognition Module
import speaker.recognition as SR
Gender = SR.GMMRec()
import scipy.io.wavfile as wav
from speaker.silence import remove_silence
from features import mfcc
def get_mfcc(audio_path):
(sr, sig) = wav.read(audio_path)
if len(sig.shape) > 1:
sig = sig[:, 0]
cleansig = remove_silence(sr, sig)
mfcc_vecs = mfcc(cleansig, sr, numcep = 15)
return mfcc_vecs
import numpy as np
# Here we use mfcc as the audio features, but in theory, other audio features should work as well, e.g. lpc
female_mfcc = np.array(get_mfcc('./Audio/female.wav')) # female_mfcc.shape = (N1, D); N1 vectors and D dimension
male_mfcc = np.array(get_mfcc('./Audio/male.wav')) # male_mfcc.shape = (N2, D);
Gender.enroll('Female', female_mfcc) # enroll the female audio features
Gender.enroll('Male', male_mfcc) # enroll the male audio features
Gender.train() # train the GMMs with PyCASP
Gender.dump('gender.model') # save the trained model into a file named "gender.model" for future use
Gender = SR.GMMRec.load('gender.model') # this is not necessary if you just trained the model
test_mfcc = np.array(get_mfcc('/Users/xuhe/Downloads/SpectrogramInversion1.02b/tapestr_rec.wav')) # test_mfcc.shape = (N3, D)
Gender.predict(test_mfcc) # predict the speaker, where result is the most porbabel speaker label, and log_lkld is the log likelihood for test_mfcc to be from the recognized speaker.
('Female', -22.373874909185876)
def totime(secs):
m, s = divmod(secs, 60)
h, m = divmod(m, 60)
return h, m, s
def showresult(recognizer, sig, sr, head):
cleansig = remove_silence(sr, sig)
mfcc_vecs = mfcc(cleansig, sr, numcep = 15)
print("%d:%02d:%02d" % (totime(head)), recognizer.predict(
mfcc_vecs))
def recognize(recognizer, audio_path, step = 1, duration = 1.5):
(fs, signal) = wav.read(audio_path)
if len(signal.shape) > 1:
signal = signal[:, 0]
head = 0
totallen = np.round(signal.shape[0] / fs).astype(int)
print('Recognition results:')
while head < totallen:
tail = head + duration
if tail > totallen:
tail = totallen
signali = signal[fs * head : np.min([fs * tail, fs * totallen])]
showresult(recognizer, signali, fs, head)
head += step
recognize(Gender, './Audio/female-male.wav', step = 5, duration = 5)
Recognition results: ('0:00:00', ('Male', -19.65672572544716)) ('0:00:05', ('Male', -19.389260191396541)) ('0:00:10', ('Male', -19.886238792273502)) ('0:00:15', ('Male', -19.988046642253273)) ('0:00:20', ('Male', -20.857762606257122)) ('0:00:25', ('Female', -20.056092628403363)) ('0:00:30', ('Female', -19.888043075692561)) ('0:00:35', ('Female', -19.657557661472801)) ('0:00:40', ('Female', -19.745223859738523)) ('0:00:45', ('Female', -19.680926940400678)) ('0:00:50', ('Female', -19.458031006355842)) ('0:00:55', ('Female', -19.553981803248707)) ('0:01:00', ('Female', -20.053499089615951)) ('0:01:05', ('Male', -19.686199644242794)) ('0:01:10', ('Male', -19.852808517223)) ('0:01:15', ('Male', -20.039521601708593)) ('0:01:20', ('Male', -19.904757723357431)) ('0:01:25', ('Male', -20.143563372546421)) ('0:01:30', ('Male', -19.966010831665649)) ('0:01:35', ('Male', -19.826530139561765)) ('0:01:40', ('Male', -19.912105539081182)) ('0:01:45', ('Male', -19.848151795975433)) ('0:01:50', ('Male', -19.684047168185359)) ('0:01:55', ('Male', -19.983463014416124)) ('0:02:00', ('Male', -19.397841075840084)) ('0:02:05', ('Male', -19.766136825379665)) ('0:02:10', ('Female', -19.768935512293602))