In [ ]:

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [ ]:

from local.torch_basics import *
from local.test import *
from local.basics import *
from local.data.all import *
from local.vision.core import *
from local.notebook.showdoc import show_doc
from local.audio.core import *
from local.audio.augment import *
from local.vision.learner import *
from local.vision.models.xresnet import *
from local.metrics import *
from local.callback.schedule import *
import torchaudio

Tutorial: Training a Voice Recognition Model¶

In [ ]:

p10speakers = Config()['data_path'] / 'ST-AEDS-20180100_1-OS'

In [ ]:

#Warning this dataset is ~8GB
p250speakers = Config()['data_path'] / '250_speakers'
untar_data(URLs.SPEAKERS250, fname=str(p250speakers)+'.tar', dest=p250speakers)

Out[ ]:

PosixPath('/home/jupyter/.fastai/data/250_speakers/250-speakers')

In [ ]:

x = AudioGetter("", recurse=True, folders=None)
files_10  = x(p10speakers)
files_250 = x(p250speakers)
#original_aud = AudioItem.create(files[0])

Datablock and Basic End to End Training on 10 Speakers¶

In [ ]:

def AudioBlock(cls=AudioItem): return TransformBlock(type_tfms=cls.create, batch_tfms=IntToFloatTensor)

In [ ]:

auds = DataBlock(blocks=(AudioBlock, CategoryBlock),  
                 get_items=get_audio_files, 
                 splitter=RandomSplitter(),
                 get_y=lambda x: str(x).split('/')[-1][:5])

In [ ]:

cats = [y for _,y in auds.datasource(p10speakers)]

In [ ]:

#verify categories are being correctly assigned
test_eq(min(cats).item(), 0)
test_eq(max(cats).item(), 9)

In [ ]:

#crop 2s from the signal and turn it to a MelSpectrogram with no augmentation
cfg_voice = AudioConfig.Voice()
a2s = AudioToSpec.from_cfg(cfg_voice)
crop_2000ms = CropSignal(2000)
tfms = [crop_2000ms, a2s]
dbunch = auds.databunch(p10speakers, item_tfms=tfms, bs=64)

Broken:
Show batch is broken as it appears to just be grabbing the data from the sg, and not the sg object itself, but calls the sg's show method which relies on nchannels, which is an object of AudioSpectrogram (part of sg settings but we overrode getattr to make it work like an attribute). This means that, for the moment, the items cant show themselves for the batch, but training still works

In [ ]:

#dbunch_cropspec.show_batch(max_n=9)

In [ ]:

dbunch.one_batch()[0].shape

Out[ ]:

torch.Size([64, 1, 128, 251])

In [ ]:

# credit to Kevin Bird and Hiromi Suenaga for these two lines to adjust a CNN model to take 1 channel input
def alter_learner(learn, channels=1):
    learn.model[0][0].in_channels=channels
    learn.model[0][0].weight = torch.nn.parameter.Parameter(learn.model[0][0].weight[:,1,:,:].unsqueeze(1))

In [ ]:

learn = Learner(dbunch, 
                xresnet18(),
                torch.nn.CrossEntropyLoss(), 
                metrics=[accuracy])
nchannels = dbunch.one_batch()[0].shape[1]
alter_learner(learn, nchannels)

In [ ]:

learn.lr_find()

In [ ]:

#epochs are a bit longer due to the chosen melspectrogram settings
learn.fit_one_cycle(10, lr_max=slice(1e-2))

(#5) [0,1.996047854423523,4.532247066497803,0.2330729216337204,00:12]
(#5) [1,0.8295692801475525,6.905653476715088,0.28125,00:11]
(#5) [2,0.4226909577846527,0.5031284689903259,0.84375,00:11]
(#5) [3,0.21792858839035034,0.38277533650398254,0.8841145634651184,00:11]
(#5) [4,0.14074330031871796,0.9695820808410645,0.765625,00:11]
(#5) [5,0.10544496029615402,0.5057219862937927,0.8619791865348816,00:11]
(#5) [6,0.07608238607645035,0.061512526124715805,0.9791666865348816,00:11]
(#5) [7,0.04908550903201103,0.03860814869403839,0.9921875,00:11]
(#5) [8,0.03291779011487961,0.0291567575186491,0.9934895634651184,00:11]
(#5) [9,0.02519594505429268,0.0238693505525589,0.9934895634651184,00:11]

Training on 250 Speakers¶

Baseline¶

In [ ]:

len(files_250)

Out[ ]:

In [ ]:

for i in range(10):
    print(random.choice(files_250))

/home/jupyter/.fastai/data/250_speakers/250-speakers/id09051/jc-o8Ra0WPg/00314.wav
/home/jupyter/.fastai/data/250_speakers/250-speakers/id09269/qua5gFTUPvU/00317.wav
/home/jupyter/.fastai/data/250_speakers/250-speakers/id09190/fMiIZC8EFsI/00118.wav
/home/jupyter/.fastai/data/250_speakers/250-speakers/id09269/uti81NExz4E/00345.wav
/home/jupyter/.fastai/data/250_speakers/250-speakers/id09215/yHiDOytgM70/00053.wav
/home/jupyter/.fastai/data/250_speakers/250-speakers/id09065/aL-u-4exX6M/00031.wav
/home/jupyter/.fastai/data/250_speakers/250-speakers/id09033/JsuM8CLM6WA/00185.wav
/home/jupyter/.fastai/data/250_speakers/250-speakers/id09225/sdnCDYiQUBg/00366.wav
/home/jupyter/.fastai/data/250_speakers/250-speakers/id09160/n4jJCVbK5-I/00073.wav
/home/jupyter/.fastai/data/250_speakers/250-speakers/id09269/l_dPwDhJGRY/00247.wav

In [ ]:

get_250speakers_label = lambda x: str(x).split('/')[-3][3:]

In [ ]:

for i in range(10):
    f = random.choice(files_250)
    print("File:",f )
    print("Label:", get_250speakers_label(f))

File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id09071/evswEAQC1w0/00234.wav
Label: 9071
File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id09049/a1LSgSTO-lQ/00400.wav
Label: 9049
File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id09269/1-ycYhStJ4U/00073.wav
Label: 9269
File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id08983/P3M8OQZEPUQ/00221.wav
Label: 8983
File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id09069/NfSFfEgKABg/00059.wav
Label: 9069
File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id08902/_SVTdgG6oaQ/00165.wav
Label: 8902
File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id08999/_r2RfU9wZfM/00270.wav
Label: 8999
File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id08937/vKLxqqt7hfw/00429.wav
Label: 8937
File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id09171/VobJGb4r1nA/00306.wav
Label: 9171
File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id09052/FGBRDssRvs4/00161.wav
Label: 9052

In [ ]:

auds = DataBlock(blocks=(AudioBlock, CategoryBlock),  
                 get_items=get_audio_files, 
                 splitter=RandomSplitter(),
                 get_y=get_250speakers_label)

In [ ]:

dbunch250 = auds.databunch(p250speakers, item_tfms=tfms, bs=64)

In [ ]:

from fastprogress import progress_bar as pb

In [ ]:

cats = [y for _,y in pb(auds.datasource(p250speakers))]

100.00% [44655/44655 12:12<00:00]

In [ ]:

#verify categories are being correctly assigned for 250 speakers
test_eq(min(cats).item(), 0)
test_eq(max(cats).item(), 249)

In [ ]:

# Use torchaudio default MelSpectrogram to get a baseline
a2s = AudioToSpec()
crop_4000ms = CropSignal(4000)
tfms = [crop_4000ms, a2s]
dbunch = auds.databunch(p250speakers, item_tfms=tfms, bs=256)

In [ ]:

learn = Learner(dbunch, 
                xresnet18(),
                torch.nn.CrossEntropyLoss(), 
                metrics=[accuracy])
nchannels = dbunch.one_batch()[0].shape[1]
alter_learner(learn, nchannels)

In [ ]:

learn.lr_find()

In [ ]:

learn.fit_one_cycle(5, lr_max=slice(2e-2))

(#5) [0,3.120875597000122,4.190258502960205,0.21565334498882294,01:02]
(#5) [1,1.2305573225021362,2.119953155517578,0.5156197547912598,01:02]
(#5) [2,0.6071012616157532,0.8982534408569336,0.7704624533653259,01:03]
(#5) [3,0.2425542175769806,0.37595334649086,0.9043779969215393,01:03]
(#5) [4,0.09204515814781189,0.29298263788223267,0.9297950863838196,01:03]

In [ ]:

learn.lr_find()

In [ ]:

learn.unfreeze()
learn.fit_one_cycle(5, lr_max=slice(1e-3))

(#5) [0,0.06576913595199585,0.2908884286880493,0.930578887462616,01:04]
(#5) [1,0.06111183762550354,0.28608283400535583,0.9310267567634583,01:03]
(#5) [2,0.0446171835064888,0.26805636286735535,0.9380808472633362,01:03]
(#5) [3,0.035814180970191956,0.2634187936782837,0.9383047819137573,01:03]
(#5) [4,0.029347669333219528,0.26514604687690735,0.9392005205154419,01:04]

Customize our AudioToSpec Function using a config¶

In [ ]:

voice_cfg = AudioConfig.Voice()
a2s = AudioToSpec.from_cfg(voice_cfg)
tfms = [crop_4000ms, a2s]
# tfms = Pipeline([CropSignal(4000),  a2s, MaskFreq(size=12), MaskTime(size=15), SGRoll()], as_item=True)
dbunch = auds.databunch(p250speakers, item_tfms=tfms, bs=128)

In [ ]:

learn = Learner(dbunch, 
                xresnet18(),
                torch.nn.CrossEntropyLoss(), 
                metrics=[accuracy])
nchannels = dbunch.one_batch()[0].shape[1]
alter_learner(learn, nchannels)

In [ ]:

learn.lr_find()

In [ ]:

# Better results even without fine tuning, but much slower. We need to move a2s to the GPU and 
# then add data augmentation!
learn.fit_one_cycle(5, lr_max=slice(2e-2))

(#5) [0,2.5127463340759277,5.194817543029785,0.195722758769989,03:54]
(#5) [1,1.0193027257919312,1.549010992050171,0.6296047568321228,03:52]
(#5) [2,0.48157766461372375,0.749039351940155,0.8093158602714539,03:51]
(#5) [3,0.1896345317363739,0.300977885723114,0.9261000752449036,03:51]
(#5) [4,0.0649874284863472,0.22395987808704376,0.9474862813949585,03:52]

Training an MFCC with Delta¶

In [ ]:

# only grab 1500ms of the clip, voice identity can be done with shorter sections and it will speed it up
# this is really slow for mfcc, even for 45k files, need to figure out what's going on here. Also the results
# shouldn't be this much worse than melspectrogram
a2mfcc = AudioToMFCC(n_mffc=20, melkwargs={"n_fft":2048, "hop_length":256, "n_mels":128})
tfms = [CropSignal(1500), a2mfcc, Delta()]
# tfms = Pipeline([CropSignal(4000),  a2s, MaskFreq(size=12), MaskTime(size=15), SGRoll()], as_item=True)
dbunch = auds.databunch(p250speakers, item_tfms=tfms, bs=1024)

In [ ]:

#n_mfcc isn't getting passed down? 
dbunch.one_batch()[0].shape

Out[ ]:

torch.Size([1024, 3, 40, 94])

In [ ]:

learn = Learner(dbunch, 
                xresnet18(),
                torch.nn.CrossEntropyLoss(), 
                metrics=[accuracy])

In [ ]:

learn.lr_find()

In [ ]:

learn.fit_one_cycle(10, lr_max=slice(2e-2))

(#5) [0,4.673135757446289,5.774165630340576,0.08151382952928543,01:17]
(#5) [1,3.332381248474121,7.908496379852295,0.07625126093626022,01:12]
(#5) [2,2.4578614234924316,2.660874605178833,0.41025641560554504,01:12]
(#5) [3,1.832728624343872,1.9707183837890625,0.5239055156707764,01:13]
(#5) [4,1.3838961124420166,1.9342174530029297,0.5400291085243225,01:12]
(#5) [5,1.0394622087478638,1.495416283607483,0.6420333385467529,01:13]
(#5) [6,0.7791723012924194,0.8641145825386047,0.7863621115684509,01:12]
(#5) [7,0.5757192373275757,0.6926474571228027,0.8310379385948181,01:12]
(#5) [8,0.4234640896320343,0.5842545628547668,0.861045777797699,01:12]
(#5) [9,0.32844018936157227,0.5443729758262634,0.8720188140869141,01:12]

In [ ]:

learn.unfreeze()
learn.lr_find()

In [ ]:

learn.fit_one_cycle(7, lr_max=slice(3e-3, 4e-3))

(#5) [0,0.23045498132705688,0.6166905760765076,0.8526480793952942,01:12]
(#5) [1,0.2589111030101776,0.7880717515945435,0.8061807155609131,01:13]
(#5) [2,0.2605298161506653,0.723810613155365,0.8249915838241577,01:12]
(#5) [3,0.23205164074897766,0.6235433220863342,0.8501847386360168,01:12]
(#5) [4,0.19764986634254456,0.5251900553703308,0.8772813677787781,01:13]
(#5) [5,0.16184552013874054,0.4946998059749603,0.8892621397972107,01:12]
(#5) [6,0.1400141716003418,0.48173633217811584,0.889710009098053,01:12]

From Here:
1. Get transforms on the GPU
2. Once it's faster test signal and spectrogram augments for speed/efficacy
3. Fine-tune and see how high we can push results on 250 speakers