%reload_ext autoreload
%autoreload 2
%matplotlib inline
from local.torch_basics import *
from local.test import *
from local.basics import *
from local.data.all import *
from local.vision.core import *
from local.notebook.showdoc import show_doc
from local.audio.core import *
from local.audio.augment import *
from local.vision.learner import *
from local.vision.models.xresnet import *
from local.metrics import *
from local.callback.schedule import *
import torchaudio
p10speakers = Config()['data_path'] / 'ST-AEDS-20180100_1-OS'
#Warning this dataset is ~8GB
p250speakers = Config()['data_path'] / '250_speakers'
untar_data(URLs.SPEAKERS250, fname=str(p250speakers)+'.tar', dest=p250speakers)
PosixPath('/home/jupyter/.fastai/data/250_speakers/250-speakers')
x = AudioGetter("", recurse=True, folders=None)
files_10 = x(p10speakers)
files_250 = x(p250speakers)
#original_aud = AudioItem.create(files[0])
def AudioBlock(cls=AudioItem): return TransformBlock(type_tfms=cls.create, batch_tfms=IntToFloatTensor)
auds = DataBlock(blocks=(AudioBlock, CategoryBlock),
get_items=get_audio_files,
splitter=RandomSplitter(),
get_y=lambda x: str(x).split('/')[-1][:5])
cats = [y for _,y in auds.datasource(p10speakers)]
#verify categories are being correctly assigned
test_eq(min(cats).item(), 0)
test_eq(max(cats).item(), 9)
#crop 2s from the signal and turn it to a MelSpectrogram with no augmentation
cfg_voice = AudioConfig.Voice()
a2s = AudioToSpec.from_cfg(cfg_voice)
crop_2000ms = CropSignal(2000)
tfms = [crop_2000ms, a2s]
dbunch = auds.databunch(p10speakers, item_tfms=tfms, bs=64)
#dbunch_cropspec.show_batch(max_n=9)
dbunch.one_batch()[0].shape
torch.Size([64, 1, 128, 251])
# credit to Kevin Bird and Hiromi Suenaga for these two lines to adjust a CNN model to take 1 channel input
def alter_learner(learn, channels=1):
learn.model[0][0].in_channels=channels
learn.model[0][0].weight = torch.nn.parameter.Parameter(learn.model[0][0].weight[:,1,:,:].unsqueeze(1))
learn = Learner(dbunch,
xresnet18(),
torch.nn.CrossEntropyLoss(),
metrics=[accuracy])
nchannels = dbunch.one_batch()[0].shape[1]
alter_learner(learn, nchannels)
learn.lr_find()
#epochs are a bit longer due to the chosen melspectrogram settings
learn.fit_one_cycle(10, lr_max=slice(1e-2))
(#5) [0,1.996047854423523,4.532247066497803,0.2330729216337204,00:12] (#5) [1,0.8295692801475525,6.905653476715088,0.28125,00:11] (#5) [2,0.4226909577846527,0.5031284689903259,0.84375,00:11] (#5) [3,0.21792858839035034,0.38277533650398254,0.8841145634651184,00:11] (#5) [4,0.14074330031871796,0.9695820808410645,0.765625,00:11] (#5) [5,0.10544496029615402,0.5057219862937927,0.8619791865348816,00:11] (#5) [6,0.07608238607645035,0.061512526124715805,0.9791666865348816,00:11] (#5) [7,0.04908550903201103,0.03860814869403839,0.9921875,00:11] (#5) [8,0.03291779011487961,0.0291567575186491,0.9934895634651184,00:11] (#5) [9,0.02519594505429268,0.0238693505525589,0.9934895634651184,00:11]
len(files_250)
44655
for i in range(10):
print(random.choice(files_250))
/home/jupyter/.fastai/data/250_speakers/250-speakers/id09051/jc-o8Ra0WPg/00314.wav /home/jupyter/.fastai/data/250_speakers/250-speakers/id09269/qua5gFTUPvU/00317.wav /home/jupyter/.fastai/data/250_speakers/250-speakers/id09190/fMiIZC8EFsI/00118.wav /home/jupyter/.fastai/data/250_speakers/250-speakers/id09269/uti81NExz4E/00345.wav /home/jupyter/.fastai/data/250_speakers/250-speakers/id09215/yHiDOytgM70/00053.wav /home/jupyter/.fastai/data/250_speakers/250-speakers/id09065/aL-u-4exX6M/00031.wav /home/jupyter/.fastai/data/250_speakers/250-speakers/id09033/JsuM8CLM6WA/00185.wav /home/jupyter/.fastai/data/250_speakers/250-speakers/id09225/sdnCDYiQUBg/00366.wav /home/jupyter/.fastai/data/250_speakers/250-speakers/id09160/n4jJCVbK5-I/00073.wav /home/jupyter/.fastai/data/250_speakers/250-speakers/id09269/l_dPwDhJGRY/00247.wav
get_250speakers_label = lambda x: str(x).split('/')[-3][3:]
for i in range(10):
f = random.choice(files_250)
print("File:",f )
print("Label:", get_250speakers_label(f))
File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id09071/evswEAQC1w0/00234.wav Label: 9071 File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id09049/a1LSgSTO-lQ/00400.wav Label: 9049 File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id09269/1-ycYhStJ4U/00073.wav Label: 9269 File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id08983/P3M8OQZEPUQ/00221.wav Label: 8983 File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id09069/NfSFfEgKABg/00059.wav Label: 9069 File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id08902/_SVTdgG6oaQ/00165.wav Label: 8902 File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id08999/_r2RfU9wZfM/00270.wav Label: 8999 File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id08937/vKLxqqt7hfw/00429.wav Label: 8937 File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id09171/VobJGb4r1nA/00306.wav Label: 9171 File: /home/jupyter/.fastai/data/250_speakers/250-speakers/id09052/FGBRDssRvs4/00161.wav Label: 9052
auds = DataBlock(blocks=(AudioBlock, CategoryBlock),
get_items=get_audio_files,
splitter=RandomSplitter(),
get_y=get_250speakers_label)
dbunch250 = auds.databunch(p250speakers, item_tfms=tfms, bs=64)
from fastprogress import progress_bar as pb
cats = [y for _,y in pb(auds.datasource(p250speakers))]
#verify categories are being correctly assigned for 250 speakers
test_eq(min(cats).item(), 0)
test_eq(max(cats).item(), 249)
# Use torchaudio default MelSpectrogram to get a baseline
a2s = AudioToSpec()
crop_4000ms = CropSignal(4000)
tfms = [crop_4000ms, a2s]
dbunch = auds.databunch(p250speakers, item_tfms=tfms, bs=256)
learn = Learner(dbunch,
xresnet18(),
torch.nn.CrossEntropyLoss(),
metrics=[accuracy])
nchannels = dbunch.one_batch()[0].shape[1]
alter_learner(learn, nchannels)
learn.lr_find()
learn.fit_one_cycle(5, lr_max=slice(2e-2))
(#5) [0,3.120875597000122,4.190258502960205,0.21565334498882294,01:02] (#5) [1,1.2305573225021362,2.119953155517578,0.5156197547912598,01:02] (#5) [2,0.6071012616157532,0.8982534408569336,0.7704624533653259,01:03] (#5) [3,0.2425542175769806,0.37595334649086,0.9043779969215393,01:03] (#5) [4,0.09204515814781189,0.29298263788223267,0.9297950863838196,01:03]
learn.lr_find()
learn.unfreeze()
learn.fit_one_cycle(5, lr_max=slice(1e-3))
(#5) [0,0.06576913595199585,0.2908884286880493,0.930578887462616,01:04] (#5) [1,0.06111183762550354,0.28608283400535583,0.9310267567634583,01:03] (#5) [2,0.0446171835064888,0.26805636286735535,0.9380808472633362,01:03] (#5) [3,0.035814180970191956,0.2634187936782837,0.9383047819137573,01:03] (#5) [4,0.029347669333219528,0.26514604687690735,0.9392005205154419,01:04]
voice_cfg = AudioConfig.Voice()
a2s = AudioToSpec.from_cfg(voice_cfg)
tfms = [crop_4000ms, a2s]
# tfms = Pipeline([CropSignal(4000), a2s, MaskFreq(size=12), MaskTime(size=15), SGRoll()], as_item=True)
dbunch = auds.databunch(p250speakers, item_tfms=tfms, bs=128)
learn = Learner(dbunch,
xresnet18(),
torch.nn.CrossEntropyLoss(),
metrics=[accuracy])
nchannels = dbunch.one_batch()[0].shape[1]
alter_learner(learn, nchannels)
learn.lr_find()
# Better results even without fine tuning, but much slower. We need to move a2s to the GPU and
# then add data augmentation!
learn.fit_one_cycle(5, lr_max=slice(2e-2))
(#5) [0,2.5127463340759277,5.194817543029785,0.195722758769989,03:54] (#5) [1,1.0193027257919312,1.549010992050171,0.6296047568321228,03:52] (#5) [2,0.48157766461372375,0.749039351940155,0.8093158602714539,03:51] (#5) [3,0.1896345317363739,0.300977885723114,0.9261000752449036,03:51] (#5) [4,0.0649874284863472,0.22395987808704376,0.9474862813949585,03:52]
# only grab 1500ms of the clip, voice identity can be done with shorter sections and it will speed it up
# this is really slow for mfcc, even for 45k files, need to figure out what's going on here. Also the results
# shouldn't be this much worse than melspectrogram
a2mfcc = AudioToMFCC(n_mffc=20, melkwargs={"n_fft":2048, "hop_length":256, "n_mels":128})
tfms = [CropSignal(1500), a2mfcc, Delta()]
# tfms = Pipeline([CropSignal(4000), a2s, MaskFreq(size=12), MaskTime(size=15), SGRoll()], as_item=True)
dbunch = auds.databunch(p250speakers, item_tfms=tfms, bs=1024)
#n_mfcc isn't getting passed down?
dbunch.one_batch()[0].shape
torch.Size([1024, 3, 40, 94])
learn = Learner(dbunch,
xresnet18(),
torch.nn.CrossEntropyLoss(),
metrics=[accuracy])
learn.lr_find()
learn.fit_one_cycle(10, lr_max=slice(2e-2))
(#5) [0,4.673135757446289,5.774165630340576,0.08151382952928543,01:17] (#5) [1,3.332381248474121,7.908496379852295,0.07625126093626022,01:12] (#5) [2,2.4578614234924316,2.660874605178833,0.41025641560554504,01:12] (#5) [3,1.832728624343872,1.9707183837890625,0.5239055156707764,01:13] (#5) [4,1.3838961124420166,1.9342174530029297,0.5400291085243225,01:12] (#5) [5,1.0394622087478638,1.495416283607483,0.6420333385467529,01:13] (#5) [6,0.7791723012924194,0.8641145825386047,0.7863621115684509,01:12] (#5) [7,0.5757192373275757,0.6926474571228027,0.8310379385948181,01:12] (#5) [8,0.4234640896320343,0.5842545628547668,0.861045777797699,01:12] (#5) [9,0.32844018936157227,0.5443729758262634,0.8720188140869141,01:12]
learn.unfreeze()
learn.lr_find()
learn.fit_one_cycle(7, lr_max=slice(3e-3, 4e-3))
(#5) [0,0.23045498132705688,0.6166905760765076,0.8526480793952942,01:12] (#5) [1,0.2589111030101776,0.7880717515945435,0.8061807155609131,01:13] (#5) [2,0.2605298161506653,0.723810613155365,0.8249915838241577,01:12] (#5) [3,0.23205164074897766,0.6235433220863342,0.8501847386360168,01:12] (#5) [4,0.19764986634254456,0.5251900553703308,0.8772813677787781,01:13] (#5) [5,0.16184552013874054,0.4946998059749603,0.8892621397972107,01:12] (#5) [6,0.1400141716003418,0.48173633217811584,0.889710009098053,01:12]