PyTorch VGGish¶

In [ ]:

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [ ]:

import os

In [ ]:

from local.torch_basics import *
from local.test import *
from local.basics import *
from local.data.all import *
from local.vision.core import *
from local.notebook.showdoc import show_doc
from local.audio.core import *
from local.audio.augment import *
from local.vision.learner import *
from local.vision.models.xresnet import *
from local.metrics import *
from local.callback.schedule import *
import torchaudio
from IPython.display import Audio, display

In [ ]:

#def arc_extract(fname, dest): Archive(fname).extractall(dest)
URLs.ESC50 = 'https://github.com/karoldvl/ESC-50/archive/master.zip'
pESC50 = Config()['data_path'] / 'ESC-50/ESC-50-master'
PATH_AUDIO = pESC50/"audio"
PATH_CSV  = pESC50/"meta/esc50.csv"
DF = pd.read_csv(PATH_CSV)
#untar_data(URLs.ESC50, fname=str(pESC50)+'.zip', dest=pESC50, extract_func=arc_extract)

In [ ]:

x = AudioGetter("", recurse=True, folders=None)
files_ESC50 = x(pESC50)
#original_aud = AudioItem.create(files[0])

In [ ]:

DF.head()

Out[ ]:

	filename	fold	target	category	esc10	src_file	take
0	1-100032-A-0.wav	1	0	dog	True	100032	A
1	1-100038-A-14.wav	1	14	chirping_birds	False	100038	A
2	1-100210-A-36.wav	1	36	vacuum_cleaner	False	100210	A
3	1-100210-B-36.wav	1	36	vacuum_cleaner	False	100210	B
4	1-101296-A-19.wav	1	19	thunderstorm	False	101296	A

In [ ]:

ESC_10 = DF[DF["esc10"] == True]["filename"].values.tolist()

In [ ]:

files_ESC50[0]

Out[ ]:

PosixPath('/home/jupyter/.fastai/data/ESC-50/ESC-50-master/audio/1-60676-A-34.wav')

In [ ]:

files_ESC10 = [f for f in files_ESC50 if str(f).split('/')[-1] in ESC_10]

In [ ]:

len(files_ESC10)

Out[ ]:

In [ ]:

#! pip install -i https://test.pypi.org/simple/ torchvggish==0.1

In [ ]:

from torchvggish import vggish, vggish_input

In [ ]:

# Initialise model and download weights
embedding_model = vggish()
embedding_model.eval()

Out[ ]:

VGG(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (embeddings): Sequential(
    (0): Linear(in_features=12288, out_features=4096, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=4096, out_features=4096, bias=True)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=4096, out_features=128, bias=True)
    (5): ReLU(inplace=True)
  )
)

In [ ]:

path_example = "/home/jupyter/rob/pytorch_videos/RNN-walkthrough/audio_sample/album.wav"

In [ ]:

example = vggish_input.wavfile_to_examples(files_ESC10[0]).detach()

In [ ]:

example.shape

Out[ ]:

torch.Size([5, 1, 96, 64])

In [ ]:

def get_embedding(p):
    example = vggish_input.wavfile_to_examples(p)
    embedding = embedding_model.forward(example)
    return embedding

In [ ]:

def get_embedding_batch(paths):
    pass

In [ ]:

def get_esc_classes(files):
    return list({get_esc_label(f) for f in files})

def get_esc_label(f):
    return str(f).split('/')[-1].split('.')[0].split('-')[-1]

def get_esc_fold(f):
    return str(f).split('/')[-1].split('-')[0]

def i2o_esc(files):
    return dict(enumerate(get_esc_classes(files)))

def o2i_esc(files):
    return {o:i for i,o in i2o_esc(files).items()}

def get_esc_embedding_data(files, valid_fold):
    i2o_dict = i2o_esc(files)
    o2i_dict = o2i_esc(files)
    x_train = torch.stack([get_embedding(f).detach() for f in files if get_esc_fold(f) != str(valid_fold)], dim=0)
    x_valid = torch.stack([get_embedding(f).detach() for f in files if get_esc_fold(f) == str(valid_fold)], dim=0)
    y_train = torch.tensor([o2i_dict[get_esc_label(f)] for f in files if get_esc_fold(f) != str(valid_fold)]) 
    y_valid = torch.tensor([o2i_dict[get_esc_label(f)] for f in files if get_esc_fold(f) == str(valid_fold)])
    return x_train, y_train, x_valid, y_valid

ESC-10 train and fit¶

In [ ]:

x_train, y_train, x_valid, y_valid = get_esc_embedding_data(files_ESC10, 5)

In [ ]:

x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

Out[ ]:

(torch.Size([320, 5, 128]),
 torch.Size([80, 5, 128]),
 torch.Size([320]),
 torch.Size([80]))

In [ ]:

x_train = x_train.reshape(x_train.shape[0], -1)
x_valid = x_valid.reshape(x_valid.shape[0], -1)

In [ ]:

x_train.shape, y_train.shape

Out[ ]:

(torch.Size([320, 640]), torch.Size([320]))

In [ ]:

from sklearn.linear_model import RidgeClassifierCV
classifier = RidgeClassifierCV(alphas=np.logspace(-8, 8, 17), normalize=True)

In [ ]:

classifier.fit(x_train, y_train);

In [ ]:

classifier.score(x_valid, y_valid)

Out[ ]:

0.8375

What if, instead of flattening, we mean across channel to keep embedding context¶

In [ ]:

x_train, y_train, x_valid, y_valid = get_esc_embedding_data(files_ESC10, 5)

In [ ]:

x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

Out[ ]:

(torch.Size([320, 5, 128]),
 torch.Size([80, 5, 128]),
 torch.Size([320]),
 torch.Size([80]))

In [ ]:

x_train.mean(dim=1).shape

Out[ ]:

torch.Size([320, 128])

In [ ]:

x_train = x_train.mean(dim=1)
x_valid = x_valid.mean(dim=1)

In [ ]:

x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

Out[ ]:

(torch.Size([320, 128]),
 torch.Size([80, 128]),
 torch.Size([320]),
 torch.Size([80]))

In [ ]:

from sklearn.linear_model import RidgeClassifierCV
classifier = RidgeClassifierCV(alphas=np.logspace(-8, 8, 17), normalize=True)

In [ ]:

classifier.fit(x_train, y_train);

In [ ]:

classifier.score(x_valid, y_valid)

Out[ ]:

0.875

ESC50 train and fit¶

In [ ]:

x_train, y_train, x_valid, y_valid = get_esc_embedding_data(files_ESC50, 5)

In [ ]:

x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

Out[ ]:

(torch.Size([1600, 5, 128]),
 torch.Size([400, 5, 128]),
 torch.Size([1600]),
 torch.Size([400]))

In [ ]:

x_train = x_train.reshape(x_train.shape[0], -1)
x_valid = x_valid.reshape(x_valid.shape[0], -1)

In [ ]:

classifier = RidgeClassifierCV(alphas=np.logspace(-8, 8, 17), normalize=True)

In [ ]:

classifier.fit(x_train, y_train);

In [ ]:

classifier.score(x_valid, y_valid)

Out[ ]:

0.6225

Train with channel mean instead¶

In [ ]:

x_train, y_train, x_valid, y_valid = get_esc_embedding_data(files_ESC50, 5)

In [ ]:

x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

Out[ ]:

(torch.Size([1600, 5, 128]),
 torch.Size([400, 5, 128]),
 torch.Size([1600]),
 torch.Size([400]))

In [ ]:

x_train = x_train.mean(dim=1)
x_valid = x_valid.mean(dim=1)

In [ ]:

x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

Out[ ]:

(torch.Size([1600, 128]),
 torch.Size([400, 128]),
 torch.Size([1600]),
 torch.Size([400]))

In [ ]:

classifier = RidgeClassifierCV(alphas=np.logspace(-8, 8, 17), normalize=True)

In [ ]:

classifier.fit(x_train, y_train);

In [ ]:

classifier.score(x_valid, y_valid)

Out[ ]:

0.6025

Let's try data augmentation¶

In [ ]:

#waveform_to_examples(data, sample_rate)+