In [ ]:
#default_exp audio.augment
#default_cls_lvl 3
In [ ]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

Data Augmentation for Audio

Transforms to apply data augmentation to AudioSpectrograms and Signals

In [ ]:
#export
from local.torch_basics import *
from local.test import *
from local.data.all import *
from local.vision.all import *
from local.notebook.showdoc import show_doc
from local.audio.core import *
from local.learner import *
from local.vision.models.xresnet import *
from local.metrics import *
from local.basics import *
from local.callback.all import *
In [ ]:
ImageDataBunch??
In [ ]:
# export
import torch.nn
from torch import stack, zeros_like as t0, ones_like as t1
from torch.distributions.bernoulli import Bernoulli
from librosa.effects import split
from dataclasses import asdict
from scipy.signal import resample_poly
from scipy.ndimage.interpolation import shift
import librosa
import colorednoise as cn
In [ ]:
##export
#_all_ = ['AudioGetter', 'get_audio_files', 'AudioItem', 'OpenAudio', 'AudioSpectrogram', 'AudioToSpec',
 #       'SpectrogramConfig', 'AudioConfig', 'audio_extensions']

Setup Examples

In [ ]:
p = Config()['data_path'] / 'ST-AEDS-20180100_1-OS'
untar_data(URLs.SPEAKERS10, fname=str(p)+'.tar', dest=p)
x = AudioGetter("", recurse=True, folders=None)
files = x(p)
In [ ]:
#files will load differently on different machines so we specify examples by name
ex_files = [p/f for f in ['m0005_us_m0005_00218.wav', 
                                'f0003_us_f0003_00279.wav', 
                                'f0001_us_f0001_00168.wav', 
                                'f0005_us_f0005_00286.wav',]]
In [ ]:
audio_orig = AudioItem.create(ex_files[0])
a2s = AudioToSpec(n_fft = 1024, hop_length=256)
sg_orig = a2s(audio_orig)
In [ ]:
#get 3 equal length portions of 3 different signals so we can stack them
#for a fake multichannel example
ai0, ai1, ai2 = map(AudioItem.create, ex_files[1:4]);
min_samples = min(ai0.nsamples, ai1.nsamples, ai2.nsamples)
s0, s1, s2 = map(lambda x: x[:,:min_samples], (ai0.sig, ai1.sig, ai2.sig))
fake_multichannel = AudioItem((torch.stack((s0, s1, s2), dim=1).squeeze(0), 16000, None))
sg_multi = a2s(fake_multichannel)

Preprocessing Functions

TO-DO:
1. Add in longer clips (whale) and do more extensive testing. Current clip only allows us to test Trim, not All or Split

Remove Silence

In [ ]:
#export
mk_class('RemoveType', **{o:o.lower() for o in ['Trim', 'All', 'Split']},
         doc="All methods of removing silence as attributes to get tab-completion and typo-proofing")
In [ ]:
#export
def _merge_splits(splits, pad):
    clip_end = splits[-1][1]
    merged = []
    i=0
    while i < len(splits):
        start = splits[i][0]
        while splits[i][1] < clip_end and splits[i][1] + pad >= splits[i+1][0] - pad:
            i += 1
        end = splits[i][1]
        merged.append(np.array([max(start-pad, 0), min(end+pad, clip_end)]))
        i+=1
    return np.stack(merged)

def RemoveSilence(remove_type=RemoveType.Trim, threshold=20, pad_ms=20):
    def _inner(ai:AudioItem)->AudioItem:
        '''Split signal at points of silence greater than 2*pad_ms '''
        if remove_type is None: return ai
        padding = int(pad_ms/1000*ai.sr)
        if(padding > ai.nsamples): return ai
        actual = ai.sig.clone()
        splits = split(actual.numpy(), top_db=threshold, hop_length=padding)
        if remove_type == "split":
            sig =  [actual[:,(max(a-padding,0)):(min(b+padding,ai.nsamples))] 
                    for (a, b) in _merge_splits(splits, padding)]
        elif remove_type == "trim":
            sig = [actual[:,(max(splits[0, 0]-padding,0)):splits[-1, -1]+padding]]
        elif remove_type == "all":
            sig = [torch.cat([actual[:,(max(a-padding,0)):(min(b+padding,ai.nsamples))] 
                              for (a, b) in _merge_splits(splits, padding)], dim=1)]
        else: 
            raise ValueError(f"Valid options for silence removal are None, 'split', 'trim', 'all' not '{remove_type}'.")
        return AudioItem((*sig, ai.sr, ai.path))
    return _inner

Trim Silence

In [ ]:
silence_audio = RemoveSilence(threshold=20, pad_ms=20)(audio_orig)
audio_orig.show()
silence_audio.show()
#test that at least a half second of silence is being removed
test(silence_audio.nsamples + 8000, audio_orig.nsamples, operator.le)
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
In [ ]:
#test that nothing is removed from audio that doesnt contain silence
test_aud = AudioItem((torch.rand_like(audio_orig.sig), 16000, None))
print("Random Noise, no silence")
test_aud.hear()
for rm_type in [RemoveType.All, RemoveType.Trim, RemoveType.Split]:
    silence_audio_trim = RemoveSilence(rm_type, threshold=20, pad_ms=20)(test_aud)
    test_eq(test_aud.nsamples, silence_audio_trim.nsamples)
Random Noise, no silence
In [ ]:
# trim silence from a multichannel clip, needs more extensive testing
silence_mc = RemoveSilence(threshold=20, pad_ms=20)(fake_multichannel)
print(silence_mc.sig.shape) #still 3 channels
fake_multichannel.hear()
silence_mc.hear()
torch.Size([3, 40640])