#default_exp audio.core
#default_cls_lvl 3
%reload_ext autoreload
%autoreload 2
%matplotlib inline
#export
from local.torch_basics import *
from local.test import *
from local.data.all import *
from local.notebook.showdoc import show_doc
import torchaudio
import torchaudio.transforms as torchaud_tfm
import warnings
from IPython.display import display, Audio
from dataclasses import dataclass, asdict, is_dataclass, make_dataclass
from torchaudio.transforms import Spectrogram, AmplitudeToDB, MFCC
from librosa.display import specshow, waveplot
#export
_all_ = ['AudioGetter', 'get_audio_files', 'AudioItem', 'OpenAudio', 'AudioSpectrogram', 'AudioToSpec',
'SpectrogramConfig', 'AudioConfig', 'audio_extensions']
This section regroups the basic types used in vision with the transform that create objects of those types.
#export
audio_extensions = tuple(str.lower(k) for k, v in mimetypes.types_map.items() if v.startswith('audio/'))
#export
def get_audio_files(path, recurse=True, folders=None):
"Get image files in `path` recursively, only in `folders`, if specified."
return get_files(path, extensions=audio_extensions, recurse=recurse, folders=folders)
#export
def AudioGetter(suf='', recurse=True, folders=None):
"Create `get_image_files` partial function that searches path suffix `suf` and passes along `kwargs`, only in `folders`, if specified."
def _inner(o, recurse=recurse, folders=folders):
return get_audio_files(o/suf, recurse, folders)
return _inner
#export
URLs.SPEAKERS10 = 'http://www.openslr.org/resources/45/ST-AEDS-20180100_1-OS'
URLs.SPEAKERS250 = 'https://public-datasets.fra1.digitaloceanspaces.com/250-speakers.tar'
URLs.ESC50 = 'https://github.com/karoldvl/ESC-50/archive/master.zip'
p = Config()['data_path'] / 'ST-AEDS-20180100_1-OS'
untar_data(URLs.SPEAKERS10, fname=str(p)+'.tar', dest=p)
PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/ST-AEDS-20180100_1-OS')
p.ls()
(#3843) [/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0002_us_m0002_00128.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0003_us_f0003_00279.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0001_us_f0001_00168.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0005_us_f0005_00286.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00282.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0005_us_f0005_00432.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0005_us_f0005_00054.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0004_us_m0004_00110.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0003_us_m0003_00180.wav...]
audio_get_func = AudioGetter("", recurse=True, folders=None)
files = audio_get_func(p)
#files will load differently on different machines so we specify examples by name
ex_files = [p/f for f in ['m0005_us_m0005_00218.wav',
'f0003_us_f0003_00279.wav',
'f0001_us_f0001_00168.wav',
'f0005_us_f0005_00286.wav',]]
#export
class AudioItem(tuple):
def show(self, ctx=None, **kwargs):
"Show image using `merge(self._show_args, kwargs)`"
print(f"File: {self.path}")
self.hear()
show_audio_signal(self, ctx=ctx, **kwargs)
plt.show()
@classmethod
def create(cls, fn, **kwargs):
sig, sr = torchaudio.load(fn)
return cls((sig, sr, fn))
sig, sr, path = add_props(lambda i, self: self[i], n=3)
nchannels, nsamples = add_props(lambda i, self: self.sig.shape[i])
@property
def duration(self): return self.nsamples/float(self.sr)
def hear(self):
display(Audio(self.sig, rate=self.sr))
#export
def show_audio_signal(ai, ctx, **kwargs):
if(ai.nchannels > 1):
_,axs = plt.subplots(ai.nchannels, 1, figsize=(6,4*ai.nchannels))
for i,channel in enumerate(ai.sig):
waveplot(channel.numpy(), ai.sr, ax=axs[i], **kwargs)
else:
axs = plt.subplots(ai.nchannels, 1)[1] if ctx is None else ctx
waveplot(ai.sig.squeeze(0).numpy(), ai.sr, ax=axs, **kwargs)
type(AudioItem((None, None, ex_files[0])))
__main__.AudioItem
item0 = AudioItem.create(ex_files[0])
item0.sig.shape
torch.Size([1, 58240])
item0.sr, item0.nchannels, item0.nsamples, item0.duration
(16000, 1, 58240, 3.64)
test_eq(type(item0.sig), torch.Tensor)
test_eq(item0.sr, 16000)
test_eq(item0.nchannels, 1)
test_eq(item0.nsamples, 58240)
test_eq(item0.duration, 3.64)
item0[0]
tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -9.1553e-05, -6.1035e-05, 0.0000e+00]])
item0.show()
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
item1 = AudioItem.create(files[1]);
item0.show()
item1.show()
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0002_us_m0002_00128.wav
#get 3 equal length portions of 3 different signals so we can stack them
#for a fake multichannel example
ai0, ai1, ai2 = map(AudioItem.create, ex_files[1:4]);
min_samples = min(ai0.nsamples, ai1.nsamples, ai2.nsamples)
s0, s1, s2 = map(lambda x: x[:,:min_samples], (ai0.sig, ai1.sig, ai2.sig))
test_eq(s0.shape, s1.shape)
test_eq(s1.shape, s2.shape)
fake_multichannel = AudioItem((torch.stack((s0, s1, s2), dim=1).squeeze(0), 16000, None))
test_eq(fake_multichannel.nchannels, 3)
test_eq(fake_multichannel.nsamples, 53760)
fake_multichannel.show()
File: None
#export
class OpenAudio(Transform):
def __init__(self, items):
self.items = items
def encodes(self, i):
o = self.items[i]
return AudioItem.create(o)
def decodes(self, i)->Path:
return self.items[i]
repr of Transform is:
classname: self.use_as_item {self.encodes} {self.decodes}
encodes and decodes are TypeDispatches whose reprs are str of dict where k/v pair is typename and function that handles that type
oa = OpenAudio(files); oa
OpenAudio: True (object,object) -> encodes (object,object) -> decodes
#demonstrate functionality of OpenAudio.encodes, the rest of the nb will
#use files that are opened by name for reproducibility/testing
oa = OpenAudio(files)
item100 = oa.encodes(100)
item100.show()
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0003_us_m0003_00340.wav
#test open audio on a random set of files
for i in range(10):
idx = random.randint(0, len(files))
test_eq_type(oa.encodes(idx), AudioItem.create(files[idx]))
test_eq_type(oa.decodes(idx), files[idx])
type(oa)
__main__.OpenAudio
oa.encodes(0)
(tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.5259e-04, -6.1035e-05, -1.8311e-04]]), 16000, PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav'))
oa.decodes(0)
PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav')
oa.items[0]
PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav')
#export
_GenSpec = torchaudio.transforms.Spectrogram
_GenMelSpec = torchaudio.transforms.MelSpectrogram
_GenMFCC = torchaudio.transforms.MFCC
_ToDB = torchaudio.transforms.AmplitudeToDB
get_usable_kwargs
takes a function and a dictionary of kwargs that may or may not be relevant to that function and returns a dictionary of all the default values to that function, updated with the kwargs that can be successfully applied. This is done because, first it allows us to combine multiple functions into a single AudioToSpec Transform but only pass the appropriate kwargs, secondly because it allows us to keep a dictionary of the settings used to create the Spectrogram which is sometimes used in it's display and cropping, and third because it allows us to warn the user when they are passing in improper or unused kwargs.
#export
def get_usable_kwargs(func, kwargs, exclude):
exclude = ifnone(exclude, [])
defaults = {k:v.default for k, v in inspect.signature(func).parameters.items() if k not in exclude}
usable = {k:v for k,v in kwargs.items() if k in defaults}
return {**defaults, **usable}
# def add_func(func, kwargs):
# func_args = get_usable_kwargs(func, kwargs, [])
# return func(**func_args)
kwargs = {'a':1, 'b':2}
extra_kwargs = {'z':0, 'a':1, 'b':2, 'c':3}
test_eq(get_usable_kwargs(operator.add, kwargs, []), kwargs)
test_eq(get_usable_kwargs(operator.add, extra_kwargs, []), kwargs)
#export
class AudioSpectrogram(TensorImageBase):
@classmethod
def create(cls, sg, settings=None):
x = cls(sg)
x.settings = settings
return x
@property
def duration(self):
# spectrograms round up length to fill incomplete columns,
# so we subtract 0.5 to compensate, wont be exact
return (self.hop_length*(self.shape[-1]-0.5))/self.sr
height, width = add_props(lambda i, self: self.shape[i+1], n=2)
#using the line below instead of above will fix show_batch but break multichannel/delta display
#nchannels, height, width = add_props(lambda i, self: self.shape[i], n=3)
def __getattr__(self, name):
if name == "settings": return None
if self.settings is not None and name in self.settings: return self.settings[name]
raise AttributeError(f"{self.__class__.__name__} object has no attribute {name}")
def show(self, ctx=None, ax=None, figsize=None, **kwargs):
show_spectrogram(self, ctx=ctx, ax=ax, figsize=figsize,**kwargs)
plt.show()
#export
def show_spectrogram(sg, ax, ctx, figsize, **kwargs):
ax = ifnone(ax,ctx)
nchannels = sg.nchannels
r, c = nchannels, sg.data.shape[0]//nchannels
proper_kwargs = get_usable_kwargs(specshow, sg.settings, exclude=["ax", "kwargs", "data",])
fname = str(sg.path).split('/')[-1] if sg.path is not None else "Unknown File"
if (r == 1 and c == 1):
_show_spectrogram(sg, ax, proper_kwargs, **kwargs)
plt.title(f"{fname}: Channel 0 Image 0")
else:
if figsize is None: figsize = (4*c, 3*r)
if ax is None: _,ax = plt.subplots(r, c, figsize=figsize)
for i, channel in enumerate(sg.data):
if r == 1:
cur_ax = ax[i%c]
elif c == 1:
cur_ax = ax[i%r]
else:
cur_ax = ax[i//c,i%c]
cur_ax.set_title(f"{fname}: Channel {i//c} Image {i%c}")
z = specshow(channel.numpy(), ax=cur_ax, **sg._show_args, **proper_kwargs)
#plt.colorbar(z, ax=cur_ax)
#ax=plt.gca() #get the current axes
#PCM=ax.get_children()[2] #get the mappable, the 1st and the 2nd are the x and y axes
#plt.colorbar(PCM, ax=ax, format='%+2.0f dB')
def _show_spectrogram(sg, ax, proper_kwargs, **kwargs):
if "mel" not in sg.settings: y_axis = None
else: y_axis = "mel" if sg.mel else "linear"
proper_kwargs.update({"x_axis":"time", "y_axis":y_axis,})
_ = specshow(sg.data.squeeze(0).numpy(), **sg._show_args, **proper_kwargs)
fmt = '%+2.0f dB' if "to_db" in sg.settings and sg.to_db else '%+2.0f'
plt.colorbar(format=fmt)
#export
@delegates(_GenSpec.__init__)
@delegates(_GenMelSpec.__init__, keep=True)
@delegates(_ToDB.__init__, keep=True)
class AudioToSpec(Transform):
def __init__(self, mel=True, to_db=True, **kwargs):
self._validate_kwargs(mel, to_db, kwargs)
transforms = L()
kwargs = self.add_local_defaults(dict(kwargs))
if mel: transforms += self.add_func(_GenMelSpec, kwargs)
else: transforms += self.add_func(_GenSpec, kwargs)
if to_db: transforms += self.add_func(_ToDB, kwargs)
#would it be better to use Pipeline here than nn.Sequential?
self.transformer = nn.Sequential(*transforms)
store_attr(self, 'to_db,mel')
self.__dict__.update(kwargs)
@classmethod
def from_cfg(cls, audio_cfg):
cfg = asdict(audio_cfg) if is_dataclass(audio_cfg) else audio_cfg
return cls(**cfg)
def encodes(self, x:AudioItem):
settings = dict(self.__dict__)
settings.update({'sr':x.sr, 'nchannels':x.nchannels, 'path':x.path})
return AudioSpectrogram.create(self.transformer(x.sig).flip(1).detach(), settings=settings)
def add_func(self, func, kwargs):
func_args = get_usable_kwargs(func, kwargs, [])
self.__dict__.update(func_args)
return func(**func_args)
# Torchaudio overrides None values internally for these objects, their logic is copied here for now
# so that the settings stored in the spectrogram accurately reflect what is happening.
# Also we override their default n_fft of 400 because it is very bad if n_mels > 64
def add_local_defaults(self, kwargs):
if "n_fft" not in kwargs or kwargs["n_fft"] is None: kwargs["n_fft"] = 1024
if "win_length" not in kwargs or kwargs["win_length"] is None: kwargs["win_length"] = kwargs["n_fft"]
if "hop_length" not in kwargs or kwargs["hop_length"] is None: kwargs["hop_length"] = int(kwargs["win_length"]/2)
return kwargs
@staticmethod
def _validate_kwargs(mel, to_db, kwargs):
funcs = [_GenMelSpec, _GenSpec, _ToDB]
all_args = set().union(*map(lambda x: set(inspect.signature(x).parameters.keys()), funcs))
for k, v in kwargs.items():
if k not in all_args:
warnings.warn(f"{k} is not a valid arg name, usable kwargs are {all_args}")
if mel: AudioToSpec._warn_kwargs(_GenMelSpec, _GenSpec, kwargs)
else : AudioToSpec._warn_kwargs(_GenSpec, _GenMelSpec, kwargs)
if not to_db: AudioToSpec._warn_kwargs(noop, _ToDB, kwargs)
@staticmethod
def _warn_kwargs(used, unused, kwargs):
def get_bad_args(f1, f2):
a1, a2 = map(lambda x: set(inspect.signature(x).parameters.keys()), (f1, f2))
return a2 - a1
bad_args = get_bad_args(used, unused)
for k, v in kwargs.items():
if(k in bad_args):
warnings.warn(f"{k} passed in but unused, your settings use {used} not {unused}")
# get a sg with weird settings for testing
a2s = AudioToSpec(f_max=20000, n_mels=137)
sg = a2s(item0)
sg2 = a2s(item100)
sg_mc = a2s(fake_multichannel)
sg.show()
sg.show()
sg2.show()
sg_mc.show()
sg.settings
{'sample_rate': 16000, 'n_fft': 1024, 'win_length': 1024, 'hop_length': 512, 'f_min': 0.0, 'f_max': 20000, 'pad': 0, 'n_mels': 137, 'window_fn': <function _VariableFunctions.hann_window>, 'wkwargs': None, 'stype': 'power', 'top_db': None, 'transformer': Sequential( (0): MelSpectrogram( (spectrogram): Spectrogram() (mel_scale): MelScale() ) (1): AmplitudeToDB() ), 'to_db': True, 'mel': True, 'sr': 16000, 'nchannels': 1, 'path': PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav')}
sg.nchannels, sg.height, sg.width
(1, 137, 114)
#test the explicit settings were properly stored in the spectrogram object and can be accessed as attributes
test_eq(sg.f_max, 20000)
test_eq(sg.hop_length, 512)
test_eq(sg.sr, item100.sr)
test_eq(sg.mel, True)
test_eq(sg.to_db, True)
test_eq(sg.nchannels, 1)
test_eq(sg.height, 137)
test_eq(sg.n_mels, sg.height)
test_eq(sg.width, 114)
defaults = {k:v.default for k, v in inspect.signature(_GenMelSpec).parameters.items()}
a2s = AudioToSpec(f_max=20000, hop_length = 345)
sg = a2s(item100)
test_eq(sg.n_mels, defaults["n_mels"])
test_eq(sg.n_fft , 1024)
test_eq(sg.shape[1], sg.n_mels)
test_eq(sg.hop_length, 345)
# test the spectrogram and audio have same duration, both are computed
# on the fly as transforms can change their duration
test_close(sg.duration, item100.duration, eps=0.1)
SHOW_W=False
#test warning for unused argument 'power' for melspec
#tests AudioToSpec and its from_cfg class method
voice_mel_cfg = {'n_fft':2560, 'f_max':22050., 'n_mels':128, 'hop_length':256, 'power':2}
test_warns(lambda: AudioToSpec(**voice_mel_cfg), show=SHOW_W)
test_warns(lambda: AudioToSpec.from_cfg(voice_mel_cfg), show=SHOW_W)
#test for unused arguments 'f_max' and 'n_mels' for non-mel Spectrogram
voice_mel_cfg = {'f_max':22050., 'n_mels':128, 'n_fft':2560, 'hop_length':256, 'power':2}
test_warns(lambda: AudioToSpec(mel=False, **voice_mel_cfg), show=SHOW_W)
#test warning for unused argument 'top_db' when db conversion not done
voice_mel_cfg = {'top_db':20, 'n_fft':2560, 'f_max':22050., 'n_mels':128, 'hop_length':256}
test_warns(lambda: AudioToSpec(to_db=False, **voice_mel_cfg), show=SHOW_W)
#test warning for invalid argument 'doesntexist'
voice_mel_cfg = {'doesntexist':True, 'n_fft':2560, 'f_max':22050., 'n_mels':128, 'hop_length':256}
test_warns(lambda: AudioToSpec(to_db=False, **voice_mel_cfg), show=SHOW_W)
a_to_db_mel = AudioToSpec()
a_to_nondb_mel = AudioToSpec(to_db=False)
a_to_db_nonmel = AudioToSpec(mel=False)
a_to_nondb_non_mel = AudioToSpec(mel=False, to_db=False)
a_to_db_mel_hyperparams = AudioToSpec(n_fft=8192, hop_length=128)
%%timeit -n10
a_to_db_mel(item0)
1.73 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
a_to_nondb_mel(item0)
1.54 ms ± 963 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
a_to_db_nonmel(item0)
1.69 ms ± 235 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
a_to_nondb_non_mel(item0)
1.38 ms ± 215 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
# Time can blow up as a factor of n_fft and hop_length. n_fft is best kept to a power of two, hop_length
# doesn't matter except smaller = more time because we have more chunks to perform STFTs on
a_to_db_mel_hyperparams(item0)
14.7 ms ± 1.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
import time
def time_variable_length_audios(f, max_seconds=30, sr=16000, channels=1):
times = []
audios = [AudioItem((torch.randn(channels, sr*i), sr, None)) for i in range(1,max_seconds+1,2)]
for a in audios:
start = time.time()
out = f(a)
end = time.time()
times.append(round(1000*(end-start), 2))
return times
%%time
a2s = AudioToSpec()
max_seconds = 180
times_mono = time_variable_length_audios(f=a2s, max_seconds=max_seconds)
times_stereo = time_variable_length_audios(f=a2s, max_seconds=max_seconds, channels=2)
plt.plot(np.arange(0,max_seconds,2), times_mono, label="mono")
plt.plot(np.arange(0,max_seconds,2), times_stereo, label="stereo")
plt.legend(['mono','stereo'])
plt.title("Time Taken by AudioToSpec")
plt.xlabel("Audio Duration in Seconds")
plt.ylabel("Processing Time in ms")
CPU times: user 25.5 s, sys: 4.89 s, total: 30.4 s Wall time: 9.69 s
Text(0, 0.5, 'Processing Time in ms')
#export
@delegates(_GenMFCC.__init__)
class AudioToMFCC(Transform):
def __init__(self,**kwargs):
func_args = get_usable_kwargs(_GenMFCC, kwargs, [])
self.transformer = _GenMFCC(**func_args)
self.settings = func_args
@classmethod
def from_cfg(cls, audio_cfg):
cfg = asdict(audio_cfg) if is_dataclass(audio_cfg) else audio_cfg
return cls(**cfg)
def encodes(self, x:AudioItem):
sg_settings = {"sr":x.sr, 'nchannels':x.nchannels,'path':x.path, **self.settings}
return AudioSpectrogram.create(self.transformer(x.sig).detach(), settings=sg_settings)
a2mfcc = AudioToMFCC()
mfcc = a2mfcc(item0)
test_eq(mfcc.n_mfcc, mfcc.data.shape[1])
mfcc.show()
mfcc.settings
{'sr': 16000, 'nchannels': 1, 'path': PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav'), 'sample_rate': 16000, 'n_mfcc': 40, 'dct_type': 2, 'norm': 'ortho', 'log_mels': False, 'melkwargs': None}
mfcc.height
40
mfcc.width
292
#n_mfcc specified should determine the height of the mfcc
n_mfcc = 67
a2mfcc67 = AudioToMFCC(n_mfcc=n_mfcc)
mfcc67 = a2mfcc67(item100)
test_eq(mfcc67.shape[1], n_mfcc)
print(mfcc67.shape)
mfcc67.show()
torch.Size([1, 67, 567])
a2mfcc_kwargs = AudioToMFCC(melkwargs={"hop_length":1024, "n_fft":1024})
mfcc_kwargs = a2mfcc_kwargs(item100)
mfcc_kwargs.show()
# make sure a new hop_length changes the resulting width
test_ne(mfcc_kwargs.width, mfcc.width)
%%time
a2mfcc = AudioToMFCC()
max_seconds = 180
times_mono = time_variable_length_audios(f=a2mfcc, max_seconds=max_seconds)
times_stereo = time_variable_length_audios(f=a2mfcc, max_seconds=max_seconds, channels=2)
plt.plot(np.arange(0,max_seconds,2), times_mono, label="mono")
plt.plot(np.arange(0,max_seconds,2), times_stereo, label="stereo")
plt.legend(['mono','stereo'])
plt.title("Time Taken by AudioToMFCC")
plt.xlabel("Audio Duration in Seconds")
plt.ylabel("Processing Time in ms")
CPU times: user 30.5 s, sys: 3.24 s, total: 33.7 s Wall time: 10.4 s
Text(0, 0.5, 'Processing Time in ms')
mel_cfg = {'n_fft':2560,'hop_length':64}
oa = OpenAudio(files)
a2s = AudioToSpec(**mel_cfg)
db_mel_pipe = Pipeline([oa,a2s], as_item=True)
for i in range(5):
print("Shape:", db_mel_pipe(i).shape)
db_mel_pipe.show(db_mel_pipe(i))
Shape: torch.Size([1, 128, 821])
Shape: torch.Size([1, 128, 1101])
Shape: torch.Size([1, 128, 1331])
Shape: torch.Size([1, 128, 841])
Shape: torch.Size([1, 128, 951])
cfg = {'hop_length':128, 'n_fft':400}
oa = OpenAudio(files)
db_mel_pipe = Pipeline([oa, AudioToSpec(mel=False, to_db=False, **cfg)], as_item=True)
for i in range(3):
print("Shape:", db_mel_pipe(i).shape)
db_mel_pipe.show(db_mel_pipe(i))
test_eq(db_mel_pipe(i).hop_length, cfg["hop_length"])
Shape: torch.Size([1, 201, 411])
Shape: torch.Size([1, 201, 551])
Shape: torch.Size([1, 201, 666])
oa = OpenAudio(files)
db_mel_pipe = Pipeline([oa, AudioToSpec(mel=False)], as_item=True)
for i in range(3):
print("Shape:", db_mel_pipe(i).shape)
db_mel_pipe.show(db_mel_pipe(i))
Shape: torch.Size([1, 513, 103])
Shape: torch.Size([1, 513, 138])
Shape: torch.Size([1, 513, 167])
#non-mel db-scale spectrogram
cfg = {'mel':False, 'n_fft':260, 'f_max':22050., 'hop_length':128}
oa = OpenAudio(files)
db_mel_pipe = Pipeline([oa, AudioToSpec.from_cfg(cfg)], as_item=True)
for i in range(3):
db_mel_pipe.show(db_mel_pipe(i))
/opt/anaconda3/envs/dev/lib/python3.7/site-packages/ipykernel_launcher.py:61: UserWarning: f_max passed in but unused, your settings use <class 'torchaudio.transforms.Spectrogram'> not <class 'torchaudio.transforms.MelSpectrogram'>
db_mfcc_pipe = Pipeline([oa, AudioToMFCC(n_mfcc=40),], as_item=True)
for i in range(3):
db_mfcc_pipe.show(db_mfcc_pipe(i))
#export
def config_from_func(func, name, **kwargs):
params = inspect.signature(func).parameters.items()
namespace = {k:v.default for k, v in params}
namespace.update(kwargs)
return make_dataclass(name, namespace.keys(), namespace=namespace)
#export
class AudioConfig():
#default configurations from the wrapped function
#make sure to pass in mel=False as kwarg for non-mel spec, and to_db=False for non db spec
BasicSpectrogram = config_from_func(_GenSpec, "BasicSpectrogram", mel=False)
BasicMelSpectrogram = config_from_func(_GenMelSpec, "BasicMelSpectrogram")
BasicMFCC = config_from_func(_GenMFCC, "BasicMFCC ")
#special configs with domain-specific defaults
Voice = config_from_func(_GenMelSpec, "Voice", f_min=50., f_max=8000., n_fft=1024, n_mels=128, hop_length=128)
# Basic Mel Spectrogram is just the Torchaudio defaults, which are currently bad, hence
# the empty melbins in the spectrogram below. We can make our own custom good ones like Voice
mel_cfg = AudioConfig.BasicMelSpectrogram()
a2mel = AudioToSpec.from_cfg(mel_cfg)
mel_bad = a2mel(oa(42))
mel_bad.show()
voice_cfg = AudioConfig.Voice()
a2mel = AudioToSpec.from_cfg(voice_cfg)
mel_good = a2mel(oa(42))
mel_good.show()
test_eq(mel_bad.n_fft, mel_cfg.n_fft)
# hop defaults to None in torchaudio but is set later in the code, we override this default to None
# internally in AudioToSpec to ensure the correct hop_length is stored as a sg attribute
test_ne(mel_bad.hop_length, mel_cfg.hop_length)
print("MelConfig Default Hop:", mel_cfg.hop_length)
print("Resulting Hop:",mel_bad.hop_length)
MelConfig Default Hop: None Resulting Hop: 200
sg_cfg = AudioConfig.BasicSpectrogram()
# make sure mel setting is passed down and is false for normal spectro
test_eq(sg_cfg.mel, False)
#Grab a random file, test that the n_fft are passed successfully via config and stored in sg settings
oa = OpenAudio(files)
f_num = random.randint(0, len(files))
sg_cfg = AudioConfig.BasicSpectrogram(n_fft=2000, hop_length=155)
a2sg = AudioToSpec.from_cfg(sg_cfg)
sg = a2sg(oa(f_num))
test_eq(sg.n_fft, sg_cfg.n_fft)
test_eq(sg.width, int(oa(f_num).nsamples/sg_cfg.hop_length)+1)
oa = OpenAudio(files)
db_mel_pipe = Pipeline([oa, AudioToSpec.from_cfg(sg_cfg)], as_item=True)
for i in range(3):
db_mel_pipe.show(db_mel_pipe(i))
voice_config = AudioConfig.Voice(); voice_config
Voice(sample_rate=16000, n_fft=1024, win_length=None, hop_length=128, f_min=50.0, f_max=8000.0, pad=0, n_mels=128, window_fn=<built-in method hann_window of type object at 0x7fe2955810a0>, wkwargs=None)
oa = OpenAudio(files)
db_mel_pipe = Pipeline([oa, AudioToSpec.from_cfg(voice_config)], as_item=True)
for i in range(3):
db_mel_pipe.show(db_mel_pipe(i))
mfcc_cfg = AudioConfig.BasicMFCC()
oa = OpenAudio(files)
mfcc_pipe = Pipeline([oa, AudioToMFCC.from_cfg(mfcc_cfg)], as_item=True)
for i in range(44,47):
print("Shape", mfcc_pipe(i).shape)
mfcc_pipe(i).show()
Shape torch.Size([1, 40, 183])
Shape torch.Size([1, 40, 490])
Shape torch.Size([1, 40, 260])
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)
Converted 00_test.ipynb. Converted 01_core_foundation.ipynb. Converted 01a_core_utils.ipynb. Converted 01b_core_dispatch.ipynb. Converted 01c_core_transform.ipynb. Converted 02_core_script.ipynb. Converted 03_torchcore.ipynb. Converted 03a_layers.ipynb. Converted 04_data_load.ipynb. Converted 05_data_core.ipynb. Converted 06_data_transforms.ipynb. Converted 07_data_block.ipynb. Converted 08_vision_core.ipynb. Converted 09_vision_augment.ipynb. Converted 09a_vision_data.ipynb. Converted 09b_vision_utils.ipynb. Converted 10_pets_tutorial.ipynb. Converted 11_vision_models_xresnet.ipynb. Converted 12_optimizer.ipynb. Converted 13_learner.ipynb. Converted 13a_metrics.ipynb. Converted 14_callback_schedule.ipynb. Converted 14a_callback_data.ipynb. Converted 15_callback_hook.ipynb. Converted 15a_vision_models_unet.ipynb. Converted 16_callback_progress.ipynb. Converted 17_callback_tracker.ipynb. Converted 18_callback_fp16.ipynb. Converted 19_callback_mixup.ipynb. Converted 20_interpret.ipynb. Converted 20a_distributed.ipynb. Converted 21_vision_learner.ipynb. Converted 22_tutorial_imagenette.ipynb. Converted 23_tutorial_transfer_learning.ipynb. Converted 30_text_core.ipynb. Converted 31_text_data.ipynb. Converted 32_text_models_awdlstm.ipynb. Converted 33_text_models_core.ipynb. Converted 34_callback_rnn.ipynb. Converted 35_tutorial_wikitext.ipynb. Converted 36_text_models_qrnn.ipynb. Converted 37_text_learner.ipynb. Converted 38_tutorial_ulmfit.ipynb. Converted 40_tabular_core.ipynb. Converted 41_tabular_model.ipynb. Converted 42_tabular_rapids.ipynb. Converted 50_data_block_examples.ipynb. Converted 60_medical_imaging.ipynb. Converted 65_medical_text.ipynb. Converted 70_audio_core.ipynb. Converted 70_callback_wandb.ipynb. Converted 71_audio_augment.ipynb. Converted 71_callback_tensorboard.ipynb. Converted 72_audio_tutorial.ipynb. Converted 90_notebook_core.ipynb. Converted 91_notebook_export.ipynb. Converted 92_notebook_showdoc.ipynb. Converted 93_notebook_export2html.ipynb. Converted 94_notebook_test.ipynb. Converted 95_index.ipynb. Converted 96_data_external.ipynb. Converted 97_utils_test.ipynb. Converted notebook2jekyll.ipynb. Converted xse_resnext.ipynb.