Slice clips in frames, apply a constant-Q transform (CQT) then some local contrast normalization (LCN). Processed audio is stored in HDF5 datasets.
import os, time
import numpy as np
import librosa
import h5py
print('Software versions:')
for pkg in [np, librosa]:
print(' {}: {}'.format(pkg.__name__, pkg.__version__))
# Much faster computation of the CQT if available.
# Provided by scikits.samplerate through libsamplerate (SRC).
print('librosa HAS_SAMPLERATE: {}'.format(librosa.core._HAS_SAMPLERATE))
Audio data from the GTZAN dataset has been previously stored in the HDF5 format which allows us to read and write data without the need to load the whole dataset into memory via memory mapping .
filename = os.path.join('data', 'gtzan.hdf5')
gtzan = h5py.File(filename, 'r')
# Display HDF5 attributes.
print('Attributes:')
for name, value in gtzan.attrs.items():
print(' {} = {}'.format(name, value))
# List the stored datasets.
print('Datasets: {}'.format(', '.join(gtzan)))
Dimensionality increase:
na = 1024
# Aligned.
N1 = int(np.floor(float(gtzan.attrs['Nsamples']) / na))
# Overlap (redundant).
N2 = int(np.floor(float(gtzan.attrs['Nsamples']) / na - 0.5))
Nframes = min(N1, N2)
Nclips = gtzan.attrs['Nclips']
Ngenres = gtzan.attrs['Ngenres']
del(N1, N2)
# Data dimensionality and size.
print('Dimensionality increase from {:,} samples '
'to {} frames x 2 x {} samples = {:,} per clip'
.format(gtzan.attrs['Nsamples'], Nframes, na, Nframes*2*na))
print('Data size N = {:,} frames of na = {} samples -> {:,} floats'
.format(Ngenres*Nclips*Nframes*2, na, Ngenres*Nclips*Nframes*2*na))
We use the CQT as a dimensionality reduction (from $n_a=1024$ to $n_s=96$) and a feature extraction tool:
Open questions:
# CQT filters.
ns = 96
No = 4
print('ns = {} filters spanning No = {} octaves'.format(ns, No))
print(' --> resolution of {} bins per octave'.format(ns/No))
# This MIDI implementation assigns middle C (note 60) to C5 not C4 !
# It may also be C3, there is no standardisation.
# It is not consistent with the scientific pitch notation.
assert librosa.note_to_midi('C5') == 60
# Tuning standard A4 = 440 Hz (A440) becomes A5 = 440Hz.
assert librosa.midi_to_hz(librosa.note_to_midi('A5')) == 440
assert librosa.midi_to_hz(69) == 440
# We should thus use C3 and C7 instead of C2 and C6...
nmin, nmax = 'C3', 'C7'
fmin = librosa.midi_to_hz(librosa.note_to_midi(nmin))
fmax = librosa.midi_to_hz(librosa.note_to_midi(nmax))
assert fmax / fmin == 2**No # By definition of an octave.
print('fmin = {:.2f} Hz ({}), fmax = {:.2f} Hz ({})'.format(
fmin[0], nmin, fmax[0], nmax))
# librosa CQT parameters.
rosaparams = {'sr':gtzan.attrs['sr'], 'hop_length':na, 'fmin':fmin,
'n_bins':ns, 'bins_per_octave':ns/No}
# Data dimensionality and size.
print('Dimensionality decrease from {0} frames x 2 x {1} samples = {3:,} '
'to {0} frames x 2 x {2} frequency bins = {4:,} per clip'
.format(Nframes, na, ns, Nframes*2*na, Nframes*2*ns))
print('Data size N = {:,} frames of ns = {} samples -> {:,} floats'
.format(Ngenres*Nclips*Nframes*2, ns, Ngenres*Nclips*Nframes*2*ns))
Five dimensions:
Three 5-dimensional HDF5 datasets: 3. Xa: raw audio of the frame, dimensionality $n=n_a$ 4. Xs: CQT spectrogram, dimensionality $n=n_s$ 5. Xn: LCN normalized spectrogram, dimensionality $n=n_s$
filename = os.path.join('data', 'audio.hdf5')
# Remove existing HDF5 file without warning if non-existent.
try:
os.remove(filename)
except OSError:
pass
# Create HDF5 file and datasets.
audio = h5py.File(filename, 'w')
# Metadata.
audio.attrs['sr'] = gtzan.attrs['sr']
genres = gtzan.keys()
dtype = 'S{}'.format(max([len(genre) for genre in genres]))
audio.attrs['labels'] = np.array(genres, dtype=dtype)
# Data.
Xa = audio.create_dataset('Xa', (Ngenres, Nclips, Nframes, 2, na), dtype='float32')
Xs = audio.create_dataset('Xs', (Ngenres, Nclips, Nframes, 2, ns), dtype='float32')
#Xn = f.create_dataset('Xn', (ns, N), dtype='float32')
# Show datasets, their dimensionality and data type.
print('Datasets:')
for dname, dset in audio.items():
print(' {:2}: {:16}, {}'.format(dname, dset.shape, dset.dtype))
# Display HDF5 attributes.
print('Attributes:')
for name, value in audio.attrs.items():
print(' {} = {}'.format(name, value))
params = {'newshape':(Nframes, na), 'order':'C'}
def process(genre, clip):
"""Usage: process(1, 2)"""
# Load audio.
y1 = gtzan[genres[genre]][:,clip] # Aligned frames.
y2 = y1[na/2:] # Overlaped frames.
# Store raw audio.
Xa[genre,clip,:,0,:] = np.reshape(y1[:na*Nframes], **params)
Xa[genre,clip,:,1,:] = np.reshape(y2[:na*Nframes], **params)
# Ensure that the signal is correctly reshaped.
i = int(np.floor(Nframes * np.random.uniform()))
assert np.alltrue(Xa[genre,clip,i,0,:] == y1[i*na:i*na+na])
assert np.alltrue(Xa[genre,clip,i,1,:] == y1[na/2+i*na:na/2+i*na+na])
# Store spectrogram. Drop the last one which consists mostly
# of padded data (and keep the same size as Xa).
Xs[genre,clip,:,0,:] = librosa.cqt(y1, **rosaparams)[:,:-1].T
Xs[genre,clip,:,1,:] = librosa.cqt(y2, **rosaparams)[:,:-1].T
Process a single clip:
#process(1, 2)
Process the entire GTZAN dataset:
#Ngenres, Nclips = 2, 100
tstart = time.time()
for genre in range(Ngenres):
for clip in range(Nclips):
process(genre, clip)
t = time.time() - tstart
print('Elapsed time: {:.0f} seconds ({:.1f} seconds per clip)'.format(
t, t/Ngenres/Nclips))
gtzan.close()
audio.close()