The GTZAN dataset comes in a ZIP archive wich contains one folder per musical genre. Each folder contains all the clips who belong to the genre. While easily browsable, this format is not appropriate for data analysis. We thus first convert the dataset to HDF5 before any pre-processing step.
class gtzan:
Ngenres = 10
Nclips = 100
sr = 22050
# Size ranges from 660000 to 675808 samples per clip.
# We will truncate all clips to not bias the classifier in any way.
Nsamples = 660000
print('Clip duration: {0:.2f} seconds ({1} samples at {2} Hz)'.format(
float(gtzan.Nsamples)/gtzan.sr, gtzan.Nsamples, gtzan.sr))
import os, time
import numpy as np
import librosa
import h5py
import IPython.display
print('Software versions:')
for pkg in [np, librosa]:
print(' {}: {}'.format(pkg.__name__, pkg.__version__))
folder = os.path.join('data', 'genres')
genres = os.listdir(folder)
assert len(genres) == gtzan.Ngenres
Define some helper functions.
def genpath(genre, clip):
"""Usage: genpath('rock', 0)"""
return os.path.join(folder, genre, '{0}.{1:0>5d}.au'.format(genre, clip))
def read(genre, clip):
"""Usage: read('rock', 0)"""
# Load audio file.
path = genpath(genre, clip)
y, sr = librosa.load(path, sr=None, mono=False) # No resampling.
# Sanity checks.
if sr != gtzan.sr:
raise ValueError('{}: wrong sampling rate of {}'.format(path, sr))
if y.size < gtzan.Nsamples:
raise ValueError('{}: too short, {} samples'.format(path, y.size))
# Truncate.
return y[:gtzan.Nsamples]
HDF5 data store:
filename = os.path.join('data', 'gtzan.hdf5')
# Remove existing HDF5 file without warning if non-existent.
try:
os.remove(filename)
except OSError:
pass
# Create HDF5 file and datasets.
f = h5py.File(filename, 'w')
dsize = (gtzan.Nsamples, gtzan.Nclips)
for genre in genres:
_ = f.create_dataset(genre, dsize, dtype='float32')
Fill the datasets with actual audio data.
tstart = time.time()
for genre in genres:
for clip in range(gtzan.Nclips):
f[genre][:,clip] = read(genre, clip)
print('Elapsed time: {:.0f} seconds'.format(time.time() - tstart))
Store the GTZAN metadata along with the audio data.
for var in vars(gtzan):
if not var.startswith("__"):
f.attrs[var] = vars(gtzan)[var]
# Display HDF5 attributes.
print('Attributes:')
for attr in f.attrs:
print(' {} = {}'.format(attr, f.attrs[attr]))
for _ in range(10):
genre = int(np.floor(gtzan.Ngenres * np.random.uniform()))
genre = genres[genre]
clip = int(np.floor(gtzan.Nclips * np.random.uniform()))
assert np.alltrue(f[genre][:,clip] == read(genre, clip))
From the HDF5 data store.
genre, clip = 'pop', 20
#IPython.display.Audio(f[genre][:,clip], rate=gtzan.sr)
Via librosa.
IPython.display.Audio(read(genre, clip), rate=gtzan.sr)
Directly from the file. It does unfortunately not support .au audio files.
#IPython.display.Audio(os.path.abspath(genpath(genre, clip)))
f.close()