Goal: Measure classification accuracy with 20% noise. Final result for the thesis report and presentation.
Conclusion: Structure provides robustness to noise.
Observations:
Pname = 'lg'
Pvalues = [None, 100]
# Regenerate the graph or the features at each iteration.
regen_graph = False
regen_features = True
regen_baseline = False
p = {}
# Preprocessing.
# Graph.
p['data_scaling_graph'] = 'features'
p['K'] = 10 + 1 # 5 to 10 + 1 for self-reference
p['dm'] = 'euclidean'
p['Csigma'] = 1
p['diag'] = True
p['laplacian'] = 'normalized'
# Feature extraction.
p['m'] = 128 # 64, 128, 512
p['ls'] = 1
p['ld'] = 10
p['le'] = None
p['lg'] = 100
# Classification.
p['scale'] = None
p['Nvectors'] = 6
p['svm_type'] = 'C'
p['kernel'] = 'linear'
p['C'] = 1
p['nu'] = 0.5
p['majority_voting'] = False
# HDF5 data stores.
p['folder'] = 'data'
p['filename_gtzan'] = 'gtzan.hdf5'
p['filename_audio'] = 'audio.hdf5'
p['filename_graph'] = 'graph.hdf5'
p['filename_features'] = 'features.hdf5'
# Dataset (10,100,644 | 5,100,149 | 2,10,644).
p['Ngenres'] = 5
p['Nclips'] = 100
p['Nframes'] = 149
# Added white noise.
p['noise_std'] = 0.2
# Graph.
p['tol'] = 1e-5
# Feature extraction.
p['rtol'] = 1e-5 # 1e-3, 1e-5, 1e-7
p['N_inner'] = 500
p['N_outer'] = 50
# Classification.
p['test_size'] = 0.1
p['Ncv'] = 20
p['dataset_classification'] = 'Z'
import numpy as np
import time
texperiment = time.time()
# Result dictionary.
res = ['accuracy', 'accuracy_std']
res += ['sparsity', 'atoms_D']
res += ['objective_g', 'objective_h', 'objective_i', 'objective_j']
res += ['time_features', 'iterations_inner', 'iterations_outer']
res = dict.fromkeys(res)
for key in res.keys():
res[key] = []
def separator(name, parameter=False):
if parameter:
name += ', {} = {}'.format(Pname, p[Pname])
dashes = 20 * '-'
print('\n {} {} {} \n'.format(dashes, name, dashes))
# Fair comparison when tuning parameters.
# Randomnesses: dictionary initialization, training and testing sets.
np.random.seed(1)
#%run gtzan.ipynb
#%run audio_preprocessing.ipynb
if not regen_graph:
separator('Graph')
%run audio_graph.ipynb
if not regen_features:
separator('Features')
%run audio_features.ipynb
# Hyper-parameter under test.
for p[Pname] in Pvalues:
if regen_graph:
separator('Graph', True)
%run audio_graph.ipynb
if regen_features:
separator('Features', True)
p['filename_features'] = 'features_{}_{}.hdf5'.format(Pname, p[Pname])
%run audio_features.ipynb
separator('Classification', True)
%run audio_classification.ipynb
# Collect results.
for key in res:
res[key].append(globals()[key])
# Baseline, i.e. classification with spectrograms.
p['dataset_classification'] = 'X'
p['scale'] = 'minmax' # Todo: should be done in pre-processing.
if regen_baseline:
res['baseline'] = []
res['baseline_std'] = []
for p[Pname] in Pvalues:
separator('Baseline', True)
%run audio_classification.ipynb
res['baseline'].append(accuracy)
res['baseline_std'].append(accuracy_std)
else:
separator('Baseline')
%run audio_classification.ipynb
res['baseline'] = len(Pvalues) * [accuracy]
res['baseline_std'] = accuracy_std
-------------------- Graph -------------------- Data: (149000, 96), float32 Elapsed time: 610.14 seconds All self-referenced in the first column: True dist in [0.0, 2.99558210373] w in [0.252469688654, 1.0] Ones on the diagonal: 149000 (over 149000) assert: True W in [0.0, 1.0] Datasets: L_data : (2899474,), float32 L_indices : (2899474,), int32 L_indptr : (149001,) , int32 L_shape : (2,) , int64 W_data : (2899474,), float32 W_indices : (2899474,), int32 W_indptr : (149001,) , int32 W_shape : (2,) , int64 Attributes: K = 11 dm = euclidean Csigma = 1 diag = True laplacian = normalized Overall time: 620.26 seconds -------------------- Features, lg = None -------------------- Attributes: sr = 22050 labels = ['blues' 'classical' 'country' 'disco' 'hiphop' 'jazz' 'metal' 'pop' 'reggae' 'rock'] Datasets: Xa: (10, 100, 644, 2, 1024) , float32 Xs: (10, 100, 644, 2, 96) , float32 Full dataset: size: N=1,288,000 x n=96 -> 123,648,000 floats dim: 123,648 features per clip shape: (10, 100, 644, 2, 96) <class 'h5py._hl.dataset.Dataset'> Reduced dataset: size: N=149,000 x n=96 -> 14,304,000 floats dim: 28,608 features per clip shape: (5, 100, 149, 2, 96) <type 'numpy.ndarray'> Data: (149000, 96), float32 Attributes: K = 11 dm = euclidean Csigma = 1 diag = True laplacian = normalized Datasets: L_data : (2899474,), float32 L_indices : (2899474,), int32 L_indptr : (149001,) , int32 L_shape : (2,) , int64 W_data : (2899474,), float32 W_indices : (2899474,), int32 W_indptr : (149001,) , int32 W_shape : (2,) , int64 Size X: 13.6 M --> 54.6 MiB Size Z: 18.2 M --> 72.8 MiB Size D: 12.0 k --> 48.0 kiB Size E: 12.0 k --> 48.0 kiB Elapsed time: 379 seconds
Inner loop: 292 iterations g(Z) = ||X-DZ||_2^2 = 4.562890e+05 rdiff: 0.000424278815903 i(Z) = ||Z||_1 = 1.178744e+06
Global objective: 1.635033e+06
Outer loop: 9 iterations Z in [-1.15228462219, 1.03771519661] Sparsity of Z: 9,483,486 non-zero entries out of 19,072,000 entries, i.e. 49.7%.
D in [-0.286309689283, 0.320648640394] d in [0.999999642372, 1.00000023842] Constraints on D: True
Datasets: D : (128, 96) , float32 X : (5, 100, 149, 2, 96) , float32 Z : (5, 100, 149, 2, 128) , float32 Attributes: sr = 22050 labels = ['blues' 'classical' 'country' 'disco' 'hiphop' 'jazz' 'metal' 'pop' 'reggae' 'rock'] Overall time: 387 seconds -------------------- Classification, lg = None -------------------- Software versions: numpy: 1.8.2 sklearn: 0.14.1 Attributes: sr = 22050 labels = ['blues' 'classical' 'country' 'disco' 'hiphop' 'jazz' 'metal' 'pop' 'reggae' 'rock'] Datasets: D : (128, 96) , float32 X : (5, 100, 149, 2, 96) , float32 Z : (5, 100, 149, 2, 128) , float32 Full dataset: size: N=149,000 x n=128 -> 19,072,000 floats dim: 38,144 features per clip shape: (5, 100, 149, 2, 128) <class 'h5py._hl.dataset.Dataset'> Reduced dataset: size: N=149,000 x n=128 -> 19,072,000 floats dim: 38,144 features per clip shape: (5, 100, 149, 2, 128) <type 'numpy.ndarray'> Flattened frames: size: N=149,000 x n=128 -> 19,072,000 floats dim: 38,144 features per clip shape: (5, 100, 298, 128) Truncated and grouped: size: N=135,000 x n=128 -> 17,280,000 floats dim: 34,560 features per clip shape: (5, 100, 6, 45, 128) Truncated and grouped: size: N=135,000 x n=128 -> 17,280,000 floats dim: 34,560 features per clip shape: (5, 100, 6, 45, 128) Feature vectors: size: N=6,000 x n=128 -> 768,000 floats dim: 1,536 features per clip shape: (5, 100, 6, 2, 128)
5 genres: blues, classical, country, disco, hiphop Training data: (3600, 128), float64 Testing data: (2400, 128), float64 Training labels: (3600,), uint8 Testing labels: (2400,), uint8 Accuracy: 40.1 % 5 genres: blues, classical, country, disco, hiphop Training data: (300, 1536), float64 Testing data: (200, 1536), float64 Training labels: (300,), uint8 Testing labels: (200,), uint8 Feature vectors accuracy: 36.7 % Clips accuracy: 46.5 % 5 genres: blues, classical, country, disco, hiphop Data: (6000, 128), float64 Labels: (6000,), uint8 Ratio: 5400.0 training, 600.0 testing 42 (+/- 2.5) <- [41 41 43 44 39 44 38 45 40 39] 42 (+/- 2.0) <- [41 45 40 41 44 44 39 40 39 43] 43 (+/- 2.4) <- [44 46 46 39 40 42 43 39 44 40] 42 (+/- 1.3) <- [40 41 41 43 43 44 43 42 40 41] 44 (+/- 1.3) <- [46 42 43 45 43 45 43 42 44 42] 42 (+/- 2.0) <- [42 43 44 40 43 45 41 42 38 41] 43 (+/- 1.9) <- [43 42 43 42 44 38 43 42 40 45] 43 (+/- 2.3) <- [44 42 40 43 43 42 37 43 46 43] 43 (+/- 2.3) <- [47 43 41 42 46 39 43 42 40 41] 41 (+/- 1.3) <- [41 40 40 41 40 42 41 39 44 39] 44 (+/- 2.1) <- [47 44 44 43 43 42 39 44 42 45] 42 (+/- 1.7) <- [42 39 42 39 42 42 42 42 45 39] 43 (+/- 1.9) <- [40 45 44 43 42 42 46 41 45 40] 42 (+/- 2.3) <- [44 43 45 38 41 44 40 40 40 43] 44 (+/- 1.0) <- [42 43 43 44 41 43 45 43 44 44] 42 (+/- 2.1) <- [42 44 42 42 45 42 40 37 43 41] 43 (+/- 2.1) <- [42 41 42 42 46 45 40 43 46 39] 43 (+/- 1.6) <- [42 43 45 42 44 45 42 45 41 41] 43 (+/- 2.0) <- [46 40 40 41 43 43 43 40 45 43] 42 (+/- 1.5) <- [40 42 42 40 42 40 41 42 45 44] Accuracy: 42.6 (+/- 2.04) Mean time (20 cv): 119.90 seconds Overall time: 2414.47 seconds -------------------- Features, lg = 100 -------------------- Attributes: sr = 22050 labels = ['blues' 'classical' 'country' 'disco' 'hiphop' 'jazz' 'metal' 'pop' 'reggae' 'rock'] Datasets: Xa: (10, 100, 644, 2, 1024) , float32 Xs: (10, 100, 644, 2, 96) , float32 Full dataset: size: N=1,288,000 x n=96 -> 123,648,000 floats dim: 123,648 features per clip shape: (10, 100, 644, 2, 96) <class 'h5py._hl.dataset.Dataset'> Reduced dataset: size: N=149,000 x n=96 -> 14,304,000 floats dim: 28,608 features per clip shape: (5, 100, 149, 2, 96) <type 'numpy.ndarray'> Data: (149000, 96), float32 Attributes: K = 11 dm = euclidean Csigma = 1 diag = True laplacian = normalized Datasets: L_data : (2899474,), float32 L_indices : (2899474,), int32 L_indptr : (149001,) , int32 L_shape : (2,) , int64 W_data : (2899474,), float32 W_indices : (2899474,), int32 W_indptr : (149001,) , int32 W_shape : (2,) , int64 Size X: 13.6 M --> 54.6 MiB Size Z: 18.2 M --> 72.8 MiB Size D: 12.0 k --> 48.0 kiB Size E: 12.0 k --> 48.0 kiB Elapsed time: 643 seconds
Inner loop: 238 iterations g(Z) = ||X-DZ||_2^2 = 2.389637e+06 rdiff: 3.97549967552e-05 i(Z) = ||Z||_1 = 2.168704e+05 j(Z) = tr(Z^TLZ) = 1.597216e+05
Global objective: 2.766229e+06
Outer loop: 4 iterations Z in [-0.120160095394, 0.157769978046] Sparsity of Z: 13,222,812 non-zero entries out of 19,072,000 entries, i.e. 69.3%.
D in [-0.214498415589, 0.321214944124] d in [0.999999761581, 1.00000023842] Constraints on D: True
Datasets: D : (128, 96) , float32 X : (5, 100, 149, 2, 96) , float32 Z : (5, 100, 149, 2, 128) , float32 Attributes: sr = 22050 labels = ['blues' 'classical' 'country' 'disco' 'hiphop' 'jazz' 'metal' 'pop' 'reggae' 'rock'] Overall time: 652 seconds -------------------- Classification, lg = 100 -------------------- Software versions: numpy: 1.8.2 sklearn: 0.14.1 Attributes: sr = 22050 labels = ['blues' 'classical' 'country' 'disco' 'hiphop' 'jazz' 'metal' 'pop' 'reggae' 'rock'] Datasets: D : (128, 96) , float32 X : (5, 100, 149, 2, 96) , float32 Z : (5, 100, 149, 2, 128) , float32 Full dataset: size: N=149,000 x n=128 -> 19,072,000 floats dim: 38,144 features per clip shape: (5, 100, 149, 2, 128) <class 'h5py._hl.dataset.Dataset'> Reduced dataset: size: N=149,000 x n=128 -> 19,072,000 floats dim: 38,144 features per clip shape: (5, 100, 149, 2, 128) <type 'numpy.ndarray'> Flattened frames: size: N=149,000 x n=128 -> 19,072,000 floats dim: 38,144 features per clip shape: (5, 100, 298, 128) Truncated and grouped: size: N=135,000 x n=128 -> 17,280,000 floats dim: 34,560 features per clip shape: (5, 100, 6, 45, 128) Truncated and grouped: size: N=135,000 x n=128 -> 17,280,000 floats dim: 34,560 features per clip shape: (5, 100, 6, 45, 128) Feature vectors: size: N=6,000 x n=128 -> 768,000 floats dim: 1,536 features per clip shape: (5, 100, 6, 2, 128)
5 genres: blues, classical, country, disco, hiphop Training data: (3600, 128), float64 Testing data: (2400, 128), float64 Training labels: (3600,), uint8 Testing labels: (2400,), uint8 Accuracy: 52.0 % 5 genres: blues, classical, country, disco, hiphop Training data: (300, 1536), float64 Testing data: (200, 1536), float64 Training labels: (300,), uint8 Testing labels: (200,), uint8 Feature vectors accuracy: 45.9 % Clips accuracy: 53.5 % 5 genres: blues, classical, country, disco, hiphop Data: (6000, 128), float64 Labels: (6000,), uint8 Ratio: 5400.0 training, 600.0 testing 51 (+/- 1.2) <- [51 52 49 50 49 50 51 53 51 49] 51 (+/- 2.1) <- [53 54 50 48 53 50 47 50 49 52] 52 (+/- 2.0) <- [52 54 52 51 48 53 53 50 55 51] 51 (+/- 1.9) <- [51 47 53 50 49 53 52 49 49 52] 52 (+/- 1.4) <- [50 53 53 52 51 52 50 52 54 50] 52 (+/- 1.7) <- [50 53 50 52 52 53 50 51 48 53] 53 (+/- 1.8) <- [55 52 53 54 51 50 54 49 53 54] 51 (+/- 1.7) <- [50 48 50 53 51 53 49 51 54 52] 51 (+/- 1.5) <- [53 50 48 51 50 50 50 52 51 50] 51 (+/- 1.9) <- [51 54 52 50 51 48 50 49 54 50] 53 (+/- 1.8) <- [51 51 51 51 53 54 56 53 50 55] 52 (+/- 2.1) <- [52 50 52 53 50 54 49 48 49 54] 51 (+/- 2.0) <- [49 56 50 51 51 50 52 49 53 50] 51 (+/- 1.6) <- [50 49 52 46 51 52 50 49 52 51] 51 (+/- 1.6) <- [50 52 54 50 49 49 51 53 52 51] 51 (+/- 1.9) <- [52 53 49 51 49 52 48 51 50 55] 51 (+/- 1.2) <- [49 51 52 51 51 52 50 50 53 49] 52 (+/- 1.7) <- [54 53 51 50 52 50 54 49 50 52] 52 (+/- 1.6) <- [53 49 49 51 50 53 53 53 51 51] 52 (+/- 1.5) <- [51 52 50 51 53 51 52 50 55 52] Accuracy: 51.6 (+/- 1.84) Mean time (20 cv): 31.16 seconds Overall time: 629.11 seconds -------------------- Baseline -------------------- Software versions: numpy: 1.8.2 sklearn: 0.14.1 Attributes: sr = 22050 labels = ['blues' 'classical' 'country' 'disco' 'hiphop' 'jazz' 'metal' 'pop' 'reggae' 'rock'] Datasets: D : (128, 96) , float32 X : (5, 100, 149, 2, 96) , float32 Z : (5, 100, 149, 2, 128) , float32 Full dataset: size: N=149,000 x n=96 -> 14,304,000 floats dim: 28,608 features per clip shape: (5, 100, 149, 2, 96) <class 'h5py._hl.dataset.Dataset'> Reduced dataset: size: N=149,000 x n=96 -> 14,304,000 floats dim: 28,608 features per clip shape: (5, 100, 149, 2, 96) <type 'numpy.ndarray'> Flattened frames: size: N=149,000 x n=96 -> 14,304,000 floats dim: 28,608 features per clip shape: (5, 100, 298, 96) Truncated and grouped: size: N=135,000 x n=96 -> 12,960,000 floats dim: 25,920 features per clip shape: (5, 100, 6, 45, 96) Truncated and grouped: size: N=135,000 x n=96 -> 12,960,000 floats dim: 25,920 features per clip shape: (5, 100, 6, 45, 96) Feature vectors: size: N=6,000 x n=96 -> 576,000 floats dim: 1,152 features per clip shape: (5, 100, 6, 2, 96)
5 genres: blues, classical, country, disco, hiphop Training data: (3600, 96), float64 Testing data: (2400, 96), float64 Training labels: (3600,), uint8 Testing labels: (2400,), uint8 Accuracy: 48.1 % 5 genres: blues, classical, country, disco, hiphop Training data: (300, 1152), float64 Testing data: (200, 1152), float64 Training labels: (300,), uint8 Testing labels: (200,), uint8 Feature vectors accuracy: 44.0 % Clips accuracy: 55.0 % 5 genres: blues, classical, country, disco, hiphop Data: (6000, 96), float64 Labels: (6000,), uint8 Ratio: 5400.0 training, 600.0 testing 46 (+/- 2.2) <- [44 45 49 47 42 49 46 48 46 43] 47 (+/- 1.2) <- [49 49 47 46 46 46 46 47 46 48] 48 (+/- 1.3) <- [46 50 48 48 46 48 49 46 48 47] 45 (+/- 1.5) <- [48 42 44 47 45 45 46 44 44 45] 46 (+/- 1.9) <- [46 46 47 45 43 48 47 49 46 43] 48 (+/- 2.7) <- [49 50 49 50 48 48 46 49 41 45] 47 (+/- 1.9) <- [47 46 46 49 47 42 49 46 48 45] 46 (+/- 2.0) <- [44 46 45 45 42 51 47 46 45 46] 47 (+/- 1.6) <- [46 48 44 49 45 48 47 49 46 46] 47 (+/- 1.4) <- [47 47 46 49 47 45 44 46 48 45] 47 (+/- 1.6) <- [46 48 47 43 47 45 48 46 48 44] 47 (+/- 1.3) <- [47 49 46 47 45 46 48 46 48 49] 47 (+/- 2.3) <- [50 50 46 46 44 50 46 49 46 43] 46 (+/- 2.2) <- [46 46 45 40 46 48 47 46 45 48] 47 (+/- 1.2) <- [48 47 47 44 47 46 45 45 47 47] 47 (+/- 1.5) <- [47 47 43 47 48 47 48 45 48 45] 46 (+/- 1.3) <- [45 47 47 47 48 45 45 47 45 44] 48 (+/- 1.5) <- [48 46 46 47 49 50 45 47 47 47] 47 (+/- 1.3) <- [46 47 44 47 47 47 50 47 46 46] 47 (+/- 1.8) <- [46 48 49 45 46 47 44 47 50 45] Accuracy: 46.9 (+/- 1.85) Mean time (20 cv): 27.70 seconds Overall time: 559.30 seconds
print('{} = {}'.format(Pname, Pvalues))
for key, value in res.items():
if key is not 'atoms_D':
print('res[\'{}\'] = {}'.format(key, value))
def plot(*args, **kwargs):
plt.figure(figsize=(8,5))
x = range(len(Pvalues))
log = 'log' in kwargs and kwargs['log'] is True
pltfunc = plt.semilogy if log else plt.plot
params = {}
params['linestyle'] = '-'
params['marker'] = '.'
params['markersize'] = 10
for i, var in enumerate(args):
if 'err' in kwargs:
pltfunc = plt.errorbar
params['yerr'] = res[kwargs['err'][i]]
params['capsize'] = 5
pltfunc(x, res[var], label=var, **params)
for i,j in zip(x, res[var]):
plt.annotate('{:.2f}'.format(j), xy=(i,j), xytext=(5,5), textcoords='offset points')
margin = 0.25
params['markersize'] = 10
plt.xlim(-margin, len(Pvalues)-1+margin)
if 'ylim' in kwargs:
plt.ylim(kwargs['ylim'])
plt.title('{} vs {}'.format(', '.join(args), Pname))
plt.xlabel(Pname)
plt.ylabel(' ,'.join(args))
plt.xticks(x, Pvalues)
plt.grid(True); plt.legend(loc='best'); plt.show()
def div(l):
div = Pvalues if Pname is l else [p[l]]
return np.array([1 if v is None else v for v in div])
# Classification results.
res['chance'] = len(Pvalues) * [100./p['Ngenres']]
res['chance_std'] = 0
err=['accuracy_std', 'baseline_std', 'chance_std']
plot('accuracy', 'baseline', 'chance', err=err, ylim=[0,100])
# Features extraction results.
if regen_features:
plot('objective_g', 'objective_h', 'objective_i', 'objective_j', log=True)
# Unweighted objectives.
print('g(Z) = ||X-DZ||_2^2, h(Z) = ||Z-EX||_2^2, i(Z) = ||Z||_1, j(Z) = tr(Z^TLZ)')
res['objective_g_un'] = res['objective_g'] / div('ld')
res['objective_h_un'] = res['objective_h'] / div('le')
res['objective_i_un'] = res['objective_i'] / div('ls')
res['objective_j_un'] = res['objective_j'] / div('lg')
plot('objective_g_un', 'objective_h_un', 'objective_i_un', 'objective_j_un', log=True)
plot('sparsity', ylim=[0,100])
plot('time_features')
plot('iterations_inner')
plot('iterations_outer')
for i, fig in enumerate(res['atoms_D']):
print('Dictionary atoms for {} = {}'.format(Pname, Pvalues[i]))
fig.show()
print('Experiment time: {:.0f} seconds'.format(time.time() - texperiment))
lg = [None, 100] res['accuracy_std'] = [2.040427515116487, 1.8377031255830689] res['objective_j'] = [0, 159721.6064453125] res['objective_i'] = [1178744.5, 216870.4375] res['objective_h'] = [0, 0] res['objective_g'] = [456288.984375, 2389636.71875] res['baseline'] = [46.866666666666667, 46.866666666666667] res['time_features'] = [379.20300483703613, 643.2467639446259] res['baseline_std'] = 1.85202591775 res['sparsity'] = [49.72465394295302, 69.33101929530201] res['iterations_inner'] = [292, 238] res['iterations_outer'] = [9, 4] res['accuracy'] = [42.606666666666655, 51.575000000000024]
/usr/lib/python2.7/dist-packages/numpy/ma/core.py:3847: UserWarning: Warning: converting a masked element to nan. warnings.warn("Warning: converting a masked element to nan.")
g(Z) = ||X-DZ||_2^2, h(Z) = ||Z-EX||_2^2, i(Z) = ||Z||_1, j(Z) = tr(Z^TLZ)
Dictionary atoms for lg = None Dictionary atoms for lg = 100 Experiment time: 5270 seconds
/usr/lib/pymodules/python2.7/matplotlib/figure.py:371: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure "matplotlib is currently using a non-GUI backend, "