from scipy.io import wavfile
import scipy as sp
import pylab as pl
from glob import glob
from os import walk
from os import path
import numpy as np
from sklearn.decomposition import RandomizedPCA
from sklearn.decomposition import FastICA
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import pandas as pd
%pylab inline --no-import-all
Populating the interactive namespace from numpy and matplotlib
from IPython.parallel import Client
client = Client()
dv = client[:]
lb_view = client.load_balanced_view()
print len(client)
24
sample_rate, x = wavfile.read('wavs/blues/blues.00000.wav')
print sample_rate
print x.shape
sx, freqs, _, _= pl.specgram(x, Fs=sample_rate, xextent=(0, 30), )
print sx.shape, freqs.shape
22050 (661794,) (129, 5169) (129,)
sample_rate, x = wavfile.read('wavs/classical/classical.00000.wav')
print sample_rate
print x.shape
sx, freqs, bins, img= pl.specgram(x, Fs=sample_rate, xextent=(0, 30), )
print sx.shape, freqs.shape, bins.shape,
22050 (661794,) (129, 5169) (129,) (5169,)
print sx[0]
[ 283.29317962 10.74862745 281.31660468 ..., 30.92338931 95.98095322 229.4967336 ]
wavfiles = []
for base, dirs, files in walk('wavs'):
print base
for f in files:
if f.endswith('wav'):
wavfiles.append(path.abspath(path.join(base, f)))
labels = [f.rsplit('/', 2)[1] for f in wavfiles]
wavs wavs/disco wavs/classical wavs/rock wavs/jazz wavs/country wavs/blues wavs/hiphop wavs/metal wavs/pop wavs/reggae
print len(wavfiles)
print len(labels)
1000 1000
def load_wav(fname):
from scipy.io import wavfile
fs, x = wavfile.read(fname)
return (fs, x)
result = lb_view.map(load_wav, wavfiles)
sample_rates, wavs = zip(*result)
sample_rates = np.asarray(sample_rates)
wavs = np.asarray(wavs)
print sample_rates.shape
print wavs.shape
(1000,) (1000,)
label_names = np.unique(labels)
labels = np.asarray(labels)
sample_rate = np.unique(sample_rates)[0]
print label_names
print sample_rate
['blues' 'classical' 'country' 'disco' 'hiphop' 'jazz' 'metal' 'pop' 'reggae' 'rock'] 22050
example_songs = dict([(lb, wavs[labels==lb][:3]) for lb in label_names])
nrows = len(label_names)
ncols = 3
fig, axes = pl.subplots(nrows = nrows, ncols = ncols, figsize=(6*ncols, 3*nrows))
for i, label in enumerate(label_names):
for j in xrange(3):
_ = axes[i,j].specgram(example_songs[label][j], Fs=sample_rate, xextent=(0, 30))
axes[i,j].set_title("%s %i" %(label, j))
def extract_fft(wav):
import scipy as sp
fft = abs(sp.fft(wav)[:1000])
return fft
fft_X = np.asarray(lb_view.map(extract_fft, [wav for wav in wavs], block=True))
print fft_X.shape
(1000, 1000)
np.save('fft.npy', fft_X)
np.save('labels.npy', labels)
print wavs.shape
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
label_names = label_encoder.classes_
(1000,)
fft_X = np.load('fft.npy')
labels = np.load('labels.npy')
ss = StandardScaler()
fft_X = ss.fit_transform(fft_X)
print fft_X.shape
(1000, 1000)
## PCA analysis
pca = RandomizedPCA()
%time pca.fit(fft_X)
CPU times: user 3.54 s, sys: 1.23 s, total: 4.77 s Wall time: 2.45 s
RandomizedPCA(copy=True, iterated_power=3, n_components=None, random_state=None, whiten=False)
var_analysis = pd.DataFrame(pca.explained_variance_ratio_[:50],
index = ['pca%i'%i for i in xrange(50)])
var_analysis.plot(kind = 'bar', figsize=(16, 4))
<matplotlib.axes.AxesSubplot at 0x5b9e9990>
pca_fft = RandomizedPCA(n_components=200).fit_transform(fft_X)
print pca_fft.shape
(1000, 200)
data_X, data_y = shuffle(pca_fft, y, random_state = 0)
train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size = 0.2)
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 5, n_jobs=-1,)
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
{'gamma': 0.03} 0.3325
svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
print np.mean(test_y == svc.predict(test_X))
0.335 0.335
print metrics.classification_report(test_y, svc.predict(test_X))
print list(enumerate(label_names))
precision recall f1-score support 0 0.78 0.24 0.37 29 1 0.74 0.61 0.67 23 2 0.67 0.15 0.25 26 3 0.38 0.25 0.30 20 4 0.00 0.00 0.00 25 5 0.69 0.60 0.64 15 6 0.33 0.33 0.33 15 7 0.16 0.80 0.27 15 8 0.06 0.09 0.07 11 9 0.28 0.48 0.35 21 avg / total 0.44 0.34 0.33 200 [(0, 'blues'), (1, 'classical'), (2, 'country'), (3, 'disco'), (4, 'hiphop'), (5, 'jazz'), (6, 'metal'), (7, 'pop'), (8, 'reggae'), (9, 'rock')]
confusion = metrics.confusion_matrix(test_y, svc.predict(test_X))
pd.DataFrame(confusion, columns=label_names, index=label_names)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated. warnings.warn(d.msg, DeprecationWarning) /usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated. warnings.warn(d.msg, DeprecationWarning)
blues | classical | country | disco | hiphop | jazz | metal | pop | reggae | rock | |
---|---|---|---|---|---|---|---|---|---|---|
blues | 7 | 1 | 0 | 0 | 0 | 1 | 1 | 15 | 1 | 3 |
classical | 0 | 14 | 0 | 1 | 0 | 0 | 0 | 6 | 0 | 2 |
country | 1 | 0 | 4 | 1 | 0 | 2 | 0 | 4 | 4 | 10 |
disco | 0 | 1 | 2 | 5 | 0 | 1 | 4 | 4 | 3 | 0 |
hiphop | 1 | 0 | 0 | 2 | 0 | 0 | 2 | 14 | 3 | 3 |
jazz | 0 | 1 | 0 | 0 | 0 | 9 | 0 | 2 | 1 | 2 |
metal | 0 | 0 | 0 | 1 | 0 | 0 | 5 | 9 | 0 | 0 |
pop | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 12 | 1 | 0 |
reggae | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 6 |
rock | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 6 | 2 | 10 |
from scikits.talkbox.features import mfcc
ceps, mspec, spec = mfcc(wavs[0], )
print ceps.shape, mspec.shape, spec.shape
ceps, mspec, spec = mfcc(wavs[500], )
print ceps.shape, mspec.shape, spec.shape
(4173, 13) (4173, 40) (4173, 512) (4135, 13) (4135, 40) (4135, 512)
def extract_mfcc_avg(wav):
from scikits.talkbox.features import mfcc
import numpy as np
ceps, _, _ = mfcc(wav)
return np.mean(ceps, axis = 0)
#num_ceps = ceps.shape[0]
#return np.mean(ceps[int(num_ceps*0.1):int(num_ceps*0.9)], axis = 0)
mfcc_avg_feats = np.asarray(lb_view.map(extract_mfcc_avg,
[wav for wav in wavs],
block=True))
print mfcc_avg_feats.shape
(1000, 13)
mfcc_avg_feats = np.nan_to_num(mfcc_avg_feats)
np.any(np.isnan(mfcc_avg_feats))
False
mfcc_avg_feats = StandardScaler().fit_transform(mfcc_avg_feats)
data_X, data_y = shuffle(mfcc_avg_feats, y)
train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size = 0.2)
print train_X.shape, test_X.shape, train_y.shape, test_y.shape
(800, 13) (200, 13) (800,) (200,)
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 5, n_jobs=-1,)
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
{'gamma': 0.3} 0.54
svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
print np.mean(test_y == svc.predict(test_X))
0.545 0.545
print metrics.classification_report(test_y, svc.predict(test_X))
print list(enumerate(label_names))
confusion = metrics.confusion_matrix(test_y, svc.predict(test_X))
pd.DataFrame(confusion, columns=label_names, index=label_names)
precision recall f1-score support 0 0.83 0.62 0.71 24 1 0.84 0.84 0.84 19 2 0.52 0.48 0.50 23 3 0.41 0.53 0.46 17 4 0.43 0.43 0.43 21 5 0.41 0.50 0.45 18 6 0.71 0.81 0.76 21 7 0.63 0.75 0.69 16 8 0.27 0.14 0.19 21 9 0.35 0.40 0.37 20 avg / total 0.55 0.55 0.54 200 [(0, 'blues'), (1, 'classical'), (2, 'country'), (3, 'disco'), (4, 'hiphop'), (5, 'jazz'), (6, 'metal'), (7, 'pop'), (8, 'reggae'), (9, 'rock')]
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated. warnings.warn(d.msg, DeprecationWarning) /usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated. warnings.warn(d.msg, DeprecationWarning)
blues | classical | country | disco | hiphop | jazz | metal | pop | reggae | rock | |
---|---|---|---|---|---|---|---|---|---|---|
blues | 15 | 0 | 4 | 0 | 0 | 0 | 1 | 0 | 2 | 2 |
classical | 0 | 16 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
country | 0 | 0 | 11 | 3 | 0 | 4 | 1 | 1 | 2 | 1 |
disco | 0 | 1 | 0 | 9 | 1 | 0 | 1 | 1 | 1 | 3 |
hiphop | 0 | 0 | 1 | 3 | 9 | 2 | 3 | 1 | 1 | 1 |
jazz | 0 | 2 | 0 | 1 | 3 | 9 | 0 | 2 | 0 | 1 |
metal | 2 | 0 | 0 | 1 | 1 | 0 | 17 | 0 | 0 | 0 |
pop | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 12 | 1 | 1 |
reggae | 1 | 0 | 1 | 3 | 3 | 2 | 0 | 2 | 3 | 6 |
rock | 0 | 0 | 2 | 1 | 4 | 4 | 0 | 0 | 1 | 8 |
def extract_mfcc(wav):
from scikits.talkbox.features import mfcc
import numpy as np
ceps, _, _ = mfcc(wav)
num_ceps = ceps.shape[0]
ceps[np.isinf(ceps)] = ceps.max()
ceps[np.isneginf(ceps)] = ceps.min()
ceps[np.isnan(ceps)] = 0.
return np.nan_to_num(ceps[int(num_ceps*0.1):int(num_ceps*0.9)], )
mfcc_feats = lb_view.map(extract_mfcc, [w for w in wavs], block = True)
all_mfcc_feats = np.concatenate(mfcc_feats)
print all_mfcc_feats.shape
print np.any(np.isnan(all_mfcc_feats)),
print np.any(np.isinf(all_mfcc_feats)),
print np.any(np.isneginf(all_mfcc_feats))
(3309019, 13) False False False
np.any(np.isneginf(all_mfcc_feats))
False
ss = StandardScaler()
all_mfcc_feats = ss.fit_transform(all_mfcc_feats)
dv['ss'] = ss
def normalize_mfcc(mx):
return ss.transform(mx)
mfcc_feats = lb_view.map(normalize_mfcc, mfcc_feats, block = True)
all_mfcc_feats[:, 0].min()
-9.1834703543812086
## find bag of words
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters = 200, batch_size = 5000, max_iter = 30, n_init=1)
%time kmeans.fit(all_mfcc_feats)
CPU times: user 56.2 s, sys: 3.36 s, total: 59.6 s Wall time: 59.6 s
MiniBatchKMeans(batch_size=5000, compute_labels=True, init='k-means++', init_size=None, max_iter=30, max_no_improvement=10, n_clusters=200, n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0, verbose=0)
## tf translate of signals
mfcc_bow = []
for mx in mfcc_feats:
clusters = kmeans.predict(mx)
mfcc_bow.append(np.asarray([np.sum(clusters == c) for c in xrange(kmeans.n_clusters)]))
mfcc_bow = np.asarray(mfcc_bow)
print mfcc_bow.shape
(1000, 200)
mfcc_bow = StandardScaler().fit_transform(mfcc_bow)
/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py:278: UserWarning: StandardScaler assumes floating point values as input, got int64 "got %s" % (estimator, X.dtype))
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(sublinear_tf=False)
mfcc_bow = tfidf.fit_transform(mfcc_bow)
y = LabelEncoder().fit_transform(labels)
data_X, data_y = shuffle(mfcc_bow, y)
train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size = 0.2)
print train_X.shape, test_X.shape, train_y.shape, test_y.shape
(800, 200) (200, 200) (800,) (200,)
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 5, n_jobs=-1,)
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
{'gamma': 1.0} 0.66125
svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
print np.mean(test_y == svc.predict(test_X))
print metrics.classification_report(test_y, svc.predict(test_X))
print list(enumerate(label_names))
confusion = metrics.confusion_matrix(test_y, svc.predict(test_X))
pd.DataFrame(confusion, columns=label_names, index=label_names)
0.645 0.645 precision recall f1-score support 0 0.63 0.57 0.60 21 1 0.76 0.89 0.82 18 2 0.56 0.70 0.62 20 3 0.36 0.53 0.43 15 4 0.59 0.57 0.58 23 5 0.84 0.76 0.80 21 6 0.75 0.91 0.82 23 7 0.93 0.78 0.85 18 8 0.47 0.47 0.47 17 9 0.58 0.29 0.39 24 avg / total 0.65 0.65 0.64 200 [(0, 'blues'), (1, 'classical'), (2, 'country'), (3, 'disco'), (4, 'hiphop'), (5, 'jazz'), (6, 'metal'), (7, 'pop'), (8, 'reggae'), (9, 'rock')]
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated. warnings.warn(d.msg, DeprecationWarning) /usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated. warnings.warn(d.msg, DeprecationWarning)
blues | classical | country | disco | hiphop | jazz | metal | pop | reggae | rock | |
---|---|---|---|---|---|---|---|---|---|---|
blues | 12 | 1 | 1 | 1 | 0 | 1 | 2 | 0 | 2 | 1 |
classical | 0 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
country | 2 | 1 | 14 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
disco | 0 | 0 | 2 | 8 | 1 | 0 | 1 | 0 | 1 | 2 |
hiphop | 2 | 1 | 1 | 3 | 13 | 1 | 0 | 0 | 2 | 0 |
jazz | 0 | 2 | 1 | 1 | 0 | 16 | 1 | 0 | 0 | 0 |
metal | 1 | 0 | 0 | 0 | 1 | 0 | 21 | 0 | 0 | 0 |
pop | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 14 | 1 | 0 |
reggae | 0 | 0 | 3 | 1 | 5 | 0 | 0 | 0 | 8 | 0 |
rock | 2 | 0 | 1 | 6 | 2 | 1 | 2 | 0 | 3 | 7 |