Looking at the music¶

In [1]:

from scipy.io import wavfile
import scipy as sp
import pylab as pl
from glob import glob
from os import walk
from os import path
import numpy as np
from sklearn.decomposition import RandomizedPCA
from sklearn.decomposition import FastICA
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import pandas as pd

%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib

In [4]:

from IPython.parallel import Client
client = Client()
dv = client[:]
lb_view = client.load_balanced_view()
print len(client)

In [56]:

sample_rate, x = wavfile.read('wavs/blues/blues.00000.wav')
print sample_rate
print x.shape
sx, freqs, _, _= pl.specgram(x, Fs=sample_rate, xextent=(0, 30), )
print sx.shape, freqs.shape

22050
(661794,)
(129, 5169) (129,)

In [65]:

sample_rate, x = wavfile.read('wavs/classical/classical.00000.wav')
print sample_rate
print x.shape
sx, freqs, bins, img= pl.specgram(x, Fs=sample_rate, xextent=(0, 30), )
print sx.shape, freqs.shape, bins.shape,

 22050
(661794,)
(129, 5169) (129,) (5169,)

In [59]:

print sx[0]

[ 283.29317962   10.74862745  281.31660468 ...,   30.92338931   95.98095322
  229.4967336 ]

Load all music¶

In [5]:

wavfiles = []
for base, dirs, files in walk('wavs'):
    print base
    for f in files:
        if f.endswith('wav'):
            wavfiles.append(path.abspath(path.join(base, f)))
labels = [f.rsplit('/', 2)[1] for f in wavfiles]

wavs
wavs/disco
wavs/classical
wavs/rock
wavs/jazz
wavs/country
wavs/blues
wavs/hiphop
wavs/metal
wavs/pop
wavs/reggae

In [6]:

print len(wavfiles)
print len(labels)

1000
1000

In [7]:

def load_wav(fname):
    from scipy.io import wavfile
    fs, x = wavfile.read(fname)
    return (fs, x)

In [8]:

result = lb_view.map(load_wav, wavfiles)
sample_rates, wavs = zip(*result)
sample_rates = np.asarray(sample_rates)
wavs = np.asarray(wavs)
print sample_rates.shape
print wavs.shape

(1000,)
(1000,)

In [9]:

label_names = np.unique(labels)
labels = np.asarray(labels)
sample_rate = np.unique(sample_rates)[0]
print label_names
print sample_rate

['blues' 'classical' 'country' 'disco' 'hiphop' 'jazz' 'metal' 'pop'
 'reggae' 'rock']
22050

Visualize Specgram¶

In [43]:

example_songs = dict([(lb, wavs[labels==lb][:3]) for lb in label_names])

In [50]:

nrows = len(label_names)
ncols = 3
fig, axes = pl.subplots(nrows = nrows, ncols = ncols, figsize=(6*ncols, 3*nrows))
for i, label in enumerate(label_names):
    for j in xrange(3):
        _ = axes[i,j].specgram(example_songs[label][j], Fs=sample_rate, xextent=(0, 30))
        axes[i,j].set_title("%s %i" %(label, j))

Use FFT¶

In [70]:

def extract_fft(wav):
    import scipy as sp
    fft = abs(sp.fft(wav)[:1000])
    return fft

In [73]:

fft_X = np.asarray(lb_view.map(extract_fft, [wav for wav in wavs], block=True))
print fft_X.shape

(1000, 1000)

In [103]:

np.save('fft.npy', fft_X)
np.save('labels.npy', labels)

In [104]:

print wavs.shape
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
label_names = label_encoder.classes_

(1000,)

In [105]:

fft_X = np.load('fft.npy')
labels = np.load('labels.npy')

In [106]:

ss = StandardScaler()
fft_X = ss.fit_transform(fft_X)
print fft_X.shape

(1000, 1000)

In [107]:

## PCA analysis
pca = RandomizedPCA()
%time pca.fit(fft_X)

CPU times: user 3.54 s, sys: 1.23 s, total: 4.77 s
Wall time: 2.45 s

Out[107]:

RandomizedPCA(copy=True, iterated_power=3, n_components=None,
       random_state=None, whiten=False)

In [108]:

var_analysis = pd.DataFrame(pca.explained_variance_ratio_[:50], 
                            index = ['pca%i'%i for i in xrange(50)])
var_analysis.plot(kind = 'bar', figsize=(16, 4))

Out[108]:

<matplotlib.axes.AxesSubplot at 0x5b9e9990>

In [109]:

pca_fft = RandomizedPCA(n_components=200).fit_transform(fft_X)
print pca_fft.shape

(1000, 200)

In [119]:

data_X, data_y = shuffle(pca_fft, y, random_state = 0)
train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size = 0.2)

svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 5, n_jobs=-1,)
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_

{'gamma': 0.03}
0.3325

In [120]:

svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
print np.mean(test_y == svc.predict(test_X))

0.335
0.335

In [121]:

print metrics.classification_report(test_y, svc.predict(test_X))
print list(enumerate(label_names))

             precision    recall  f1-score   support

          0       0.78      0.24      0.37        29
          1       0.74      0.61      0.67        23
          2       0.67      0.15      0.25        26
          3       0.38      0.25      0.30        20
          4       0.00      0.00      0.00        25
          5       0.69      0.60      0.64        15
          6       0.33      0.33      0.33        15
          7       0.16      0.80      0.27        15
          8       0.06      0.09      0.07        11
          9       0.28      0.48      0.35        21

avg / total       0.44      0.34      0.33       200

[(0, 'blues'), (1, 'classical'), (2, 'country'), (3, 'disco'), (4, 'hiphop'), (5, 'jazz'), (6, 'metal'), (7, 'pop'), (8, 'reggae'), (9, 'rock')]

In [122]:

confusion = metrics.confusion_matrix(test_y, svc.predict(test_X))
pd.DataFrame(confusion, columns=label_names, index=label_names)

/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)

Out[122]:

	blues	classical	country	disco	jazz	metal	pop	reggae	rock
blues	7	1	0	0	1	1	15	1	3
classical	0	14	0	1	0	0	6	0	2
country	1	0	4	1	2	0	4	4	10
disco	0	1	2	5	1	4	4	3	0
hiphop	1	0	0	2	0	2	14	3	3
jazz	0	1	0	0	9	0	2	1	2
metal	0	0	0	1	0	5	9	0	0
pop	0	0	0	1	0	1	12	1	0
reggae	0	1	0	1	0	1	1	1	6
rock	0	1	0	1	0	1	6	2	10

MFCC - averaged¶

In [124]:

from scikits.talkbox.features import mfcc

In [172]:

ceps, mspec, spec = mfcc(wavs[0], )
print ceps.shape, mspec.shape, spec.shape
ceps, mspec, spec = mfcc(wavs[500], )
print ceps.shape, mspec.shape, spec.shape

(4173, 13) (4173, 40) (4173, 512)
(4135, 13) (4135, 40) (4135, 512)

In [183]:

def extract_mfcc_avg(wav):
    from scikits.talkbox.features import mfcc
    import numpy as np
    ceps, _, _ = mfcc(wav)
    return np.mean(ceps, axis = 0)
    #num_ceps = ceps.shape[0]
    #return np.mean(ceps[int(num_ceps*0.1):int(num_ceps*0.9)], axis = 0)

In [184]:

mfcc_avg_feats = np.asarray(lb_view.map(extract_mfcc_avg, 
                                        [wav for wav in wavs], 
                                        block=True))
print mfcc_avg_feats.shape

(1000, 13)

In [188]:

mfcc_avg_feats = np.nan_to_num(mfcc_avg_feats)
np.any(np.isnan(mfcc_avg_feats))

Out[188]:

False

In [186]:

mfcc_avg_feats = StandardScaler().fit_transform(mfcc_avg_feats)

In [189]:

data_X, data_y = shuffle(mfcc_avg_feats, y)
train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size = 0.2)
print train_X.shape, test_X.shape, train_y.shape, test_y.shape

(800, 13) (200, 13) (800,) (200,)

In [190]:

svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 5, n_jobs=-1,)
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_

{'gamma': 0.3}
0.54

In [194]:

svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
print np.mean(test_y == svc.predict(test_X))

0.545
0.545

In [193]:

print metrics.classification_report(test_y, svc.predict(test_X))
print list(enumerate(label_names))
confusion = metrics.confusion_matrix(test_y, svc.predict(test_X))
pd.DataFrame(confusion, columns=label_names, index=label_names)

             precision    recall  f1-score   support

          0       0.83      0.62      0.71        24
          1       0.84      0.84      0.84        19
          2       0.52      0.48      0.50        23
          3       0.41      0.53      0.46        17
          4       0.43      0.43      0.43        21
          5       0.41      0.50      0.45        18
          6       0.71      0.81      0.76        21
          7       0.63      0.75      0.69        16
          8       0.27      0.14      0.19        21
          9       0.35      0.40      0.37        20

avg / total       0.55      0.55      0.54       200

[(0, 'blues'), (1, 'classical'), (2, 'country'), (3, 'disco'), (4, 'hiphop'), (5, 'jazz'), (6, 'metal'), (7, 'pop'), (8, 'reggae'), (9, 'rock')]

/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)

Out[193]:

	blues	classical	country	disco	hiphop	jazz	metal	pop	reggae	rock
blues	15	0	4	0	0	0	1	0	2	2
classical	0	16	1	0	0	1	1	0	0	0
country	0	0	11	3	0	4	1	1	2	1
disco	0	1	0	9	1	0	1	1	1	3
hiphop	0	0	1	3	9	2	3	1	1	1
jazz	0	2	0	1	3	9	0	2	0	1
metal	2	0	0	1	1	0	17	0	0	0
pop	0	0	1	1	0	0	0	12	1	1
reggae	1	0	1	3	3	2	0	2	3	6
rock	0	0	2	1	4	4	0	0	1	8

MFCC - BOW¶

In [10]:

def extract_mfcc(wav):
    from scikits.talkbox.features import mfcc
    import numpy as np
    ceps, _, _ = mfcc(wav)
    num_ceps = ceps.shape[0]
    ceps[np.isinf(ceps)] = ceps.max()
    ceps[np.isneginf(ceps)] = ceps.min()
    ceps[np.isnan(ceps)] = 0.
    return np.nan_to_num(ceps[int(num_ceps*0.1):int(num_ceps*0.9)], )

In [11]:

mfcc_feats = lb_view.map(extract_mfcc, [w for w in wavs], block = True)

all_mfcc_feats = np.concatenate(mfcc_feats)
print all_mfcc_feats.shape
print np.any(np.isnan(all_mfcc_feats)), 
print np.any(np.isinf(all_mfcc_feats)), 
print np.any(np.isneginf(all_mfcc_feats))

(3309019, 13)
False False False

In [12]:

np.any(np.isneginf(all_mfcc_feats))

Out[12]:

False

In [13]:

ss = StandardScaler()
all_mfcc_feats = ss.fit_transform(all_mfcc_feats)

In [14]:

dv['ss'] = ss
def normalize_mfcc(mx):
    return ss.transform(mx)
mfcc_feats = lb_view.map(normalize_mfcc, mfcc_feats, block = True)

In [15]:

all_mfcc_feats[:, 0].min()

Out[15]:

-9.1834703543812086

In [17]:

## find bag of words
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters = 200, batch_size = 5000, max_iter = 30, n_init=1)

In [18]:

%time kmeans.fit(all_mfcc_feats)

CPU times: user 56.2 s, sys: 3.36 s, total: 59.6 s
Wall time: 59.6 s

Out[18]:

MiniBatchKMeans(batch_size=5000, compute_labels=True, init='k-means++',
        init_size=None, max_iter=30, max_no_improvement=10, n_clusters=200,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=0)

In [41]:

## tf translate of signals
mfcc_bow = []
for mx in mfcc_feats:
    clusters = kmeans.predict(mx)
    mfcc_bow.append(np.asarray([np.sum(clusters == c) for c in xrange(kmeans.n_clusters)]))
mfcc_bow = np.asarray(mfcc_bow)
print mfcc_bow.shape

(1000, 200)

In [27]:

mfcc_bow = StandardScaler().fit_transform(mfcc_bow)

/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py:278: UserWarning: StandardScaler assumes floating point values as input, got int64
  "got %s" % (estimator, X.dtype))

In [42]:

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(sublinear_tf=False)
mfcc_bow = tfidf.fit_transform(mfcc_bow)

In [43]:

y = LabelEncoder().fit_transform(labels)

In [44]:

data_X, data_y = shuffle(mfcc_bow, y)
train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size = 0.2)
print train_X.shape, test_X.shape, train_y.shape, test_y.shape

(800, 200) (200, 200) (800,) (200,)

In [45]:

svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 5, n_jobs=-1,)
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_

{'gamma': 1.0}
0.66125

In [46]:

svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
print np.mean(test_y == svc.predict(test_X))
print metrics.classification_report(test_y, svc.predict(test_X))
print list(enumerate(label_names))
confusion = metrics.confusion_matrix(test_y, svc.predict(test_X))
pd.DataFrame(confusion, columns=label_names, index=label_names)

0.645
0.645
             precision    recall  f1-score   support

          0       0.63      0.57      0.60        21
          1       0.76      0.89      0.82        18
          2       0.56      0.70      0.62        20
          3       0.36      0.53      0.43        15
          4       0.59      0.57      0.58        23
          5       0.84      0.76      0.80        21
          6       0.75      0.91      0.82        23
          7       0.93      0.78      0.85        18
          8       0.47      0.47      0.47        17
          9       0.58      0.29      0.39        24

avg / total       0.65      0.65      0.64       200

[(0, 'blues'), (1, 'classical'), (2, 'country'), (3, 'disco'), (4, 'hiphop'), (5, 'jazz'), (6, 'metal'), (7, 'pop'), (8, 'reggae'), (9, 'rock')]

/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)

Out[46]:

	blues	classical	country	disco	hiphop	jazz	metal	pop	reggae	rock
blues	12	1	1	1	0	1	2	0	2	1
classical	0	16	0	0	0	0	0	0	0	2
country	2	1	14	1	0	0	1	1	0	0
disco	0	0	2	8	1	0	1	0	1	2
hiphop	2	1	1	3	13	1	0	0	2	0
jazz	0	2	1	1	0	16	1	0	0	0
metal	1	0	0	0	1	0	21	0	0	0
pop	0	0	2	1	0	0	0	14	1	0
reggae	0	0	3	1	5	0	0	0	8	0
rock	2	0	1	6	2	1	2	0	3	7

In [ ]: