import cPickle
from IPython.parallel import Client
import mahotas as mh
import pylab as pl
import seaborn as sb
import numpy as np
from functools import partial
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import RandomizedPCA
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize
%pylab inline --no-import
sb.set(style = 'nogrid')
Populating the interactive namespace from numpy and matplotlib
def draw_confusion_matrix(mat, label_names):
pl.matshow(mat, cmap = pl.cm.jet)
pl.xticks(range(len(label_names)), label_names)
pl.yticks(range(len(label_names)), label_names)
pl.colorbar()
client = Client()
print len(client)
dv = client[:]
lb_view = client.load_balanced_view()
24
## load data
images, labels = cPickle.load(open('../data/subcifa.pkl', 'r'))
print len(images), len(labels)
3979 3979
images, labels = shuffle(images, labels)
images = list(images)
print images[0].shape
image_shape = images[0].shape
ll = LabelEncoder()
y = ll.fit_transform(labels)
label_names = ll.classes_
print y.shape
print label_names
(256, 384) (3979,) ['Anims' 'Cars' 'Distras' 'Trans']
np.unique([im.shape for im in images])
array([[214, 343], [253, 384], [255, 384], [256, 383], [256, 384], [258, 384], [260, 384], [262, 384], [384, 256]])
def extract_haralick(img, averaged=True):
import mahotas as mh
import numpy as np
feats = mh.features.haralick(img)
if averaged:
return feats.mean(axis = 0)
else:
return feats.ravel()
dv['extract_haralick'] = extract_haralick
## full haralick
X_hara_full = lb_view.map(partial(extract_haralick, averaged = False),
images, block=True)
X_hara_full = np.asarray(X_hara_full)
print X_hara_full.shape
(3979, 52)
## averaged haralick on 4 directions
X_hara_averaged = lb_view.map(partial(extract_haralick, averaged = True),
images, block=True)
X_hara_averaged = np.asarray(X_hara_averaged)
print X_hara_averaged.shape
(3979, 13)
ss = StandardScaler()
X_hara_full = ss.fit_transform(X_hara_full)
X_hara_averaged = ss.fit_transform(X_hara_averaged)
print X_hara_full.shape, X_hara_averaged.shape
(3979, 52) (3979, 13)
hara full performance
X_harafull_train, X_harafull_test, y_train, y_test = train_test_split(X_hara_full, y,
random_state = 0)
print X_harafull_train.shape, X_harafull_test.shape
print y_train.shape, y_test.shape
(2984, 52) (995, 52) (2984,) (995,)
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1)
gs.fit(X_harafull_train, y_train)
print gs.best_params_
print gs.best_score_
{'gamma': 0.1} 0.67191689008
svc = SVC(**gs.best_params_)
svc.fit(X_harafull_train, y_train)
print svc.score(X_harafull_test, y_test)
draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_harafull_test)),
label_names)
0.663316582915
hara averaged performance
X_haraavg_train, X_haraavg_test, y_train, y_test = train_test_split(X_hara_averaged, y,
random_state = 0)
print X_haraavg_train.shape, X_haraavg_test.shape
print y_train.shape, y_test.shape
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1)
gs.fit(X_haraavg_train, y_train)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(X_haraavg_train, y_train)
print svc.score(X_haraavg_test, y_test)
draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_haraavg_test)),
label_names)
(2984, 13) (995, 13) (2984,) (995,) {'gamma': 0.3} 0.645442359249 0.64824120603
def extract_surf_avg(img):
from mahotas.features import surf
import numpy as np
feats = surf.surf(img, descriptor_only=True)
return feats.mean(axis = 0)
X_surf_avg = lb_view.map(extract_surf_avg, images, block=True)
X_surf_avg = np.asarray(X_surf_avg)
print X_surf_avg.shape
(3979, 64)
ss = StandardScaler()
X_surf_avg = StandardScaler().fit_transform(X_surf_avg)
X_surfavg_train, X_surfavg_test, y_train, y_test = train_test_split(X_surf_avg, y,
random_state = 0)
print X_surfavg_train.shape, X_surfavg_test.shape
print y_train.shape, y_test.shape
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1)
gs.fit(X_surfavg_train, y_train)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(X_surfavg_train, y_train)
print svc.score(X_surfavg_test, y_test)
draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_surfavg_test)),
label_names)
(2984, 64) (995, 64) (2984,) (995,) {'gamma': 0.01} 0.571715817694 0.563819095477
def extract_surf(img):
from mahotas.features import surf
import numpy as np
feats = surf.surf(img, descriptor_only=True)
return feats
surf_feats = lb_view.map(extract_surf, images, block=True)
all_feats = np.concatenate(surf_feats)
print all_feats.shape
(2489070, 64)
from scipy.stats import skew
print np.std(all_feats.max(axis = 1))
print np.std(all_feats.min(axis = 1))
print np.std(all_feats.mean(axis = 1))
0.102311791226 0.120414854283 0.0189734020146
** IT SEEMS SURF FEATURES behave quite well in terms of generating similiar-range features. And so normalization is not necessary before clustering**
## use a big set of visual words - SVC may have difficulty of handling it directly though (or not?)
kmeans = MiniBatchKMeans(n_clusters = 1000, batch_size=10000, random_state=0)
kmeans.fit(all_feats)
MiniBatchKMeans(batch_size=10000, compute_labels=True, init='k-means++', init_size=None, max_iter=100, max_no_improvement=10, n_clusters=1000, n_init=3, random_state=0, reassignment_ratio=0.01, tol=0.0, verbose=0)
## get visual words
dv['kmeans'] = kmeans
def get_bow(feat):
import numpy as np
feat_clusters = kmeans.predict(feat)
bow = [np.sum(feat_clusters==c) for c in xrange(kmeans.n_clusters)]
return bow
X_surf_bow = np.asarray(lb_view.map(get_bow, surf_feats, block = True))
print X_surf_bow.shape
(3979, 1000)
surf bow performance - without dimensionality reduction, svc is very slow in handling large dim data
X_surfbow_train, X_surfbow_test, y_train, y_test = train_test_split(X_surf_bow, y,
random_state = 0)
print X_surfbow_train.shape, X_surfbow_test.shape
print y_train.shape, y_test.shape
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1)
gs.fit(X_surfbow_train, y_train)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(X_surfbow_train, y_train)
print svc.score(X_surfbow_test, y_test)
draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_surfbow_test)),
label_names)
(2984, 1000) (995, 1000) (2984,) (995,) {'gamma': 0.0003} 0.649463806971 0.63216080402
surf bow PCA
pca = RandomizedPCA(n_components=100)
pca.fit(X_surf_bow)
explained_vars = pd.DataFrame(pca.explained_variance_ratio_,
index = ['pca%i'%i for i in xrange(pca.components_.shape[0])])
explained_vars.plot(kind = 'bar')
<matplotlib.axes.AxesSubplot at 0xd4ad4990>
X_surf_pca = RandomizedPCA(n_components=50).fit_transform(X_surf_bow)
X_surfpca_train, X_surfpca_test, y_train, y_test = train_test_split(X_surf_pca, y,
random_state = 0)
print X_surfpca_train.shape, X_surfpca_test.shape
print y_train.shape, y_test.shape
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1)
gs.fit(X_surfpca_train, y_train)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(X_surfpca_train, y_train)
print svc.score(X_surfpca_test, y_test)
draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_surfpca_test)),
label_names)
(2984, 50) (995, 50) (2984,) (995,) {'gamma': 0.001} 0.63773458445 0.643216080402
surf with tfidf
tfidf = TfidfTransformer()
X_surf_tfidf = tfidf.fit_transform(X_surf_bow)
print X_surf_tfidf.shape
(3979, 1000)
X_surftfidf_train, X_surftfidf_test, y_train, y_test = train_test_split(X_surf_tfidf, y,
random_state = 0)
print X_surftfidf_train.shape, X_surftfidf_test.shape
print y_train.shape, y_test.shape
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1)
gs.fit(X_surftfidf_train, y_train)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(X_surftfidf_train, y_train)
print svc.score(X_surftfidf_test, y_test)
draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_surftfidf_test)),
label_names)
(2984, 1000) (995, 1000) (2984,) (995,) {'gamma': 1.0} 0.642426273458 0.63216080402
print X_surftfidf_train.shape, X_surftfidf_test.shape
print y_train.shape, y_test.shape
sgd = SGDClassifier(penalty='elasticnet', )
alphas = [1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(sgd, {'alpha': alphas}, cv = 10, n_jobs=-1)
gs.fit(X_surftfidf_train, y_train)
print gs.best_params_
print gs.best_score_
sgd = SGDClassifier(**gs.best_params_)
sgd.fit(X_surftfidf_train, y_train)
print sgd.score(X_surftfidf_test, y_test)
draw_confusion_matrix(confusion_matrix(y_test, sgd.predict(X_surftfidf_test)),
label_names)
(2984, 1000) (995, 1000) (2984,) (995,) {'alpha': 0.0003} 0.612600536193 0.593969849246
## note X_hara_full have been normalized before
X_harasurf_full = np.c_[X_hara_full, X_surf_bow]
print X_harasurf_full.shape
(3979, 1052)
X_train, X_test, y_train, y_test = train_test_split(X_harasurf_full, y,
random_state = 0)
print X_train.shape, X_test.shape
print y_train.shape, y_test.shape
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1)
gs.fit(X_train, y_train)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(X_train, y_train)
print svc.score(X_test, y_test)
draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_test)),
label_names)
(2984, 1052) (995, 1052) (2984,) (995,) {'gamma': 0.0003} 0.692694369973 0.674371859296
normalization (not sparse anymore) + sgd It runs much faster than SVC and achieves comparable result - good features go a long way
ss = StandardScaler()
X_harasurf_norm = ss.fit_transform(X_harasurf_full)
X_train, X_test, y_train, y_test = train_test_split(X_harasurf_norm, y,
random_state = 0)
sgd = SGDClassifier(penalty='elasticnet', )
alphas = [1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(sgd, {'alpha': alphas}, cv = 10, n_jobs=-1)
gs.fit(X_train, y_train)
print gs.best_params_
print gs.best_score_
sgd = SGDClassifier(**gs.best_params_)
sgd.fit(X_train, y_train)
print sgd.score(X_test, y_test)
draw_confusion_matrix(confusion_matrix(y_test, sgd.predict(X_test)),
label_names)
{'alpha': 0.01} 0.652479892761 0.676381909548
surf_feats
and all_feats
print len(surf_feats), all_feats.shape
3979 (2489070, 64)
k_seeds = 1000
seeds = all_feats[shuffle(np.arange(all_feats.shape[0]))][:k_seeds]
seeds = normalize(seeds)
print seeds.shape
dv['seeds'] = seeds
(1000, 64)
def soft_thresh(surf_feat):
from sklearn.preprocessing import normalize
import numpy as np
X = normalize(surf_feat)
similarities = np.dot(X, seeds.T)
thr = np.mean(similarities)
similarities[similarities < thr] = 0.0
return similarities.mean(axis = 0)
soft_surf_X = lb_view.map(soft_thresh, surf_feats, block=True)
soft_surf_X = np.asarray(soft_surf_X)
print soft_surf_X.shape
(3979, 1000)
X_train, X_test, y_train, y_test = train_test_split(soft_surf_X, y,
random_state = 0)
print X_train.shape, X_test.shape
print y_train.shape, y_test.shape
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1)
gs.fit(X_train, y_train)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(X_train, y_train)
print svc.score(X_test, y_test)
draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_test)),
label_names)
(2984, 1000) (995, 1000) (2984,) (995,) {'gamma': 0.3} 0.595174262735 0.590954773869
from sklearn.decomposition import NMF
nmf = NMF(n_components=200, )
X_surf_bow.shape
nmf.fit(X_surf_bow)
X_surf_nmf = nmf.transform(X_surf_bow)
print X_surf_nmf.shape
(3979, 200)
X_train, X_test, y_train, y_test = train_test_split(X_surf_nmf, y,
random_state = 0)
print X_train.shape, X_test.shape
print y_train.shape, y_test.shape
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1)
gs.fit(X_train, y_train)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(X_train, y_train)
print svc.score(X_test, y_test)
draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_test)),
label_names)
(2984, 200) (995, 200) (2984,) (995,) {'gamma': 0.3} 0.638404825737 0.612060301508
for i in xrange(5):
fig, axes = pl.subplots(nrows = 1, ncols = 2)
fig.subplots_adjust(wspace = 0, hspace = 0)
axes[0].imshow(images[i], cmap = pl.cm.gray)
axes[1].imshow(surf.show_surf(images[i], surf.surf(images[i])))
fig.suptitle(labels[i])