import cPickle from IPython.parallel import Client import mahotas as mh import pylab as pl import seaborn as sb import numpy as np from functools import partial import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import train_test_split from sklearn.grid_search import GridSearchCV from sklearn.svm import SVC from sklearn.linear_model import SGDClassifier from sklearn.utils import shuffle from sklearn.preprocessing import LabelEncoder from sklearn.metrics import confusion_matrix from sklearn.cluster import MiniBatchKMeans from sklearn.decomposition import RandomizedPCA from sklearn.feature_extraction.text import TfidfTransformer from sklearn.preprocessing import normalize %pylab inline --no-import sb.set(style = 'nogrid') def draw_confusion_matrix(mat, label_names): pl.matshow(mat, cmap = pl.cm.jet) pl.xticks(range(len(label_names)), label_names) pl.yticks(range(len(label_names)), label_names) pl.colorbar() client = Client() print len(client) dv = client[:] lb_view = client.load_balanced_view() ## load data images, labels = cPickle.load(open('../data/subcifa.pkl', 'r')) print len(images), len(labels) images, labels = shuffle(images, labels) images = list(images) print images[0].shape image_shape = images[0].shape ll = LabelEncoder() y = ll.fit_transform(labels) label_names = ll.classes_ print y.shape print label_names np.unique([im.shape for im in images]) def extract_haralick(img, averaged=True): import mahotas as mh import numpy as np feats = mh.features.haralick(img) if averaged: return feats.mean(axis = 0) else: return feats.ravel() dv['extract_haralick'] = extract_haralick ## full haralick X_hara_full = lb_view.map(partial(extract_haralick, averaged = False), images, block=True) X_hara_full = np.asarray(X_hara_full) print X_hara_full.shape ## averaged haralick on 4 directions X_hara_averaged = lb_view.map(partial(extract_haralick, averaged = True), images, block=True) X_hara_averaged = np.asarray(X_hara_averaged) print X_hara_averaged.shape ss = StandardScaler() X_hara_full = ss.fit_transform(X_hara_full) X_hara_averaged = ss.fit_transform(X_hara_averaged) print X_hara_full.shape, X_hara_averaged.shape X_harafull_train, X_harafull_test, y_train, y_test = train_test_split(X_hara_full, y, random_state = 0) print X_harafull_train.shape, X_harafull_test.shape print y_train.shape, y_test.shape svc = SVC() gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1) gs.fit(X_harafull_train, y_train) print gs.best_params_ print gs.best_score_ svc = SVC(**gs.best_params_) svc.fit(X_harafull_train, y_train) print svc.score(X_harafull_test, y_test) draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_harafull_test)), label_names) X_haraavg_train, X_haraavg_test, y_train, y_test = train_test_split(X_hara_averaged, y, random_state = 0) print X_haraavg_train.shape, X_haraavg_test.shape print y_train.shape, y_test.shape svc = SVC() gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1) gs.fit(X_haraavg_train, y_train) print gs.best_params_ print gs.best_score_ svc = SVC(**gs.best_params_) svc.fit(X_haraavg_train, y_train) print svc.score(X_haraavg_test, y_test) draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_haraavg_test)), label_names) def extract_surf_avg(img): from mahotas.features import surf import numpy as np feats = surf.surf(img, descriptor_only=True) return feats.mean(axis = 0) X_surf_avg = lb_view.map(extract_surf_avg, images, block=True) X_surf_avg = np.asarray(X_surf_avg) print X_surf_avg.shape ss = StandardScaler() X_surf_avg = StandardScaler().fit_transform(X_surf_avg) X_surfavg_train, X_surfavg_test, y_train, y_test = train_test_split(X_surf_avg, y, random_state = 0) print X_surfavg_train.shape, X_surfavg_test.shape print y_train.shape, y_test.shape svc = SVC() gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1) gs.fit(X_surfavg_train, y_train) print gs.best_params_ print gs.best_score_ svc = SVC(**gs.best_params_) svc.fit(X_surfavg_train, y_train) print svc.score(X_surfavg_test, y_test) draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_surfavg_test)), label_names) def extract_surf(img): from mahotas.features import surf import numpy as np feats = surf.surf(img, descriptor_only=True) return feats surf_feats = lb_view.map(extract_surf, images, block=True) all_feats = np.concatenate(surf_feats) print all_feats.shape from scipy.stats import skew print np.std(all_feats.max(axis = 1)) print np.std(all_feats.min(axis = 1)) print np.std(all_feats.mean(axis = 1)) ## use a big set of visual words - SVC may have difficulty of handling it directly though (or not?) kmeans = MiniBatchKMeans(n_clusters = 1000, batch_size=10000, random_state=0) kmeans.fit(all_feats) ## get visual words dv['kmeans'] = kmeans def get_bow(feat): import numpy as np feat_clusters = kmeans.predict(feat) bow = [np.sum(feat_clusters==c) for c in xrange(kmeans.n_clusters)] return bow X_surf_bow = np.asarray(lb_view.map(get_bow, surf_feats, block = True)) print X_surf_bow.shape X_surfbow_train, X_surfbow_test, y_train, y_test = train_test_split(X_surf_bow, y, random_state = 0) print X_surfbow_train.shape, X_surfbow_test.shape print y_train.shape, y_test.shape svc = SVC() gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1) gs.fit(X_surfbow_train, y_train) print gs.best_params_ print gs.best_score_ svc = SVC(**gs.best_params_) svc.fit(X_surfbow_train, y_train) print svc.score(X_surfbow_test, y_test) draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_surfbow_test)), label_names) pca = RandomizedPCA(n_components=100) pca.fit(X_surf_bow) explained_vars = pd.DataFrame(pca.explained_variance_ratio_, index = ['pca%i'%i for i in xrange(pca.components_.shape[0])]) explained_vars.plot(kind = 'bar') X_surf_pca = RandomizedPCA(n_components=50).fit_transform(X_surf_bow) X_surfpca_train, X_surfpca_test, y_train, y_test = train_test_split(X_surf_pca, y, random_state = 0) print X_surfpca_train.shape, X_surfpca_test.shape print y_train.shape, y_test.shape svc = SVC() gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1) gs.fit(X_surfpca_train, y_train) print gs.best_params_ print gs.best_score_ svc = SVC(**gs.best_params_) svc.fit(X_surfpca_train, y_train) print svc.score(X_surfpca_test, y_test) draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_surfpca_test)), label_names) tfidf = TfidfTransformer() X_surf_tfidf = tfidf.fit_transform(X_surf_bow) print X_surf_tfidf.shape X_surftfidf_train, X_surftfidf_test, y_train, y_test = train_test_split(X_surf_tfidf, y, random_state = 0) print X_surftfidf_train.shape, X_surftfidf_test.shape print y_train.shape, y_test.shape svc = SVC() gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1) gs.fit(X_surftfidf_train, y_train) print gs.best_params_ print gs.best_score_ svc = SVC(**gs.best_params_) svc.fit(X_surftfidf_train, y_train) print svc.score(X_surftfidf_test, y_test) draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_surftfidf_test)), label_names) print X_surftfidf_train.shape, X_surftfidf_test.shape print y_train.shape, y_test.shape sgd = SGDClassifier(penalty='elasticnet', ) alphas = [1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gs = GridSearchCV(sgd, {'alpha': alphas}, cv = 10, n_jobs=-1) gs.fit(X_surftfidf_train, y_train) print gs.best_params_ print gs.best_score_ sgd = SGDClassifier(**gs.best_params_) sgd.fit(X_surftfidf_train, y_train) print sgd.score(X_surftfidf_test, y_test) draw_confusion_matrix(confusion_matrix(y_test, sgd.predict(X_surftfidf_test)), label_names) ## note X_hara_full have been normalized before X_harasurf_full = np.c_[X_hara_full, X_surf_bow] print X_harasurf_full.shape X_train, X_test, y_train, y_test = train_test_split(X_harasurf_full, y, random_state = 0) print X_train.shape, X_test.shape print y_train.shape, y_test.shape svc = SVC() gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1) gs.fit(X_train, y_train) print gs.best_params_ print gs.best_score_ svc = SVC(**gs.best_params_) svc.fit(X_train, y_train) print svc.score(X_test, y_test) draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_test)), label_names) ss = StandardScaler() X_harasurf_norm = ss.fit_transform(X_harasurf_full) X_train, X_test, y_train, y_test = train_test_split(X_harasurf_norm, y, random_state = 0) sgd = SGDClassifier(penalty='elasticnet', ) alphas = [1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gs = GridSearchCV(sgd, {'alpha': alphas}, cv = 10, n_jobs=-1) gs.fit(X_train, y_train) print gs.best_params_ print gs.best_score_ sgd = SGDClassifier(**gs.best_params_) sgd.fit(X_train, y_train) print sgd.score(X_test, y_test) draw_confusion_matrix(confusion_matrix(y_test, sgd.predict(X_test)), label_names) print len(surf_feats), all_feats.shape k_seeds = 1000 seeds = all_feats[shuffle(np.arange(all_feats.shape[0]))][:k_seeds] seeds = normalize(seeds) print seeds.shape dv['seeds'] = seeds def soft_thresh(surf_feat): from sklearn.preprocessing import normalize import numpy as np X = normalize(surf_feat) similarities = np.dot(X, seeds.T) thr = np.mean(similarities) similarities[similarities < thr] = 0.0 return similarities.mean(axis = 0) soft_surf_X = lb_view.map(soft_thresh, surf_feats, block=True) soft_surf_X = np.asarray(soft_surf_X) print soft_surf_X.shape X_train, X_test, y_train, y_test = train_test_split(soft_surf_X, y, random_state = 0) print X_train.shape, X_test.shape print y_train.shape, y_test.shape svc = SVC() gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1) gs.fit(X_train, y_train) print gs.best_params_ print gs.best_score_ svc = SVC(**gs.best_params_) svc.fit(X_train, y_train) print svc.score(X_test, y_test) draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_test)), label_names) from sklearn.decomposition import NMF nmf = NMF(n_components=200, ) X_surf_bow.shape nmf.fit(X_surf_bow) X_surf_nmf = nmf.transform(X_surf_bow) print X_surf_nmf.shape X_train, X_test, y_train, y_test = train_test_split(X_surf_nmf, y, random_state = 0) print X_train.shape, X_test.shape print y_train.shape, y_test.shape svc = SVC() gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gs = GridSearchCV(svc, {'gamma': gammas}, cv = 10, n_jobs=-1) gs.fit(X_train, y_train) print gs.best_params_ print gs.best_score_ svc = SVC(**gs.best_params_) svc.fit(X_train, y_train) print svc.score(X_test, y_test) draw_confusion_matrix(confusion_matrix(y_test, svc.predict(X_test)), label_names) for i in xrange(5): fig, axes = pl.subplots(nrows = 1, ncols = 2) fig.subplots_adjust(wspace = 0, hspace = 0) axes[0].imshow(images[i], cmap = pl.cm.gray) axes[1].imshow(surf.show_surf(images[i], surf.surf(images[i]))) fig.suptitle(labels[i])