#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import scipy.io from sklearn import preprocessing from sklearn import cross_validation from sklearn import manifold import matplotlib import matplotlib.pyplot as plt import numpy.linalg as lin get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: fileidxJW11 = scipy.io.loadmat("/Users/corbinrosset/Dropbox/XRMB/DATA/FILEIDX/fileidxJW11.mat") fileidxJW13 = scipy.io.loadmat("/Users/corbinrosset/Dropbox/XRMB/DATA/FILEIDX/fileidxJW13.mat") fileidxJW24 = scipy.io.loadmat("/Users/corbinrosset/Dropbox/XRMB/DATA/FILEIDX/fileidxJW24.mat") fileidxJW30 = scipy.io.loadmat("/Users/corbinrosset/Dropbox/XRMB/DATA/FILEIDX/fileidxJW30.mat") JW11 = scipy.io.loadmat("/Users/corbinrosset/Dropbox/XRMB/DATA/MAT/JW11[numfr1=7,numfr2=7].mat") JW13 = scipy.io.loadmat("/Users/corbinrosset/Dropbox/XRMB/DATA/MAT/JW13[numfr1=7,numfr2=7].mat") JW24 = scipy.io.loadmat("/Users/corbinrosset/Dropbox/XRMB/DATA/MAT/JW24[numfr1=7,numfr2=7].mat") JW30 = scipy.io.loadmat("/Users/corbinrosset/Dropbox/XRMB/DATA/MAT/JW30[numfr1=7,numfr2=7].mat") # In[34]: #generate manageable 2d data def generateData(numGaussians=3, numTraining=5000, numTesting=1000): #numGaussians = 5 #also the number of classes #N = 5000 #num training examples per class #T = 1000 #num testing examples per class gaussians = [] test = [] for i in range(numGaussians): mean = 10 * np.random.randn(2) cov = np.random.randint(10, size=(2, 2)) x_train = np.random.multivariate_normal(mean, cov, N).T x_test = np.random.multivariate_normal(mean, cov, T).T plt.plot(x_train[0], x_train[1], 'x') gaussians.append(x_train) #train data test.append(x_test) #test data #format and plot the training data X, and test data X_test plt.axis('equal') plt.title('Training Data') plt.show() X_separated = np.array(gaussians) # shape #(3, 2, N) for 3 classes of N 2-D points each X = np.concatenate(X_separated, axis=1) #shape (2, 3*N) X_test = np.concatenate(test, axis = 1) X = X.T X_test = X_test.T #create data labels for training set X and test set X_test y, y_test = [], [] for n in range(numGaussians): y = y + [n for i in range(N)] y_test = y_test + [n for i in range(T)] y = np.array(y) y_test = np.array(y_test) return X, X_test, y, y_test # In[103]: ### do LDA computations by hand ### #compute means of each class (empirical) X, X_test, y, y_test = generateData(5, 5000, 1000) means = [np.mean(x, 1) for x in X_separated] #empirical mean of each mean = np.array(np.mean(means, 0)) #compute within class scatter (covariances) covariances = np.array([np.cov(x) for x in X_separated]) #within class covariance matrices S_W = covariances.sum(axis=0) #S_w, the Within-class Scatter matrix (sum of covariance matrices of each class) #compute between class scatter (means) - sum of outer products of deviations of the class means from universal mean S_B = numGaussians*N*sum([np.outer((x - mean), (x - mean)) for x in means]) #Between-class Scatter, shape (2, 2) #To solve the generalized eigenvalue problem inv(S_W)*S_B*x = \lambda*x, must compute the eigendecomposition #of invS_W * S_B. The top-most eigenvector (assoc. with greatest eigenvalue) is the direction of maximimum #discriminancy invS_W = lin.inv(S_W) Z = invS_W * S_B #solve the eigenvalue problem, obtain the eigenvectors sorted by decreasing eigenvalue [vals, vecs] = lin.eig(Z) #annoying way to sort vecs by the vals...better way? ev_list = zip(vals, vecs) ev_list.sort(key=lambda tup:tup[0], reverse=True) [vals, V] = zip(*ev_list) V = np.array(V) #shape is (2, 2) #project data onto the primary and secondary axes proj_classes_first = np.array([np.dot(V[0], X_separated[i]) for i in range(numGaussians)]) #project onto top axis proj_classes_second = np.array([np.dot(V[1], X_separated[i]) for i in range(numGaussians)]) #project onto next axis # In[104]: ### visually inspect discriminating capability of axes found by LDA - can be skipped### #plot distributions projected onto the first axis found by LDA def gaussian(x, mu, sig): return np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.))) y_1min = min([min(x) for x in proj_classes_first]) y_1max = max([max(x) for x in proj_classes_first]) for projection in proj_classes_first: mu = np.mean(projection) sigma = np.std(projection) plt.plot(gaussian(np.linspace(y_1min, y_1max, 200), mu, sigma)) plt.show() #plot projections onto second axis y_2min = min([min(x) for x in proj_classes_second]) y_2max = max([max(x) for x in proj_classes_second]) for projection in proj_classes_second: mu = np.mean(projection) sigma = np.std(projection) plt.plot(gaussian(np.linspace(y_2min, y_2max, 200), mu, sigma)) plt.show() # In[109]: ### Run classifiers on test data and see results. This cell can be run independently of any others ### # of course we could have just used the built-in LDA function from scikit (but that's no fun) # from sklearn import datasets # LDA seems to perform best when the clusters overlap and their means are mutually distant. If the means # are too close and the distributions overlap, then KNN seems to perform better from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.neighbors import KNeighborsClassifier from sklearn import tree #generate some data, or if you'd like to visually inspect it, run the above cells first X, X_test, y, y_test = generateData(5, 5000, 1000) # print X.shape # print X_test.shape # print y.shape # print y_test.shape target_names = [i for i in range(numGaussians)] pca = PCA(n_components=2, whiten=True) X_pca = pca.fit(X).transform(X) X_pca_test = pca.transform(X_test) print 'PCA: explained variance ratio: ' + str(pca.explained_variance_ratio_) #print ' and avg likelihood: ' + str(pca.score(X)) lda = LinearDiscriminantAnalysis(n_components=(numGaussians - 1)) X_lda = lda.fit(X, y).transform(X) X_lda_test = lda.transform(X_test) lda.predict(X_test) print 'accuracy of built-in LDA on Test Data: ' print str(lda.score(X_test, y_test)) # classifiers neigh = KNeighborsClassifier(n_neighbors=2) print 'Accuracy of K-nearest neighbor on raw test data' neigh.fit(X, y) Z = neigh.predict(X_test) plt.scatter(X_test[:, 0], X_test[:, 1], c=Z) plt.title('KNN on raw') plt.show() print neigh.score(X_test, y_test) neigh = KNeighborsClassifier(n_neighbors=2) print 'running knearest neighbor on LDA-transformed data' neigh.fit(X_lda, y) Z = neigh.predict(X_lda_test) plt.scatter(X_lda_test[:, 0], X_lda_test[:, 1], c=Z) plt.title('KNN from LDA') plt.show() print neigh.score(X_lda_test, y_test) neigh = KNeighborsClassifier(n_neighbors=2) print 'running knearest neighbor on PCA-transformed data' neigh.fit(X_pca, y) Z = neigh.predict(X_pca_test) plt.scatter(X_pca_test[:, 0], X_pca_test[:, 1], c=Z) plt.title('KNN from PCA') plt.show() print neigh.score(X_pca_test, y_test) print 'running decision tree on LDA-transformed data' tr33 = tree.DecisionTreeClassifier() tr33 = tr33.fit(X_lda, y) Z = tr33.predict(X_lda_test) plt.scatter(X_lda_test[:, 0], X_lda_test[:, 1], c=Z) plt.title('DT on LDA') plt.show() print tr33.score(X_lda_test, y_test) print 'running decision tree on PCA-transformed data' tr33 = tree.DecisionTreeClassifier() tr33 = tr33.fit(X_pca, y) Z = tr33.predict(X_pca_test) plt.scatter(X_pca_test[:, 0], X_pca_test[:, 1], c=Z) plt.title('DT on PCA') plt.show() print tr33.score(X_pca_test, y_test) # In[107]: #### XRMB DATA - load and preprocess #### #XRMB contains two views of speech: articulatory and acoustic, each predictive of the labels, which are phonemes scaler = preprocessing.StandardScaler() mfcc_features = scaler.fit_transform(preprocessing.normalize(np.transpose(JW11['MFCC']))) articulatory_features = scaler.fit_transform(preprocessing.normalize(np.transpose(JW11['X']).astype(float))) phone_labels = np.transpose(JW11['P'][0]) n_samples = mfcc_features.shape[0] n_mfcc_features = mfcc_features.shape[1] n_articulatory_features = articulatory_features.shape[1] permutation = np.random.permutation(n_samples) X1 = np.asarray([mfcc_features[i] for i in permutation]) X2 = np.asarray([articulatory_features[i] for i in permutation]) #Y = np.asarray([binarized_labels[i] for i in permutation]) Phones = np.asarray([phone_labels[i] for i in permutation]) train, dev, test = 25948, 40948, 50948 #two views of data, X and Y X1_tr = X1[0:train, :] X1_dev = X1[train:dev, :] X1_test = X1[dev:test, :] X2_tr = X2[0:train, :] X2_dev = X2[train:dev, :] X2_test = X2[dev:test, :] #Y_tr = Y[0:train, :] #Y_dev = Y[train:dev, :] #Y_test = Y[dev:test, :] #labels Phones_tr = Phones[0:train] Phones_dev = Phones[train:dev] Phones_test = Phones[dev:test] #"core" features of the time series data baseline_acoustic_tr = X1_tr[:, 118:157] baseline_acoustic_dev = X1_dev[:, 118:157] baseline_acoustic_test = X1_test[:, 118:157] # In[14]: #apply dimensionality reduction to XRMB data from sklearn.neighbors import KNeighborsClassifier from sklearn.decomposition import PCA n_comps = 40 #LDA on View 1 lda = LinearDiscriminantAnalysis(n_components=n_comps) X_lda1 = lda.fit(X1_tr, Phones_tr).transform(X1_tr) X1_lda_test = lda.transform(X1_test) lda.predict(X1_test) print 'score of LDA on view 1: ' + str(lda.score(X1_test, Phones_test)) #LDA on View 2 lda2 = LinearDiscriminantAnalysis(n_components=n_comps) X_lda2 = lda2.fit(X2_tr, Phones_tr).transform(X2_tr) lda2.predict(X2_test) print 'score of LDA on view 2: ' + str(lda2.score(X2_test, Phones_test)) #PCA on View 1 pca = PCA(n_components=n_comps, whiten=True) X_pca1 = pca.fit(X1_tr).transform(X1_tr) X1_pca_test = pca.transform(X1_test) print 'pca view 1 explained variance ratio: ' + str(pca.explained_variance_ratio_) print ' and avg likelihood: ' + str(pca.score(X1_tr)) #PCA on View 2 pca = PCA(n_components=n_comps, whiten=True) X_pca2 = pca.fit(X2_tr).transform(X2_tr) print 'pca view 2 explained variance ratio: ' + str(pca.explained_variance_ratio_) print ' and avg likelihood: ' + str(pca.score(X2_tr)) #apply both PCA and LDA dimensionality reduction techniqes to the data set and then attempt #classification by KNN, for instance, to see which reduction technique is most effective #baseline KNN for view 1 neigh = KNeighborsClassifier(n_neighbors=2) print 'running knearest neighbor on raw data' neigh.fit(X1_tr, Phones_tr) neigh.predict(X1_test) print neigh.score(X1_test, Phones_test) #KNN on LDA transformed data; view 1 neigh2 = KNeighborsClassifier(n_neighbors=2) print 'running knearest neighbor on LDA View 1' neigh.fit(X_lda1, Phones_tr) neigh.predict(X1_lda_test) print neigh.score(X1_lda_test, Phones_test) #KNN on PCA transformed data; view 1 neigh2 = KNeighborsClassifier(n_neighbors=2) print 'running knearest neighbor on PCA View 1' neigh.fit(X_pca1, Phones_tr) neigh.predict(X1_pca_test) print neigh.score(X1_pca_test, Phones_test) # In[ ]: