%load_ext watermark %watermark -a 'Sebastian Raschka' -v -d from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler import numpy as np import matplotlib.pyplot as plt data = load_iris() X = data.data # convert features in column 1 from cm to inches X[:,0] /= 2.54 # convert features in column 2 from cm to meters X[:,1] /= 100 %matplotlib inline def pca_raw_cov(X): # Compute the covariance matrix cov_mat = np.cov(X.T) # Eigendecomposition of the covariance matrix eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat) # Make a list of (eigenvalue, eigenvector) tuples # and sort the (eigenvalue, eigenvector) tuples from high to low eig_pairs_cov = [(np.abs(eig_val_cov[i]), eig_vec_cov[:,i]) for i in range(len(eig_val_cov))] eig_pairs_cov.sort() eig_pairs_cov.reverse() # Construct the transformation matrix W from the eigenvalues that correspond to # the k largest eigenvalues (here: k = 2) matrix_w_cov = np.hstack((eig_pairs_cov[0][1].reshape(4,1), eig_pairs_cov[1][1].reshape(4,1))) # Transform the data using matrix W X_raw_transf = matrix_w_cov.T.dot(X.T).T # Plot the data plt.scatter(X_raw_transf[:,0], X_raw_transf[:,1]) plt.title('PCA based on the covariance matrix of the raw data') plt.show() def pca_standardize_cov(X): # Standardize data X_std = StandardScaler().fit_transform(X) # Compute the covariance matrix cov_mat = np.cov(X_std.T) # Eigendecomposition of the covariance matrix eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat) # Make a list of (eigenvalue, eigenvector) tuples # and sort the (eigenvalue, eigenvector) tuples from high to low eig_pairs_cov = [(np.abs(eig_val_cov[i]), eig_vec_cov[:,i]) for i in range(len(eig_val_cov))] eig_pairs_cov.sort() eig_pairs_cov.reverse() # Construct the transformation matrix W from the eigenvalues that correspond to # the k largest eigenvalues (here: k = 2) matrix_w_cov = np.hstack((eig_pairs_cov[0][1].reshape(4,1), eig_pairs_cov[1][1].reshape(4,1))) # Transform the data using matrix W X_std_transf = matrix_w_cov.T.dot(X_std.T).T # Plot the data plt.scatter(X_std_transf[:,0], X_std_transf[:,1]) plt.title('PCA based on the covariance matrix after standardizing the data') plt.show() def pca_cor(X): # Compute the correlation matrix cor_mat = np.corrcoef(X.T) # Eigendecomposition of the correlation matrix eig_val_cor, eig_vec_cor = np.linalg.eig(cor_mat) # Make a list of (eigenvalue, eigenvector) tuples # and sort the (eigenvalue, eigenvector) tuples from high to low eig_pairs_cor = [(np.abs(eig_val_cor[i]), eig_vec_cor[:,i]) for i in range(len(eig_val_cor))] eig_pairs_cor.sort() eig_pairs_cor.reverse() # Construct the transformation matrix W from the eigenvalues that correspond to # the k largest eigenvalues (here: k = 2) matrix_w_cor = np.hstack((eig_pairs_cor[0][1].reshape(4,1), eig_pairs_cor[1][1].reshape(4,1))) # Transform the data using matrix W X_transf = matrix_w_cor.T.dot(X.T).T # Plot the data plt.scatter(X_transf[:,0], X_transf[:,1]) plt.title('PCA based on the correlation matrix of the raw data') plt.show() pca_raw_cov(X) pca_standardize_cov(X) pca_cor(X)