#!/usr/bin/env python # coding: utf-8 # In[2]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib import gridspec from sklearn.decomposition import PCA m = ['o', 's', '^', 'x'] countries = ['Argentina', 'Australia', 'Chile', 'South Africa'] # # PCA in action # # We saw previously a simple conceptual introduction to PCA, in that it is a method to find the vectors describing the features over which the largest spread is. # However, it is not yet clear how it can be used for dimensionality reduction. # For this, we will use a [real dataset](./example), which is information about 44 different wines (the data file is available [here](https://github.com/arm61/trad_ml_methods/raw/main/wine_data.csv)). # The features we are investigating are the column in the data below, while the rows are the different wines. # In[3]: data = pd.read_csv('wine_data.csv') # In[4]: data.head() # We are only printing the first 5 rows of the table, but rest assured there are 44 wines in the dataset. # In[5]: print(data.shape) # In traditional analysis, we would investigate how individual parameters vary. # Below, we show how the ethanol, glycerol, and methanol content varies as a function of the country that the wine is sourced from. # However, there is no real trends present in the data. # In[6]: fig, ax = plt.subplots(1, 3, figsize=(12, 5)) for i in range(4): ax[0].plot(data.where(data['Country'] == countries[i])['Ethanol'], m[i]) ax[0].set_xlabel('Sample number') ax[0].set_ylabel('Ethanol/%') for i in range(4): ax[1].plot(data.where(data['Country'] == countries[i])['Glycerol'], m[i]) ax[1].set_xlabel('Sample number') ax[1].set_ylabel('Glycerol/%') for i in range(4): ax[2].plot(data.where(data['Country'] == countries[i])['Methanol'], m[i]) ax[2].set_xlabel('Sample number') ax[2].set_ylabel('Methanol/%') plt.legend(labels=countries) plt.tight_layout() plt.show() # However, this univariate analysis process removes all covariation in the features. # For example, if we plot the ethanol contant against the glycerol content, we can see a potentially interesting covariance. # In[7]: for i in range(4): plt.plot(data.where(data['Country'] == countries[i])['Ethanol'], data.where(data['Country'] == countries[i])['Glycerol'], m[i]) plt.xlabel('Ethanol/%') plt.ylabel('Glycerol/%') plt.legend(labels=countries) plt.show() # If these two features were completely correlated, it would be possible to describe the dataset completely with a combined feature. # Let's investigate performing a single component PCA on these two covariant features. # First it is necessary to normalise the features such that those with large values do no overwhelm the analysis. # This is achieved with the `sklearn.preprocessing.scale` function, we have also defined an `inverse_scale` helper function that will unnormalise the values. # In[8]: from sklearn.preprocessing import scale # In[9]: def inverse_scale(s, x): sigma = np.array(x.std(ddof=0, axis=0)) mu = np.array(x.mean(axis=0)) return (s * sigma.T) + mu.T # We can then reduce the relationship between ethanol and glycerol content to a single dimension. # We then inverse transform this to visualise the effect. # In[10]: pca = PCA(n_components=1) pca.fit(scale(data[['Ethanol', 'Glycerol']])) X_pca = pca.transform(scale(data[['Ethanol', 'Glycerol']])) # In[11]: X_new = inverse_scale(pca.inverse_transform(X_pca), data[['Ethanol', 'Glycerol']]) for i in range(4): plt.plot(data.where(data['Country'] == countries[i])['Ethanol'], data.where(data['Country'] == countries[i])['Glycerol'], m[i], alpha=0.5) plt.plot(X_new[:, 0], X_new[:, 1], 'k.') plt.xlabel('Ethanol/%') plt.ylabel('Glycerol/%') plt.legend(labels=countries) plt.show() # Above, we have used PCA with a single component to reduce the description of the covariation of ethanol and glycerol in our wine data. # Although, the new dataset does not completely capture the dataset that it describes, we have encoded most of the information regarding the relationship between our two features. # # In total, there are 14 features in the data (we are not considering the country that the wine is produced in as a feature as this is essentially our independent variable). # So, how many principal components should we use to describe the measured data? # In[12]: pca = PCA().fit(scale(data.drop('Country', axis=1))) plt.plot(np.cumsum(pca.explained_variance_ratio_), 'o') plt.axhline(1, color='k') plt.xlabel('Number of components') plt.ylabel('Cumulative explained variance') plt.ylim(0, 1.1) plt.show() # The plot above is a quantification of how much of the total data variance is contained within the first *N* components. # We can see that using three components is capable of describing more than 50 % of the observed variance, while seven components are required to take this to more than 90 %. # In[13]: print(np.cumsum(pca.explained_variance_ratio_)) # We can use a plot such as that above to determine how many components we should pay attention to. # For simplicity going forward (so that we can use standard 2-dimensional plotting), we will only consider the first two principle components. # In[14]: pca = PCA(n_components=2) pca.fit(scale(data.drop('Country', axis=1))) # We can visualise the components as a function of the sample and country of origin or even the two components together. # In[15]: X_new = pca.fit_transform(scale(data.drop('Country', axis=1))) data['PC1'] = X_new[:, 0] data['PC2'] = X_new[:, 1] # In[16]: fig = plt.figure(figsize=(12, 10)) gs = gridspec.GridSpec(2, 2) ax0 = plt.subplot(gs[0, 0]) for i in range(4): ax0.plot(data.where(data['Country'] == countries[i])['PC1'], m[i]) ax0.set_xlabel('Sample number') ax0.set_ylabel('Component 1') ax1 = plt.subplot(gs[1, 0]) for i in range(4): ax1.plot(data.where(data['Country'] == countries[i])['PC2'], m[i]) ax1.set_xlabel('Sample number') ax1.set_ylabel('Component 2') ax2 = plt.subplot(gs[:, 1]) for i in range(4): plt.plot(data.where(data['Country'] == countries[i])['PC1'], data.where(data['Country'] == countries[i])['PC2'], m[i]) ax2.set_xlabel('Component 1') ax2.set_ylabel('Component 2') plt.legend(labels=countries) plt.show() # From plotting the component scores, we can see that, for example, the Chilean wines typically have negative scores for component 2. # This means that the these wines have less of the features that component 2 represents. # Additionally, we can start to see clustering of different countries' wine in the cross component plot. # Clustering methods could potentially be applied here to classify the country of origin for new wines to our data automatically. # # Above, we mentioned that the Chilean wines appear to have less of the features that contribute to component 2. # To understand this, it is necessary to investigate which features contribute to a given component. # This can be achieved for our system by plotting the loadings of each feature to the component. # In[17]: loading_matrix = pd.DataFrame(pca.components_.T * np.sqrt(pca.explained_variance_), columns=['PC1', 'PC2'], index=data.columns[:-3]) # In[18]: ind = np.arange(len(loading_matrix.index)) width = 0.35 fig, ax = plt.subplots() ax.bar(ind, loading_matrix['PC1'], width, label='Component 1 ({:.1f} %)'.format(pca.explained_variance_ratio_[0] * 100)) ax.bar(ind + width, loading_matrix['PC2'], width, label='Component 2 ({:.1f} %)'.format(pca.explained_variance_ratio_[1] * 100)) ax.set_xlabel('Features') ax.set_ylabel('Loading') ax.set_xticks(ind + width / 2) ax.set_xticklabels(loading_matrix.index) ax.tick_params(axis='x', rotation=90) plt.legend() plt.show() # Since the Chilean wines had negative scores for component 2, it is implies that they have a lot of the opposite of the feature represented by component 2. # These can be interpreted that Chilean wines have variation where ehtanol, total acid, volatile A, and lactic acid are all low, but malic acid is high. # # We will now move on to explore the application of PCA in improving edge detection. # However, for the interested reader, we recommend the mathematical discuss of all aspects of PCA from Bro and Smilde {cite}`bro_analytical_2014` as well as the detailed introduction to PCA in Python from VanderPlas {cite}`vanderplas_python_2016`. # In[ ]: