#!/usr/bin/env python # coding: utf-8 # In[ ]: get_ipython().run_line_magic('load_ext', 'load_style') get_ipython().run_line_magic('load_style', 'talk.css') # # PCA of SST anomalies in the Pacific with scikit-learn # In[ ]: import pandas as pd import numpy as np from numpy import ma from matplotlib import pyplot as plt from mpl_toolkits.basemap import Basemap # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') # ## load the SST data # The file (74 Mb) can be downloaded at `ftp://ftp.niwa.co.nz/incoming/fauchereaun/ersst.realtime.nc` # In[ ]: import xray # In[ ]: dset = xray.open_dataset('../data/ersst.realtime.nc') # In[ ]: dset # ### selects the period 1980 - 2014 and the tropical Pacific domain # In[ ]: dsub = dset.sel(time=slice('1980','2014'), zlev=0, lat=slice(-40,40), lon=slice(120,290)) # In[ ]: lat = dsub['lat'].values lon = dsub['lon'].values sst = dsub['anom'].values # In[ ]: sst.shape # ### reshape in 2D (time, space) # In[ ]: X = np.reshape(sst, (sst.shape[0], len(lat) * len(lon)), order='F') # In[ ]: np.any(np.isnan(X)) # ### Mask the land points # In[ ]: type(X) # In[ ]: X = ma.masked_array(X, np.isnan(X)) # In[ ]: type(X) # In[ ]: land = X.sum(0).mask # In[ ]: ocean = -land # ### keep only oceanic grid-points # In[ ]: X = X[:,ocean] # ### Standardize SST using the fit and transform methods of the `sklearn.preprocessing.scaler.StandardScaler` # In[ ]: from sklearn import preprocessing scaler = preprocessing.StandardScaler() # In[ ]: scaler_sst = scaler.fit(X) # ### Once the scaler object has been 'trained' on the data, we can save it as a pickle object # In[ ]: from sklearn.externals import joblib # In[ ]: joblib.dump(scaler_sst, '../data/scaler_sst.pkl', compress=9) # In[ ]: scaler_sst = joblib.load('../data/scaler_sst.pkl') # ### scales: use the `transform` method of the scaler object # In[ ]: X = scaler_sst.transform(X) # ### verify that mean = 0 and std = 1 # In[ ]: X.mean() # In[ ]: X.std() # In[ ]: X.shape # ### EOF decomposition # In[ ]: from sklearn.decomposition import pca # #### instantiates the PCA object # In[ ]: skpca = pca.PCA() # #### fit # In[ ]: skpca.fit(X) # ### Now saves the (fitted) PCA object for reuse in operations # In[ ]: joblib.dump(skpca, '../data/EOF.pkl', compress=9) # In[ ]: from matplotlib import style style.use('fivethirtyeight') # In[ ]: style.available # In[ ]: f, ax = plt.subplots(figsize=(6,6)) ax.plot(skpca.explained_variance_ratio_[0:10]*100) ax.plot(skpca.explained_variance_ratio_[0:10]*100,'ro') # ### keep number of PC sufficient to explain 70 % of the original variance # In[ ]: ipc = np.where(skpca.explained_variance_ratio_.cumsum() >= 0.70)[0][0] # In[ ]: ipc # ### The Principal Components (PCs) are obtained by using the `transform` method of the `pca` object (`skpca`) # In[ ]: PCs = skpca.transform(X) # In[ ]: PCs = PCs[:,:ipc] # ### The Empirical Orthogonal Functions (EOFs) are contained in the `components_` attribute of the `pca` object (`skpca`) # In[ ]: EOFs = skpca.components_ # In[ ]: EOFs = EOFs[:ipc,:] # In[ ]: EOFs.shape # ### we can the reconstruct the 2D fields (maps) # In[ ]: EOF_recons = np.ones((ipc, len(lat) * len(lon))) * -999. # In[ ]: for i in xrange(ipc): EOF_recons[i,ocean] = EOFs[i,:] # In[ ]: EOF_recons = ma.masked_values(np.reshape(EOF_recons, (ipc, len(lat), len(lon)), order='F'), -999.) # In[ ]: EOF_recons.shape # In[ ]: plt.imshow(EOF_recons[0,:,:], origin='lower', interpolation='nearest', aspect='auto') plt.colorbar(); # ### scale the Principal Components # In[ ]: from sklearn.preprocessing import StandardScaler # In[ ]: scaler_PCs = StandardScaler() # In[ ]: scaler_PCs.fit(PCs) # In[ ]: PCs_std = scaler_PCs.transform(PCs) # In[ ]: joblib.dump(scaler_PCs, '../data/scaler_PCs.pkl') # In[ ]: PCdf = pd.DataFrame(PCs_std, index = dsub['time'], \ columns = ["EOF%s" % (x) for x in xrange(1, PCs_std.shape[1] +1)]) # In[ ]: PCdf.head() # In[ ]: PCdf.to_csv('../data/EOF_ERSST_PCs.csv') # In[ ]: from scipy.signal import detrend # In[ ]: f, ax = plt.subplots(figsize=(12,6)) PCdf.ix[:,0].plot(ax=ax, color='k', label='PC1') #ax.set_xlabel('period', fontsize=18) ax.plot(PCdf.index, detrend(PCdf.ix[:,0].values), 'r', label='PC1 (trend removed)') ax.grid('off') ax.legend(loc=1); # In[ ]: get_ipython().system('ipython nbconvert sklearn_EOF_decomposition.ipynb --to html')