#!/usr/bin/env python
# coding: utf-8

# In[ ]:


get_ipython().run_line_magic('load_ext', 'load_style')
get_ipython().run_line_magic('load_style', 'talk.css')


# # PCA of SST anomalies in the Pacific with scikit-learn

# In[ ]:


import pandas as pd
import numpy as np
from numpy import ma
from matplotlib import pyplot as plt
from mpl_toolkits.basemap import Basemap


# In[ ]:


get_ipython().run_line_magic('matplotlib', 'inline')


# ## load the SST data

# The file (74 Mb) can be downloaded at `ftp://ftp.niwa.co.nz/incoming/fauchereaun/ersst.realtime.nc`

# In[ ]:


import xray


# In[ ]:


dset = xray.open_dataset('../data/ersst.realtime.nc')


# In[ ]:


dset


# ### selects the period 1980 - 2014 and the tropical Pacific domain

# In[ ]:


dsub = dset.sel(time=slice('1980','2014'), zlev=0, lat=slice(-40,40), lon=slice(120,290))


# In[ ]:


lat = dsub['lat'].values
lon = dsub['lon'].values
sst = dsub['anom'].values


# In[ ]:


sst.shape


# ### reshape in 2D (time, space)

# In[ ]:


X = np.reshape(sst, (sst.shape[0], len(lat) * len(lon)), order='F')


# In[ ]:


np.any(np.isnan(X))


# ### Mask the land points

# In[ ]:


type(X)


# In[ ]:


X = ma.masked_array(X, np.isnan(X))


# In[ ]:


type(X)


# In[ ]:


land = X.sum(0).mask


# In[ ]:


ocean = -land


# ### keep only oceanic grid-points

# In[ ]:


X = X[:,ocean]


# ### Standardize SST using the fit and transform methods of the `sklearn.preprocessing.scaler.StandardScaler`

# In[ ]:


from sklearn import preprocessing
scaler  = preprocessing.StandardScaler()


# In[ ]:


scaler_sst = scaler.fit(X)


# ### Once the scaler object has been 'trained' on the data, we can save it as a pickle object

# In[ ]:


from sklearn.externals import joblib


# In[ ]:


joblib.dump(scaler_sst, '../data/scaler_sst.pkl', compress=9)


# In[ ]:


scaler_sst = joblib.load('../data/scaler_sst.pkl')


# ### scales: use the `transform` method of the scaler object

# In[ ]:


X = scaler_sst.transform(X)


# ### verify that mean = 0 and std = 1

# In[ ]:


X.mean()


# In[ ]:


X.std()


# In[ ]:


X.shape


# ### EOF decomposition 

# In[ ]:


from sklearn.decomposition import pca


# #### instantiates the PCA object

# In[ ]:


skpca = pca.PCA()


# #### fit

# In[ ]:


skpca.fit(X)


# ### Now saves the (fitted) PCA object for reuse in operations

# In[ ]:


joblib.dump(skpca, '../data/EOF.pkl', compress=9)


# In[ ]:


from matplotlib import style
style.use('fivethirtyeight')


# In[ ]:


style.available


# In[ ]:


f, ax = plt.subplots(figsize=(6,6))
ax.plot(skpca.explained_variance_ratio_[0:10]*100)
ax.plot(skpca.explained_variance_ratio_[0:10]*100,'ro')


# ### keep number of PC sufficient to explain 70 % of the original variance 

# In[ ]:


ipc = np.where(skpca.explained_variance_ratio_.cumsum() >= 0.70)[0][0]


# In[ ]:


ipc


# ### The Principal Components (PCs) are obtained by using the `transform` method of the `pca` object (`skpca`)

# In[ ]:


PCs = skpca.transform(X)


# In[ ]:


PCs = PCs[:,:ipc]


# ### The Empirical Orthogonal Functions (EOFs) are contained in the `components_` attribute of the `pca` object (`skpca`)

# In[ ]:


EOFs = skpca.components_


# In[ ]:


EOFs = EOFs[:ipc,:]


# In[ ]:


EOFs.shape


# ### we can the reconstruct the 2D fields (maps)

# In[ ]:


EOF_recons = np.ones((ipc, len(lat) * len(lon))) * -999.


# In[ ]:


for i in xrange(ipc): 
    EOF_recons[i,ocean] = EOFs[i,:]


# In[ ]:


EOF_recons = ma.masked_values(np.reshape(EOF_recons, (ipc, len(lat), len(lon)), order='F'), -999.)


# In[ ]:


EOF_recons.shape


# In[ ]:


plt.imshow(EOF_recons[0,:,:], origin='lower', interpolation='nearest', aspect='auto')
plt.colorbar();


# ### scale the Principal Components

# In[ ]:


from sklearn.preprocessing import StandardScaler


# In[ ]:


scaler_PCs = StandardScaler()


# In[ ]:


scaler_PCs.fit(PCs)


# In[ ]:


PCs_std = scaler_PCs.transform(PCs)


# In[ ]:


joblib.dump(scaler_PCs, '../data/scaler_PCs.pkl')


# In[ ]:


PCdf = pd.DataFrame(PCs_std, index = dsub['time'], \
                    columns = ["EOF%s" % (x) for x in xrange(1, PCs_std.shape[1] +1)])


# In[ ]:


PCdf.head()


# In[ ]:


PCdf.to_csv('../data/EOF_ERSST_PCs.csv')


# In[ ]:


from scipy.signal import detrend


# In[ ]:


f, ax = plt.subplots(figsize=(12,6))
PCdf.ix[:,0].plot(ax=ax, color='k', label='PC1')
#ax.set_xlabel('period', fontsize=18)
ax.plot(PCdf.index, detrend(PCdf.ix[:,0].values), 'r',  label='PC1 (trend removed)')
ax.grid('off')
ax.legend(loc=1); 


# In[ ]:


get_ipython().system('ipython nbconvert sklearn_EOF_decomposition.ipynb --to html')