#!/usr/bin/env python
# coding: utf-8

# # Sea Surface Altimetry Data Analysis
# 
# For this example we will use gridded sea-surface altimetry data from The Copernicus Marine Environment:
# 
# http://marine.copernicus.eu/services-portfolio/access-to-products/?option=com_csw&view=details&product_id=SEALEVEL_GLO_PHY_L4_REP_OBSERVATIONS_008_047
# 
# This is a widely used dataset in physical oceanography and climate.
# 
# ![globe image](http://marine.copernicus.eu/documents/IMG/SEALEVEL_GLO_SLA_MAP_L4_REP_OBSERVATIONS_008_027.png)
# 
# The dataset has already been extracted from copernicus and stored in google cloud storage in [xarray-zarr](http://xarray.pydata.org/en/latest/io.html#zarr) format.

# In[ ]:


import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import gcsfs
plt.rcParams['figure.figsize'] = (15,10)
get_ipython().run_line_magic('matplotlib', 'inline')


# ### Initialize Dataset
# 
# Here we load the dataset from the zarr store. Note that this very large dataset initializes nearly instantly, and we can see the full list of variables and coordinates.

# In[ ]:


gcsmap = gcsfs.mapping.GCSMap('pangeo-data/dataset-duacs-rep-global-merged-allsat-phy-l4-v3-alt')
ds = xr.open_zarr(gcsmap)
ds


# In[ ]:


### Alternate method to load the data using Intake
### Where the remote YAML spec points to the data on GCS
# import intake
# ds = intake.Catalog('https://raw.githubusercontent.com/pangeo-data/pangeo/master/gce/catalog.yaml').sea_surface.to_dask()
# ds


# ### Examine Metadata
# 
# For those unfamiliar with this dataset, the variable metadata is very helpful for understanding what the variables actually represent

# In[ ]:


for v in ds.data_vars:
    print('{:>10}: {}'.format(v, ds[v].attrs['long_name']))


# ### Create and Connect to Dask Distributed Cluster

# In[ ]:


from dask.distributed import Client, progress

from dask_kubernetes import KubeCluster
cluster = KubeCluster(n_workers=20)
cluster


# ** ☝️ Don't forget to click the link above to view the scheduler dashboard! **

# In[ ]:


client = Client(cluster)
client


# ## Visually Examine Some of the Data
# 
# Let's do a sanity check that the data looks reasonable:

# In[ ]:


plt.rcParams['figure.figsize'] = (15, 8)
ds.sla.sel(time='1982-08-07', method='nearest').plot()


# ## Timeseries of Global Mean Sea Level
# 
# Here we make a simple yet fundamental calculation: the rate of increase of global mean sea level over the observational period.

# In[ ]:


# the number of GB involved in the reduction
ds.sla.nbytes/1e9


# In[ ]:


# the computationally intensive step
sla_timeseries = ds.sla.mean(dim=('latitude', 'longitude')).load()


# In[ ]:


sla_timeseries.plot(label='full data')
sla_timeseries.rolling(time=365, center=True).mean().plot(label='rolling annual mean')
plt.ylabel('Sea Level Anomaly [m]')
plt.title('Global Mean Sea Level')
plt.legend()
plt.grid()


# In order to understand how the sea level rise is distributed in latitude, we can make a sort of [Hovmöller diagram](https://en.wikipedia.org/wiki/Hovm%C3%B6ller_diagram).

# In[ ]:


sla_hov = ds.sla.mean(dim='longitude').load()


# In[ ]:


fig, ax = plt.subplots(figsize=(12, 4))
sla_hov.name = 'Sea Level Anomaly [m]'
sla_hov.transpose().plot(vmax=0.2, ax=ax)


# We can see that most sea level rise is actually in the Southern Hemisphere.

# ## Sea Level Variability
# 
# We can examine the natural variability in sea level by looking at its standard deviation in time.

# In[ ]:


sla_std = ds.sla.std(dim='time').load()
sla_std.name = 'Sea Level Variability [m]'


# In[ ]:


ax = sla_std.plot()
_ = plt.title('Sea Level Variability')


# In[ ]: