#!/usr/bin/env python # coding: utf-8 # This notebook was created using Python 2.7.12 kernel. Mentioning the Python version becomes more important in case where this document is executed with Python 3.5 kernel and it may turn out that some part of the python code might not be compatible with Python 3.5. # # Exploring features available in Jupyterhub environment # At the moment, the current Jupyterhub environment features 2 python kernels (python 2.7.12 and python 3.5.2). A notebook document (existing one or a new one) is opened with either of these kernels. Irrespective of the choice of the kernel, it is a feature of the notebook server that a bash kernel is also started along with chosen kernel. This means unix commands can also be executed in the code cells. In order to do this, the command needs to be prefixed with "!" character. # In[1]: get_ipython().system('ls') # In[2]: get_ipython().system('head README.md') # One can also use cell magic "%%bash" to execute unix commands. This is particularly useful in case of a multi line command. # In[3]: get_ipython().run_cell_magic('bash', '', '\nmodule list\n') # Notice that this environment comes preloaded with few modules like `cdo`, `netcdf` and `texlive`. The `python` version depends on the selected kernel. # # Since `cdo` module is already loaded into the environment, we can execute `cdo` commands in the notebook. # # It is important to note that this is a frozen environment. Meaning, no new modules can be loaded or these module versions cannot be swapped. I guess for the most part, this minimal environment must be good enough. # # Likewise, for `python` quite a decent set of packages are available. # # Before continuing further it is probably useful to take a moment and quickly go through the following links as they cover some of the most common features of a notebook and most of them are also valid in this environment. # # - http://quasiben.github.io/dfwmeetup_2014/#/ # - http://arogozhnikov.github.io/2016/09/10/jupyter-features.html # # Here, I pick a few from these sources... # ### embedding sites # # https://jupyter.brynmawr.edu/services/public/dblank/Jupyter%20Notebook%20Users%20Manual.ipynb # In[4]: from IPython.display import HTML HTML('') # ### Inline Audio # # Please be warned that this audio may not be pleasent. # In[5]: import numpy as np from IPython.display import Audio framerate = 44100 t = np.linspace(0, 5, framerate * 5) data = np.sin(2 * np.pi * 220 * t**2) Audio(data, rate=framerate) # ### embedding video # # The youtube video shown here is from Jake Vanderplas. He talks about "Reproducible Data Analysis in Jupyter notebook". It is a series of 10 short 5 minute videos showing off a typical work flow using the notebooks. # In[6]: from IPython.display import YouTubeVideo YouTubeVideo("_ZEWDGpM-vM") # Embedding a series of videos... # # The first 4 videos in same series is worth checking out. # In[7]: from IPython.display import display for name in ("_ZEWDGpM-vM", "yUNBVzQfugg", "J45NJ0pJXWQ", "VdLdfF_uuSQ"): display(YouTubeVideo(name)) # ## $\LaTeX$ # In[8]: get_ipython().run_cell_magic('latex', '', '\n\\begin{align}\n\n\\frac{\\partial u}{\\partial t} + \\nabla \\cdot \\left( \\boldsymbol{v} u - D\\nabla u \\right) = f\n\n\\end{align}\n') # The same equation can also also be rendered in a markdown cell like this: $\frac{\partial u}{\partial t} + \nabla \cdot \left( \boldsymbol{v} u - D\nabla u \right) = f$ # ### inline plots # In[9]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') x = np.arange(0,2*np.pi,.01) plt.plot(x,np.sin(x)) # ### Expanding cell width to fill the screen width # In[10]: display(HTML("")) # The things covered so far might give you some idea of what can be done in a notebook. Although not everything is needed all the time, it is good to know that these features exist. The main take away is that one can take advantage of them to create a rich document to enhance the story telling part. # # Working with netcdf file # # # A more realistic example using these features is to work with real data and show a typical work-flow. In this case it is a sample meteogram dataset from ICON-LEM-DE simulations. # # Data URL: https://swift.dkrz.de/v1/dkrz_1e33ba3a-9ecb-452f-93b9-583cf4a66e57/jupyterhub_sample_datasets/Meteogram_sample_dataset.nc # # # # Downloading the data # In[11]: import requests import shutil import os url = "https://swift.dkrz.de/v1/dkrz_1e33ba3a-9ecb-452f-93b9-583cf4a66e57/jupyterhub_sample_datasets/Meteogram_sample_dataset.nc" filename = "Meteogram_sample_dataset.nc" if not os.path.exists(filename): r = requests.get(url, stream=True) if r.status_code == 200: with open(filename, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) # In[12]: get_ipython().system('ls -lh') # The file is 13MB in size. As mentioned earlier we can use `cdo` to perform some basic operations on this file. # In[13]: get_ipython().system('cdo showvar $filename') # In[14]: get_ipython().system('cdo sinfo $filename') # So we have 4 variables in this dataset. Measurements for the full day with a temporal frequency of 9 seconds. # # We can also use `ncdump -h` command to get the header information. # In[15]: get_ipython().system('ncdump -h $filename | head -n 50') # This provides quite a good insight into the dataset. # # The dataset contains 3 surface variables ($SHFL$, $LHFL$ and $T2M$) and 1 profile variable (a.k.a volume variable) $T$ with 150 height levels. # # we open this dataset in python using [xarray](http://xarray.pydata.org/en/stable/) package and visualize the data # In[16]: import xarray as xr ds = xr.open_dataset(filename) ds # The following line ensures that we have a inline plot # In[17]: get_ipython().run_line_magic('matplotlib', 'inline') # plotting sensible heat flux # In[18]: ds['SHFL'].plot() # we can pass some arguments to plot function to customize it. # In[19]: ds['SHFL'].plot(figsize=(12, 4)) # We can convert the dataset to pandas series object and plot. This will produce the same plot but formatting of the time labels is altered. # In[20]: ds['SHFL'].to_pandas().plot(figsize=(12, 4)) # To visualize sensible and latent heat fluxes in the same plot, the dataset is converted to a dataframe object. # In[21]: fluxes = ds[['SHFL', 'LHFL']].to_dataframe() fluxes.plot(figsize=(12, 4)) # Instead of plotting, we can also take a look at the data. # # top five rows # In[22]: fluxes.head() # In[23]: fluxes.tail() # Resamplling data to 10 minute frequency. # In[24]: fluxes_10min = fluxes.resample('10min').mean() fluxes_10min.plot(figsize=(12, 4)) # In[25]: fluxes_10min.head() # working with $T2M$ (temperature at 2 meters) will be similar, so skipping it. # # Next, working with profile variable $T$ (temperature) which is a 2 dimensional data (time, height_2). # In[26]: ds['T'] # In[27]: ds['T'].plot() # A better representation is shown when the data is transposed, so the axes are swapped. # In[28]: options = dict(figsize=(12, 4), cmap='RdYlBu_r') ds['T'].transpose().plot(**options) # This 2d plot shows a complete picture of the temperature for whole day (every time step) and for all height levels. In practice, producing profile plot at a specific time stamp is a preferred way of visualizing the data. # In[29]: # selecting 300th time step ds['T'].isel(time=300).plot() # The above plot looks better if the axes are swapped. Here the technique of transposing the data doesnot work (1 dimension data), so the 'height' information has to be explicitly passed as the second argument in the plotting function. # In[30]: import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(6, 7)) T = ds['T'].isel(time=300) ax.plot(T, T.height_2) ax.set_ylabel('height (m)') ax.set_xlabel('Temperature (K)') # Resampling to hourly profiles by taking means. # In[31]: T = ds['T'].resample('1H', 'time', how='mean') # In[32]: T.time # In[33]: f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 7)) for t in T: ax1.plot(t, t.height_2) ax2.plot(t, t.height_2, alpha=0.1, color='grey') ax1.set_ylabel('height (m)') ax1.set_xlabel('Temperature (K)') ax2.set_xlabel('Temperature (K)') f.tight_layout() # The above plot is an example for over-plotting. Setting a constant color and alpha value would result in a better representation of the data. # # --- # # [Holoviews](http://holoviews.org/) is a very interesting plotting library. Since this package is available, repeating the profile plot exercise with it. # In[34]: import pandas as pd import holoviews as hv # holoviews support matplotlib, bokeh and plotly backends for visualization hv.extension('bokeh') # creating a profile plot for the first time-step # In[35]: get_ipython().run_cell_magic('opts', 'Curve [width=500 height=600 invert_axes=True show_grid=True]', '\nhv.Curve(T[0])\n') # Notice that we did not change the dataset or pass the height information for get a proper plot as we have done in the earlier example. Setting 'invert_axis' to 'True' did the magic. # In[36]: get_ipython().run_cell_magic('opts', "Curve [width=500 height=600 invert_axes=True show_legend=False show_grid=True] (alpha=0.1 color='gray')", '\n# creating a mapping of curve objects w.r.t time\ncurves = {pd.to_datetime(t.time.values): hv.Curve(t) for t in T}\n\n# creating a holomap object out of curves\nhmap = hv.HoloMap(curves, kdims=[\'Time\']).relabel("Temperature profiles - Hourly")\n\n# overlaying curves\nhv.NdOverlay(hmap)\n') # 'NdOverlay' plots all the curves in a single plot. Directly visualizing the 'Holomap' object automatically creates the widget object as shown below. # In[37]: get_ipython().run_cell_magic('opts', 'Curve [width=500 height=600 invert_axes=True show_grid=True]', '\ncurves = {pd.to_datetime(t.time.values): hv.Curve(t) for t in T}\nhmap = hv.HoloMap(curves, kdims=[\'Time\']).relabel("Temperature profiles - Hourly")\nhmap\n')