Search and Load CMIP6 Data via ESGF / OPeNDAP¶

This notebooks shows how to search and load data via Earth System Grid Federation infrastructure. This infrastructure works great and is the foundation of the CMIP6 distribution system.

The main technologies used here are the ESGF search API, used to figure out what data we want, and OPeNDAP, a remote data access protocol over HTTP.

In [ ]:

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr

xr.set_options(display_style='html')
%matplotlib inline
%config InlineBackend.figure_format = 'retina' 

Search using ESGF API¶

In [2]:

#!/usr/bin/env python
from __future__ import print_function
import requests
import xml.etree.ElementTree as ET
import numpy

# Author: Unknown
# I got the original version from a word document published by ESGF
# https://docs.google.com/document/d/1pxz1Kd3JHfFp8vR2JCVBfApbsHmbUQQstifhGNdc6U0/edit?usp=sharing

# API AT: https://github.com/ESGF/esgf.github.io/wiki/ESGF_Search_REST_API#results-pagination

def esgf_search(server="https://esgf-node.llnl.gov/esg-search/search",
                files_type="OPENDAP", local_node=True, project="CMIP6",
                verbose=False, format="application%2Fsolr%2Bjson",
                use_csrf=False, **search):
    client = requests.session()
    payload = search
    payload["project"] = project
    payload["type"]= "File"
    if local_node:
        payload["distrib"] = "false"
    if use_csrf:
        client.get(server)
        if 'csrftoken' in client.cookies:
            # Django 1.6 and up
            csrftoken = client.cookies['csrftoken']
        else:
            # older versions
            csrftoken = client.cookies['csrf']
        payload["csrfmiddlewaretoken"] = csrftoken

    payload["format"] = format

    offset = 0
    numFound = 10000
    all_files = []
    files_type = files_type.upper()
    while offset < numFound:
        payload["offset"] = offset
        url_keys = [] 
        for k in payload:
            url_keys += ["{}={}".format(k, payload[k])]

        url = "{}/?{}".format(server, "&".join(url_keys))
        print(url)
        r = client.get(url)
        r.raise_for_status()
        resp = r.json()["response"]
        numFound = int(resp["numFound"])
        resp = resp["docs"]
        offset += len(resp)
        for d in resp:
            if verbose:
                for k in d:
                    print("{}: {}".format(k,d[k]))
            url = d["url"]
            for f in d["url"]:
                sp = f.split("|")
                if sp[-1] == files_type:
                    all_files.append(sp[0].split(".html")[0])
    return sorted(all_files)

In [3]:

result = esgf_search(activity_id='CMIP', table_id='Amon', variable_id='tas', experiment_id='historical',
                  institution_id="NCAR", source_id="CESM2", member_id="r10i1p1f1")
result

https://esgf-node.llnl.gov/esg-search/search/?activity_id=CMIP&table_id=Amon&variable_id=tas&experiment_id=historical&institution_id=NCAR&source_id=CESM2&member_id=r10i1p1f1&project=CMIP6&type=File&distrib=false&format=application%2Fsolr%2Bjson&offset=0

Out[3]:

['http://aims3.llnl.gov/thredds/dodsC/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_185001-189912.nc',
 'http://aims3.llnl.gov/thredds/dodsC/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_190001-194912.nc',
 'http://aims3.llnl.gov/thredds/dodsC/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_195001-199912.nc',
 'http://aims3.llnl.gov/thredds/dodsC/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_200001-201412.nc',
 'http://esgf-data.ucar.edu/thredds/dodsC/esg_dataroot/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_185001-189912.nc',
 'http://esgf-data.ucar.edu/thredds/dodsC/esg_dataroot/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_190001-194912.nc',
 'http://esgf-data.ucar.edu/thredds/dodsC/esg_dataroot/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_195001-199912.nc',
 'http://esgf-data.ucar.edu/thredds/dodsC/esg_dataroot/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_200001-201412.nc']

Load Data with Xarray¶

These are OPeNDAP endpoints. Xarray, together with the netCDF4 python library, allow lazy loading.

In [4]:

# there are mulitple sources of the same data--need to pick one
files_to_open = result[-4:]

ds = xr.open_mfdataset(files_to_open, combine='by_coords')
ds

/Users/rpa/Code/xarray/xarray/conventions.py:494: SerializationWarning: variable 'tas' has multiple fill values {1e+20, 1e+20}, decoding all values to NaN.
  use_cftime=use_cftime,

Out[4]:

xarray.Dataset

Dimensions:
- lat: 192
- lon: 288
- nbnd: 2
- time: 1980

Coordinates: (3)

lat

(lat)

float64

-90.0 -89.06 -88.12 ... 89.06 90.0

axis :: Y
bounds :: lat_bnds
standard_name :: latitude
title :: Latitude
type :: double
units :: degrees_north
valid_max :: 90.0
valid_min :: -90.0
_ChunkSizes :: 192

array([-90.      , -89.057592, -88.115183, -87.172775, -86.230366, -85.287958,
       -84.34555 , -83.403141, -82.460733, -81.518325, -80.575916, -79.633508,
       -78.691099, -77.748691, -76.806283, -75.863874, -74.921466, -73.979058,
       -73.036649, -72.094241, -71.151832, -70.209424, -69.267016, -68.324607,
       -67.382199, -66.439791, -65.497382, -64.554974, -63.612565, -62.670157,
       -61.727749, -60.78534 , -59.842932, -58.900524, -57.958115, -57.015707,
       -56.073298, -55.13089 , -54.188482, -53.246073, -52.303665, -51.361257,
       -50.418848, -49.47644 , -48.534031, -47.591623, -46.649215, -45.706806,
       -44.764398, -43.82199 , -42.879581, -41.937173, -40.994764, -40.052356,
       -39.109948, -38.167539, -37.225131, -36.282723, -35.340314, -34.397906,
       -33.455497, -32.513089, -31.570681, -30.628272, -29.685864, -28.743455,
       -27.801047, -26.858639, -25.91623 , -24.973822, -24.031414, -23.089005,
       -22.146597, -21.204188, -20.26178 , -19.319372, -18.376963, -17.434555,
       -16.492147, -15.549738, -14.60733 , -13.664921, -12.722513, -11.780105,
       -10.837696,  -9.895288,  -8.95288 ,  -8.010471,  -7.068063,  -6.125654,
        -5.183246,  -4.240838,  -3.298429,  -2.356021,  -1.413613,  -0.471204,
         0.471204,   1.413613,   2.356021,   3.298429,   4.240838,   5.183246,
         6.125654,   7.068063,   8.010471,   8.95288 ,   9.895288,  10.837696,
        11.780105,  12.722513,  13.664921,  14.60733 ,  15.549738,  16.492147,
        17.434555,  18.376963,  19.319372,  20.26178 ,  21.204188,  22.146597,
        23.089005,  24.031414,  24.973822,  25.91623 ,  26.858639,  27.801047,
        28.743455,  29.685864,  30.628272,  31.570681,  32.513089,  33.455497,
        34.397906,  35.340314,  36.282723,  37.225131,  38.167539,  39.109948,
        40.052356,  40.994764,  41.937173,  42.879581,  43.82199 ,  44.764398,
        45.706806,  46.649215,  47.591623,  48.534031,  49.47644 ,  50.418848,
        51.361257,  52.303665,  53.246073,  54.188482,  55.13089 ,  56.073298,
        57.015707,  57.958115,  58.900524,  59.842932,  60.78534 ,  61.727749,
        62.670157,  63.612565,  64.554974,  65.497382,  66.439791,  67.382199,
        68.324607,  69.267016,  70.209424,  71.151832,  72.094241,  73.036649,
        73.979058,  74.921466,  75.863874,  76.806283,  77.748691,  78.691099,
        79.633508,  80.575916,  81.518325,  82.460733,  83.403141,  84.34555 ,
        85.287958,  86.230366,  87.172775,  88.115183,  89.057592,  90.      ])

lon
(lon)
float64
0.0 1.25 2.5 ... 356.2 357.5 358.8
axis :
X
bounds :
lon_bnds
standard_name :
longitude
title :
Longitude
type :
double
units :
degrees_east
valid_max :
360.0
valid_min :
0.0
_ChunkSizes :
288
```
array([  0.  ,   1.25,   2.5 , ..., 356.25, 357.5 , 358.75])
```

time

(time)

object

1850-01-15 12:00:00 ... 2014-12-15 12:00:00

axis :: T
bounds :: time_bnds
standard_name :: time
title :: time
type :: double
_ChunkSizes :: 512

array([cftime.DatetimeNoLeap(1850, 1, 15, 12, 0, 0, 0, 2, 15),
       cftime.DatetimeNoLeap(1850, 2, 14, 0, 0, 0, 0, 4, 45),
       cftime.DatetimeNoLeap(1850, 3, 15, 12, 0, 0, 0, 5, 74), ...,
       cftime.DatetimeNoLeap(2014, 10, 15, 12, 0, 0, 0, 5, 288),
       cftime.DatetimeNoLeap(2014, 11, 15, 0, 0, 0, 0, 1, 319),
       cftime.DatetimeNoLeap(2014, 12, 15, 12, 0, 0, 0, 3, 349)], dtype=object)

Data variables: (4)

time_bnds

(time, nbnd)

object

dask.array<chunksize=(600, 2), meta=np.ndarray>

_ChunkSizes :: [1 2]





  
      Array  Chunk 
  
  
     Bytes  31.68 kB   9.60 kB 
     Shape  (1980, 2)   (600, 2) 
     Count  12 Tasks  4 Chunks 
     Type  object  numpy.ndarray

lat_bnds

(time, lat, nbnd)

float64

dask.array<chunksize=(600, 192, 2), meta=np.ndarray>

units :: degrees_north
_ChunkSizes :: [192 2]





  
      Array  Chunk 
  
  
     Bytes  6.08 MB   1.84 MB 
     Shape  (1980, 192, 2)   (600, 192, 2) 
     Count  20 Tasks  4 Chunks 
     Type  float64  numpy.ndarray

lon_bnds

(time, lon, nbnd)

float64

dask.array<chunksize=(600, 288, 2), meta=np.ndarray>

units :: degrees_east
_ChunkSizes :: [288 2]





  
      Array  Chunk 
  
  
     Bytes  9.12 MB   2.76 MB 
     Shape  (1980, 288, 2)   (600, 288, 2) 
     Count  20 Tasks  4 Chunks 
     Type  float64  numpy.ndarray

tas

(time, lat, lon)

float32

dask.array<chunksize=(600, 192, 288), meta=np.ndarray>

cell_measures :: area: areacella
cell_methods :: area: time: mean
comment :: near-surface (usually, 2 meter) air temperature
description :: near-surface (usually, 2 meter) air temperature
frequency :: mon
id :: tas
long_name :: Near-Surface Air Temperature
mipTable :: Amon
out_name :: tas
prov :: Amon ((isd.003))
realm :: atmos
standard_name :: air_temperature
time :: time
time_label :: time-mean
time_title :: Temporal mean
title :: Near-Surface Air Temperature
type :: real
units :: K
variable_id :: tas
_ChunkSizes :: [ 1 192 288]





  
      Array  Chunk 
  
  
     Bytes  437.94 MB   132.71 MB 
     Shape  (1980, 192, 288)   (600, 192, 288) 
     Count  12 Tasks  4 Chunks 
     Type  float32  numpy.ndarray

Attributes: (46)
Conventions :
CF-1.7 CMIP-6.2
activity_id :
CMIP
branch_method :
standard
branch_time_in_child :
674885.0
branch_time_in_parent :
306600.0
case_id :
24
cesm_casename :
b.e21.BHIST.f09_g17.CMIP6-historical.010
contact :
cesm_cmip6@ucar.edu
creation_date :
2019-03-12T06:39:18Z
data_specs_version :
01.00.29
experiment :
Simulation of recent past (1850 to 2014). Impose changing conditions (consistent with observations). Should be initialised from a point early enough in the pre-industrial control run to ensure that the end of all the perturbed runs branching from the end of this historical run end before the end of the control. Only one ensemble member is requested but modelling groups are strongly encouraged to submit at least three ensemble members of their CMIP historical simulation.
experiment_id :
historical
external_variables :
areacella
forcing_index :
1
frequency :
mon
further_info_url :
https://furtherinfo.es-doc.org/CMIP6.NCAR.CESM2.historical.none.r10i1p1f1
grid :
native 0.9x1.25 finite volume grid (192x288 latxlon)
grid_label :
gn
initialization_index :
1
institution :
National Center for Atmospheric Research, Climate and Global Dynamics Laboratory, 1850 Table Mesa Drive, Boulder, CO 80305, USA
institution_id :
NCAR
license :
CMIP6 model data produced by <The National Center for Atmospheric Research> is licensed under a Creative Commons Attribution-[]ShareAlike 4.0 International License (https://creativecommons.org/licenses/). Consult https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file)[]. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.
mip_era :
CMIP6
model_doi_url :
https://doi.org/10.5065/D67H1H0V
nominal_resolution :
100 km
parent_activity_id :
CMIP
parent_experiment_id :
piControl
parent_mip_era :
CMIP6
parent_source_id :
CESM2
parent_time_units :
days since 0001-01-01 00:00:00
parent_variant_label :
r1i1p1f1
physics_index :
1
product :
model-output
realization_index :
10
realm :
atmos
source :
CESM2 (2017): atmosphere: CAM6 (0.9x1.25 finite volume grid; 288 x 192 longitude/latitude; 32 levels; top level 2.25 mb); ocean: POP2 (320x384 longitude/latitude; 60 levels; top grid cell 0-10 m); sea_ice: CICE5.1 (same grid as ocean); land: CLM5 0.9x1.25 finite volume grid; 288 x 192 longitude/latitude; 32 levels; top level 2.25 mb); aerosol: MAM4 (0.9x1.25 finite volume grid; 288 x 192 longitude/latitude; 32 levels; top level 2.25 mb); atmoschem: MAM4 (0.9x1.25 finite volume grid; 288 x 192 longitude/latitude; 32 levels; top level 2.25 mb); landIce: CISM2.1; ocnBgchem: MARBL (320x384 longitude/latitude; 60 levels; top grid cell 0-10 m)
source_id :
CESM2
source_type :
AOGCM BGC
sub_experiment :
none
sub_experiment_id :
none
table_id :
Amon
tracking_id :
hdl:21.14100/e47b79db-3925-45a7-9c0a-6799c2f1e8ae
variable_id :
tas
variant_info :
CMIP6 20th century experiments (1850-2014) with CAM6, interactive land (CLM5), coupled ocean (POP2) with biogeochemistry (MARBL), interactive sea ice (CICE5.1), and non-evolving land ice (CISM2.1)
variant_label :
r10i1p1f1
DODS_EXTRA.Unlimited_Dimension :
time

Plot a map from a specific date.

In [5]:

ds.tas.sel(time='1950-01').squeeze().plot()

Out[5]:

<matplotlib.collections.QuadMesh at 0x11dd89908>

Create a timeseries of global-average surface air temperature. For this we need the area weighting factor for each gridpoint.

In [6]:

files_area = esgf_search(variable_id='areacella', activity_id='CMIP',
                         experiment_id='historical', institution_id="NCAR", source_id="CESM2")
ds_area = xr.open_dataset(files_area[0])
ds_area

In [7]:

total_area = ds_area.areacella.sum(dim=['lon', 'lat'])
ta_timeseries = (ds.tas * ds_area.areacella).sum(dim=['lon', 'lat']) / total_area
ta_timeseries

Out[7]:

xarray.DataArray

time: 1980

dask.array<chunksize=(600,), meta=np.ndarray>





  
      Array  Chunk 
  
  
     Bytes  7.92 kB   2.40 kB 
     Shape  (1980,)   (600,) 
     Count  38 Tasks  4 Chunks 
     Type  float32  numpy.ndarray

Coordinates: (1)

time

(time)

object

1850-01-15 12:00:00 ... 2014-12-15 12:00:00

axis :: T
bounds :: time_bnds
standard_name :: time
title :: time
type :: double
_ChunkSizes :: 512

array([cftime.DatetimeNoLeap(1850, 1, 15, 12, 0, 0, 0, 2, 15),
       cftime.DatetimeNoLeap(1850, 2, 14, 0, 0, 0, 0, 4, 45),
       cftime.DatetimeNoLeap(1850, 3, 15, 12, 0, 0, 0, 5, 74), ...,
       cftime.DatetimeNoLeap(2014, 10, 15, 12, 0, 0, 0, 5, 288),
       cftime.DatetimeNoLeap(2014, 11, 15, 0, 0, 0, 0, 1, 319),
       cftime.DatetimeNoLeap(2014, 12, 15, 12, 0, 0, 0, 3, 349)], dtype=object)

Attributes: (0)

By default the data are loaded lazily, as Dask arrays. Here we trigger computation explicitly.

In [8]:

%time ta_timeseries.load()

CPU times: user 4.93 s, sys: 6.73 s, total: 11.7 s
Wall time: 1min 35s

Out[8]:

xarray.DataArray

time: 1980

284.99948 285.23215 285.85364 ... 288.54376 287.61884 287.06284

array([284.99948, 285.23215, 285.85364, ..., 288.54376, 287.61884,
       287.06284], dtype=float32)

Coordinates: (1)

time

(time)

object

1850-01-15 12:00:00 ... 2014-12-15 12:00:00

axis :: T
bounds :: time_bnds
standard_name :: time
title :: time
type :: double
_ChunkSizes :: 512

array([cftime.DatetimeNoLeap(1850, 1, 15, 12, 0, 0, 0, 2, 15),
       cftime.DatetimeNoLeap(1850, 2, 14, 0, 0, 0, 0, 4, 45),
       cftime.DatetimeNoLeap(1850, 3, 15, 12, 0, 0, 0, 5, 74), ...,
       cftime.DatetimeNoLeap(2014, 10, 15, 12, 0, 0, 0, 5, 288),
       cftime.DatetimeNoLeap(2014, 11, 15, 0, 0, 0, 0, 1, 319),
       cftime.DatetimeNoLeap(2014, 12, 15, 12, 0, 0, 0, 3, 349)], dtype=object)

Attributes: (0)

In [ ]:

ta_timeseries.plot(label='monthly')
ta_timeseries.rolling(time=12).mean().plot(label='12 month rolling mean')
plt.legend()
plt.title('Global Mean Surface Air Temperature')

In [ ]:

ds.nbytes / 1e6 / 40

In [ ]: