This notebooks shows how to search and load data via Earth System Grid Federation infrastructure. This infrastructure works great and is the foundation of the CMIP6 distribution system.
The main technologies used here are the ESGF search API, used to figure out what data we want, and OPeNDAP, a remote data access protocol over HTTP.
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
xr.set_options(display_style='html')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
#!/usr/bin/env python
from __future__ import print_function
import requests
import xml.etree.ElementTree as ET
import numpy
# Author: Unknown
# I got the original version from a word document published by ESGF
# https://docs.google.com/document/d/1pxz1Kd3JHfFp8vR2JCVBfApbsHmbUQQstifhGNdc6U0/edit?usp=sharing
# API AT: https://github.com/ESGF/esgf.github.io/wiki/ESGF_Search_REST_API#results-pagination
def esgf_search(server="https://esgf-node.llnl.gov/esg-search/search",
files_type="OPENDAP", local_node=True, project="CMIP6",
verbose=False, format="application%2Fsolr%2Bjson",
use_csrf=False, **search):
client = requests.session()
payload = search
payload["project"] = project
payload["type"]= "File"
if local_node:
payload["distrib"] = "false"
if use_csrf:
client.get(server)
if 'csrftoken' in client.cookies:
# Django 1.6 and up
csrftoken = client.cookies['csrftoken']
else:
# older versions
csrftoken = client.cookies['csrf']
payload["csrfmiddlewaretoken"] = csrftoken
payload["format"] = format
offset = 0
numFound = 10000
all_files = []
files_type = files_type.upper()
while offset < numFound:
payload["offset"] = offset
url_keys = []
for k in payload:
url_keys += ["{}={}".format(k, payload[k])]
url = "{}/?{}".format(server, "&".join(url_keys))
print(url)
r = client.get(url)
r.raise_for_status()
resp = r.json()["response"]
numFound = int(resp["numFound"])
resp = resp["docs"]
offset += len(resp)
for d in resp:
if verbose:
for k in d:
print("{}: {}".format(k,d[k]))
url = d["url"]
for f in d["url"]:
sp = f.split("|")
if sp[-1] == files_type:
all_files.append(sp[0].split(".html")[0])
return sorted(all_files)
result = esgf_search(activity_id='CMIP', table_id='Amon', variable_id='tas', experiment_id='historical',
institution_id="NCAR", source_id="CESM2", member_id="r10i1p1f1")
result
https://esgf-node.llnl.gov/esg-search/search/?activity_id=CMIP&table_id=Amon&variable_id=tas&experiment_id=historical&institution_id=NCAR&source_id=CESM2&member_id=r10i1p1f1&project=CMIP6&type=File&distrib=false&format=application%2Fsolr%2Bjson&offset=0
['http://aims3.llnl.gov/thredds/dodsC/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_185001-189912.nc', 'http://aims3.llnl.gov/thredds/dodsC/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_190001-194912.nc', 'http://aims3.llnl.gov/thredds/dodsC/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_195001-199912.nc', 'http://aims3.llnl.gov/thredds/dodsC/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_200001-201412.nc', 'http://esgf-data.ucar.edu/thredds/dodsC/esg_dataroot/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_185001-189912.nc', 'http://esgf-data.ucar.edu/thredds/dodsC/esg_dataroot/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_190001-194912.nc', 'http://esgf-data.ucar.edu/thredds/dodsC/esg_dataroot/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_195001-199912.nc', 'http://esgf-data.ucar.edu/thredds/dodsC/esg_dataroot/CMIP6/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/tas/gn/v20190313/tas_Amon_CESM2_historical_r10i1p1f1_gn_200001-201412.nc']
These are OPeNDAP endpoints. Xarray, together with the netCDF4 python library, allow lazy loading.
# there are mulitple sources of the same data--need to pick one
files_to_open = result[-4:]
ds = xr.open_mfdataset(files_to_open, combine='by_coords')
ds
/Users/rpa/Code/xarray/xarray/conventions.py:494: SerializationWarning: variable 'tas' has multiple fill values {1e+20, 1e+20}, decoding all values to NaN. use_cftime=use_cftime,
array([-90. , -89.057592, -88.115183, -87.172775, -86.230366, -85.287958, -84.34555 , -83.403141, -82.460733, -81.518325, -80.575916, -79.633508, -78.691099, -77.748691, -76.806283, -75.863874, -74.921466, -73.979058, -73.036649, -72.094241, -71.151832, -70.209424, -69.267016, -68.324607, -67.382199, -66.439791, -65.497382, -64.554974, -63.612565, -62.670157, -61.727749, -60.78534 , -59.842932, -58.900524, -57.958115, -57.015707, -56.073298, -55.13089 , -54.188482, -53.246073, -52.303665, -51.361257, -50.418848, -49.47644 , -48.534031, -47.591623, -46.649215, -45.706806, -44.764398, -43.82199 , -42.879581, -41.937173, -40.994764, -40.052356, -39.109948, -38.167539, -37.225131, -36.282723, -35.340314, -34.397906, -33.455497, -32.513089, -31.570681, -30.628272, -29.685864, -28.743455, -27.801047, -26.858639, -25.91623 , -24.973822, -24.031414, -23.089005, -22.146597, -21.204188, -20.26178 , -19.319372, -18.376963, -17.434555, -16.492147, -15.549738, -14.60733 , -13.664921, -12.722513, -11.780105, -10.837696, -9.895288, -8.95288 , -8.010471, -7.068063, -6.125654, -5.183246, -4.240838, -3.298429, -2.356021, -1.413613, -0.471204, 0.471204, 1.413613, 2.356021, 3.298429, 4.240838, 5.183246, 6.125654, 7.068063, 8.010471, 8.95288 , 9.895288, 10.837696, 11.780105, 12.722513, 13.664921, 14.60733 , 15.549738, 16.492147, 17.434555, 18.376963, 19.319372, 20.26178 , 21.204188, 22.146597, 23.089005, 24.031414, 24.973822, 25.91623 , 26.858639, 27.801047, 28.743455, 29.685864, 30.628272, 31.570681, 32.513089, 33.455497, 34.397906, 35.340314, 36.282723, 37.225131, 38.167539, 39.109948, 40.052356, 40.994764, 41.937173, 42.879581, 43.82199 , 44.764398, 45.706806, 46.649215, 47.591623, 48.534031, 49.47644 , 50.418848, 51.361257, 52.303665, 53.246073, 54.188482, 55.13089 , 56.073298, 57.015707, 57.958115, 58.900524, 59.842932, 60.78534 , 61.727749, 62.670157, 63.612565, 64.554974, 65.497382, 66.439791, 67.382199, 68.324607, 69.267016, 70.209424, 71.151832, 72.094241, 73.036649, 73.979058, 74.921466, 75.863874, 76.806283, 77.748691, 78.691099, 79.633508, 80.575916, 81.518325, 82.460733, 83.403141, 84.34555 , 85.287958, 86.230366, 87.172775, 88.115183, 89.057592, 90. ])
array([ 0. , 1.25, 2.5 , ..., 356.25, 357.5 , 358.75])
array([cftime.DatetimeNoLeap(1850, 1, 15, 12, 0, 0, 0, 2, 15), cftime.DatetimeNoLeap(1850, 2, 14, 0, 0, 0, 0, 4, 45), cftime.DatetimeNoLeap(1850, 3, 15, 12, 0, 0, 0, 5, 74), ..., cftime.DatetimeNoLeap(2014, 10, 15, 12, 0, 0, 0, 5, 288), cftime.DatetimeNoLeap(2014, 11, 15, 0, 0, 0, 0, 1, 319), cftime.DatetimeNoLeap(2014, 12, 15, 12, 0, 0, 0, 3, 349)], dtype=object)
|
|
|
|
Plot a map from a specific date.
ds.tas.sel(time='1950-01').squeeze().plot()
<matplotlib.collections.QuadMesh at 0x11dd89908>
Create a timeseries of global-average surface air temperature. For this we need the area weighting factor for each gridpoint.
files_area = esgf_search(variable_id='areacella', activity_id='CMIP',
experiment_id='historical', institution_id="NCAR", source_id="CESM2")
ds_area = xr.open_dataset(files_area[0])
ds_area
https://esgf-node.llnl.gov/esg-search/search/?variable_id=areacella&activity_id=CMIP&experiment_id=historical&institution_id=NCAR&source_id=CESM2&project=CMIP6&type=File&distrib=false&format=application%2Fsolr%2Bjson&offset=0 https://esgf-node.llnl.gov/esg-search/search/?variable_id=areacella&activity_id=CMIP&experiment_id=historical&institution_id=NCAR&source_id=CESM2&project=CMIP6&type=File&distrib=false&format=application%2Fsolr%2Bjson&offset=10 https://esgf-node.llnl.gov/esg-search/search/?variable_id=areacella&activity_id=CMIP&experiment_id=historical&institution_id=NCAR&source_id=CESM2&project=CMIP6&type=File&distrib=false&format=application%2Fsolr%2Bjson&offset=20
/Users/rpa/Code/xarray/xarray/conventions.py:494: SerializationWarning: variable 'areacella' has multiple fill values {1e+20, 1e+20}, decoding all values to NaN. use_cftime=use_cftime,
array([-90. , -89.057592, -88.115183, -87.172775, -86.230366, -85.287958, -84.34555 , -83.403141, -82.460733, -81.518325, -80.575916, -79.633508, -78.691099, -77.748691, -76.806283, -75.863874, -74.921466, -73.979058, -73.036649, -72.094241, -71.151832, -70.209424, -69.267016, -68.324607, -67.382199, -66.439791, -65.497382, -64.554974, -63.612565, -62.670157, -61.727749, -60.78534 , -59.842932, -58.900524, -57.958115, -57.015707, -56.073298, -55.13089 , -54.188482, -53.246073, -52.303665, -51.361257, -50.418848, -49.47644 , -48.534031, -47.591623, -46.649215, -45.706806, -44.764398, -43.82199 , -42.879581, -41.937173, -40.994764, -40.052356, -39.109948, -38.167539, -37.225131, -36.282723, -35.340314, -34.397906, -33.455497, -32.513089, -31.570681, -30.628272, -29.685864, -28.743455, -27.801047, -26.858639, -25.91623 , -24.973822, -24.031414, -23.089005, -22.146597, -21.204188, -20.26178 , -19.319372, -18.376963, -17.434555, -16.492147, -15.549738, -14.60733 , -13.664921, -12.722513, -11.780105, -10.837696, -9.895288, -8.95288 , -8.010471, -7.068063, -6.125654, -5.183246, -4.240838, -3.298429, -2.356021, -1.413613, -0.471204, 0.471204, 1.413613, 2.356021, 3.298429, 4.240838, 5.183246, 6.125654, 7.068063, 8.010471, 8.95288 , 9.895288, 10.837696, 11.780105, 12.722513, 13.664921, 14.60733 , 15.549738, 16.492147, 17.434555, 18.376963, 19.319372, 20.26178 , 21.204188, 22.146597, 23.089005, 24.031414, 24.973822, 25.91623 , 26.858639, 27.801047, 28.743455, 29.685864, 30.628272, 31.570681, 32.513089, 33.455497, 34.397906, 35.340314, 36.282723, 37.225131, 38.167539, 39.109948, 40.052356, 40.994764, 41.937173, 42.879581, 43.82199 , 44.764398, 45.706806, 46.649215, 47.591623, 48.534031, 49.47644 , 50.418848, 51.361257, 52.303665, 53.246073, 54.188482, 55.13089 , 56.073298, 57.015707, 57.958115, 58.900524, 59.842932, 60.78534 , 61.727749, 62.670157, 63.612565, 64.554974, 65.497382, 66.439791, 67.382199, 68.324607, 69.267016, 70.209424, 71.151832, 72.094241, 73.036649, 73.979058, 74.921466, 75.863874, 76.806283, 77.748691, 78.691099, 79.633508, 80.575916, 81.518325, 82.460733, 83.403141, 84.34555 , 85.287958, 86.230366, 87.172775, 88.115183, 89.057592, 90. ])
array([ 0. , 1.25, 2.5 , ..., 356.25, 357.5 , 358.75])
array([[-90. , -89.528796], [-89.528796, -88.586387], [-88.586387, -87.643979], ..., [ 87.643979, 88.586387], [ 88.586387, 89.528796], [ 89.528796, 90. ]])
array([[ -0.625, 0.625], [ 0.625, 1.875], [ 1.875, 3.125], ..., [355.625, 356.875], [356.875, 358.125], [358.125, 359.375]])
array([[2.994837e+07, 2.994837e+07, 2.994837e+07, ..., 2.994837e+07, 2.994837e+07, 2.994837e+07], [2.395748e+08, 2.395748e+08, 2.395748e+08, ..., 2.395748e+08, 2.395748e+08, 2.395748e+08], [4.790848e+08, 4.790848e+08, 4.790848e+08, ..., 4.790848e+08, 4.790848e+08, 4.790848e+08], ..., [4.790848e+08, 4.790848e+08, 4.790848e+08, ..., 4.790848e+08, 4.790848e+08, 4.790848e+08], [2.395748e+08, 2.395748e+08, 2.395748e+08, ..., 2.395748e+08, 2.395748e+08, 2.395748e+08], [2.994837e+07, 2.994837e+07, 2.994837e+07, ..., 2.994837e+07, 2.994837e+07, 2.994837e+07]], dtype=float32)
total_area = ds_area.areacella.sum(dim=['lon', 'lat'])
ta_timeseries = (ds.tas * ds_area.areacella).sum(dim=['lon', 'lat']) / total_area
ta_timeseries
|
array([cftime.DatetimeNoLeap(1850, 1, 15, 12, 0, 0, 0, 2, 15), cftime.DatetimeNoLeap(1850, 2, 14, 0, 0, 0, 0, 4, 45), cftime.DatetimeNoLeap(1850, 3, 15, 12, 0, 0, 0, 5, 74), ..., cftime.DatetimeNoLeap(2014, 10, 15, 12, 0, 0, 0, 5, 288), cftime.DatetimeNoLeap(2014, 11, 15, 0, 0, 0, 0, 1, 319), cftime.DatetimeNoLeap(2014, 12, 15, 12, 0, 0, 0, 3, 349)], dtype=object)
By default the data are loaded lazily, as Dask arrays. Here we trigger computation explicitly.
%time ta_timeseries.load()
CPU times: user 4.93 s, sys: 6.73 s, total: 11.7 s Wall time: 1min 35s
array([284.99948, 285.23215, 285.85364, ..., 288.54376, 287.61884, 287.06284], dtype=float32)
array([cftime.DatetimeNoLeap(1850, 1, 15, 12, 0, 0, 0, 2, 15), cftime.DatetimeNoLeap(1850, 2, 14, 0, 0, 0, 0, 4, 45), cftime.DatetimeNoLeap(1850, 3, 15, 12, 0, 0, 0, 5, 74), ..., cftime.DatetimeNoLeap(2014, 10, 15, 12, 0, 0, 0, 5, 288), cftime.DatetimeNoLeap(2014, 11, 15, 0, 0, 0, 0, 1, 319), cftime.DatetimeNoLeap(2014, 12, 15, 12, 0, 0, 0, 3, 349)], dtype=object)
ta_timeseries.plot(label='monthly')
ta_timeseries.rolling(time=12).mean().plot(label='12 month rolling mean')
plt.legend()
plt.title('Global Mean Surface Air Temperature')
ds.nbytes / 1e6 / 40