Authors: Mackenzie Blanusa, A.Radhakrishnan
from glob import glob
import xarray as xr
import cftime
import nc_time_axis
import numpy as np
import matplotlib.pyplot as plt
import intake, intake_esm
from dask_gateway import Gateway
import pandas as pd
pd.set_option("display.max_colwidth", None)
#!pip install cmip6_preprocessing
%pip install git+https://github.com/jbusecke/cmip6_preprocessing.git
from cmip6_preprocessing.preprocessing import combined_preprocessing
from cmip6_preprocessing.preprocessing import (correct_units,rename_cmip6)
def latest_version(cat):
"""
input
cat: esmdatastore
output
esmdatastore with latest DRS versions
"""
latest_cat = cat.df.sort_values(by=['version','path']).drop_duplicates(['temporal subset','model','mip_table',
'institute','variable','ensemble_member',
'grid_label','experiment_id'],keep='last')
return latest_cat
def fix_time(ds):
""" force calendar to noleap"""
import xarray as xr
ds = ds.copy()
if "time" not in ds.dims:
return ds
if ("calendar" not in ds["time"].attrs):
ds["time"].attrs.update({"calendar": "noleap"})
if ds["time"].attrs["calendar"] not in ["noleap", "NOLEAP", "365_day"]:
ds["time"].attrs.update({"calendar": "noleap"})
ds = xr.decode_cf(ds)
return ds
def fix_units(ds):
if "units" in ds["lev"].attrs:
if ds["lev"].attrs["units"] in ["cm", "centimeters"]:
ds["lev"] = xr.DataArray(ds["lev"].values / 100., dims=ds["lev"].dims)
return ds
def pp_thetao(ds):
ds = ds.copy() #the wrapper function makes a copy of the ds and works from this
ds = rename_cmip6(ds)
ds = fix_time(ds)
#ds = fix_units(ds)
ds = correct_units(ds)
return ds
col_url = "https://cmip6-nc.s3.us-east-2.amazonaws.com/esgf-world.json"
col = intake.open_esm_datastore(col_url)
debug starts
query = dict(experiment_id=['historical'],
mip_table='Omon',
ensemble_member=["r1i1p1f1"],
model=['IPSL-CM6A-LR'],
grid_label=['gn'],
variable=["thetao"]
)
cat_T = col.search(**query)
WHAT DOES NOT WORK:
the following misses olevel_bounds (renamed to lev_bnds) and other variables after preprocesssing. Without preprocessing, the datasets have the old dim/var names as found in the original files/objects
dset_dict_T_orig = cat_T.to_dataset_dict(cdf_kwargs={'decode_times': False, 'chunks': {'time': 1,'olevel':1}},
preprocess = pp_thetao,storage_options={'anon':True})
--> The keys in the returned dictionary of datasets are constructed as follows: 'project.institute.model.experiment_id.mip_table'
for k, ds in dset_dict_T_orig.items():
print(k)
print(list(ds.dims))
CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon ['ensemble_member', 'lev', 'time', 'x', 'y']
dset_dict_T_orig['CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon']
<xarray.Dataset> Dimensions: (ensemble_member: 1, lev: 75, time: 1980, x: 362, y: 332) Coordinates: lat (y, x) float32 dask.array<chunksize=(332, 362), meta=np.ndarray> lon (y, x) float32 dask.array<chunksize=(332, 362), meta=np.ndarray> * lev (lev) float32 0.50576 1.5558553 ... 5698.0605 5902.0576 * time (time) object 1850-01-16 12:00:00 ... 2015-01-25 12:00:00 * ensemble_member (ensemble_member) <U8 'r1i1p1f1' Dimensions without coordinates: x, y Data variables: thetao (ensemble_member, time, lev, y, x) float32 dask.array<chunksize=(1, 1, 1, 332, 362), meta=np.ndarray> Attributes: title: IPSL-CM6A-LR model output prepared for CMIP6 / C... intake_esm_varname: ['thetao'] source: IPSL-CM6A-LR (2017): atmos: LMDZ (NPv6, N96; 14... institution_id: IPSL history: Sat Dec 1 12:16:38 2018: ncatted -O -a realizat... physics_index: [1] parent_variant_label: r1i1p1f1 parent_experiment_id: piControl branch_method: standard grid: native ocean tri-polar grid with 105 k ocean cells realization_index: [1] parent_source_id: IPSL-CM6A-LR sub_experiment_id: none model_version: 6.1.5 variant_label: r1i1p1f1 sub_experiment: none branch_time_in_parent: [21914.] forcing_index: [1] initialization_index: [1] dr2xml_md5sum: f1e40c1fc5d8281f865f72fbf4e38f9d license: CMIP6 model data produced by IPSL is licensed un... EXPID: historical grid_label: gn Conventions: CF-1.7 CMIP-6.2 source_id: IPSL-CM6A-LR description: CMIP6 historical institution: Institut Pierre Simon Laplace, Paris 75252, France experiment: all-forcing simulation of the recent past frequency: mon activity_id: CMIP parent_activity_id: CMIP contact: ipsl-cmip6@listes.ipsl.fr realm: ocean source_type: AOGCM BGC data_specs_version: 01.00.21 further_info_url: https://furtherinfo.es-doc.org/CMIP6.IPSL.IPSL-C... dr2xml_version: 1.11 variable_id: thetao parent_time_units: days since 1850-01-01 00:00:00 parent_mip_era: CMIP6 CMIP6_CV_version: cv=6.2.3.5-2-g63b123e product: model-output NCO: "4.6.0" experiment_id: historical branch_time_in_child: [0.] nominal_resolution: 100 km tracking_id: hdl:21.14100/2357970e-3f77-4595-80d8-e3d5c69d0bd... table_id: Omon external_variables: areacello volcello mip_era: CMIP6 name: /ccc/work/cont003/gencmip6/p86caub/IGCM_OUT/IPSL... intake_esm_dataset_key: CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon
|
|
array([5.057600e-01, 1.555855e+00, 2.667682e+00, 3.856280e+00, 5.140361e+00, 6.543034e+00, 8.092519e+00, 9.822750e+00, 1.177368e+01, 1.399104e+01, 1.652532e+01, 1.942980e+01, 2.275762e+01, 2.655830e+01, 3.087456e+01, 3.574020e+01, 4.118002e+01, 4.721189e+01, 5.385064e+01, 6.111284e+01, 6.902168e+01, 7.761116e+01, 8.692943e+01, 9.704131e+01, 1.080303e+02, 1.200000e+02, 1.330758e+02, 1.474062e+02, 1.631645e+02, 1.805499e+02, 1.997900e+02, 2.211412e+02, 2.448906e+02, 2.713564e+02, 3.008875e+02, 3.338628e+02, 3.706885e+02, 4.117939e+02, 4.576256e+02, 5.086399e+02, 5.652923e+02, 6.280260e+02, 6.972587e+02, 7.733683e+02, 8.566790e+02, 9.474479e+02, 1.045854e+03, 1.151991e+03, 1.265861e+03, 1.387377e+03, 1.516364e+03, 1.652568e+03, 1.795671e+03, 1.945296e+03, 2.101027e+03, 2.262422e+03, 2.429025e+03, 2.600380e+03, 2.776039e+03, 2.955570e+03, 3.138565e+03, 3.324641e+03, 3.513446e+03, 3.704657e+03, 3.897982e+03, 4.093159e+03, 4.289953e+03, 4.488155e+03, 4.687581e+03, 4.888070e+03, 5.089479e+03, 5.291683e+03, 5.494575e+03, 5.698061e+03, 5.902058e+03], dtype=float32)
array([cftime.DatetimeNoLeap(1850, 1, 16, 12, 0, 0, 0), cftime.DatetimeNoLeap(1850, 2, 15, 0, 0, 0, 0), cftime.DatetimeNoLeap(1850, 3, 16, 12, 0, 0, 0), ..., cftime.DatetimeNoLeap(2014, 11, 25, 12, 0, 0, 0), cftime.DatetimeNoLeap(2014, 12, 26, 0, 0, 0, 0), cftime.DatetimeNoLeap(2015, 1, 25, 12, 0, 0, 0)], dtype=object)
array(['r1i1p1f1'], dtype='<U8')
|
WHAT WORKS:
for some reason the following works and includes all data variables. latest_version outputs a pandas dataframe which we then convert to esm datastore
cat_T_gn_latest = latest_version(cat_T)
esmcol_data = col.esmcol_data
cat_T2 = intake.open_esm_datastore(cat_T_gn_latest,esmcol_data=esmcol_data)
dset_dict_T = cat_T2.to_dataset_dict(cdf_kwargs={'decode_times': False, 'chunks': {'time': 1}},
preprocess = pp_thetao,storage_options={'anon':True})
--> The keys in the returned dictionary of datasets are constructed as follows: 'project.institute.model.experiment_id.mip_table'
dset_dict_T['CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon']
<xarray.Dataset> Dimensions: (bnds: 2, ensemble_member: 1, lev: 75, time: 1980, vertex: 4, x: 362, y: 332) Coordinates: lat (y, x) float32 dask.array<chunksize=(332, 362), meta=np.ndarray> lon (y, x) float32 dask.array<chunksize=(332, 362), meta=np.ndarray> * lev (lev) float32 0.50576 1.5558553 ... 5698.0605 5902.0576 * time (time) object 1850-01-16 12:00:00 ... 2015-01-25 12:00:00 * ensemble_member (ensemble_member) <U8 'r1i1p1f1' Dimensions without coordinates: bnds, vertex, x, y Data variables: lon_bounds (y, x, vertex) float32 dask.array<chunksize=(332, 362, 4), meta=np.ndarray> lat_bounds (y, x, vertex) float32 dask.array<chunksize=(332, 362, 4), meta=np.ndarray> area (y, x) float32 dask.array<chunksize=(332, 362), meta=np.ndarray> lev_bounds (lev, bnds) float32 dask.array<chunksize=(75, 2), meta=np.ndarray> time_bounds (time, bnds) object dask.array<chunksize=(1, 2), meta=np.ndarray> thetao (ensemble_member, time, lev, y, x) float32 dask.array<chunksize=(1, 1, 75, 332, 362), meta=np.ndarray> Attributes: title: IPSL-CM6A-LR model output prepared for CMIP6 / C... intake_esm_varname: ['thetao'] source: IPSL-CM6A-LR (2017): atmos: LMDZ (NPv6, N96; 14... institution_id: IPSL history: Sat Dec 1 12:16:38 2018: ncatted -O -a realizat... physics_index: [1] parent_variant_label: r1i1p1f1 parent_experiment_id: piControl branch_method: standard grid: native ocean tri-polar grid with 105 k ocean cells realization_index: [1] parent_source_id: IPSL-CM6A-LR sub_experiment_id: none model_version: 6.1.5 variant_label: r1i1p1f1 sub_experiment: none branch_time_in_parent: [21914.] forcing_index: [1] initialization_index: [1] dr2xml_md5sum: f1e40c1fc5d8281f865f72fbf4e38f9d license: CMIP6 model data produced by IPSL is licensed un... EXPID: historical grid_label: gn Conventions: CF-1.7 CMIP-6.2 source_id: IPSL-CM6A-LR description: CMIP6 historical institution: Institut Pierre Simon Laplace, Paris 75252, France experiment: all-forcing simulation of the recent past frequency: mon activity_id: CMIP parent_activity_id: CMIP contact: ipsl-cmip6@listes.ipsl.fr realm: ocean source_type: AOGCM BGC data_specs_version: 01.00.21 further_info_url: https://furtherinfo.es-doc.org/CMIP6.IPSL.IPSL-C... dr2xml_version: 1.11 variable_id: thetao parent_time_units: days since 1850-01-01 00:00:00 parent_mip_era: CMIP6 CMIP6_CV_version: cv=6.2.3.5-2-g63b123e product: model-output NCO: "4.6.0" experiment_id: historical branch_time_in_child: [0.] nominal_resolution: 100 km tracking_id: hdl:21.14100/2357970e-3f77-4595-80d8-e3d5c69d0bd... table_id: Omon external_variables: areacello volcello mip_era: CMIP6 name: /ccc/work/cont003/gencmip6/p86caub/IGCM_OUT/IPSL... intake_esm_dataset_key: CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon
|
|
array([5.057600e-01, 1.555855e+00, 2.667682e+00, 3.856280e+00, 5.140361e+00, 6.543034e+00, 8.092519e+00, 9.822750e+00, 1.177368e+01, 1.399104e+01, 1.652532e+01, 1.942980e+01, 2.275762e+01, 2.655830e+01, 3.087456e+01, 3.574020e+01, 4.118002e+01, 4.721189e+01, 5.385064e+01, 6.111284e+01, 6.902168e+01, 7.761116e+01, 8.692943e+01, 9.704131e+01, 1.080303e+02, 1.200000e+02, 1.330758e+02, 1.474062e+02, 1.631645e+02, 1.805499e+02, 1.997900e+02, 2.211412e+02, 2.448906e+02, 2.713564e+02, 3.008875e+02, 3.338628e+02, 3.706885e+02, 4.117939e+02, 4.576256e+02, 5.086399e+02, 5.652923e+02, 6.280260e+02, 6.972587e+02, 7.733683e+02, 8.566790e+02, 9.474479e+02, 1.045854e+03, 1.151991e+03, 1.265861e+03, 1.387377e+03, 1.516364e+03, 1.652568e+03, 1.795671e+03, 1.945296e+03, 2.101027e+03, 2.262422e+03, 2.429025e+03, 2.600380e+03, 2.776039e+03, 2.955570e+03, 3.138565e+03, 3.324641e+03, 3.513446e+03, 3.704657e+03, 3.897982e+03, 4.093159e+03, 4.289953e+03, 4.488155e+03, 4.687581e+03, 4.888070e+03, 5.089479e+03, 5.291683e+03, 5.494575e+03, 5.698061e+03, 5.902058e+03], dtype=float32)
array([cftime.DatetimeNoLeap(1850, 1, 16, 12, 0, 0, 0), cftime.DatetimeNoLeap(1850, 2, 15, 0, 0, 0, 0), cftime.DatetimeNoLeap(1850, 3, 16, 12, 0, 0, 0), ..., cftime.DatetimeNoLeap(2014, 11, 25, 12, 0, 0, 0), cftime.DatetimeNoLeap(2014, 12, 26, 0, 0, 0, 0), cftime.DatetimeNoLeap(2015, 1, 25, 12, 0, 0, 0)], dtype=object)
array(['r1i1p1f1'], dtype='<U8')
|
|
|
|
|
|
debug ends