Read netcdf files with xarray, and write to HSDS using h5pyd. Reading with xarray is useful because: (1) data too big for memory can be handled by dask, (2) it can separate out coordinate and data variables, and (3) it can read all types of netcdf files, not just netcdf4, and has no problems with netcdf attributes
import xarray as xr
import numpy as np
import h5pyd as h5py
infile = '/notebooks/rsignell/data/CFSR/tiny.nc'
outfile = '/home/rsignell/tiny_rechunked.nc'
# Don't decode times, as we don't want datetime64 objects here
ds = xr.open_dataset(infile, decode_cf=True, decode_times=False)
f = h5py.File(outfile, 'w')
for key, val in ds.attrs.items():
if isinstance(val,str):
f.attrs[key]=val
else:
f.attrs.create(key, val, (), dtype=val.dtype)
# inspect existing chunking
for key, val in ds.variables.items():
print(key, val.shape, val.chunks)
TMP_2maboveground (12, 880, 1760) None latitude (880,) None longitude (1760,) None time (12,) None
# specify chunk sizes for only those vars you want to rechunk
ds['TMP_2maboveground'].attrs['chunks'] = (4, 220, 440)
# writing data to variables
for key, val in ds.variables.items():
print('\nVariable: {}'.format(key))
dset = f.create_dataset(key, data=val.data, chunks=val.chunks)
for k,v in val.attrs.items():
print('{} = {}'.format(k,v))
if isinstance(v,str):
dset.attrs[k] = v
else:
dset.attrs.create(k, np.array(v))
Variable TMP_2maboveground: short_name=TMP_2maboveground long_name=Temperature level=2 m above ground units=K chunks=(4, 220, 440) Variable latitude: units=degrees_north long_name=latitude Variable longitude: units=degrees_east long_name=longitude Variable time: units=seconds since 1970-01-01 00:00:00.0 0:00 long_name=verification time generated by wgrib2 function verftime() reference_time=1483228800.0 reference_time_type=0 reference_date=2017.01.01 00:00:00 UTC reference_time_description=kind of product unclear, reference date is variable, min found reference date is given time_step_setting=auto time_step=3600.0
# Creating dimension scales
for key,val in ds.dims.items():
dset.dims.create_scale(f['/{}'.format(key)])
# Attaching dimension scales to data variables
# e.g. f['/TMP_2maboveground'].dims[0].attach_scale(f['/time'])
for key, val in ds.data_vars.items():
for i,dim in enumerate(val.dims):
print('/{}'.format(key),i,'/{}'.format(dim))
f['/{}'.format(key)].dims[i].attach_scale(f['/{}'.format(dim)])
/TMP_2maboveground 0 /time /TMP_2maboveground 1 /latitude /TMP_2maboveground 2 /longitude
ds.close()
f.close()