Convert NetCDF4 file to HSDS, with custom chunking

Read netcdf4 files using xarray, so that we can read the attributes

import xarray as xr import numpy as np import h5pyd as h5py

In [2]:
infile = '/notebooks/rsignell/data/CFSR/tmp2m_2months.nc'
outfile = '/home/rsignell/tmp2m_2months_rechunked.nc'
#outfile = '/notebooks/rsignell/data/CFSR/foo.nc'
In [3]:
ds = xr.open_dataset(infile, decode_cf=False)
In [4]:
ds.variables
Out[4]:
Frozen(OrderedDict([('latitude', <xarray.IndexVariable 'latitude' (latitude: 880)>
array([-89.843515, -89.640798, -89.436886, ...,  89.436886,  89.640798,
        89.843515])
Attributes:
    units:      degrees_north
    long_name:  latitude), ('longitude', <xarray.IndexVariable 'longitude' (longitude: 1760)>
array([  0.000000e+00,   2.045452e-01,   4.090904e-01, ...,   3.593859e+02,
         3.595905e+02,   3.597950e+02])
Attributes:
    units:      degrees_east
    long_name:  longitude), ('time', <xarray.IndexVariable 'time' (time: 1416)>
array([  1.483232e+09,   1.483236e+09,   1.483240e+09, ...,   1.488319e+09,
         1.488323e+09,   1.488326e+09])
Attributes:
    units:                       seconds since 1970-01-01 00:00:00.0 0:00
    long_name:                   verification time generated by wgrib2 functi...
    reference_time:              1483228800.0
    reference_time_type:         0
    reference_date:              2017.01.01 00:00:00 UTC
    reference_time_description:  kind of product unclear, reference date is v...
    time_step_setting:           auto
    time_step:                   3600.0), ('TMP_2maboveground', <xarray.Variable (time: 1416, latitude: 880, longitude: 1760)>
[2193100800 values with dtype=float32]
Attributes:
    _FillValue:  9.999e+20
    short_name:  TMP_2maboveground
    long_name:   Temperature
    level:       2 m above ground
    units:       K)]))
In [5]:
f = h5py.File(outfile, 'w')
In [6]:
for key, val in ds.attrs.items():
    if isinstance(val,str):
        f.attrs[key]=val
    else:
        f.attrs.create(key, val, (), dtype=val.dtype)
In [7]:
f.attrs['Conventions']
Out[7]:
'COARDS'
In [8]:
ds['latitude'].data.shape
Out[8]:
(880,)
In [9]:
ds['latitude'].data.dtype
Out[9]:
dtype('float64')
In [10]:
for key, val in ds.variables.items():
    print(key, val.shape, val.chunks)
latitude (880,) None
longitude (1760,) None
time (1416,) None
TMP_2maboveground (1416, 880, 1760) None

Just specify the chunk sizes for those vars that need rechunking

In [ ]:
ds['TMP_2maboveground'].attrs['chunks'] = (4, 220, 440)
In [ ]:
for key, val in ds.variables.items():
    print(key)
    dset = f.create_dataset(key, data=val.data, chunks=val.chunks)
    for k,v in val.attrs.items():
        print(k,v)
        if isinstance(v,str):
            dset.attrs[k] = v
        else:
            dset.attrs.create(k, np.array(v))
latitude
units degrees_north
long_name latitude
longitude
units degrees_east
long_name longitude
time
units seconds since 1970-01-01 00:00:00.0 0:00
long_name verification time generated by wgrib2 function verftime()
reference_time 1483228800.0
reference_time_type 0
reference_date 2017.01.01 00:00:00 UTC
reference_time_description kind of product unclear, reference date is variable, min found reference date is given
time_step_setting auto
time_step 3600.0
TMP_2maboveground
In [ ]:
# Creating dimension scales
dset.dims.set_scale(f['/latitude'])
dset.dims.set_scale(f['/time'])
dset.dims.set_scale(f['/longitude'])

# Attaching dimension scales to dataset: /TMP_2maboveground
f['/TMP_2maboveground'].dims[0].attach_scale(f['/time'])
f['/TMP_2maboveground'].dims[1].attach_scale(f['/latitude'])
f['/TMP_2maboveground'].dims[2].attach_scale(f['/longitude'])

f.close()