NWM ReferenceFileSystem JSON¶

Create ReferenceFileSystem JSON file for a collection of NWM NetCDF files on S3

In [1]:

import os
import fsspec
import ujson   # fast json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask
import hvplot.xarray

In [2]:

fs = fsspec.filesystem('s3', anon=True, skip_instance_cache=True)

In [3]:

best_hour='f001'
var = 'channel_rt'

Cheat on file list¶

globbing all the files takes a long time (> 5 minutes), so instead, just read the dates and generate 24 files for each date, which of course assumes no missing files

In [4]:

#%%time
#flist = fs.glob(f'noaa-nwm-pds/nwm.*/short_range/nwm.*.short_range.{var}.{best_hour}.conus.nc')

In [5]:

days = fs.glob(f'noaa-nwm-pds/nwm.*')

In [6]:

print(days[0])
print(days[-1])

noaa-nwm-pds/nwm.20210626
noaa-nwm-pds/nwm.20210726

In [7]:

flist=[]
for day in days[2:-2]:
    for i in range(24):
        flist.append(f'{day}/short_range/nwm.t{i:02d}z.short_range.{var}.{best_hour}.conus.nc')

In [8]:

flist.extend(fs.glob(f'{days[-1]}/short_range/nwm.*.short_range.{var}.{best_hour}.conus.nc'))

In [9]:

fs.size(flist[0])/1e6

Out[9]:

13.689784

In [10]:

ds = xr.open_dataset(fs.open(flist[0]))

In [11]:

ds.streamflow.encoding

Out[11]:

{'chunksizes': (925580,),
 'fletcher32': False,
 'shuffle': True,
 'zlib': True,
 'complevel': 2,
 'source': '<File-like object S3FileSystem, noaa-nwm-pds/nwm.20210628/short_range/nwm.t00z.short_range.channel_rt.f001.conus.nc>',
 'original_shape': (2776738,),
 'dtype': dtype('int32'),
 'missing_value': array([-999900], dtype=int32),
 '_FillValue': array([-999900], dtype=int32),
 'scale_factor': array([0.01], dtype=float32),
 'add_offset': array([0.], dtype=float32)}

In [12]:

ds.nbytes/1e6

Out[12]:

144.390393

In [13]:

print(flist[0])
print(flist[-1])

noaa-nwm-pds/nwm.20210628/short_range/nwm.t00z.short_range.channel_rt.f001.conus.nc
noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f001.conus.nc

Join the "best time series" from past forecasts with the latest forecast¶

Remove the first day of data since this is a rolling collection and we don't want to be trying to access files that soon will be removed
Use all the files from the last forecast cycle

In [14]:

last_dir = f'{os.path.dirname(flist[-1])}'
last_dir

Out[14]:

'noaa-nwm-pds/nwm.20210726/short_range'

In [15]:

last_file = os.path.basename(flist[-1]).split('.')
last_file

Out[15]:

['nwm', 't15z', 'short_range', 'channel_rt', 'f001', 'conus', 'nc']

In [16]:

last_files = fs.glob(f'{last_dir}/{last_file[0]}.{last_file[1]}.{last_file[2]}.{var}.*.conus.nc')
last_files

Out[16]:

['noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f001.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f002.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f003.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f004.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f005.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f006.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f007.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f008.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f009.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f010.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f011.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f012.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f013.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f014.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f015.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f016.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f017.conus.nc',
 'noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f018.conus.nc']

Skip the first of the last_files since it's a duplicate:

In [17]:

flist.extend(last_files[1:])

In [18]:

print(flist[0])
print(flist[-1])

noaa-nwm-pds/nwm.20210628/short_range/nwm.t00z.short_range.channel_rt.f001.conus.nc
noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f018.conus.nc

We need to include the "s3://" prefix to the list of files so that fsspec will recognize that these JSON files are on S3. There is no "storage_

In [19]:

urls = ["s3://" + f for f in flist]

so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')

In [20]:

print(urls[0])
print(urls[-1])

s3://noaa-nwm-pds/nwm.20210628/short_range/nwm.t00z.short_range.channel_rt.f001.conus.nc
s3://noaa-nwm-pds/nwm.20210726/short_range/nwm.t15z.short_range.channel_rt.f018.conus.nc

In [21]:

fs.size(urls[10])

Out[21]:

13717114

In [22]:

import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Pangeo Worker', 
                                      propagate_env=True)

/home/conda/store/578732b5a39dadc3cadc71a29a33e58ded03e8a9d5c888edac6151d80eda2868-pangeo/lib/python3.8/site-packages/dask_gateway/client.py:21: FutureWarning: format_bytes is deprecated and will be removed in a future release. Please use dask.utils.format_bytes instead.
  from distributed.utils import LoopRunner, format_bytes

Existing Dask clusters:
Cluster Index c_idx: 0 / Name: dev.ce7242a88a1847958d0b4ac42dad0850 ClusterStatus.RUNNING
Using existing cluster [0].
Setting Fixed Scaling workers=30
Reconnect client to clear cache
client.dashboard_link (for new browser tab/window or dashboard searchbar in Jupyterhub):
https://jupyter.qhub.esipfed.org/gateway/clusters/dev.ce7242a88a1847958d0b4ac42dad0850/status
Propagating environment variables to workers

Create the individual JSON files directly on S3¶

We passed AWS credentials to the Dask workers via environment variables above, and the dask workers don't have the AWS credentials file with profiles defined, so we don't define a profile here, we just set anon=False and let the workers find the credentials via the environment variables:

In [23]:

fs2 = fsspec.filesystem('s3', anon=False)  

If the directory exists, remove it (and all the files)

In [24]:

json_dir = 's3://esip-qhub/usgs/nwm_forecast/jsons/'

In [25]:

try:
    fs2.rm(json_dir, recursive=True)
except:
    pass

In [26]:

def gen_json(u):
    with fs.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        p = u.split('/')
        date = p[3]
        fname = p[5]
        outf = f'{json_dir}{date}.{fname}.json'
        print(outf)
        with fs2.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

Send the list of delayed tasks to the Dask worksers to compute¶

In [27]:

%%time
dask.compute(*[dask.delayed(gen_json)(u) for u in urls], retries=10);

CPU times: user 318 ms, sys: 35.6 ms, total: 354 ms
Wall time: 1min 7s

Out[27]:

(None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None)

In [28]:

flist2 = fs2.ls(json_dir)
furls = sorted(['s3://'+f for f in flist2])
furls[0]

Out[28]:

's3://esip-qhub/usgs/nwm_forecast/jsons/nwm.20210628.nwm.t00z.short_range.channel_rt.f001.conus.nc.json'

In [29]:

len(furls)

Out[29]:

In [30]:

fs.size(flist[0])

Out[30]:

13689784

In [31]:

mzz = MultiZarrToZarr(furls[-240:], 
    storage_options={'anon':False}, 
    remote_protocol='s3',
    remote_options={'anon' : 'True'},   #JSON files  
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
        'drop_variables': ['reference_time', 'crs'],
        'decode_coords' : False
    },
    xarray_concat_args={
#          "data_vars": "minimal",
#          "coords": "minimal",
#          "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "time"
    }
)

In [32]:

%%time
#%%prun -D multizarr_profile 
mzz.translate('nwm.json')

CPU times: user 15.5 s, sys: 1.69 s, total: 17.2 s
Wall time: 53.3 s

Copy the local consolidated JSON file to S3¶

In [33]:

rpath = 's3://esip-qhub/usgs/forecast/nwm.json'

fs2.put_file(lpath='nwm.json', rpath=rpath)

Try opening the consolidated JSON file from S3¶

In [34]:

s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'anon':True}
fs = fsspec.filesystem("reference", fo=rpath, ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [35]:

ds

In [36]:

%%time
ds.streamflow[:,1000].hvplot(x='time', grid=True)

CPU times: user 4.46 s, sys: 296 ms, total: 4.75 s
Wall time: 6.93 s

Out[36]:

In [37]:

cluster.shutdown(); client.close()

In [ ]: