Read a collection of GRIB2 files on AWS as a single dataset using the Zarr library, via fsspec's ReferenceFileSystem. This notebook also demonstrates both how to generate the JSON file that fsspec uses, speeding up extracting the metadata from each GRIB2 file using Reference File Maker with a Dask Cluster.
Requires development version of fsspec_reference_maker
pip install --user git+https://github.com/intake/fsspec-reference-maker
import xarray as xr
import hvplot.xarray
import datetime as dt
import pandas as pd
import dask
import panel as pn
import json
import fsspec
from fsspec_reference_maker.grib2 import scan_grib
from fsspec_reference_maker.combine import MultiZarrToZarr
There is a new HRRR forecast every hour, so use forecast hour 1 from past forecasts and then append the latest forecast
fs = fsspec.filesystem('s3', anon=True, skip_instance_cache=True)
today = dt.datetime.utcnow().strftime('%Y%m%d')
last_file = fs.glob(f's3://noaa-hrrr-bdp-pds/hrrr.{today}/conus/*wrfsfcf01.grib2')[-1]
end_time = dt.datetime.strptime(last_file,
'noaa-hrrr-bdp-pds/hrrr.%Y%m%d/conus/hrrr.t%Hz.wrfsfcf01.grib2')
start_time = end_time - dt.timedelta(days=7)
last_file
'noaa-hrrr-bdp-pds/hrrr.20210903/conus/hrrr.t18z.wrfsfcf01.grib2'
glob_pattern_for_latest_forecast = end_time.strftime('s3://noaa-hrrr-bdp-pds/hrrr.%Y%m%d/conus/hrrr.t%Hz.wrfsfcf*.grib2')
print(glob_pattern_for_latest_forecast)
s3://noaa-hrrr-bdp-pds/hrrr.20210903/conus/hrrr.t18z.wrfsfcf*.grib2
dates = pd.date_range(start=start_time, end=end_time, freq='1h')
files = [date.strftime('s3://noaa-hrrr-bdp-pds/hrrr.%Y%m%d/conus/hrrr.t%Hz.wrfsfcf01.grib2') for date in dates]
latest_files = fs.glob(glob_pattern_for_latest_forecast)
latest_files = [f's3://{file}' for file in latest_files]
print(latest_files[-1])
s3://noaa-hrrr-bdp-pds/hrrr.20210903/conus/hrrr.t18z.wrfsfcf31.grib2
files.extend(latest_files[2:])
print(files[0])
print(files[-1])
s3://noaa-hrrr-bdp-pds/hrrr.20210827/conus/hrrr.t18z.wrfsfcf01.grib2 s3://noaa-hrrr-bdp-pds/hrrr.20210903/conus/hrrr.t18z.wrfsfcf31.grib2
This is for the ESIP qhub: you will need to modify to work elsewhere.
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd
ebd.set_credentials(profile='esip-qhub')
profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max,
region=region, use_existing_cluster=True,
adaptive_scaling=False, wait_for_cluster=False,
environment='pangeo', worker_profile='Pangeo Worker',
propagate_env=True)
/home/conda/store/3d745bdbbc77faf1b06381a24d6e593eeec3375ed3ddf4003aa2fc578a214eab-pangeo/lib/python3.9/site-packages/dask_gateway/client.py:21: FutureWarning: format_bytes is deprecated and will be removed in a future release. Please use dask.utils.format_bytes instead. from distributed.utils import LoopRunner, format_bytes
No Cluster running. Starting new cluster. Setting Fixed Scaling workers=30 Reconnect client to clear cache client.dashboard_link (for new browser tab/window or dashboard searchbar in Jupyterhub): https://jupyter.qhub.esipfed.org/gateway/clusters/dev.ef75dc2aaa3e4fc486dade428b913085/status Propagating environment variables to workers
afilter={'typeOfLevel': 'heightAboveGround', 'level': 2}
so = {"anon": True, "default_cache_type": "readahead"}
common = ['time', 'step', 'latitude', 'longitude', 'valid_time']
def gen_json(u):
date = u.split('/')[3].split('.')[1]
name = u.split('/')[5].split('.')[1:3]
outfname = f'{json_dir}{date}.{name[0]}.{name[1]}.json'
out = scan_grib(u, common, so, inline_threashold=100, filter=afilter)
with fs2.open(outfname, "w") as f:
f.write(json.dumps(out))
json_dir = 's3://esip-qhub/noaa/hrrr/jsons/'
fs2 = fsspec.filesystem('s3', anon=False, skip_instance_cache=True)
try:
fs2.rm(json_dir, recursive=True)
except:
pass
%%time
_ = dask.compute(*[dask.delayed(gen_json)(u) for u in files], retries=10);
CPU times: user 991 ms, sys: 63 ms, total: 1.05 s Wall time: 15min 50s
MultiZarrToZarr()
to combine into single reference¶flist2 = fs2.ls(json_dir)
furls = sorted(['s3://'+f for f in flist2])
print(furls[0])
print(furls[-1])
s3://esip-qhub/noaa/hrrr/jsons/20210827.t18z.wrfsfcf01.json s3://esip-qhub/noaa/hrrr/jsons/20210903.t18z.wrfsfcf31.json
# mzz = MultiZarrToZarr(furls,
# storage_options={'anon':False},
# remote_protocol='s3',
# remote_options={'anon' : 'True'}, #JSON files
# xarray_open_kwargs={
# 'decode_cf' : False,
# 'mask_and_scale' : False,
# 'decode_times' : False,
# 'use_cftime' : False,
# 'drop_variables': ['reference_time', 'crs'],
# 'decode_coords' : False
# },
# xarray_concat_args={
# # "data_vars": "minimal",
# # "coords": "minimal",
# # "compat": "override",
# "join": "override",
# "combine_attrs": "override",
# "dim": "time"
# }
# )
mzz = MultiZarrToZarr(furls,
storage_options={'anon':False},
remote_protocol='s3',
remote_options={'anon': True},
xarray_concat_args={'dim': 'valid_time'})
%%time
#%%prun -D multizarr_profile
mzz.translate('hrrr_best.json')
CPU times: user 2.9 s, sys: 451 ms, total: 3.35 s Wall time: 19.2 s
rpath = 's3://esip-qhub-public/noaa/hrrr/hrrr_best.json'
fs2.put_file(lpath='hrrr_best.json', rpath=rpath)
rpath = 's3://esip-qhub-public/noaa/hrrr/hrrr_best.json'
s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'anon':True}
fs = fsspec.filesystem("reference", fo=rpath, ref_storage_args=s_opts,
remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds2 = xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False))
ds2.data_vars
Data variables: d2m (valid_time, y, x) float32 ... pt (valid_time, y, x) float32 ... r2 (valid_time, y, x) float32 ... sh2 (valid_time, y, x) float32 ... t2m (valid_time, y, x) float32 ... unknown (valid_time, y, x) float32 ...
var='t2m'
ds2[var].nbytes/1e9
1.516492236
ds2[var]
<xarray.DataArray 't2m' (valid_time: 199, y: 1059, x: 1799)> [379123059 values with dtype=float32] Coordinates: heightAboveGround float64 2.0 latitude (y, x) float64 ... longitude (y, x) float64 ... step timedelta64[ns] 01:00:00 time (valid_time) datetime64[ns] 2021-08-27T18:00:00 ... NaT * valid_time (valid_time) datetime64[us] 2021-08-27T19:00:00 ... 20... Dimensions without coordinates: y, x Attributes: (12/33) GRIB_DxInMetres: 3000.0 GRIB_DyInMetres: 3000.0 GRIB_LaDInDegrees: 38.5 GRIB_Latin1InDegrees: 38.5 GRIB_Latin2InDegrees: 38.5 GRIB_LoVInDegrees: 262.5 ... ... GRIB_stepUnits: 1 GRIB_typeOfLevel: heightAboveGround GRIB_units: K long_name: 2 metre temperature standard_name: air_temperature units: K
[379123059 values with dtype=float32]
array(2.)
[1905141 values with dtype=float64]
[1905141 values with dtype=float64]
array(3600000000000, dtype='timedelta64[ns]')
array(['2021-08-27T18:00:00.000000000', '2021-08-27T19:00:00.000000000', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT'], dtype='datetime64[ns]')
array(['2021-08-27T19:00:00.000000', '2021-08-27T20:00:00.000000', '2021-08-27T21:00:00.000000', '2021-08-27T22:00:00.000000', '2021-08-27T23:00:00.000000', '2021-08-28T00:00:00.000000', '2021-08-28T01:00:00.000000', '2021-08-28T02:00:00.000000', '2021-08-28T03:00:00.000000', '2021-08-28T04:00:00.000000', '2021-08-28T05:00:00.000000', '2021-08-28T06:00:00.000000', '2021-08-28T07:00:00.000000', '2021-08-28T08:00:00.000000', '2021-08-28T09:00:00.000000', '2021-08-28T10:00:00.000000', '2021-08-28T11:00:00.000000', '2021-08-28T12:00:00.000000', '2021-08-28T13:00:00.000000', '2021-08-28T14:00:00.000000', '2021-08-28T15:00:00.000000', '2021-08-28T16:00:00.000000', '2021-08-28T17:00:00.000000', '2021-08-28T18:00:00.000000', '2021-08-28T19:00:00.000000', '2021-08-28T20:00:00.000000', '2021-08-28T21:00:00.000000', '2021-08-28T22:00:00.000000', '2021-08-28T23:00:00.000000', '2021-08-29T00:00:00.000000', '2021-08-29T01:00:00.000000', '2021-08-29T02:00:00.000000', '2021-08-29T03:00:00.000000', '2021-08-29T04:00:00.000000', '2021-08-29T05:00:00.000000', '2021-08-29T06:00:00.000000', '2021-08-29T07:00:00.000000', '2021-08-29T08:00:00.000000', '2021-08-29T09:00:00.000000', '2021-08-29T10:00:00.000000', '2021-08-29T11:00:00.000000', '2021-08-29T12:00:00.000000', '2021-08-29T13:00:00.000000', '2021-08-29T14:00:00.000000', '2021-08-29T15:00:00.000000', '2021-08-29T16:00:00.000000', '2021-08-29T17:00:00.000000', '2021-08-29T18:00:00.000000', '2021-08-29T19:00:00.000000', '2021-08-29T20:00:00.000000', '2021-08-29T21:00:00.000000', '2021-08-29T22:00:00.000000', '2021-08-29T23:00:00.000000', '2021-08-30T00:00:00.000000', '2021-08-30T01:00:00.000000', '2021-08-30T02:00:00.000000', '2021-08-30T03:00:00.000000', '2021-08-30T04:00:00.000000', '2021-08-30T05:00:00.000000', '2021-08-30T06:00:00.000000', '2021-08-30T07:00:00.000000', '2021-08-30T08:00:00.000000', '2021-08-30T09:00:00.000000', '2021-08-30T10:00:00.000000', '2021-08-30T11:00:00.000000', '2021-08-30T12:00:00.000000', '2021-08-30T13:00:00.000000', '2021-08-30T14:00:00.000000', '2021-08-30T15:00:00.000000', '2021-08-30T16:00:00.000000', '2021-08-30T17:00:00.000000', '2021-08-30T18:00:00.000000', '2021-08-30T19:00:00.000000', '2021-08-30T20:00:00.000000', '2021-08-30T21:00:00.000000', '2021-08-30T22:00:00.000000', '2021-08-30T23:00:00.000000', '2021-08-31T00:00:00.000000', '2021-08-31T01:00:00.000000', '2021-08-31T02:00:00.000000', '2021-08-31T03:00:00.000000', '2021-08-31T04:00:00.000000', '2021-08-31T05:00:00.000000', '2021-08-31T06:00:00.000000', '2021-08-31T07:00:00.000000', '2021-08-31T08:00:00.000000', '2021-08-31T09:00:00.000000', '2021-08-31T10:00:00.000000', '2021-08-31T11:00:00.000000', '2021-08-31T12:00:00.000000', '2021-08-31T13:00:00.000000', '2021-08-31T14:00:00.000000', '2021-08-31T15:00:00.000000', '2021-08-31T16:00:00.000000', '2021-08-31T17:00:00.000000', '2021-08-31T18:00:00.000000', '2021-08-31T19:00:00.000000', '2021-08-31T20:00:00.000000', '2021-08-31T21:00:00.000000', '2021-08-31T22:00:00.000000', '2021-08-31T23:00:00.000000', '2021-09-01T00:00:00.000000', '2021-09-01T01:00:00.000000', '2021-09-01T02:00:00.000000', '2021-09-01T03:00:00.000000', '2021-09-01T04:00:00.000000', '2021-09-01T05:00:00.000000', '2021-09-01T06:00:00.000000', '2021-09-01T07:00:00.000000', '2021-09-01T08:00:00.000000', '2021-09-01T09:00:00.000000', '2021-09-01T10:00:00.000000', '2021-09-01T11:00:00.000000', '2021-09-01T12:00:00.000000', '2021-09-01T13:00:00.000000', '2021-09-01T14:00:00.000000', '2021-09-01T15:00:00.000000', '2021-09-01T16:00:00.000000', '2021-09-01T17:00:00.000000', '2021-09-01T18:00:00.000000', '2021-09-01T19:00:00.000000', '2021-09-01T20:00:00.000000', '2021-09-01T21:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T23:00:00.000000', '2021-09-02T00:00:00.000000', '2021-09-02T01:00:00.000000', '2021-09-02T02:00:00.000000', '2021-09-02T03:00:00.000000', '2021-09-02T04:00:00.000000', '2021-09-02T05:00:00.000000', '2021-09-02T06:00:00.000000', '2021-09-02T07:00:00.000000', '2021-09-02T08:00:00.000000', '2021-09-02T09:00:00.000000', '2021-09-02T10:00:00.000000', '2021-09-02T11:00:00.000000', '2021-09-02T12:00:00.000000', '2021-09-02T13:00:00.000000', '2021-09-02T14:00:00.000000', '2021-09-02T15:00:00.000000', '2021-09-02T16:00:00.000000', '2021-09-02T17:00:00.000000', '2021-09-02T18:00:00.000000', '2021-09-02T19:00:00.000000', '2021-09-02T20:00:00.000000', '2021-09-02T21:00:00.000000', '2021-09-02T22:00:00.000000', '2021-09-02T23:00:00.000000', '2021-09-03T00:00:00.000000', '2021-09-03T01:00:00.000000', '2021-09-03T02:00:00.000000', '2021-09-03T03:00:00.000000', '2021-09-03T04:00:00.000000', '2021-09-03T05:00:00.000000', '2021-09-03T06:00:00.000000', '2021-09-03T07:00:00.000000', '2021-09-03T08:00:00.000000', '2021-09-03T09:00:00.000000', '2021-09-03T10:00:00.000000', '2021-09-03T11:00:00.000000', '2021-09-03T12:00:00.000000', '2021-09-03T13:00:00.000000', '2021-09-03T14:00:00.000000', '2021-09-03T15:00:00.000000', '2021-09-03T16:00:00.000000', '2021-09-03T17:00:00.000000', '2021-09-03T18:00:00.000000', '2021-09-03T19:00:00.000000', '2021-09-03T20:00:00.000000', '2021-09-03T21:00:00.000000', '2021-09-03T22:00:00.000000', '2021-09-03T23:00:00.000000', '2021-09-04T00:00:00.000000', '2021-09-04T01:00:00.000000', '2021-09-04T02:00:00.000000', '2021-09-04T03:00:00.000000', '2021-09-04T04:00:00.000000', '2021-09-04T05:00:00.000000', '2021-09-04T06:00:00.000000', '2021-09-04T07:00:00.000000', '2021-09-04T08:00:00.000000', '2021-09-04T09:00:00.000000', '2021-09-04T10:00:00.000000', '2021-09-04T11:00:00.000000', '2021-09-04T12:00:00.000000', '2021-09-04T13:00:00.000000', '2021-09-04T14:00:00.000000', '2021-09-04T15:00:00.000000', '2021-09-04T16:00:00.000000', '2021-09-04T17:00:00.000000', '2021-09-04T18:00:00.000000', '2021-09-04T19:00:00.000000', '2021-09-04T20:00:00.000000', '2021-09-04T21:00:00.000000', '2021-09-04T22:00:00.000000', '2021-09-04T23:00:00.000000', '2021-09-05T00:00:00.000000', '2021-09-05T01:00:00.000000'], dtype='datetime64[us]')
Hvplot wants lon [-180,180], not [0,360]:
ds2 = ds2.assign_coords(longitude=(((ds2.longitude + 180) % 360) - 180))
hvplot has a slider for time steps, but we want a dropdown list, so we use Panel
extra_dims = list(ds2[var].dims[:-2])
mesh = ds2[var].hvplot.quadmesh(x='longitude', y='latitude', rasterize=True, geo=True, tiles='OSM', cmap='turbo')
pn.panel(mesh, widgets={k: pn.widgets.Select for k in extra_dims})
We are reading GRIB2 files, which compress the entire spatial domain as a single chunk. Therefore reading all the time values at a single point actually needs to load and uncompress all the data for that variable. With 30 cores, accessing a weeks worth of data take just under two minutes.
%%time
ds2[var][:,500,500].hvplot(x='valid_time', grid=True)
CPU times: user 1min 48s, sys: 9.96 s, total: 1min 58s Wall time: 2min 1s
client.close(); cluster.shutdown()