#!/usr/bin/env python # coding: utf-8 # # Open NWM 1km dataset as DFReferenceFileSystem # # Open dataset as a fsspec `DFReferenceFileSystem` filesystem by reading references from a collection of Parquet files: one file containing global metadata and coordinate variable references, and one file for each of the data variables. # # The big wins here are lazy-loading of the references for each variable, and the more efficient construction of the virtual fsspec filesystem from the Parquet files (JSON is slow to decode). # In[1]: import fsspec from fsspec.implementations.reference import DFReferenceFileSystem import xarray as xr import numpy as np # In[2]: fs = fsspec.filesystem('s3', anon=True, client_kwargs={'endpoint_url':'https://ncsa.osn.xsede.org'}) # In[3]: s3_lazy_refs = 's3://esip/noaa/nwm/lazy_refs' # In[4]: lazy_refs_size = [fs.size(f) for f in fs.ls(s3_lazy_refs)] print(f'Number of Parquet files: {len(lazy_refs_size)}') print(f'Total size of Parquet references: {np.array(lazy_refs_size).sum()/1e9} GB') # In[5]: r_opts = {'anon': True} t_opts = {'anon': True, 'client_kwargs':{'endpoint_url':'https://ncsa.osn.xsede.org'}} # In[6]: get_ipython().run_cell_magic('time', '', 'fs2 = DFReferenceFileSystem(s3_lazy_refs, lazy=True, target_options=t_opts,\n remote_protocol=\'s3\', remote_options=r_opts)\nm = fs2.get_mapper("")\nds = xr.open_dataset(m, engine="zarr", chunks={}, backend_kwargs=dict(consolidated=False))\n') # In[7]: ds # Examine a specific variable: # In[8]: ds.TRAD # Compute the uncompressed size of the whole dataset in TB: # In[9]: ds.nbytes/1e12 # Load some data at a specific time step. The first time a variable is accessed it will take longer as the references need to be loaded. # In[10]: get_ipython().run_cell_magic('time', '', "da = ds.TRAD.sel(time='1990-01-01 00:00').load()\n") # Loading data for another time step is much faster as the references are already loaded: # In[11]: get_ipython().run_cell_magic('time', '', "da = ds.TRAD.sel(time='2015-01-01 00:00').load()\n") # Compute the mean over the domain: # In[12]: da.mean().data # In[13]: da.plot() # In[ ]: