#!/usr/bin/env python
# coding: utf-8

# # Open NWM 1km dataset as DFReferenceFileSystem 
# 
# Open dataset as a fsspec `DFReferenceFileSystem` filesystem by reading references from a collection of Parquet files: one file containing global metadata and coordinate variable references, and one file for each of the data variables.  
# 
# The big wins here are lazy-loading of the references for each variable, and the more efficient construction of the virtual fsspec filesystem from the Parquet files (JSON is slow to decode).

# In[1]:


import fsspec
from fsspec.implementations.reference import DFReferenceFileSystem
import xarray as xr
import numpy as np


# In[2]:


fs = fsspec.filesystem('s3', anon=True, 
                        client_kwargs={'endpoint_url':'https://ncsa.osn.xsede.org'})


# In[3]:


s3_lazy_refs = 's3://esip/noaa/nwm/lazy_refs'


# In[4]:


lazy_refs_size = [fs.size(f) for f in fs.ls(s3_lazy_refs)]
print(f'Number of Parquet files: {len(lazy_refs_size)}')
print(f'Total size of Parquet references: {np.array(lazy_refs_size).sum()/1e9} GB')


# In[5]:


r_opts = {'anon': True}
t_opts = {'anon': True, 'client_kwargs':{'endpoint_url':'https://ncsa.osn.xsede.org'}}


# In[6]:


get_ipython().run_cell_magic('time', '', 'fs2 = DFReferenceFileSystem(s3_lazy_refs, lazy=True, target_options=t_opts,\n                        remote_protocol=\'s3\', remote_options=r_opts)\nm = fs2.get_mapper("")\nds = xr.open_dataset(m, engine="zarr", chunks={}, backend_kwargs=dict(consolidated=False))\n')


# In[7]:


ds


# Examine a specific variable:

# In[8]:


ds.TRAD


# Compute the uncompressed size of the whole dataset in TB:

# In[9]:


ds.nbytes/1e12  


# Load some data at a specific time step.  The first time a variable is accessed it will take longer as the references need to be loaded.

# In[10]:


get_ipython().run_cell_magic('time', '', "da = ds.TRAD.sel(time='1990-01-01 00:00').load()\n")


# Loading data for another time step is much faster as the references are already loaded:

# In[11]:


get_ipython().run_cell_magic('time', '', "da = ds.TRAD.sel(time='2015-01-01 00:00').load()\n")


# Compute the mean over the domain:

# In[12]:


da.mean().data


# In[13]:


da.plot()


# In[ ]: