In [1]:

import s3fs
import requests
from urllib import request
from http.cookiejar import CookieJar
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
from json import dumps
from io import StringIO
from os.path import dirname, join
import netrc

import os
import fsspec
import ujson   # fast json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask
import hvplot.xarray

In [2]:

import fsspec_reference_maker
fsspec_reference_maker.__version__

Out[2]:

'0.0.3+5.gfda8a53'

code for setting up earthdata_login from here
for the earthdata login to work you need to create a .netrc file on your home directory
.netrc file contains:\

machine urs.earthdata.nasa.gov
login 'earthdata username'
password 'password'

In [3]:

#########Setting up earthdata login credentials 
def setup_earthdata_login_auth(endpoint):
    """
    Set up the request library so that it authenticates against the given Earthdata Login
    endpoint and is able to track cookies between requests.  This looks in the .netrc file 
    first and if no credentials are found, it prompts for them.
    Valid endpoints:
        urs.earthdata.nasa.gov - Earthdata Login production
    """
    try:
        username, _, password = netrc.netrc().authenticators(endpoint)
    except (FileNotFoundError, TypeError):
        # FileNotFound = There's no .netrc file
        # TypeError = The endpoint isn't in the netrc file, causing the above to try unpacking None
        print("There's no .netrc file or the The endpoint isn't in the netrc file")

    manager = request.HTTPPasswordMgrWithDefaultRealm()
    manager.add_password(None, endpoint, username, password)
    auth = request.HTTPBasicAuthHandler(manager)

    jar = CookieJar()
    processor = request.HTTPCookieProcessor(jar)
    opener = request.build_opener(auth, processor)
    request.install_opener(opener)

###############################################################################
edl="urs.earthdata.nasa.gov"
setup_earthdata_login_auth(edl)

def begin_s3_direct_access():
    url="https://archive.podaac.earthdata.nasa.gov/s3credentials"
    response = requests.get(url).json()
    return s3fs.S3FileSystem(key=response['accessKeyId'],secret=response['secretAccessKey'],token=response['sessionToken'],client_kwargs={'region_name':'us-west-2'})

In [4]:

import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Medium Worker', 
                                      propagate_env=True)

/home/conda/store/c37333ddf2c7a75f8b9c2f18df15ab096f5e7c49bb1aa06ca31fcab52dd54528-pangeo/lib/python3.9/site-packages/dask_gateway/client.py:21: FutureWarning: format_bytes is deprecated and will be removed in a future release. Please use dask.utils.format_bytes instead.
  from distributed.utils import LoopRunner, format_bytes

No Cluster running.
Starting new cluster.
Setting Fixed Scaling workers=30
Reconnect client to clear cache
client.dashboard_link (for new browser tab/window or dashboard searchbar in Jupyterhub):
https://jupyter.qhub.esipfed.org/gateway/clusters/dev.cc990959c7554138828671f7b4f48c36/status
Propagating environment variables to workers

Create a list of all MUR files that are on the PODAAC Cloud¶

In [ ]:

fs = begin_s3_direct_access()
flist = []
for lyr in range(2002,2003): #2022):
    for imon in range(12,13): #(1,13):
        fstr = str(lyr)+str(imon).zfill(2)+'*.nc'
        files = fs.glob(join("podaac-ops-cumulus-protected/", "MUR-JPL-L4-GLOB-v4.1", fstr))
        for file in files:
            flist.append(file)
print('total number of individual netcdf files:',len(flist))

In [ ]:

urls = ["s3://" + f for f in flist]

so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')

In [ ]:

fs2 = fsspec.filesystem('s3', anon=False)  

In [ ]:

json_dir = 's3://esip-qhub/nasa/mur/jsons/'

file to output merged mur file

In [ ]:

#If the directory exists, remove it (and all the files)
try:
    fs2.rm(json_dir, recursive=True)
except:
    pass

In [ ]:

def gen_json(u):
    with fs.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        p = u.split('/')
        date = p[4][0:8] #p[3]
        fname = p[4] #p[5]
        outf = f'{json_dir}{date}.{fname}.json'
        print(outf)
        with fs2.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

In [ ]:

%%time
_ = dask.compute(*[dask.delayed(gen_json)(u) for u in urls[0:30]], retries=10);

In [ ]:

flist2 = fs2.ls(json_dir)
furls = sorted(['s3://'+f for f in flist2])
furls[0]

In [ ]:

client.close(); cluster.shutdown()

In [ ]:

from dask.distributed import Client

In [ ]:

client = Client()

In [ ]:

client

In [ ]:

mzz = MultiZarrToZarr(furls, 
    storage_options={'anon':False}, 
    remote_protocol='s3',
    remote_options={'anon' : 'True'},   #JSON files  
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
        'drop_variables': ['reference_time', 'crs'],
        'decode_coords' : False
    },
    xarray_concat_args={
#          "data_vars": "minimal",
#          "coords": "minimal",
#          "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "time"
    }
)

In [ ]:

%%time
#%%prun -D multizarr_profile 
mzz.translate('mur_consolidated.json')

In [5]:

rpath = 's3://esip-qhub-public/nasa/mur/mur4.1_consolidated.json'

In [ ]:

fs2.put_file(lpath='mur_consolidated.json', rpath=rpath)

testing¶

Try a single json¶

In [6]:

url="https://archive.podaac.earthdata.nasa.gov/s3credentials"
response = requests.get(url).json()

In [ ]:

turl ='s3://esip-qhub/nasa/mur/jsons/20021201.20021201090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc.json'

In [ ]:

s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'key':response['accessKeyId'],
 'secret':response['secretAccessKey'],
 'token':response['sessionToken'],
 'client_kwargs':{'region_name':'us-west-2'}}

fs = fsspec.filesystem("reference", fo=turl, ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", consolidated=False)

In [ ]:

ds

Try the consolidated JSON¶

In [7]:

r_opts = {'key':response['accessKeyId'],
 'secret':response['secretAccessKey'],
 'token':response['sessionToken'],
 'client_kwargs':{'region_name':'us-west-2'}}

In [8]:

s_opts = {'requester_pays':True, 'skip_instance_cache':True}
fs = fsspec.filesystem("reference", fo=rpath, ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", consolidated=False)

In [9]:

ds

In [10]:

import hvplot.xarray

In [12]:

sst = ds['analysed_sst'].sel(time='2002-12-20 12:00', method='nearest').load()
sst.hvplot.quadmesh(x='lon', y='lat', geo=True, rasterize=True, cmap='turbo' )

Out[12]:

In [ ]:

set up earthdata login credentials¶

Create a list of all MUR files that are on the PODAAC Cloud¶

testing¶

Try a single json¶

Try the consolidated JSON¶