Data from a 2.5 arc minute CONUS model from 1895 to 2020 The provided files were fixed-width ASCII, with year and date in the first two columns, and the data in the rest of the columns. The raster data from each time step is written to a single row, with only the non-missing values written. There is one file for each variable. There is also a separate CSV file that contains the lon,lat locations for each column of data.
To parallelize the workflow, we split the original files (tmean.monthly.all.gz
, prcp.monthly.all.gz
) into many smaller text files using split
, choosing the number of lines to match the desired number of time steps in the chunked output.
#!/bin/bash
for var in prcp tmean
do
mkdir $var
zcat $var.monthly.all.gz | split -l 120 --numeric-suffixes - $var/$var
done
import fsspec
import xarray as xr
import pandas as pd
import numpy as np
import datetime as dt
import hvplot.xarray
from dask.distributed import Client
import dask
fs = fsspec.filesystem('')
inpath = '/scratch/mike/'
outpath = '/scratch/mike/wbm.zarr'
fs.ls(inpath)
['/scratch/mike/prcp.monthly.all.gz', '/scratch/mike/do_split.sh', '/scratch/mike/wb.swe.cg.awc.gz', '/scratch/mike/wb.rain.cg.awc.gz', '/scratch/mike/gzfiles', '/scratch/mike/wb.snow.cg.awc.gz', '/scratch/mike/prcp', '/scratch/mike/wb.runoff.cg.awc.gz', '/scratch/mike/wbm.nc', '/scratch/mike/nbm.zarr', '/scratch/mike/LatLongs.csv', '/scratch/mike/tmean.monthly.all.gz', '/scratch/mike/wb.soilstorage.cg.awc.gz', '/scratch/mike/tmean', '/scratch/mike/foo.zarr', '/scratch/mike/wb.pet.cg.awc.gz', '/scratch/mike/wb.aet.cg.awc.gz']
df = pd.read_csv(f'{inpath}/LatLongs.csv')
Determine the i,j locations on the grid corresponding to the given lon,lat point:
ii = np.round((df['X']-df['X'].min())/(2.5/60)).astype('int')
jj = np.round((df['Y']-df['Y'].min())/(2.5/60)).astype('int')
nx = max(ii)+1
ny = max(jj)+1
print(nx,ny)
1385 596
lon = np.linspace(df['X'].min(), df['X'].max(),nx)
lat = np.linspace(df['Y'].min(), df['Y'].max(),ny)
dates = pd.date_range(start='1895-01-01 00:00',end='2021-01-01 00:00', freq='M')
nt = len(dates)
print(nt)
1512
chunk_lon = 700
chunk_lat = 300
chunk_time = 120
fs.ls(f'{inpath}/gzfiles/')
['/scratch/mike//gzfiles/tmean', '/scratch/mike//gzfiles/snow', '/scratch/mike//gzfiles/aet', '/scratch/mike//gzfiles/runoff', '/scratch/mike//gzfiles/prcp', '/scratch/mike//gzfiles/rain', '/scratch/mike//gzfiles/pet', '/scratch/mike//gzfiles/soilstorage', '/scratch/mike//gzfiles/swe']
d = dask.array.zeros((nt,ny,nx), chunks=(chunk_time, chunk_lat, chunk_lon), dtype='float32')
ds0 = xr.Dataset(
{
"prcp": (['time', 'lat', 'lon'], d),
"tmean": (['time', 'lat', 'lon'], d),
"aet": (['time', 'lat', 'lon'], d),
"pet": (['time', 'lat', 'lon'], d),
"rain": (['time', 'lat', 'lon'], d),
"runoff": (['time', 'lat', 'lon'], d),
"snow": (['time', 'lat', 'lon'], d),
"soilstorage": (['time', 'lat', 'lon'], d),
"swe": (['time', 'lat', 'lon'], d)
},
coords={
"lon": (["lon"], lon),
"lat": (["lat"], lat),
"time": dates
},
)
ds0.to_zarr(outpath, mode='w', compute=False, consolidated=True)
Delayed('_finalize_store-3916485e-4c2e-4d10-8c6e-46da6e14315b')
def write_chunk(var, f, istart):
a = np.loadtxt(f, dtype='float32')
year = a[:,0].astype('int')
mon = a[:,1].astype('int')
t = [np.datetime64(dt.datetime(year[k],mon[k],1)) for k in range(len(mon))]
data = a[:,2:]
[nt, nr] = data.shape
b = np.nan * np.zeros((nt,ny,nx), dtype='float32')
for k in range(nr):
b[:, jj[k], ii[k]] = data[:,k]
da = xr.DataArray(data=b, dims=['time','lat','lon'],
coords=dict(
lon=('lon',lon),
lat=('lat',lat),
time=('time',t)
))
ds = da.to_dataset(name=var)
ds = ds.chunk(chunks={'time':chunk_time, 'lat':chunk_lat, 'lon':chunk_lon})
ds.drop(['lon','lat']).to_zarr(outpath, region={'time':slice(istart,istart+nt)})
client = Client()
%%time
tasks=[]
for var in ['tmean','prcp','aet','pet','rain','runoff','snow','soilstorage','swe']:
flist = fs.glob(f'/scratch/mike/gzfiles/{var}/{var}??')
i = 0
for f in flist:
print(f)
istart=i*chunk_time
tasks.append(dask.delayed(write_chunk)(var, f, istart))
i = i + 1
/scratch/mike/gzfiles/tmean/tmean00 /scratch/mike/gzfiles/tmean/tmean01 /scratch/mike/gzfiles/tmean/tmean02 /scratch/mike/gzfiles/tmean/tmean03 /scratch/mike/gzfiles/tmean/tmean04 /scratch/mike/gzfiles/tmean/tmean05 /scratch/mike/gzfiles/tmean/tmean06 /scratch/mike/gzfiles/tmean/tmean07 /scratch/mike/gzfiles/tmean/tmean08 /scratch/mike/gzfiles/tmean/tmean09 /scratch/mike/gzfiles/tmean/tmean10 /scratch/mike/gzfiles/tmean/tmean11 /scratch/mike/gzfiles/tmean/tmean12 /scratch/mike/gzfiles/prcp/prcp00 /scratch/mike/gzfiles/prcp/prcp01 /scratch/mike/gzfiles/prcp/prcp02 /scratch/mike/gzfiles/prcp/prcp03 /scratch/mike/gzfiles/prcp/prcp04 /scratch/mike/gzfiles/prcp/prcp05 /scratch/mike/gzfiles/prcp/prcp06 /scratch/mike/gzfiles/prcp/prcp07 /scratch/mike/gzfiles/prcp/prcp08 /scratch/mike/gzfiles/prcp/prcp09 /scratch/mike/gzfiles/prcp/prcp10 /scratch/mike/gzfiles/prcp/prcp11 /scratch/mike/gzfiles/prcp/prcp12 /scratch/mike/gzfiles/aet/aet00 /scratch/mike/gzfiles/aet/aet01 /scratch/mike/gzfiles/aet/aet02 /scratch/mike/gzfiles/aet/aet03 /scratch/mike/gzfiles/aet/aet04 /scratch/mike/gzfiles/aet/aet05 /scratch/mike/gzfiles/aet/aet06 /scratch/mike/gzfiles/aet/aet07 /scratch/mike/gzfiles/aet/aet08 /scratch/mike/gzfiles/aet/aet09 /scratch/mike/gzfiles/aet/aet10 /scratch/mike/gzfiles/aet/aet11 /scratch/mike/gzfiles/aet/aet12 /scratch/mike/gzfiles/pet/pet00 /scratch/mike/gzfiles/pet/pet01 /scratch/mike/gzfiles/pet/pet02 /scratch/mike/gzfiles/pet/pet03 /scratch/mike/gzfiles/pet/pet04 /scratch/mike/gzfiles/pet/pet05 /scratch/mike/gzfiles/pet/pet06 /scratch/mike/gzfiles/pet/pet07 /scratch/mike/gzfiles/pet/pet08 /scratch/mike/gzfiles/pet/pet09 /scratch/mike/gzfiles/pet/pet10 /scratch/mike/gzfiles/pet/pet11 /scratch/mike/gzfiles/pet/pet12 /scratch/mike/gzfiles/rain/rain00 /scratch/mike/gzfiles/rain/rain01 /scratch/mike/gzfiles/rain/rain02 /scratch/mike/gzfiles/rain/rain03 /scratch/mike/gzfiles/rain/rain04 /scratch/mike/gzfiles/rain/rain05 /scratch/mike/gzfiles/rain/rain06 /scratch/mike/gzfiles/rain/rain07 /scratch/mike/gzfiles/rain/rain08 /scratch/mike/gzfiles/rain/rain09 /scratch/mike/gzfiles/rain/rain10 /scratch/mike/gzfiles/rain/rain11 /scratch/mike/gzfiles/rain/rain12 /scratch/mike/gzfiles/runoff/runoff00 /scratch/mike/gzfiles/runoff/runoff01 /scratch/mike/gzfiles/runoff/runoff02 /scratch/mike/gzfiles/runoff/runoff03 /scratch/mike/gzfiles/runoff/runoff04 /scratch/mike/gzfiles/runoff/runoff05 /scratch/mike/gzfiles/runoff/runoff06 /scratch/mike/gzfiles/runoff/runoff07 /scratch/mike/gzfiles/runoff/runoff08 /scratch/mike/gzfiles/runoff/runoff09 /scratch/mike/gzfiles/runoff/runoff10 /scratch/mike/gzfiles/runoff/runoff11 /scratch/mike/gzfiles/runoff/runoff12 /scratch/mike/gzfiles/snow/snow00 /scratch/mike/gzfiles/snow/snow01 /scratch/mike/gzfiles/snow/snow02 /scratch/mike/gzfiles/snow/snow03 /scratch/mike/gzfiles/snow/snow04 /scratch/mike/gzfiles/snow/snow05 /scratch/mike/gzfiles/snow/snow06 /scratch/mike/gzfiles/snow/snow07 /scratch/mike/gzfiles/snow/snow08 /scratch/mike/gzfiles/snow/snow09 /scratch/mike/gzfiles/snow/snow10 /scratch/mike/gzfiles/snow/snow11 /scratch/mike/gzfiles/snow/snow12 /scratch/mike/gzfiles/soilstorage/soilstorage00 /scratch/mike/gzfiles/soilstorage/soilstorage01 /scratch/mike/gzfiles/soilstorage/soilstorage02 /scratch/mike/gzfiles/soilstorage/soilstorage03 /scratch/mike/gzfiles/soilstorage/soilstorage04 /scratch/mike/gzfiles/soilstorage/soilstorage05 /scratch/mike/gzfiles/soilstorage/soilstorage06 /scratch/mike/gzfiles/soilstorage/soilstorage07 /scratch/mike/gzfiles/soilstorage/soilstorage08 /scratch/mike/gzfiles/soilstorage/soilstorage09 /scratch/mike/gzfiles/soilstorage/soilstorage10 /scratch/mike/gzfiles/soilstorage/soilstorage11 /scratch/mike/gzfiles/soilstorage/soilstorage12 /scratch/mike/gzfiles/swe/swe00 /scratch/mike/gzfiles/swe/swe01 /scratch/mike/gzfiles/swe/swe02 /scratch/mike/gzfiles/swe/swe03 /scratch/mike/gzfiles/swe/swe04 /scratch/mike/gzfiles/swe/swe05 /scratch/mike/gzfiles/swe/swe06 /scratch/mike/gzfiles/swe/swe07 /scratch/mike/gzfiles/swe/swe08 /scratch/mike/gzfiles/swe/swe09 /scratch/mike/gzfiles/swe/swe10 /scratch/mike/gzfiles/swe/swe11 /scratch/mike/gzfiles/swe/swe12 CPU times: user 18.3 ms, sys: 13.1 ms, total: 31.4 ms Wall time: 38.8 ms
%%time
dask.compute(tasks, scheduler='processes', num_workers=4)
CPU times: user 35.5 s, sys: 6.84 s, total: 42.3 s Wall time: 15min 59s
([None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None],)
ds2 = xr.open_dataset(outpath, engine='zarr', chunks={})
ds2
<xarray.Dataset> Dimensions: (lat: 596, lon: 1385, time: 1512) Coordinates: * lat (lat) float64 24.56 24.6 24.65 24.69 ... 49.27 49.31 49.35 * lon (lon) float64 -124.7 -124.6 -124.6 ... -67.1 -67.06 -67.02 * time (time) datetime64[ns] 1895-01-01 1895-02-01 ... 2020-12-01 Data variables: aet (time, lat, lon) float32 dask.array<chunksize=(120, 300, 700), meta=np.ndarray> pet (time, lat, lon) float32 dask.array<chunksize=(120, 300, 700), meta=np.ndarray> prcp (time, lat, lon) float32 dask.array<chunksize=(120, 300, 700), meta=np.ndarray> rain (time, lat, lon) float32 dask.array<chunksize=(120, 300, 700), meta=np.ndarray> runoff (time, lat, lon) float32 dask.array<chunksize=(120, 300, 700), meta=np.ndarray> snow (time, lat, lon) float32 dask.array<chunksize=(120, 300, 700), meta=np.ndarray> soilstorage (time, lat, lon) float32 dask.array<chunksize=(120, 300, 700), meta=np.ndarray> swe (time, lat, lon) float32 dask.array<chunksize=(120, 300, 700), meta=np.ndarray> tmean (time, lat, lon) float32 dask.array<chunksize=(120, 300, 700), meta=np.ndarray>
array([24.5625 , 24.604167, 24.645833, ..., 49.270865, 49.312532, 49.354199])
array([-124.6875 , -124.645833, -124.604167, ..., -67.10423 , -67.062564, -67.020897])
array(['1895-01-01T00:00:00.000000000', '1895-02-01T00:00:00.000000000', '1895-03-01T00:00:00.000000000', ..., '2020-10-01T00:00:00.000000000', '2020-11-01T00:00:00.000000000', '2020-12-01T00:00:00.000000000'], dtype='datetime64[ns]')
|
|
|
|
|
|
|
|
|
ds2.tmean.sel(time='1925-01-01').hvplot.quadmesh(x='lon',y='lat', geo=True, tiles='OSM', cmap='turbo', rasterize=True, alpha=0.7)
ds2.prcp.hvplot.quadmesh(x='lon',y='lat', geo=True, tiles='OSM', cmap='turbo', rasterize=True, alpha=0.7)
ds2.tmean
<xarray.DataArray 'tmean' (time: 1512, lat: 596, lon: 1385)> dask.array<open_dataset-b974807f46d01968588aea2aeb5fcbf5tmean, shape=(1512, 596, 1385), dtype=float32, chunksize=(120, 300, 700), chunktype=numpy.ndarray> Coordinates: * lat (lat) float64 24.56 24.6 24.65 24.69 ... 49.23 49.27 49.31 49.35 * lon (lon) float64 -124.7 -124.6 -124.6 -124.6 ... -67.1 -67.06 -67.02 * time (time) datetime64[ns] 1895-01-01 1895-02-01 ... 2020-12-01
|
array([24.5625 , 24.604167, 24.645833, ..., 49.270865, 49.312532, 49.354199])
array([-124.6875 , -124.645833, -124.604167, ..., -67.10423 , -67.062564, -67.020897])
array(['1895-01-01T00:00:00.000000000', '1895-02-01T00:00:00.000000000', '1895-03-01T00:00:00.000000000', ..., '2020-10-01T00:00:00.000000000', '2020-11-01T00:00:00.000000000', '2020-12-01T00:00:00.000000000'], dtype='datetime64[ns]')
ds2.tmean.sel(lon=-90.,lat=35.,method='nearest').plot()
[<matplotlib.lines.Line2D at 0x15076443cca0>]