Here we process a time stack of drone-derived beach topography using the rasterio backend to xarray to read the cloud-optimized geotiff data. We use xarray's coordinate slicing to obtain the same region of interest in each map, and use xarray's built-in Dask distributed to process the data in parallel. This preliminary data was collected by Chris Sherwood (csherwood@usgs.gov) and is for analysis and visualization demonstration purposes only: it has not yet been published.
%matplotlib inline
import json
import numpy as np
import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import xarray as xr
import os
from dask.distributed import Client, progress, LocalCluster
pangeo = 'ESIP-AWS-S3'
if pangeo=='ESIP-AWS-S3':
# use direct access from s3 if credentials exist, otherwise use public HTTPs address
if os.environ.get('AWS_ACCESS_KEY_ID') and os.environ.get('AWS_SECRET_ACCESS_KEY'):
fdir = 's3://rsignell/crs_maps'
else:
fdir = 'https://rsignell.s3.amazonaws.com/crs_maps'
from dask_kubernetes import KubeCluster
cluster = KubeCluster()
cluster.scale(10);
if pangeo=='USGS-HPC-YETI':
fdir = '/lustre/projects/hazards/cmgp/woodshole/rsignell/crs_maps'
from dask_jobqueue import SLURMCluster
import os
cluster = SLURMCluster(processes=4, threads=1, memory='16GB',
project='woodshole', walltime='01:00:00', queue='normal')
cluster.start_workers(10)
cluster
VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n <style scoped>\n .…
client = Client(cluster)
Check to make sure the Jupyter notebook and Dask workers are using the same Dask version. This will error if there is a problem.
out = client.get_versions(check=True)
fnames = (
"2016-01-22_SandwichTNB_PT_DEM_10cm_trimmed.tif",
"2016-01-25_SandwichTNB_DEM_10cm_trimmed.tif",
"2016-02-11_SandwichTNB_DEM_10cm_trimmed.tif",
"2016-03-30_SandwichTNB_AS_DEM_10cm_trimmed.tif",
"2016-09-21_SandwichTNB_DEM_10cm_trimmed.tif",
"2017-01-09_SandwichTNB_DEM_10cm_trimmed.tif",
"2017-01-25_SandwichTNB_DEM_10cm_trimmed.tif",
"2017-02-14_SandwichTNB_DEM_10cm_trimmed.tif",
"2017-03-16_SandwichTNB_DEM_10cm_trimmed.tif",
"2017-04-28_SandwichTNB_DEM_10cm_trimmed.tif",
"2017-05-04_SandwichTNB_DEM_10cm_trimmed.tif",
"2017-09-18_SandwichTNB_DEM_10cm_trimmed.tif",
"2018-01-10_SandwichTNB_DEM_10cm_trimmed.tif",
"DEM_10cm_selfcal_lokicampos_mid_denseNAD83UTM19.tif"
)
Each geotiff covers a different region, but all files have coordinates in NAD83 UTM Zone 19N meters.
# top left corner of region of interest (UTM Zone 19N meters)
e0 = 376488.; n0 = 4625200.
# size of region in meters
xsize, ysize = 800., 680.
e1 = e0 + xsize
n1 = n0 - ysize
The subsetting operation returns arrays that all have the same size, but with tiny (round-off level) differences in the coordinates, so below we just take the coordinates from the first array and assign it to the others.
%%time
dalist =[]
for fname in fnames:
da = xr.open_rasterio(f'{fdir}/{fname}',chunks={'band':1, 'y':6800, 'x':8000})
da = da.sel(x=slice(e0,e1), y=slice(n0,n1))
if len(dalist) == 0:
xc = da['x']-da['x'].values.min()
yc = da['y']-da['y'].values.min()
da['x'] = xc
da['y'] = yc
dalist.append(da)
CPU times: user 872 ms, sys: 56 ms, total: 928 ms Wall time: 4.21 s
dastack = xr.concat(dalist, dim='band')
Mask the bad data (where values are -32767).
dastack = dastack.where(dastack!=-32767.)
zout = dastack.mean(dim='band', skipna=False).persist()
progress(zout)
VBox()
%%time
zout[::10,::10].plot.imshow(vmin=-2, vmax=8);
CPU times: user 7.53 s, sys: 1.07 s, total: 8.6 s Wall time: 30.7 s
<matplotlib.image.AxesImage at 0x7f6692b54d68>
zout = dastack.std(dim='band', skipna=False).persist()
progress(zout)
VBox()
%%time
zout[::10,::10].plot.imshow(vmin=0, vmax=1);
CPU times: user 10.7 s, sys: 1.36 s, total: 12.1 s Wall time: 29.7 s
<matplotlib.image.AxesImage at 0x7f66960b80b8>