import s3fs
import h5py
import numpy as np
import xarray as xr
import dask
import dask.array as dsa
import tqdm
from matplotlib import pyplot as plt
import pickle
This uses spot instances.
If executing from a different environment, replace this with a different type of dask cluster (e.g. LocalCluster
).
import dask_gateway
gateway = dask_gateway.Gateway()
options = gateway.cluster_options()
options.worker_memory = 24
cluster = gateway.new_cluster(options)
cluster
VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n …
from dask.distributed import Client
cluster.scale(30)
client = Client(cluster)
client
Client
|
Cluster
|
fs = s3fs.S3FileSystem(anon=True)
fs
<s3fs.core.S3FileSystem at 0x7ff1f59aed50>
Listing all the files from s3 takes a few minutes.
# cache the file listing to speed things up
try:
with open('all_files.pkl', 'rb') as f:
all_files = pickle.load(f)
except FileNotFoundError:
base = 'noaa-goes16/ABI-L2-RRQPEF/2020/'
top_level_dirs = fs.ls(base)
all_files = []
for tld in tqdm.tqdm(top_level_dirs):
all_files.append(fs.find(tld))
with open('all_files.pkl', 'wb') as f:
pickle.dump(all_files, f)
flat_list = []
for sublist in all_files:
for item in sublist:
flat_list.append(item)
len(flat_list)
29059
tbounds0 = ds_example.time_bounds.values
example_url = flat_list[1]
f = fs.open(example_url)
ds_example = xr.open_dataset(f, decode_cf=True)
ds_example
<xarray.Dataset> Dimensions: (number_of_LZA_bounds: 2, number_of_SZA_bounds: 2, number_of_image_bounds: 2, number_of_lat_bounds: 2, number_of_rainfall_rate_bounds: 2, number_of_time_bounds: 2, x: 5424, y: 5424) Coordinates: t datetime64[ns] 2020-01-01T00:15:0... * y (y) float32 0.151844 ... -0.151844 * x (x) float32 -0.151844 ... 0.151844 y_image float32 0.0 x_image float32 0.0 retrieval_local_zenith_angle float32 90.0 quantitative_local_zenith_angle float32 70.0 solar_zenith_angle float32 180.0 latitude float32 60.0 accounted_rainfall_rate float32 1.0 Dimensions without coordinates: number_of_LZA_bounds, number_of_SZA_bounds, number_of_image_bounds, number_of_lat_bounds, number_of_rainfall_rate_bounds, number_of_time_bounds Data variables: RRQPE (y, x) float32 ... DQF (y, x) float32 ... time_bounds (number_of_time_bounds) datetime64[ns] ... goes_imager_projection int32 -2147483647 y_image_bounds (number_of_image_bounds) float32 ... x_image_bounds (number_of_image_bounds) float32 ... nominal_satellite_subpoint_lat float64 0.0 nominal_satellite_subpoint_lon float64 -75.2 nominal_satellite_height float64 3.579e+04 geospatial_lat_lon_extent float32 9.96921e+36 total_pixels_with_rain float64 4.075e+05 total_rain_volume float64 2.897e+06 total_pixels_with_successful_retrieval float64 2.028e+07 rainfall_rate_outlier_pixel_count float64 3.642e+03 minimum_rainfall_rate float64 8.64e-05 maximum_rainfall_rate float64 100.0 mean_rainfall_rate float64 6.51 standard_deviation_rainfall_rate float64 9.896 algorithm_dynamic_input_data_container int32 -2147483647 processing_parm_version_container int32 -2147483647 algorithm_product_version_container int32 -2147483647 retrieval_local_zenith_angle_bounds (number_of_LZA_bounds) float32 0.... quantitative_local_zenith_angle_bounds (number_of_LZA_bounds) float32 0.... solar_zenith_angle_bounds (number_of_SZA_bounds) float32 0.... latitude_bounds (number_of_lat_bounds) float32 -6... accounted_rainfall_rate_bounds (number_of_rainfall_rate_bounds) float32 ... percent_uncorrectable_GRB_errors float64 0.0 percent_uncorrectable_L0_errors float64 0.0 Attributes: naming_authority: gov.nesdis.noaa Conventions: CF-1.7 Metadata_Conventions: Unidata Dataset Discovery v1.0 standard_name_vocabulary: CF Standard Name Table (v35, 20 July 2016) institution: DOC/NOAA/NESDIS > U.S. Department of Commerce,... project: GOES production_site: NSOF production_environment: OE spatial_resolution: 2km at nadir orbital_slot: GOES-East platform_ID: G16 instrument_type: GOES R Series Advanced Baseline Imager scene_id: Full Disk instrument_ID: FM1 dataset_name: OR_ABI-L2-RRQPEF-M6_G16_s20200010010216_e20200... iso_series_metadata_id: 3a3268a0-b006-11e1-afa6-0800200c9a66 title: ABI L2 Rainfall Rate - Quantitative Precipitat... summary: The Rainfall Rate - Quantitative Precipitation... keywords: ATMOSPHERE > PRECIPITATION > PRECIPITATION RATE keywords_vocabulary: NASA Global Change Master Directory (GCMD) Ear... license: Unclassified data. Access is restricted to ap... processing_level: National Aeronautics and Space Administration ... date_created: 2020-01-01T00:20:03.5Z cdm_data_type: Image time_coverage_start: 2020-01-01T00:10:21.6Z time_coverage_end: 2020-01-01T00:19:52.4Z timeline_id: ABI Mode 6 production_data_source: Realtime id: 1bf00d5c-8e39-446c-8f9a-b276a80dbe70
array('2020-01-01T00:15:07.050812928', dtype='datetime64[ns]')
array([ 0.151844, 0.151788, 0.151732, ..., -0.151732, -0.151788, -0.151844], dtype=float32)
array([-0.151844, -0.151788, -0.151732, ..., 0.151732, 0.151788, 0.151844], dtype=float32)
array(0., dtype=float32)
array(0., dtype=float32)
array(90., dtype=float32)
array(70., dtype=float32)
array(180., dtype=float32)
array(60., dtype=float32)
array(1., dtype=float32)
[29419776 values with dtype=float32]
[29419776 values with dtype=float32]
array(['2020-01-01T00:10:21.653735936', '2020-01-01T00:19:52.447890048'], dtype='datetime64[ns]')
array(-2147483647, dtype=int32)
array([ 0.151872, -0.151872], dtype=float32)
array([-0.151872, 0.151872], dtype=float32)
array(0.)
array(-75.199997)
array(35786.023438)
array(9.96921e+36, dtype=float32)
array(407487.)
array(2897253.75)
array(20276620.)
array(3642.)
array(8.64e-05)
array(99.995682)
array(6.510106)
array(9.895539)
array(-2147483647, dtype=int32)
array(-2147483647, dtype=int32)
array(-2147483647, dtype=int32)
array([ 0., 90.], dtype=float32)
array([ 0., 70.], dtype=float32)
array([ 0., 180.], dtype=float32)
array([-60., 60.], dtype=float32)
array([ 1., 100.], dtype=float32)
array(0.)
array(0.)
ds_example.RRQPE
<xarray.DataArray 'RRQPE' (y: 5424, x: 5424)> [29419776 values with dtype=float32] Coordinates: t datetime64[ns] 2020-01-01T00:05:07.05408... * y (y) float32 0.151844 0.151788 ... -0.151844 * x (x) float32 -0.151844 ... 0.151844 y_image float32 0.0 x_image float32 0.0 retrieval_local_zenith_angle float32 90.0 quantitative_local_zenith_angle float32 70.0 solar_zenith_angle float32 180.0 latitude float32 60.0 accounted_rainfall_rate float32 1.0 Attributes: long_name: ABI L2+ Rainfall Rate - Quantitative Precipitation ... standard_name: rainfall_rate valid_range: [ 0 -6] units: mm h-1 resolution: y: 0.000056 rad x: 0.000056 rad grid_mapping: goes_imager_projection cell_methods: latitude: point (good quality pixel produced) retri... ancillary_variables: DQF
[29419776 values with dtype=float32]
array('2020-01-01T00:05:07.054081024', dtype='datetime64[ns]')
array([ 0.151844, 0.151788, 0.151732, ..., -0.151732, -0.151788, -0.151844], dtype=float32)
array([-0.151844, -0.151788, -0.151732, ..., 0.151732, 0.151788, 0.151844], dtype=float32)
array(0., dtype=float32)
array(0., dtype=float32)
array(90., dtype=float32)
array(70., dtype=float32)
array(180., dtype=float32)
array(60., dtype=float32)
array(1., dtype=float32)
ds_example.RRQPE.encoding
{'chunksizes': (226, 226), 'fletcher32': False, 'shuffle': False, 'zlib': True, 'complevel': 1, 'source': '<File-like object S3FileSystem, noaa-goes16/ABI-L2-RRQPEF/2020/001/00/OR_ABI-L2-RRQPEF-M6_G16_s20200010000216_e20200010009524_c20200010010034.nc>', 'original_shape': (5424, 5424), 'dtype': dtype('int16'), '_Unsigned': 'true', '_FillValue': array([65535], dtype=uint16), 'scale_factor': array([0.00152602], dtype=float32), 'add_offset': array([0.], dtype=float32), 'coordinates': 'latitude retrieval_local_zenith_angle quantitative_local_zenith_angle solar_zenith_angle t y x'}
def load_data(url):
"""Extract just the data from a single file."""
f = fs.open(url)
# handles decoding of the data automatically, e.g. scaling, units, etc
ds = xr.open_dataset(f)
return ds['RRQPE'].values
data0 = load_data(example_url)
np.nansum(data0)
3618519.0
from matplotlib.colors import LogNorm
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw={'aspect': 'equal'})
pc = ax.pcolormesh(np.ma.masked_invalid(data0)[::4, ::-4].transpose())
plt.colorbar(pc)
<matplotlib.colorbar.Colorbar at 0x7ff1e17f5c10>
This uses dask's delayed syntax to load the data lazily.
%%time
shape = data0.shape
dtype = data0.dtype
all_arrays = []
for url in flat_list:
array = dsa.from_delayed(dask.delayed(load_data)(url),
shape, dtype=dtype)
all_arrays.append(array)
all_data = dsa.stack(all_arrays)
all_data
CPU times: user 5.2 s, sys: 109 ms, total: 5.31 s Wall time: 5.3 s
|
%%time
all_data_sum = dsa.nansum(all_data, axis=0).compute()
CPU times: user 39.1 s, sys: 779 ms, total: 39.9 s Wall time: 11min 17s
cluster.scale(0)
# units are in mm/hr, but at 10 min sample, so we must divide by 6 for hourly measure
total_mm = all_data_sum / 6
from matplotlib.colors import LogNorm
fig, ax = plt.subplots(figsize=(12, 12), subplot_kw={'aspect': 'equal'})
pc = ax.pcolormesh(total_mm[::-4, ::4])
plt.colorbar(pc)
pc.set_clim([0, 10000])
cluster.close()
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client _GatheringFuture exception was never retrieved future: <_GatheringFuture finished exception=CancelledError()> concurrent.futures._base.CancelledError