#!/usr/bin/env python # coding: utf-8 # # Getting Started with RasterFrames Notebook # ## Setup Spark Environment # In[1]: import pyrasterframes from pyrasterframes.utils import create_rf_spark_session import pyrasterframes.rf_ipython # enables nicer visualizations of pandas DF from pyrasterframes.rasterfunctions import * import pyspark.sql.functions as F # In[2]: spark = create_rf_spark_session() # ### Get a PySpark DataFrame from [open data](https://docs.opendata.aws/modis-pds/readme.html) # # Read a single "granule" or scene of MODIS surface reflectance data. # In[3]: uri = 'https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059' \ '/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF' df = spark.read.raster(uri) # In[4]: df.printSchema() # Do some work with the raster data; add 3 element-wise to the pixel/cell values and show some rows of the DataFrame. # In[5]: df.select(rf_local_add(df.proj_raster, F.lit(3))) # The extent struct tells us where in the [CRS](https://spatialreference.org/ref/sr-org/6842/) the tile data covers. The granule is split into arbitrary sized chunks. Each row is a different chunk. Let's see how many. # # Side note: you can configure the default size of these chunks, which are called Tiles, by passing a tuple of desired columns and rows as: `raster(uri, tile_dimensions=(96, 96))`. The default is `(256, 256)` # In[6]: df.count() # What area does the DataFrame cover? # In[7]: crs = df.agg(F.first(rf_crs(df.proj_raster)).crsProj4.alias('crs')).first()['crs'] print(crs) coverage_area = df.select( df.proj_raster_path, st_reproject( st_geometry(rf_extent(df.proj_raster)), rf_mk_crs(crs), rf_mk_crs('EPSG:4326')).alias('footprint') ) coverage_area # So where in the world is that? We'll generate a little visualization with Leaflet in the notebook using Folium. # In[8]: import geopandas import folium # In[9]: gdf = geopandas.GeoDataFrame( coverage_area.select('footprint').toPandas(), geometry='footprint', crs={'init':'EPSG:4326'}) # In[ ]: folium.Map((5, -65), zoom_start=6) \ .add_child(folium.GeoJson(gdf.__geo_interface__)) # Look at a sample of the data. You may find it useful to double-click the tile image column to see larger or smaller rendering of the image. # In[11]: #Look at a sample pandas_df = df.select( df.proj_raster_path, rf_extent(df.proj_raster).alias('extent'), rf_geometry(df.proj_raster).alias('geo'), rf_tile(df.proj_raster).alias('tile'), ).limit(5).toPandas() pandas_df # In[ ]: