#!/usr/bin/env python # coding: utf-8 # # GeoPandas: Advanced topics # [Emilio Mayorga](https://github.com/emiliom/), 2017-2-6 # ## 1. Introduction # We covered the basics of GeoPandas in the previous episode and notebook. Here, we'll extend that introduction to illustrate additional aspects of GeoPandas and its interactions with other Python libraries, covering fancier mapping, analysis (unitary and binary spatial operators), raster zonal stats and geopandas. # # **Here are the main sections in this episode / notebook:** # - Read HydroBASINS Pacific NW / Western Washington from a shape file. # - Dissolve into larger watersheds, and reproject # - Plot `choropleth` map based on calculated watershed areas # - Choropleth map as an interactive map with folium # - Spatial join, `sjoin`, of polygons on points # - rasterstats: "zonal" statistics from polygons on rasters # ## 2. Set up packages and data file path # We'll use these throughout the rest of the tutorial. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import os import json import matplotlib.pyplot as plt # The two statemens below are used mainly to set up a plotting # default style that's better than the default from matplotlib import seaborn as sns plt.style.use('bmh') from shapely.geometry import Point import pandas as pd import geopandas as gpd from geopandas import GeoSeries, GeoDataFrame data_pth = "../data" # ## 3. Read HydroBASINS North America / Western Washington # - Read [HydroBASINS](http://hydrosheds.org/page/hydrobasins) "all-levels" (lev00) hierarchical watersheds dataset for the Pacific NW, from a shape file. **Watersheds in the dataset are at the finest (highest resolution) "Pfastetter" hierarchical level, level 12.** # - In the [original UW GeoHack Week tutorial](https://geohackweek.github.io/vector/06-geopandas-advanced/), this exercise involved reading geospatial data from a remote PostGIS source. # In[2]: hydrobas = gpd.read_file(os.path.join(data_pth, "hybas_na_lev00_v1c.shp")) # In[3]: len(hydrobas) # We'll apply a filter to the dataset to select only the Pfastetter level-4 watershed with code 7831: `hydrobas['pfaf_4'] == 7831`. This is **most of Western Washington.** Watershed polygons will still be read at their original level 12 resolution. # In[4]: hydrobas_ww = hydrobas[hydrobas['pfaf_4'] == 7831] # 413 polygon features returned. Let's examine the attributes for the first feature. # In[5]: len(hydrobas_ww) # In[6]: hydrobas_ww.iloc[0] # **Plot a categorical map, with coloring based on the aggregating column `pfaf_7`.** Watershed boundaries are at the high-resolution Pfastetter level 12. # In[7]: hydrobas_ww.plot(column='pfaf_7', cmap='Paired', categorical=True, figsize=(14,6)); # ## 4. Dissolve into larger watersheds, and reproject # ### Dissolve source polygons into larger watersheds based on attribute values # Apply GeoDataFrame [dissolve](http://geopandas.org/aggregation_with_dissolve.html) aggregation method (implemented from lower level `shapely` operators) on level-7 Pfastetter codes (`pfaf_7`) shown in the plot above. Aggregate attributes, retaining only `pfaf_7` and `pfaf_6` (plus `geometry`, of course). Note that this operation results in only 17 polygons, from the original 413. # In[8]: cols = ['pfaf_6', 'pfaf_7', 'geometry'] hydrobas_ww_p7 = hydrobas_ww[cols].dissolve(by='pfaf_7', aggfunc='first', as_index=False) len(hydrobas_ww_p7) # In[9]: hydrobas_ww_p7.crs # The CRS was not retained during the dissolve operation. Apply it by copying it from `hydrobas_ww`. # In[10]: hydrobas_ww_p7.crs = hydrobas_ww.crs hydrobas_ww_p7.crs # Let's examine some of the features. # In[11]: hydrobas_ww_p7.head() # Plot the results. Looks like the previous plot, except the polygon boundaries are now the pfaf_7 watersheds (same as the background colors). # In[12]: hydrobas_ww_p7.plot(column='pfaf_7', cmap='Paired', categorical=True, figsize=(14,6)); # **_NOTE/WATCH:_** # **Beware that `dissolve` may fail if there are "invalid" geometries.** # See this code from the previous, intro notebook. The 6 geometries/points reported are invalid (and are reported by the `is_valid()` method). This dissolve statement does work, though. # ```python # seas_grp = seas[['oceans', 'geometry']] # seas_oceans_diss = seas_grp[seas_grp.geometry.is_valid].dissolve(by='oceans') # # Ring Self-intersection at or near point 10.407218181818182 54.821390909090908 # Self-intersection at or near point -79.365827272727287 76.296645454545455 # Ring Self-intersection at or near point 10.979445510225332 54.380555030408686 # Ring Self-intersection at or near point 133.61550925464189 -4.3005540903175188 # Ring Self-intersection at or near point 121.91067196634913 -5.0593090510592447 # Ring Self-intersection at or near point 115.29553592754269 -7.0082630551828515 # ``` # ### Reproject (transform) to WA State Plane South, epsg:2927 # Partly so we can calculate polygon areas in linear units, not geodetic degrees. But also because that's the projection used by most state and local governments in Washington. # - http://epsg.io/?q=2927 # - http://spatialreference.org/ref/epsg/2927/ # - [Report from http://www.epsg-registry.org](http://www.epsg-registry.org/report.htm?type=selection&entity=urn:ogc:def:crs:EPSG::2927&reportDetail=short&style=urn:uuid:report-style:default-with-code&style_name=OGP%20Default%20With%20Code&title=EPSG:2927) # No need to go to a web site to learn more about what `epsg:2927` is. Use `pyepsg`, which issues queries to http://epsg.io web services. # In[13]: import pyepsg # Extract the epsg code from the string returned by `crs['init']`. # In[14]: hydrobas_ww_p7_epsg_str = hydrobas_ww_p7.crs['init'].split(':')[1] pyepsg.get(hydrobas_ww_p7_epsg_str) # Query epsg `2927`. # In[15]: pyepsg.get('2927') # **Apply the crs transformation (reprojection)** using `to_crs` method. # In[16]: hydrobas_ww_p7_wasp = hydrobas_ww_p7.to_crs(epsg=2927) # #### Plot the reprojected map. # Note that, being in a planar project (not geodetic), the shape looks different compared to the previous map. More "normal". And the axes are now in `feet` relative to some origin. # In[17]: hydrobas_ww_p7_wasp.plot(column='pfaf_7', cmap='Paired', categorical=True, figsize=(14,6)); # ## 5. Plot `choropleth` map based on calculated watershed areas # As the projection is in `feet`, auto-calculated polygon areas will be in feet2. So let's convert to miles2 first (why not!). We'll add a new column to the GeoDataFrame. # In[18]: hydrobas_ww_p7_wasp['area_mi2'] = hydrobas_ww_p7_wasp.geometry.area / 27878400 hydrobas_ww_p7_wasp.head(3) # **_NOTE/FUN:_** # Now you could get the area of a pfaf_6 watershed via simple Pandas DataFrame `groupby` aggregation (sum). # Plot the choloropleth, using `area_mi2`. # # "fisher_jenks" value segmentation `scheme` (using 7 segments, k=7) used is one of the available `pysal.esda.mapclassify.Map_Classifier` classifiers from the powerful [PySAL package](http://pysal.github.io) (Python Spatial Analysis Library); GeoPandas can use these classifiers if PySAL is installed. To get the list of classifiers, use: # ```python # import pysal # print(pysal.esda.mapclassify.Map_Classifier.__doc__) # ``` # In[19]: f, ax = plt.subplots(1, figsize=(6, 4)) hydrobas_ww_p7_wasp.plot(column='area_mi2', scheme='fisher_jenks', k=7, alpha=0.9, cmap=plt.cm.Blues, legend=True, ax=ax) plt.axis('equal') ax.set_title('Watersheds by area ($mi^2$)'); # ## 6. Choropleth map as an interactive map with folium # [Folium](https://github.com/python-visualization/folium) is very cool, specially for use in Jupyter notebooks; or to export into stand-alone HTML. # In[20]: import folium # Folium likes GeoJSON for geometries. GeoPandas makes conversion easy. # In[21]: geo_str = hydrobas_ww_p7.to_json() # `m.choropleth` will do an attribute join between `geo_str` and `hydrobas_ww_p7_wasp` based on the key `pfaf_7`. # In[22]: m = folium.Map(location=[47.8, -122.5], zoom_start=7, tiles="cartodbpositron") m.choropleth(geo_str=geo_str, data=hydrobas_ww_p7_wasp, columns=['pfaf_7', 'area_mi2'], key_on='feature.properties.pfaf_7', fill_color='YlGn', fill_opacity=0.4, legend_name='area') m # This map is interactive, so play with it (zoom and pan). There is a lot more to explore in Folium! This is just a teaser. # ## 7. Spatial join, `sjoin`, of polygons on points # We'll use an old, local snapshot of NANOOS coastal and marine monitoring stations in the Pacific NW, from the [NANOOS Visualization System (NVS) Data Explorer](http://nvs.nanoos.org/Explorer). While many stations are moorings on marine waters, some are inshore or in tidal shores and will overlap the watershed boundaries. # The point file is in the [GeoPackage](http://www.geopackage.org) format, an OGC format implemented in SQLite. # In[23]: nanoosstations_gdf = gpd.read_file(os.path.join(data_pth, "nanoos_nvs.gpkg")) len(nanoosstations_gdf) # In[24]: nanoosstations_gdf.iloc[-1] # Points in the coasts of the Pacific NW (BC, WA, OR) and out in the open ocean. # In[25]: nanoosstations_gdf.plot(markersize=4); # **Apply "inner" spatial join with `sjoin` operator** from the `gpd.tools` package. An inner join will retain only overlapping features. Then plot as a map overlay on top of `hydrobas_ww_p7`, categorizing (coloring) points by the `pfaf_6` watershed they're in. # In[26]: nanoossta_hydrobas_ww_gdf = gpd.tools.sjoin(nanoosstations_gdf, hydrobas_ww_p7, how="inner") len(nanoossta_hydrobas_ww_gdf) # In[27]: f, ax = plt.subplots(1, figsize=(6, 6)) plt.axis('equal') hydrobas_ww_p7.plot(ax=ax) nanoossta_hydrobas_ww_gdf.plot(column='pfaf_6', markersize=6, categorical=True, legend=True, ax=ax); # ## 8. rasterstats: "zonal" statistics from polygons on rasters # We'll end by mixing features from a GeoDataFrame with a raster, applying zonal statistics using the cool and light weight [rasterstats](http://pythonhosted.org/rasterstats/) package. # Monthly Juy long-term climatology precipitation. The original monthly time series data are from the [PRISM Climate Group](http://prism.oregonstate.edu); the monthly climatology and Pacific NW clip were created by your truly and Don Setiawan for the [BiGCZ project](http://bigcz.org). # In[28]: ppt_july_tif_pth = os.path.join(data_pth, 'prism_precipitation_july_climatology.tif') # ### rasterio # Rasterstas uses [rasterio](https://mapbox.github.io/rasterio) to read rasters (and `fiona` to read vector datasets), so we'll first do a quick exploration of rasterio. # In[29]: import rasterio import rasterio.plot as rioplot # In[30]: ppt_july = rasterio.open(ppt_july_tif_pth) ppt_july # Examine the metadata read from the raster file (we can confirm CRS is epsg:4326), then plot the raster. # In[31]: ppt_july.meta # **_NOTE/WATCH:_** # `with_bounds=True` is supposed to display the X & Y coordinate labels (lat & lon) from the metadata. We don't why this isn't working. # In[32]: rioplot.show(ppt_july, with_bounds=True, cmap=plt.cm.Blues); # ### Apply rasterstas `zonal_stats` # Apply `zonal_stats` from `rasterstats` package. Can pass a `GeoDataFrame` directly (instead of the file path to a GIS file) because it implements our old friend, the `__geo_interface__` method. For the raster, we pass its file path. # # `zonal_stats` returns a geojson with the original properties plus the zonal statistics. # In[33]: import rasterstats as rs # In[34]: zonal_ppt_gjson = rs.zonal_stats(hydrobas_ww_p7, ppt_july_tif_pth, prefix='pptjuly_', geojson_out=True) # In[35]: len(zonal_ppt_gjson) # In[36]: zonal_ppt_gdf = GeoDataFrame.from_features(zonal_ppt_gjson) zonal_ppt_gdf.head(2) # #### And finally, a choropleth map of July precipitation by watershed! # In[37]: f, ax = plt.subplots(1, figsize=(6, 5)) zonal_ppt_gdf.plot(column='pptjuly_mean', scheme='Equal_Interval', k=5, alpha=1, cmap=plt.cm.Blues, legend=True, ax=ax) plt.axis('equal') ax.set_title('Mean July precipitation ($mm/month$) by watershed'); # In[ ]: