#!/usr/bin/env python # coding: utf-8 # ## Filtering a Data Observatory dataset using "Who's On First" in CARTOFrames # This notebook illustrates how to use the admin. region geometries from Who's on First (public data) to filter a dataset from CARTO's [Data Observatory](https://carto.com/spatial-data-catalog/) using [CARTOFrames](https://carto.com/cartoframes/) methods. # # The notebook is organized as follows: # 0. Setup account # 1. Access a dataset from a Data Observatory subscription to be filtered # 2. Who's on First for filtering data in cities # # **Documentation** # - CARTO Spatial Data Catalogue - [link](https://carto.com/spatial-data-catalog/browser/) # - CARTOFrames technical documentation - [link](https://carto.com/developers/cartoframes/) # - "Who's on First" GeoJSON data product - [link](https://carto.com/spatial-data-catalog/browser/geography/wof_geojson_4e78587c/data) # ### 0. Setup # In[1]: import geopandas as gpd import pandas as pd from cartoframes.auth import set_default_credentials from cartoframes.data.observatory import * from cartoframes.viz import * pd.set_option('display.max_columns', None) # In[2]: set_default_credentials('creds.json') # ### 1. Access a dataset from a Data Observatory subscription to be filtered # First, we check our data subscriptions from the Data Observatory to select which dataset we want to filter. # In[3]: Catalog().subscriptions().datasets.to_dataframe() # We identify the slug_id from the dataset we want to use. For example this one: # In[4]: SpatialFeatures_esp_qk15 = Dataset.get('cdb_spatial_fea_d23a5c97') # In[5]: SpatialFeatures_esp_qk15.to_dict() # In[6]: SpatialFeatures_esp_qk15.tail() # ### 2. Who's On First GeoJSON for filtering data in cities # CARTO's Data Observatory also provides direct access to a group of public datasets. You can navigate and explore our Spatial Data Catalog from within your Python notebook with the Data Discovery methods in CARTOFrames or using our [Spatial Data Catalog](https://carto.com/spatial-data-catalog/browser/?license=public). # # ["Who's on First"](https://whosonfirst.org/) is a gazetteer (o big list) of places, each with a stable identifier and some number of descriptive properties about that location. # # # We can use the WoF GeoJSON to find the city boundaries to use then for filtering the data from other datasets from the Data Observatory. # In[7]: Catalog().provider('whos_on_first').public().geographies.to_dataframe() # Note that the ID to access the WoF GeoJSON table is 'wof_geojson_4e78587c'. # In[8]: wof_geojson = Geography.get('wof_geojson_4e78587c') # In[9]: wof_geojson.to_dict() # Now we are going to perform a query to the table in order to retrieve the different geometries given a city name and a country ISO Alpha-2 code. As we are looking for city boundaries, we can also limit our search to the placetype = 'locality' if we find that this is the specific type of place for our needs. # In[10]: city_name = "'Madrid'" country_code = "'ES'" placetype = "'locality'" sql_query = f"SELECT * FROM $geography$ WHERE name = {city_name} AND country = {country_code}" wof_geojson_filtered = wof_geojson.to_dataframe(sql_query=sql_query) wof_geojson_filtered # It may happen that we find that there are more than one locality with the same name. In order to select the right polygon, we can build a map with a category widget that will allow us to decide for the specific geometry that we are looking for. # In[11]: wof_geojson_filtered['geoid_str'] = wof_geojson_filtered['geoid'].astype(str) Map( Layer( wof_geojson_filtered, # where the data comes from color_category_style('geoid_str',palette='Vivid',opacity=0.6,stroke_width=0.2), widgets=[category_widget('geoid_str','Select geoid to visualize')], popup_hover=[popup_element('geoid','geoid'), popup_element('name','name'), popup_element('placetype','placetype')], legends=color_category_legend('Geoid'), geom_col='geom', #the name of the column on the query that has a GEOGRAPHY data encode_data = False ) ) # Once we know which geometry (i.e. polygon of city boundaries) is the right one for our tests, we should copy/note its associated geoid. # # As in this example we want to filter the data for Madrid, we will use geoid = ' 101748283'. # In[12]: """Helper function for downloading only the data within the area (geometry) of interest Args: do_dataset: DO Dataset you'd like to download for a specific area of interest do_geom_dataset: DO Dataset containing the geometry you'd like to use as filter (your area of interest) target_geoid: geoid of the geometry you'd like to use as filter (your area of interest) """ def filter_data(do_dataset, do_geom_dataset, target_geoid): do_geom_dataset_id=do_geom_dataset.id sql_query = f"""WITH do_geom AS ( SELECT geom FROM `{do_geom_dataset_id}` WHERE geoid = '{target_geoid}') SELECT do_d.* FROM $dataset$ do_d, do_geom WHERE ST_Intersects(do_d.geom, do_geom.geom)""" filtered_data = do_dataset.to_dataframe(sql_query = sql_query) return filtered_data # In[13]: SpatialFeatures_esp_qk15_madrid = filter_data(SpatialFeatures_esp_qk15,wof_geojson,'101748283') SpatialFeatures_esp_qk15_madrid.head() # In[14]: Map( Layer( SpatialFeatures_esp_qk15_madrid, # where the data comes from color_continuous_style('population', palette=palettes.magenta,opacity=0.75,stroke_width=0.5), widgets=[formula_widget('population','sum','Total Population'),histogram_widget('population','Population by cell')], popup_hover=[popup_element('population','Population')], geom_col='geom', #the name of the column on the query that has a GEOGRAPHY data encode_data = False ) )