#!/usr/bin/env python # coding: utf-8 # # GIS with ![Python](https://www.python.org/static/community_logos/python-logo-inkscape.svg) and ![IPython](https://ipython.org/_static/IPy_header.png) # # Getting some data # There are many sources of GIS data. Here are some useful links: # # * [WorldMap](http://worldmap.harvard.edu/) # * [FAO's GeoNetwork](http://www.fao.org/geonetwork) # * [IPUMS USA Boundary files for Censuses](https://usa.ipums.org/usa/volii/boundaries.shtml) # * [IPUMS International Boundary files for Censuses](https://international.ipums.org/international/gis.shtml) # * [GADM database of Global Administrative Areas](http://www.gadm.org/) # * [Global Administrative Unit Layers](http://www.fao.org/geonetwork/srv/en/metadata.show?id=12691) # * [Natural Earth](http://www.naturalearthdata.com/): All kinds of geographical, cultural and socioeconomic variables # * [Global Map](http://www.iscgm.org/cgi-bin/fswiki/wiki.cgi?page=Summary) # * [Digital Chart of the World](http://worldmap.harvard.edu/data/geonode:Digital_Chart_of_the_World) # * [Sage](http://www.sage.wisc.edu/mapsdatamodels.html) and [Sage Atlas](http://www.sage.wisc.edu/atlas/maps.php) # * [Caloric Suitability Index CSI](https://ozak.github.io/Caloric-Suitability-Index/): Agricultural suitability data # * [Ramankutti's Datasets on land use, crops, etc.](http://www.geog.mcgill.ca/~nramankutty/Datasets/Datasets.html) # * [SEDAC at Columbia Univesrity](http://sedac.ciesin.columbia.edu/data/sets/browse): Gridded Population, Hazzards, etc. # * [World Port Index](http://msi.nga.mil/NGAPortal/MSI.portal?_nfpb=true&_pageLabel=msi_portal_page_62&pubCode=0015) # * [USGS elevation maps](http://eros.usgs.gov/elevation-products) # * [NOOA's Global Land One-km Base Elevation Project (GLOBE)](http://www.ngdc.noaa.gov/mgg/topo/globe.html) # * [NOOA Nightlight data](http://ngdc.noaa.gov/eog/download.html): This is the data used by Henderson, Storeygard, and Weil AER 2012 paper. # * [Other NOOA Data](http://www.ngdc.noaa.gov/ngdcinfo/onlineaccess.html) # * [GEcon](http://gecon.yale.edu/) # * [OpenStreetMap](http://openstreetmap.org) # * [U.S. Census TIGER](http://www.census.gov/geo/maps-data/data/tiger.html) # * [Geo-referencing of Ethnic Groups](http://www.icr.ethz.ch/data/other/greg) # # See also [Wikipedia links](http://en.wikipedia.org/wiki/List_of_GIS_data_sources) # # Set-up # Let's import the packages we will use and set the paths for outputs. # In[1]: # Let's import pandas and some other basic packages we will use from __future__ import division get_ipython().run_line_magic('pylab', '--no-import-all') get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import os, sys # In[2]: # GIS packages import geopandas as gpd from geopandas.tools import overlay from shapely.geometry import Polygon, Point import georasters as gr # Alias for Geopandas gp = gpd # In[3]: # Plotting import matplotlib as mpl import seaborn as sns # Setup seaborn sns.set() # In[4]: # Paths pathout = './data/' if not os.path.exists(pathout): os.mkdir(pathout) pathgraphs = './graphs/' if not os.path.exists(pathgraphs): os.mkdir(pathgraphs) # # Initial Example -- Natural Earth Country Shapefile # Let's download a shapefile with all the polygons for countries so we can visualize and analyze some of the data we have downloaded in other notebooks. [Natural Earth](https://www.naturalearthdata.com/downloads/) provides lots of free data so let's use that one. # # For shapefiles and other polygon type data ``geopandas`` is the most useful package. ``geopandas`` is to GIS what ``pandas`` is to other data. Since ``gepandas`` extends the functionality of ``pandas`` to a GIS dataset, all the nice functions and properties of ``pandas`` are also available in ``geopandas``. Of course, ``geopandas`` includes functions and properties unique to GIS data. # # Next we will use it to download the shapefile (which is contained in a zip archive). ``geopandas`` extends ``pandas`` for use with GIS data. We can use many functions and properties of the ``GeoDataFrame`` to analyze our data. # In[5]: import requests import io #headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'} url = 'https://naturalearth.s3.amazonaws.com/10m_cultural/ne_10m_admin_0_countries.zip' r = requests.get(url, headers=headers) countries = gp.read_file(io.BytesIO(r.content)) #countries = gpd.read_file('https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip') # Let's look inside this ``GeoDataFrame`` # In[6]: countries # Each row contains the information for one country. # # Each column is one property or variable. # # Unlike ``pandas`` ``DataFrame``s, ``geopandas`` always must have a ``geometry`` column. # # Let's plot this data # In[7]: get_ipython().run_line_magic('matplotlib', 'inline') fig, ax = plt.subplots(figsize=(15,10)) countries.plot(ax=ax) ax.set_title("WGS84 (lat/lon)", fontdict={'fontsize':34}) # We can also get some additional information on this data. For example its projection # In[8]: countries.crs # We can reproject the data from its current WGS84 projection to other ones. Let's do this and plot the results so we can see how different projections distort results. # In[9]: fig, ax = plt.subplots(figsize=(15,10)) countries_merc = countries.to_crs(epsg=3857) countries_merc.loc[countries_merc.NAME!='Antarctica'].reset_index().plot(ax=ax) ax.set_title("Mercator", fontdict={'fontsize':34}) # In[10]: countries_merc.crs # In[11]: cea = {'datum': 'WGS84', 'lat_ts': 0, 'lon_0': 0, 'no_defs': True, 'over': True, 'proj': 'cea', 'units': 'm', 'x_0': 0, 'y_0': 0} fig, ax = plt.subplots(figsize=(15,10)) countries_cea = countries.to_crs(crs=cea) countries_cea.plot(ax=ax) ax.set_title("Cylindrical Equal Area", fontdict={'fontsize':34}) # Notice that each projection shows the world in a very different manner, distoring areas, distances etc. So you need to take care when doing computations to use the correct projection. An important issue to remember is that you need a projected (not geographical) projection to compute areas and distances. Let's compare these three a bit. Start with the boundaries of each. # In[12]: print('[xmin, ymin, xmax, ymax] in three projections') print(countries.total_bounds) print(countries_merc.total_bounds) print(countries_cea.total_bounds) # Let's describe the areas of these countries in the three projections # In[13]: print('Area distribution in WGS84') print(countries.area.describe(), '\n') # In[14]: print('Area distribution in Mercator') print(countries_merc.area.describe(), '\n') # In[15]: print('Area distribution in CEA') print(countries_cea.area.describe(), '\n') # In[16]: countries['geometry'] # In[17]: Point((0,0)) # In[18]: Polygon(((0,0), (1,2), (3,0))) # Let's compare the area of each country in the two projected projections # In[19]: countries_merc = countries_merc.set_index('ADM0_A3') countries_cea = countries_cea.set_index('ADM0_A3') countries_merc['ratio_area'] = countries_merc.area / countries_cea.area countries_cea['ratio_area'] = countries_merc.area / countries_cea.area sns.set(rc={'figure.figsize':(11.7,8.27)}) sns.set_context("talk") fig, ax = plt.subplots() sns.scatterplot(x=countries_cea.area/1e6, y=countries_merc.area/1e6, ax=ax) sns.lineplot(x=countries_cea.area/1e6, y=countries_cea.area/1e6, color='r', ax=ax) ax.set_ylabel('Mercator') ax.set_xlabel('CEA') ax.set_title("Areas") # Now, how do we know what is correct? Let's get some data from WDI to compare the areas of countries in these projections to what the correct area should be (notice that each country usually will use a local projection that ensures areas are correctly computed, so their data should be closer to the truth than any of our global ones). # # Here we use some of what we learned before in [this notebook](./EconomicDataAnalysis.ipynb). # In[20]: from pandas_datareader import data, wb wbcountries = wb.get_countries() wbcountries['name'] = wbcountries.name.str.strip() wdi = wb.download(indicator=['AG.LND.TOTL.K2'], country=wbcountries.iso2c.values, start=2017, end=2017) wdi.columns = ['WDI_area'] wdi = wdi.reset_index() wdi = wdi.merge(wbcountries[['iso3c', 'iso2c', 'name']], left_on='country', right_on='name') countries_cea['CEA_area'] = countries_cea.area / 1e6 countries_merc['MERC_area'] = countries_merc.area / 1e6 areas = pd.merge(countries_cea['CEA_area'], countries_merc['MERC_area'], left_index=True, right_index=True) # Let's merge the WDI data with what we have computed before. # In[21]: wdi = wdi.merge(areas, left_on='iso3c', right_index=True) wdi # How correlated are these measures? # In[22]: wdi.corr() # Let's change the shape of the data so we can plot it using ``seaborn``. # In[23]: wdi2 = wdi.melt(id_vars=['iso3c', 'iso2c', 'name', 'country', 'year', 'WDI_area'], value_vars=['CEA_area', 'MERC_area']) wdi2 # In[24]: sns.set(rc={'figure.figsize':(11.7,8.27)}) sns.set_context("talk") fig, ax = plt.subplots() sns.scatterplot(x='WDI_area', y='value', data=wdi2, hue='variable', ax=ax) #sns.scatterplot(x='WDI_area', y='MERC_area', data=wdi, ax=ax) sns.lineplot(x='WDI_area', y='WDI_area', data=wdi, color='r', ax=ax) ax.set_ylabel('Other') ax.set_xlabel('WDI') ax.set_title("Areas") ax.legend() # We could use other data to compare, e.g. data from the CIA Factbook. # In[25]: cia_area = pd.read_csv('https://web.archive.org/web/20201116182145if_/https://www.cia.gov/LIBRARY/publications/the-world-factbook/rankorder/rawdata_2147.txt', sep='\t', header=None) cia_area = pd.DataFrame(cia_area[0].str.strip().str.split('\s\s+').tolist(), columns=['id', 'Name', 'area']) cia_area.area = cia_area.area.str.replace(',', '').astype(int) cia_area # In[26]: print('CEA area for Russia', countries_cea.area.loc['RUS'] / 1e6) print('MERC area for Russia', countries_merc.area.loc['RUS'] / 1e6) print('WDI area for Russia', wdi.loc[wdi.iso3c=='RUS', 'WDI_area']) print('CIA area for Russia', cia_area.loc[cia_area.Name=='Russia', 'area']) # Again very similar result. ``CEA`` is closest to both ``WDI`` and ``CIA``. # ## Exercise # # 1. Merge the ``CIA`` data with the wdi data. You need to get correct codes for the countries to allow for the merge or correct the names to ensure they are compatible. # 2. Change the dataframe as we did with ``wdi2`` and plot the association between these measures # # Mapping data # Let's use the ``geoplot`` package to plot data in a map. As usual we can do it in many ways, but ``geoplot`` makes our life very easy. Let's import the various packages we will use. # In[27]: import geoplot as gplt import geoplot.crs as gcrs import mapclassify as mc import textwrap # Let's import some of the data we had downloaded before. Specifically, let's import the Penn World Tables data. # In[28]: pwt = pd.read_stata(pathout + 'pwt91.dta') pwt_xls = pd.read_excel(pathout + 'pwt91.xlsx') pwt # Let's recreate GDPpc data # In[29]: # Get columns with GDP measures gdpcols = pwt_xls.loc[pwt_xls['Variable definition'].apply(lambda x: str(x).upper().find('REAL GDP')!=-1), 'Variable name'].tolist() # Generate GDPpc for each measure for gdp in gdpcols: pwt[gdp + '_pc'] = pwt[gdp] / pwt['pop'] # GDPpc data gdppccols = [col+'_pc' for col in gdpcols] pwt[['countrycode', 'country', 'year'] + gdppccols] # Let's map GDPpc for the year 2010 using ``geoplot``. For this, let's write two functions that will simplify plotting and saving maps. Also, we can reuse it whenever we need to create a new map for the world. # In[30]: # Functions for plotting def center_wrap(text, cwidth=32, **kw): '''Center Text (to be used in legend)''' lines = text #lines = textwrap.wrap(text, **kw) return "\n".join(line.center(cwidth) for line in lines) def MyChloropleth(mydf=pwt.loc[pwt.year==2010], myfile='GDPpc2010', myvar='rgdpe_pc', mylegend='GDP per capita 2010', k=5, extent=[-180, -90, 180, 90], bbox_to_anchor=(0.2, 0.5), edgecolor='white', facecolor='lightgray', scheme='FisherJenks', save=True, percent=False, **kwargs): # Chloropleth # Color scheme if scheme=='EqualInterval': scheme = mc.EqualInterval(mydf[myvar], k=k) elif scheme=='Quantiles': scheme = mc.Quantiles(mydf[myvar], k=k) elif scheme=='BoxPlot': scheme = mc.BoxPlot(mydf[myvar], k=k) elif scheme=='FisherJenks': scheme = mc.FisherJenks(mydf[myvar], k=k) elif scheme=='FisherJenksSampled': scheme = mc.FisherJenksSampled(mydf[myvar], k=k) elif scheme=='HeadTailBreaks': scheme = mc.HeadTailBreaks(mydf[myvar], k=k) elif scheme=='JenksCaspall': scheme = mc.JenksCaspall(mydf[myvar], k=k) elif scheme=='JenksCaspallForced': scheme = mc.JenksCaspallForced(mydf[myvar], k=k) elif scheme=='JenksCaspallSampled': scheme = mc.JenksCaspallSampled(mydf[myvar], k=k) elif scheme=='KClassifiers': scheme = mc.KClassifiers(mydf[myvar], k=k) # Format legend upper_bounds = scheme.bins # get and format all bounds bounds = [] for index, upper_bound in enumerate(upper_bounds): if index == 0: lower_bound = mydf[myvar].min() else: lower_bound = upper_bounds[index-1] # format the numerical legend here if percent: bound = f'{lower_bound:.0%} - {upper_bound:.0%}' else: bound = f'{float(lower_bound):,.0f} - {float(upper_bound):,.0f}' bounds.append(bound) legend_labels = bounds #Plot ax = gplt.choropleth( mydf, hue=myvar, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None), edgecolor='white', linewidth=1, cmap='Reds', legend=True, scheme=scheme, legend_kwargs={'bbox_to_anchor': bbox_to_anchor, 'frameon': True, 'title':mylegend, }, legend_labels = legend_labels, figsize=(24, 16), rasterized=True, ) gplt.polyplot( countries, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None), edgecolor=edgecolor, facecolor=facecolor, ax=ax, rasterized=True, extent=extent, ) if save: plt.savefig(pathgraphs + myfile + '_' + myvar +'.pdf', dpi=300, bbox_inches='tight') plt.savefig(pathgraphs + myfile + '_' + myvar +'.png', dpi=300, bbox_inches='tight') pass # Let's merge the PWT GDPpc data with our shape file. # In[31]: year = 2010 gdppc = pwt.loc[pwt.year==year].reset_index(drop=True).copy() gdppc = countries.merge(gdppc, left_on='ADM0_A3', right_on='countrycode') gdppc = gdppc.dropna(subset=['rgdpe_pc']) mylegend = center_wrap(["GDP per capita in " + str(year)], cwidth=32, width=32) MyChloropleth(mydf=gdppc, myfile='PWT_GDP_' + str(year), myvar='rgdpe_pc', mylegend=mylegend, k=10, scheme='Quantiles', save=True) # In[32]: year = 2000 gdppc = pwt.loc[pwt.year==year].reset_index(drop=True).copy() gdppc = countries.merge(gdppc, left_on='ADM0_A3', right_on='countrycode') gdppc = gdppc.dropna(subset=['rgdpe_pc']) mylegend = center_wrap(["GDP per capita in " + str(year)], cwidth=32, width=32) MyChloropleth(mydf=gdppc, myfile='PWT_GDP_' + str(year), myvar='rgdpe_pc', mylegend=mylegend, k=10, scheme='Quantiles', save=True) # In[33]: year = 2000 gdppc = pwt.loc[pwt.year==year].reset_index(drop=True).copy() gdppc = countries.merge(gdppc, left_on='ADM0_A3', right_on='countrycode') gdppc = gdppc.dropna(subset=['pop']) mylegend = center_wrap(["Population in " + str(year)], cwidth=32, width=32) MyChloropleth(mydf=gdppc, myfile='PWT_POP_' + str(year), myvar='pop', mylegend=mylegend, k=10, scheme='Quantiles', save=True) # # GIS operations, functions and properties # Let's explore the data with some of the functions of ``geopandas``. # # Let's start by finding the centroid of every country and plot it. # In[34]: centroids = countries.copy() centroids.geometry = centroids.centroid ax = gplt.pointplot( centroids, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None), figsize=(24, 16), rasterized=True, ) gplt.polyplot(countries.geometry, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None), edgecolor='white', facecolor='lightgray', extent=[-180, -90, 180, 90], ax=ax) # In[35]: centroids.to_file(pathout + 'centroids.shp') # In[36]: centroids.loc[centroids.SOVEREIGNT=='Southern Patagonian Ice Field'] # Let's compute distances between the centroids. For this we will use the ``geopy`` package. # In[37]: from geopy.distance import geodesic, great_circle import itertools centroids['xy'] = centroids.geometry.apply(lambda x: [x.y, x.x]) # In[38]: mypairs = pd.DataFrame(index = pd.MultiIndex.from_arrays( np.array([x for x in itertools.product(centroids['ADM0_A3'].tolist(), repeat=2)]).T, names = ['country_1','country_2'])).reset_index() mypairs = mypairs.merge(centroids[['ADM0_A3', 'xy']], left_on='country_1', right_on='ADM0_A3') mypairs = mypairs.merge(centroids[['ADM0_A3', 'xy']], left_on='country_2', right_on='ADM0_A3', suffixes=['_1', '_2']) mypairs # In[39]: mypairs['geodesic_dist'] = mypairs.apply(lambda x: geodesic(x.xy_1, x.xy_2).km, axis=1) mypairs['great_circle_dist'] = mypairs.apply(lambda x: great_circle(x.xy_1, x.xy_2).km, axis=1) mypairs # In[40]: mypairs.corr() # Let's now use the cylindrical equal area projection and geopandas distance function to compute the distance between centroids. # In[41]: centroids_cea = countries_cea.copy() centroids_cea.reset_index(inplace=True) centroids_cea.geometry = centroids_cea.centroid centroids_cea['xy'] = centroids_cea.geometry.apply(lambda x: [x.y, x.x]) mypairs_cea = pd.DataFrame(index = pd.MultiIndex.from_arrays( np.array([x for x in itertools.product(centroids_cea['ADM0_A3'].tolist(), repeat=2)]).T, names = ['country_1','country_2'])).reset_index() mypairs_cea = mypairs_cea.merge(centroids_cea[['ADM0_A3', 'geometry', 'xy']], left_on='country_1', right_on='ADM0_A3') mypairs_cea = mypairs_cea.merge(centroids_cea[['ADM0_A3', 'geometry', 'xy']], left_on='country_2', right_on='ADM0_A3', suffixes=['_1', '_2']) # In[42]: mypairs_cea # In[43]: mypairs_cea['CEA_dist'] = mypairs_cea.apply(lambda x: x.geometry_1.distance(x.geometry_2)/1e3, axis=1) mypairs_cea # Let's merge the three distance measures and see how similar they are. # In[44]: dists = mypairs[['country_1', 'country_2', 'geodesic_dist', 'great_circle_dist']].copy() dists = dists.merge(mypairs_cea[['country_1', 'country_2', 'CEA_dist']]) dists # In[45]: dists.corr() # In[46]: centroids_merc = countries_merc.copy() centroids_merc.reset_index(inplace=True) centroids_merc.geometry = centroids_merc.centroid centroids_merc['xy'] = centroids_merc.geometry.apply(lambda x: [x.y, x.x]) mypairs_merc = pd.DataFrame(index = pd.MultiIndex.from_arrays( np.array([x for x in itertools.product(centroids_merc['ADM0_A3'].tolist(), repeat=2)]).T, names = ['country_1','country_2'])).reset_index() mypairs_merc = mypairs_merc.merge(centroids_merc[['ADM0_A3', 'geometry', 'xy']], left_on='country_1', right_on='ADM0_A3') mypairs_merc = mypairs_merc.merge(centroids_merc[['ADM0_A3', 'geometry', 'xy']], left_on='country_2', right_on='ADM0_A3', suffixes=['_1', '_2']) # In[47]: mypairs_merc['MERC_dist'] = mypairs_merc.apply(lambda x: x.geometry_1.distance(x.geometry_2)/1e3, axis=1) mypairs_merc # In[48]: dists = dists.merge(mypairs_merc[['country_1', 'country_2', 'MERC_dist']]) dists # In[49]: dists.corr()