#!/usr/bin/env python # coding: utf-8 # #Data2go Spatial Analysis # This code calculates the spatial autocorrelation of the different measures in the data2go.nyc dataset. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import numpy as np import pandas as pd import shapefile from scipy import spatial import pysal from shapely.geometry import Polygon as Shp_poly from bokeh.plotting import figure, output_notebook, show output_notebook() # ###Load Data # Each column is a different measure in the data set. Each row is a New York City community district. The first column, 'GEO_ID', is number which identifies the community district and is the same identification used in community district shapefile # One column is removed because of bad values # In[2]: d = pd.read_csv("cd_data_reduced.csv",low_memory=False) d = d.drop('hiv_test_cd', 1) d # ###Edit Shapefile # Get the NYC community district shapefile from here: http://www.nyc.gov/html/dcp/html/bytes/districts_download_metadata.shtml # # Some of the community districts have no associated data in the data2go dataset, most likely because they are parks or nateral areas. (For example, Central Park is one of the community districts.) So this block of code removes those community districts from the shapefile and saves a new one. # In[3]: sf = shapefile.Reader("./nycd_15d/nycd.shp") iD = [] area = [] measures = [] index = 0 del_cnt = 0 e = shapefile.Editor(shapefile="./nycd_15d/nycd.shp") for rec in sf.iterRecords(): iD.append(rec[0]) area.append(rec[1]) row = d.loc[d['GEO_ID'] == rec[0]] if row.empty: e.delete(index - del_cnt) del_cnt = del_cnt + 1 else: measures.append(row.iloc[:,2].values[0]) index = index + 1 e.save("./nycd_15d/nycd_reduced") # ###Spatial Analysis # This performs the spatial autocorrelation (The Gamma Index and Moran's I) of each measure in 'd'. # In[4]: num_measures = len(d.columns[2:]) fill = {'Gamma mag' : np.zeros(num_measures)} measure_correlation = pd.DataFrame(fill, index = d.columns[2:] ) measure_correlation['Gamma mag'] = np.NAN measure_correlation['Gamma stdmag'] = np.NAN measure_correlation['Gamma pval'] = np.NAN measure_correlation['MoranI I'] = np.NAN measure_correlation['MoranI EI'] = np.NAN measure_correlation['MoranI p_norm'] = np.NAN measure_correlation['MoranI p_rand'] = np.NAN sf = shapefile.Reader("./nycd_15d/nycd.shp") w = pysal.rook_from_shapefile("./nycd_15d/nycd_reduced.shp") for metric in measure_correlation.index: measures = [] for rec in sf.iterRecords(): row = d.loc[d['GEO_ID'] == rec[0]] if row.empty: continue else: measures.append(row[metric].values[0]) measures = np.array(measures) g = pysal.Gamma(measures,w) measure_correlation.ix[metric,'Gamma mag'] = g.g measure_correlation.ix[metric,'Gamma stdmag'] = g.g_z measure_correlation.ix[metric,'Gamma pval'] = g.p_sim_g mi = pysal.Moran(measures, w, permutations = 999) measure_correlation.ix[metric,'MoranI I'] = mi.I measure_correlation.ix[metric,'MoranI EI'] = mi.EI measure_correlation.ix[metric,'MoranI p_norm'] = mi.p_norm measure_correlation.ix[metric,'MoranI p_rand'] = mi.p_rand # Plot some of the calculated autocorrelation values. 'Gamma stdmag' is the standardized Gamma Index and 'Gamma pval' is a measure of certainty. For more details: # https://pysal.readthedocs.org/en/latest/users/tutorials/autocorrelation.html#gamma-index-of-spatial-autocorrelation # # Not surprisingly, low p values are only observed for large values of the Gamma Index # In[5]: measure_correlation.plot(kind='scatter',x='Gamma stdmag',y='MoranI I') # In[6]: measure_correlation.plot(kind='scatter',x='Gamma stdmag',y='Gamma pval') # ###Save # Save the data for later use. # In[7]: measure_correlation.to_csv('spatial_correlations.csv') # In[ ]: