# --- Imports --- import sys,os import pandas as pd import numpy as np import matplotlib import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline d_crime=pd.read_csv('/Users/lmartin/Desktop/Misc/Blog_Archive/GIS/Map__Crime_Incidents_-_from_1_Jan_2003.csv') print d_crime.shape d_crime.head(1) date=pd.to_datetime(d_crime['Date']) print date.min() print date.max() t_delta=(date-date.min()).astype('timedelta64[D]') d_crime['days']=t_delta d_crime.head(1) def plotdat(data,cat): l=data.groupby(cat).size() l.sort() fig=plt.figure(figsize=(10,5)) plt.yticks(fontsize=8) l.plot(kind='bar',fontsize=12,color='k') plt.xlabel('') plt.ylabel('Number of reports',fontsize=10) plotdat(d_crime,'Category') l=d_crime.groupby('Descript').size() l.sort() print l.shape def types_districts(d_crime,per): # Group by crime type and district hoods_per_type=d_crime.groupby('Descript').PdDistrict.value_counts(sort=True) t=hoods_per_type.unstack().fillna(0) # Sort by hood sum hood_sum=t.sum(axis=0) hood_sum.sort(ascending=False) t=t[hood_sum.index] # Filter by crime per district crime_sum=t.sum(axis=1) crime_sum.sort() # Large number, so let's slice the data. p=np.percentile(crime_sum,per) ix=crime_sum[crime_sum>p] t=t.loc[ix.index] return t t=types_districts(d_crime,96) sns.clustermap(t) sns.clustermap(t,standard_scale=1) sns.clustermap(t,standard_scale=0) plotdat(d_crime,'Category') # Let's drill down onto one cat=d_crime[d_crime['Category']=='DRUG/NARCOTIC'] c=cat['Descript'].value_counts() c.sort(ascending=False) c.head(10) t=types_districts(cat,70) sns.clustermap(t) sns.clustermap(t,standard_scale=1) sns.clustermap(t,standard_scale=0) # Let's drill down onto one cat=d_crime[d_crime['Category']=='DRUG/NARCOTIC'] # Bin crime by 30 day window cat['Month']=np.floor(cat['days']/30) # Approximate month (30 day window) # Default district='All' def timeseries(dat,per): ''' Category grouped by month ''' # Group by crime type and district cat_per_time=dat.groupby('Month').Descript.value_counts(sort=True) t=cat_per_time.unstack().fillna(0) # Filter by crime per district crime_sum=t.sum(axis=0) crime_sum.sort() # Large number, so let's slice the data. p=np.percentile(crime_sum,per) ix=crime_sum[crime_sum>p] t=t[ix.index] return t t_all=timeseries(cat,0) barituate_features=['SALE OF BARBITUATES', 'POSSESSION OF BARBITUATES FOR SALES', 'ENCOURAGE MINOR TO USE BARBITUATES', 'POSSESSION OF BARBITUATES'] coke_features=['ENCOURAGING MINOR TO USE COCAINE', 'SALES COCAINE BASE/SCHOOLYARD TRAFFICKING ACT VIO', 'TRANSPORTATION OF COCAINE', 'SALE OF COCAINE', 'POSSESSION OF COCAINE FOR SALES', 'POSSESSION OF COCAINE'] weed_features=['ENCOURAGING MINOR TO USE MARIJUANA', 'FURNISHING MARIJUANA', 'PLANTING/CULTIVATING MARIJUANA', 'TRANSPORTATION OF MARIJUANA', 'SALE OF MARIJUANA', 'POSSESSION OF MARIJUANA FOR SALES', 'POSSESSION OF MARIJUANA'] metadone_features=['TRANSPORTATION OF METHADONE', 'SALE OF METHADONE', 'POSSESSION OF METHADONE FOR SALES', 'POSSESSION OF METHADONE'] hallu_features=['TRANSPORTATION OF OPIATES', 'SALE OF HALLUCINOGENIC', 'POSSESSION OF OPIUM', 'POSSESSION OF OPIUM DERIVATIVE', 'POSSESSION OF OPIUM', 'SALE OF OPIUM', 'SALE OF OPIUM DERIVATIVE', 'TRANSPORTATION OF OPIATES', 'POSSESSION OF OPIUM FOR SALES', 'TRANSPORTATION OF HALLUCINOGENIC', 'POSSESSION OF OPIUM DERIVATIVE FOR SALES', 'SALE OF OPIATES', 'SALE OF HALLUCINOGENIC', 'POSSESSION OF OPIUM DERIVATIVE', 'POSSESSION OF OPIUM', 'POSSESSION OF OPIATES FOR SALES', 'POSSESSION OF HALLUCINOGENIC FOR SALES', 'POSSESSION OF OPIATES', 'POSSESSION OF HALLUCINOGENIC'] meth_features=['TRANSPORTATION OF AMPHETAMINE', 'SALE OF AMPHETAMINE', 'POSSESSION OF AMPHETAMINE', 'SALE OF METH-AMPHETAMINE', 'TRANSPORTATION OF METH-AMPHETAMINE', 'POSSESSION OF AMPHETAMINE FOR SALES', 'POSSESSION OF METH-AMPHETAMINE FOR SALE', 'POSSESSION OF METH-AMPHETAMINE'] heroin_features=['SALE OF HEROIN', 'POSSESSION OF HEROIN', 'POSSESSION OF HEROIN FOR SALES', 'TRANSPORTATION OF HEROIN', 'SALE OF HEROIN', 'POSSESSION OF HEROIN FOR SALES', 'POSSESSION OF HEROIN'] crack_features=['POSSESSION OF BASE/ROCK COCAINE FOR SALE', 'SALE OF BASE/ROCK COCAINE', 'POSSESSION OF BASE/ROCK COCAINE'] # Lets use real dates for plotting days_from_start=pd.Series(t_all.index*30).astype('timedelta64[D]') dates_for_plot=date.min()+days_from_start time_labels=dates_for_plot.map(lambda x: str(x.year)+'-'+str(x.month)) def drug_analysis(t,district,plot): t['BARBITUATES']=t[map(lambda s: s.strip(), barituate_features)].sum(axis=1) t['HEROIN']=t[map(lambda s: s.strip(), heroin_features)].sum(axis=1) t['HALLUCINOGENIC']=t[map(lambda s: s.strip(), hallu_features)].sum(axis=1) t['METH']=t[map(lambda s: s.strip(), meth_features)].sum(axis=1) t['WEED']=t[map(lambda s: s.strip(), weed_features)].sum(axis=1) t['COKE']=t[map(lambda s: s.strip(), coke_features)].sum(axis=1) t['METHADONE']=t[map(lambda s: s.strip(), metadone_features)].sum(axis=1) t['CRACK']=t[map(lambda s: s.strip(), crack_features)].sum(axis=1) drugs=t[['HEROIN','HALLUCINOGENIC','METH','WEED','COKE','CRACK']] if plot: drugs.index=[int(i) for i in drugs.index] colors = plt.cm.jet(np.linspace(0, 1, drugs.shape[1])) drugs.plot(kind='bar', stacked=True, figsize=(20,5), color=colors, width=1, title=district,fontsize=6) return drugs drug_df_all=drug_analysis(t_all,district,True) def drug_analysis_rescale(t,district,plot): t['BARBITUATES']=t[map(lambda s: s.strip(), barituate_features)].sum(axis=1) t['HEROIN']=t[map(lambda s: s.strip(), heroin_features)].sum(axis=1) t['HALLUCINOGENIC']=t[map(lambda s: s.strip(), hallu_features)].sum(axis=1) t['METH']=t[map(lambda s: s.strip(), meth_features)].sum(axis=1) t['WEED']=t[map(lambda s: s.strip(), weed_features)].sum(axis=1) t['COKE']=t[map(lambda s: s.strip(), coke_features)].sum(axis=1) t['METHADONE']=t[map(lambda s: s.strip(), metadone_features)].sum(axis=1) t['CRACK']=t[map(lambda s: s.strip(), crack_features)].sum(axis=1) drugs=t[['HEROIN','HALLUCINOGENIC','METH','WEED','COKE','CRACK']] if plot: drugs=drugs.div(drugs.sum(axis=1),axis=0) drugs.index=[int(i) for i in drugs.index] colors = plt.cm.GnBu(np.linspace(0, 1, drugs.shape[1])) colors = plt.cm.jet(np.linspace(0, 1, drugs.shape[1])) drugs.plot(kind='bar', stacked=True, figsize=(20,5), color=colors, width=1, title=district, legend=False) plt.ylim([0,1]) return drugs drug_df_all=drug_analysis_rescale(t_all,district,True) dates_for_plot.index=dates_for_plot sns.set_context(rc={"figure.figsize": (12.5,5.5)}) for d,c in zip(['METH','CRACK','HEROIN','WEED'],['b','r','c','g']): plt.plot(dates_for_plot.index,drug_df_all[d],'o-',color=c,ms=6,mew=1.5,mec='white',linewidth=0.5,label=d,alpha=0.75) plt.legend(loc='upper left',scatterpoints=1,prop={'size':8}) stor=[] stor_time=[] for d in set(d_crime['PdDistrict']): # Specify district and group by time dist_dat=cat[cat['PdDistrict']==d] t=timeseries(dist_dat,0) # Merge to ensure all categories are preserved! t_merge=pd.DataFrame(columns=t_all.columns) m=pd.concat([t_merge,t],axis=0).fillna(0) m.reset_index(inplace=True) # Plot drug_df=drug_analysis(m,d,True) plt.show() s=drug_df.sum(axis=0) stor=stor+[s] drug_df.columns=cols=[c+"_%s"%d for c in drug_df.columns] stor_time=stor_time+[drug_df] drug_dat_time=pd.concat(stor_time,axis=1) drug_dat=pd.concat(stor,axis=1) drug_dat.columns=[list(set(d_crime['PdDistrict']))] sns.set_context(rc={"figure.figsize": (20,20)}) sns.corrplot(drug_dat_time, annot=False, sig_stars=False,diag_names=False) drug_dat_time.index=dates_for_plot sns.set_context(rc={"figure.figsize": (7.5,5)}) for d,c in zip(['METH_MISSION','CRACK_MISSION'],['b','r']): plt.plot(drug_dat_time.index,drug_dat_time[d],'o-',color=c,ms=6,mew=1,mec='white',linewidth=0.5,label=d,alpha=0.75) plt.legend(loc='upper left',scatterpoints=1,prop={'size':8}) sns.set_context(rc={"figure.figsize": (7.5,5)}) for d,c in zip(['CRACK_MISSION','CRACK_TENDERLOIN','CRACK_BAYVIEW','CRACK_SOUTHERN'],['b','r','g','c']): plt.plot(drug_dat_time.index,drug_dat_time[d],'o-',color=c,ms=6,mew=1,mec='white',linewidth=0.5,label=d,alpha=0.75) plt.legend(loc='upper left',scatterpoints=1,prop={'size':8}) stor=[] stor_time=[] for d in set(d_crime['PdDistrict']): # Specify district and group by time dist_dat=cat[cat['PdDistrict']==d] t=timeseries(dist_dat,0) # Merge to ensure all categories are preserved! t_merge=pd.DataFrame(columns=t_all.columns) m=pd.concat([t_merge,t],axis=0).fillna(0) m.reset_index(inplace=True) # Plot drug_df=drug_analysis_rescale(m,d,True) plt.show() s=drug_df.sum(axis=0) stor=stor+[s] drug_df.columns=cols=[c+"_%s"%d for c in drug_df.columns] stor_time=stor_time+[drug_df] drug_dat_time_rescale=pd.concat(stor_time,axis=1) drug_dat_rescale=pd.concat(stor,axis=1) drug_dat.columns=[list(set(d_crime['PdDistrict']))] sns.clustermap(drug_dat,standard_scale=0) sns.clustermap(drug_dat,standard_scale=1) tmp=cat.copy() tmp.set_index('Descript',inplace=True) crack_dat=tmp.loc[crack_features] crack_pts=crack_dat[['X','Y','Month']] d=pd.DataFrame(crack_pts.groupby('Month').size()) d.index=dates_for_plot d.columns=['Count'] diff=len(d.index)-120 plt.plot(d.index,d['Count'],'o-',color='k',ms=6,mew=1,mec='white',linewidth=0.5,label=d,alpha=0.75) plt.axvspan(d.index[40-diff],d.index[40],color='cyan',alpha=0.5) plt.axvspan(d.index[80-diff],d.index[80],color='red',alpha=0.5) plt.axvspan(d.index[120],d.index[-1],color='green',alpha=0.5) oldest_crack_sums=d.loc[(d.index>d.index[40-diff]) & (d.indexd.index[80-diff]) & (d.indexd.index[120]] old_crack_sums['Count'].mean()/float(new_crack_sums['Count'].mean()) oldest_crack=crack_pts[(crack_pts['Month']>(40-diff)) & (crack_pts['Month']<40)] oldest_crack.columns=['longitude','latitude','time'] old_crack=crack_pts[(crack_pts['Month']>(80-diff)) & (crack_pts['Month']<80)] old_crack.columns=['longitude','latitude','time'] new_crack=crack_pts[crack_pts['Month']>120] new_crack.columns=['longitude','latitude','time'] from mpl_toolkits.basemap import Basemap import fiona from shapely.geometry import shape, mapping from pyproj import Proj, transform from fiona.crs import from_epsg from itertools import chain shapefile="/Users/lmartin/Desktop/Misc/Blog_Archive/GIS/sffind_neighborhoods/sffind_neighborhoods.shp" shp = fiona.open(shapefile) bds = shp.bounds print (bds) shp.close() extra = 0.01 ll = (bds[0], bds[1]) ur = (bds[2], bds[3]) coords = list(chain(ll, ur)) w, h = coords[2] - coords[0], coords[3] - coords[1] m = Basemap( projection='tmerc', lon_0=-122., lat_0=37.7, ellps = 'WGS84', llcrnrlon=coords[0] - extra * w, llcrnrlat=coords[1] - extra + 0.01 * h, urcrnrlon=coords[2] + extra * w, urcrnrlat=coords[3] + extra + 0.01 * h, lat_ts=0, resolution='i', suppress_ticks=True) m.readshapefile( '/Users/lmartin/Desktop/Misc/Blog_Archive/GIS/sffind_neighborhoods/sffind_neighborhoods', 'SF', color='none', zorder=2) from descartes import PolygonPatch from shapely.geometry import Point, Polygon, MultiPoint, MultiPolygon from shapely.prepared import prep from matplotlib.collections import PatchCollection # Set up a map dataframe df_map = pd.DataFrame({ 'poly': [Polygon(xy) for xy in m.SF], 'ward_name': [ward['name'] for ward in m.SF_info]}) df_map['area_m'] = df_map['poly'].map(lambda x: x.area) df_map['area_km'] = df_map['area_m'] / 100000 def makePoints(dat): # Create Point objects in map coordinates from dataframe lon and lat values map_points = pd.Series([Point(m(mapped_x,mapped_y)) for mapped_x, mapped_y in zip(dat['longitude'],dat['latitude'])]) plt_points = MultiPoint(list(map_points.values)) hoods_polygon = prep(MultiPolygon(list(df_map['poly'].values))) pts = filter(hoods_polygon.contains,plt_points) return pts sf_oldest_crack_points_todraw=makePoints(oldest_crack) sf_old_crack_points_todraw=makePoints(old_crack) sf_new_crack_points_todraw=makePoints(new_crack) # Draw neighborhoods withpolygons df_map['patches'] = df_map['poly'].map(lambda x: PolygonPatch( x, fc='#000000', ec='#ffffff', lw=.5, alpha=1, zorder=4)) plt.clf() fig = plt.figure() ax = fig.add_subplot(111, axisbg='w', frame_on=False) # Now, we can overlay our points dev = m.scatter( [geom.x for geom in sf_oldest_crack_points_todraw], [geom.y for geom in sf_oldest_crack_points_todraw], 10, marker='o', lw=.25, facecolor='cyan', edgecolor='cyan', alpha=0.75, antialiased=True, label='New crack', zorder=3) dev = m.scatter( [geom.x for geom in sf_old_crack_points_todraw], [geom.y for geom in sf_old_crack_points_todraw], 10, marker='o', lw=.25, facecolor='red', edgecolor='red', alpha=0.75, antialiased=True, label='Old crack', zorder=3) dev = m.scatter( [geom.x for geom in sf_new_crack_points_todraw], [geom.y for geom in sf_new_crack_points_todraw], 10, marker='o', lw=.25, facecolor='green', edgecolor='green', alpha=0.75, antialiased=True, label='New crack', zorder=3) ax.add_collection(PatchCollection(df_map['patches'].values, match_original=True)) plt.tight_layout() fig.set_size_inches(15,15)