#!/usr/bin/env python # coding: utf-8 # In[1]: import calendar import datetime import dask.dataframe as dd import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib.dates as mdates import seaborn as sns from IPython.display import SVG, display # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'svg'") # set neat seaborn whitegrid styles for matplotlib charts plt.style.use('seaborn') sns.set_style('whitegrid') #plt.style.available # In[3]: get_ipython().run_cell_magic('time', '', "# set data file path\nparquet_data_folder = '../data/crimes-2017.snappy.parq'\nprint('Loading crime data from: {}'.format(parquet_data_folder))\n\n# load crimes parquet data into dask df\ncrimes = dd.read_parquet(parquet_data_folder, index='Date')\n\n# load all data into memory\ncrimes = crimes.persist()\nprint('Crime data loaded into memory.')\n") # In[4]: get_ipython().run_cell_magic('time', '', "# log records count and data partitions\nprint('Crime data stats:')\nprint('---------------------------------------')\nprint('{:,} total records in {} partitions'.format(len(crimes), crimes.npartitions))\nprint('DataFrame size: {:,}'.format(crimes.size.compute()))\n") # In[5]: # check data frame structure crimes # In[6]: # preview crimes data with Date index crimes.head() # In[7]: # get crime types and counts by primary type crime_types = crimes[['PrimaryType']] crime_type_total = crime_types.groupby('PrimaryType').size().compute() # print crime stats print(crime_type_total.sort_values(ascending=False)) print("...\nTotal Primary Crime Types: {:,}".format(crime_type_total.size)) # In[8]: # get arrests and domestic counts for each primary crime type crime_type_counts = crimes[['PrimaryType', 'Arrest', 'Domestic']]\ .groupby('PrimaryType').sum().compute() # add crime type totals column crime_type_counts['Total'] = crime_type_total # print all crime stats (arrests, domestic + total) print(crime_type_counts.sort_values(by='Total', ascending=False)) # In[9]: # plot crimes by primary type counts all_crime_types = crime_type_counts.sort_values(by='Total', ascending=True)\ .drop(['Arrest', 'Domestic'], axis=1) # denotes column all_crime_types.plot(kind='barh', figsize=(8,6), color='#cc0000') plt.title('2017 Chicago Crimes by Type') plt.xlabel('Number of Crime reports') plt.ylabel('Crime Type') plt.tight_layout() plt.show() # In[10]: # plot crimes by arrests crime_type_counts.sort_values(by='Arrest', ascending=True)\ .plot(kind='barh', figsize=(8,10)) plt.title('2017 Chicago Crime reports by Arrests') plt.ylabel('Crime Type') plt.xlabel('Number of Crimes') plt.tight_layout() plt.show() # In[11]: # plot crimes by domestic incident reports crime_type_counts.sort_values(by='Domestic', ascending=True)\ .plot(kind='barh', figsize=(8,10)) plt.title('2017 Chicago Crime reports by Domestic incidents') plt.ylabel('Crime Type') plt.xlabel('Number of Crimes') plt.tight_layout() plt.show() # In[12]: # plot high crimes (>-= 1,000 reports) crime_type_counts[crime_type_counts['Total'] >= 1000]\ .sort_values(by='Total', ascending=True)\ .plot(kind='barh', figsize=(6,6)) plt.ylabel('Crime Type') plt.xlabel('Number of Crimes') plt.title('High 2017 Chicago Crimes (>= 1,000 reports)') plt.show() # In[13]: # plot less than 1000 crime reports types crime_type_counts[crime_type_counts['Total'] < 1000]\ .sort_values(by='Total', ascending=True)\ .plot(kind='barh', figsize=(6,6)) plt.ylabel('Crime Type') plt.xlabel('Number of Crimes') plt.title('Low 2017 Chicago Crimes (<1,000 reports)') plt.show() # In[14]: # plot less than 100 crime reports types crime_type_counts[crime_type_counts['Total'] < 100]\ .sort_values(by='Total', ascending=True)\ .plot(kind='barh', figsize=(6,4)) plt.ylabel('Crime Type') plt.xlabel('Number of Crimes') plt.title('Least 2017 Chicago Crimes (<100 reports)') plt.show() # In[15]: # get crime location counts crime_locations = crimes.groupby('LocationDescription').size().compute() crime_locations = crime_locations.sort_values(ascending=False).rename('Total') #.reset_index() # print crime location stats print(crime_locations.head()) print("...\nTotal Locations: {:,}".format(crime_locations.size)) # In[16]: # plot top 30 crime locations crime_locations[:30].sort_values(ascending=True)\ .plot(kind='barh', figsize=(6,8)) plt.ylabel('Location') plt.xlabel('Number of Crimes') plt.title('2017 Chicago Crime Top 30 Locations') plt.show() # In[17]: # get arrests and domestic counts for each location description crime_location_counts = crimes[['LocationDescription', 'Arrest', 'Domestic']]\ .groupby('LocationDescription').sum().compute() # add crime location totals column crime_location_counts['Total'] = crime_locations crime_location_counts.head() # In[18]: # plot top crime locations with arrests and domestic crime counts crime_location_counts[crime_location_counts['Total'] >= 2500]\ .sort_values(by='Total', ascending=True)\ .plot(kind='barh', figsize=(6,6)) plt.ylabel('Location') plt.xlabel('Number of Crimes') plt.title('2017 Chicago Crime Top Locations (>=2,500 Crime Reports)') plt.show() # In[19]: # plot next 20 top crime locations crime_location_counts = crime_location_counts[crime_location_counts['Total'] <= 3000]\ .sort_values(by='Total', ascending=False) crime_location_counts[:20].sort_values(by='Total', ascending=True)\ .plot(kind='barh', figsize=(6,6)) plt.ylabel('Location') plt.xlabel('Number of Crimes') plt.title('2017 Chicago Crime Next Top 20 Crime Locations (<3,000 Crime Reports)') plt.show() # In[20]: # check for datetime64 index to plot crime over time crimes.index # In[21]: # get arrests arrests = crimes[crimes['Arrest'] == True]['Arrest'] # print arrests stats print(arrests.head()) print("...\nTotal Arrests: {:,}".format(arrests.size.compute())) # In[22]: # plot monthly arrests monthly_arrests = arrests.resample('M').sum().compute() monthly_arrests.plot(kind='bar', figsize=(6,3)) plt.xticks([0,1,2,3,4,5,6,7], calendar.month_name[1:13], rotation=0) plt.xlabel('2017 Month') plt.title('2017 Chicago Crime Monthly Arrests') plt.show() # In[23]: # plot weekly arrests weekly_arrests = arrests.resample('W').sum().compute() weekly_arrests.plot(kind='bar') # TODO: format weekly xticks to only list week start date plt.xlabel('Week Of 2017') plt.title('2017 Chicago Crime Weekly Arrests') plt.show() # In[24]: # plot daily arrests daily_arrests = arrests.resample('D').sum().compute() daily_arrests.plot() plt.title('2017 Chicago Crime Daily Arrests') plt.show() # In[25]: # get domestic crimes domestic = crimes[crimes['Domestic'] == True]['Domestic'] # print domestic crime stats print(domestic.head()) print("...\nTotal Domestic: {:,}".format(domestic.size.compute())) # In[26]: # plot daily domestic crimes daily_domestic = domestic.resample('D').sum().compute() daily_domestic.plot(color='g') plt.title('2017 Chicago Crime Daily Domestic reports') plt.show() # In[27]: # get daily total crime counts daily_crime = crime_types.resample('D').count().compute() #print daily total crime stats print(daily_crime.head()) print("...\nTotal Days: {:,}".format(daily_crime.size)) # In[28]: # plot daily crime counts, arrests, and domestic incidents reports fig, ax = plt.subplots() ax.plot(daily_crime.index, daily_crime, '--', label='Total', color='r', zorder=10) ax.plot(daily_arrests.index, daily_arrests, color='#3399ff', zorder=10) ax.fill_between(daily_domestic.index, daily_domestic, label='Domestic', color='c') ax.set_ylabel('Number of Crimes') ax.set_xlabel('Month') ax.legend(loc='right') plt.title('2017 Daily Chicago Crime reports, Arests, and Domestic incidents') plt.show() # In[29]: # get select human endangerment crimes violent_crimes = crime_types[(crime_types['PrimaryType']=='CRIM SEXUAL ASSAULT') | \ (crime_types['PrimaryType']=='HOMICIDE') | \ (crime_types['PrimaryType']=='HUMAN TRAFFICKING') | \ (crime_types['PrimaryType']=='KIDNAPPING') | \ (crime_types['PrimaryType']=='OFFENSE INVOLVING CHILDREN')] # print violent crimes stats print(violent_crimes.head(10)) print('...............................................') print(violent_crimes.PrimaryType.value_counts().head(5)) # In[30]: # categorize by primary type violent_crimes = violent_crimes.categorize(columns='PrimaryType') print(violent_crimes) # print selected violent crimes primary type categories print('\nSelected Primary Type categories:') print('----------------------------------------------------------------------') print(violent_crimes.PrimaryType.cat.categories) # In[31]: # group violent crimes by month and crime type violent_crimes_groupby = violent_crimes.groupby([pd.TimeGrouper('M'), 'PrimaryType']) violent_crime_data = violent_crimes_groupby['PrimaryType'].count().compute().rename('Count') print(violent_crime_data.head(10)) print('...') # unstack violent crime type group for series plotting violent_crime_data = violent_crime_data.unstack() print(violent_crime_data.head(12)) print("...\nTotal Monthly/Type records: {:,}".format(violent_crime_data.size)) # In[32]: # plot violent crime data violent_crime_data.plot(figsize=(6,6), kind='bar') plt.xticks([0,1,2,3,4,5,6], calendar.month_name[1:13], rotation=0) plt.legend(loc='upper right', frameon=True) plt.xlabel('2017 Month') plt.title('Human Endangerment 2017 Chicago Crimes') plt.tight_layout() plt.show() # In[33]: # get top 5 crimes top_5_crimes = crime_types[(crime_types['PrimaryType']=='THEFT') | \ (crime_types['PrimaryType']=='BATTERY') | \ (crime_types['PrimaryType']=='CRIMINAL DAMAGE') | \ (crime_types['PrimaryType']=='ASSAULT') | \ (crime_types['PrimaryType']=='OTHER OFFENSE')] top_5_crimes.head(10) # In[34]: # print top 5 crime counts top_5_crimes.PrimaryType.value_counts().head() # In[35]: print('Monthly Top 5 Crime Data:') print('---------------------------------------------------------------------') # group top 5 crimes by month and crime type crimes_groupby = top_5_crimes.groupby([pd.TimeGrouper('M'), 'PrimaryType']) top_5_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count') print(top_5_crime_data.head(10)) print('...') # unstack top 5 crimes type group for series plotting top_5_crime_data = top_5_crime_data.unstack() print(top_5_crime_data.head(10)) # In[36]: # plot top 5 crimes data top_5_crime_data.plot(figsize=(6,6), kind='bar') plt.xticks([0,1,2,3,4,5,6,7], calendar.month_name[1:13], rotation=0) plt.legend(loc='upper right', frameon=True) plt.xlabel('2017 Month') plt.title('Top 5 2017 Chicago Crimes by Month') plt.tight_layout() plt.show() # In[37]: print('Weekly Top 5 Crime Data:') print('---------------------------------------------------------------------') # group top 5 crimes by week and crime type crimes_groupby = top_5_crimes.groupby([pd.TimeGrouper('W'), 'PrimaryType']) top_5_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count') print(top_5_crime_data.head(10)) print('...') # unstack top 5 crimes type group for series plotting top_5_crime_data = top_5_crime_data.unstack() print(top_5_crime_data.head(10)) print('...') # In[38]: # add weekly arrests data for comparison of crime fighting efforts #top_5_crime_data['Arrests'] = weekly_arrests # plot Weekly top 5 crimes data top_5_crime_data.plot(figsize=(6,6)) plt.legend(loc='upper right', frameon=True) plt.title('Weekly Top 5 2017 Chicago Crimes') plt.tight_layout() plt.show() # In[39]: print('Daily Top 5 Crime Data:') print('---------------------------------------------------------------------') # group top 5 crimes by day and crime type crimes_groupby = top_5_crimes.groupby([pd.TimeGrouper('D'), 'PrimaryType']) top_5_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count') print(top_5_crime_data.head(10)) print('...') # unstack top 5 crimes type group for series plotting top_5_crime_data = top_5_crime_data.unstack() print(top_5_crime_data.head(10)) print('...') # In[40]: # add daily arrests data for comparison of crime fighting efforts #top_5_crime_data['Arrests'] = daily_arrests # plot top 5 daily crimes grid top_5_crime_data.plot(figsize=(9, 6), subplots=True, layout=(-1, 3),\ cmap='tab10', sharex=False, sharey=False) plt.show() # In[41]: # load Chicago community areas with pandas # for plotting crime by Chicago 'sides' community_areas = pd.read_csv('../data/chicago-community-areas.csv') #, index_col='CommunityName') community_areas.head() # In[42]: # get community crime stats community_areas['Total'] = crimes.groupby('CommunityArea').size().compute().rename('Total') community_crime = community_areas.sort_values(by='Total', ascending=False).dropna() # print community crime stats print(community_crime.head()) print("...\nTotal Communities: {:,}".format(community_crime.Total.count())) # In[43]: # drop unused columns and reindex community_crime = community_crime.drop(['CommunityArea', 'Side'], axis=1) # denotes column community_crime = community_crime.set_index('CommunityName') community_crime.head() # In[44]: # plot 20 high crime communities community_crime.head(20).sort_values(by='Total', ascending=True)\ .plot(kind='barh', color='#cc0000') plt.ylabel('Community') plt.xlabel('Number of Crimes') plt.title('2017 Chicago High Crime Communities') plt.show() # In[45]: # plot 20 low crime communities community_crime.tail(20).sort_values(by='Total', ascending=False).plot(kind='barh') plt.ylabel('Community') plt.xlabel('Number of Crimes') plt.title('2017 Chicago Low Crime Communities') plt.show() # In[46]: community_areas.head() # In[47]: # group crime totals by Chicago 'sides' crime_by_side = community_areas.groupby('Side').sum().drop('CommunityArea', axis=1) # denotes column crime_by_side # In[48]: # plot crime by Chicago community sides crime_by_side.sort_values(by='Total', ascending=True)\ .plot(kind='barh', figsize=(6,4), color='#cc0000') plt.ylabel('Chicago Side') plt.xlabel('Number of Crimes') plt.title('2017 Chicago Crime by Side') plt.show() # In[49]: # show Chicago sides svg for reference display(SVG(url='../data/chicago-community-areas.svg')) # In[50]: get_ipython().run_cell_magic('time', '', "\n# get crime geo data\ncrime_geo = crimes[['PrimaryType', 'Latitude', 'Longitude']].dropna()\n\n# get homicides\nhomicides = crime_geo[(crime_geo['PrimaryType']=='HOMICIDE')]\\\n.compute()\n\n# get kidnappings\nkidnappings = crime_geo[(crime_geo['PrimaryType']=='KIDNAPPING')]\\\n.compute()\n\n# get criminal sexual assaults\nsexual_assaults = crime_geo[(crime_geo['PrimaryType']=='CRIM SEXUAL ASSAULT')]\\\n.compute()\n\n# get thefts\nthefts = crime_geo[(crime_geo['PrimaryType']=='THEFT')]\\\n.compute()\n\n# create scatter map plot\n%config InlineBackend.figure_format = 'png'\nfig,ax= plt.subplots(1, 1, figsize=(10,12))\nplt.xlim(41.65, 42.02) # (minLat, maxLat)\nplt.ylim(-87.78, -87.53) # (minLong, maxLong)\n#ax.set_facecolor('black')\n\n# plot thefts, homicides, kidnappings and sexual assaults\nax.scatter(thefts['Latitude'], thefts['Longitude'],\\\n s=1, alpha=0.8, color='#66ff99', label='Theft')\nax.scatter(homicides['Latitude'], homicides['Longitude'],\\\n s=10, alpha=0.8, color='#ff0000', label='Homicide')\nax.scatter(kidnappings['Latitude'], kidnappings['Longitude'],\\\n s=10, alpha=0.8, color='#3333ff', label='Kidnapping')\nax.scatter(sexual_assaults['Latitude'], sexual_assaults['Longitude'],\\\n s=8, alpha=0.8, color='#ff9933', label='Criminal Sexual Assault')\nplt.xlabel('Latitude')\nplt.ylabel('Longitude')\nplt.title('2017 Chicago Crime Map')\nplt.legend(loc='upper right', frameon=True)\nplt.show()\n") # In[ ]: