import dask.dataframe as dd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
# set neat seaborn whitegrid styles for matplotlib charts
plt.style.use('seaborn')
sns.set_style('whitegrid')
%%time
# set parquet data folder path
parquet_data_folder = '../data/crimes-2001-to-present.snappy.parq'
print('Loading crime data from: {}'.format(parquet_data_folder))
# load crimes parquet data into dask df
crimes = dd.read_parquet(parquet_data_folder, index='Date')
# load all data into memory
crimes = crimes.persist()
print('Crime data loaded into memory.')
Loading crime data from: ../data/crimes-2001-to-present.snappy.parq Crime data loaded into memory. Wall time: 10.2 s
%%time
# log records count and data frame/partitions stats
print('Chicago Crime data stats:')
print('---------------------------------------')
print('{:,} total records in {} partitions'.format(len(crimes), crimes.npartitions))
print('DataFrame size: {:,}'.format(crimes.size.compute()))
Chicago Crime data stats: --------------------------------------- 6,401,616 total records in 1 partitions DataFrame size: 89,622,624 Wall time: 0 ns
# check data frame structure
crimes
Block | PrimaryType | FBICode | Description | LocationDescription | CommunityArea | Beat | District | Ward | Arrest | Domestic | Latitude | Longitude | Year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
npartitions=1 | ||||||||||||||
2001-01-01 00:00:00 | object | category[unknown] | category[unknown] | category[unknown] | category[unknown] | category[unknown] | category[unknown] | category[unknown] | category[unknown] | bool | bool | float64 | float64 | category[unknown] |
2017-08-25 23:59:00 | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
%%time
# preview crimes data with Date index
crimes.head(5)
Wall time: 0 ns
Block | PrimaryType | FBICode | Description | LocationDescription | CommunityArea | Beat | District | Ward | Arrest | Domestic | Latitude | Longitude | Year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | ||||||||||||||
2001-01-01 | 064XX S CAMPBELL AVE | OFFENSE INVOLVING CHILDREN | 20 | CRIM SEX ABUSE BY FAM MEMBER | RESIDENCE | 66.0 | 825 | 8.0 | 15.0 | False | True | 41.777017 | -87.686172 | 2001 |
2001-01-01 | 063XX S CENTRAL AV | OTHER OFFENSE | 26 | TELEPHONE THREAT | OTHER | NaN | 813 | 8.0 | NaN | False | False | 41.777538 | -87.761552 | 2001 |
2001-01-01 | 031XX W DOUGLAS BLVD | SEX OFFENSE | 17 | AGG CRIMINAL SEXUAL ABUSE | CHURCH/SYNAGOGUE/PLACE OF WORSHIP | 29.0 | 1022 | 10.0 | 24.0 | False | False | NaN | NaN | 2001 |
2001-01-01 | 007XX N MICHIGAN AVE | THEFT | 06 | FINANCIAL ID THEFT: OVER $300 | RESIDENCE | 8.0 | 1833 | 18.0 | 42.0 | False | False | 41.896660 | -87.624110 | 2001 |
2001-01-01 | 076XX S EMERALD AVE | THEFT | 06 | AGG: FINANCIAL ID THEFT | RESIDENCE | 71.0 | 621 | 6.0 | 17.0 | False | False | 41.756087 | -87.642812 | 2001 |
%%time
# get crime types
crime_types = crimes[['PrimaryType']]
# get crime counts by primary type
crime_type_total = crime_types.groupby('PrimaryType').size().compute()
# get arrests and domestic counts for each primary crime type
crime_type_counts = crimes[['PrimaryType', 'Arrest', 'Domestic']]\
.groupby('PrimaryType').sum().compute()
# add crime type totals column
crime_type_counts['Total'] = crime_type_total
# print all crime stats (arrests, domestic, and totals)
print(crime_type_counts.sort_values(by='Total', ascending=False))
Arrest Domestic Total PrimaryType THEFT 161206.0 31004.0 1336422 BATTERY 268091.0 493984.0 1170982 CRIMINAL DAMAGE 52256.0 55455.0 735055 NARCOTICS 683386.0 245.0 687684 OTHER OFFENSE 70194.0 115557.0 397375 ASSAULT 92630.0 85388.0 393800 BURGLARY 21364.0 2467.0 372293 MOTOR VEHICLE THEFT 27796.0 1141.0 301173 ROBBERY 23657.0 3902.0 242663 DECEPTIVE PRACTICE 42641.0 2771.0 236939 CRIMINAL TRESPASS 136845.0 5843.0 184854 PROSTITUTION 66102.0 6.0 66362 WEAPONS VIOLATION 51657.0 227.0 64252 PUBLIC PEACE VIOLATION 29623.0 2144.0 46090 OFFENSE INVOLVING CHILDREN 9113.0 21896.0 42164 CRIM SEXUAL ASSAULT 4008.0 3260.0 24869 SEX OFFENSE 7424.0 1834.0 23539 GAMBLING 14057.0 3.0 14160 LIQUOR LAW VIOLATION 13624.0 8.0 13749 INTERFERENCE WITH PUBLIC OFFICER 12432.0 68.0 13607 ARSON 1408.0 555.0 10690 HOMICIDE 4006.0 563.0 8507 KIDNAPPING 724.0 1781.0 6451 INTIMIDATION 676.0 481.0 3733 STALKING 514.0 1335.0 3144 OBSCENITY 377.0 58.0 455 PUBLIC INDECENCY 145.0 1.0 146 CONCEALED CARRY LICENSE VIOLATION 112.0 0.0 125 OTHER NARCOTIC VIOLATION 84.0 0.0 116 NON-CRIMINAL 8.0 1.0 115 NON - CRIMINAL 6.0 0.0 38 HUMAN TRAFFICKING 4.0 3.0 34 RITUALISM 3.0 5.0 23 NON-CRIMINAL (SUBJECT SPECIFIED) 2.0 5.0 6 DOMESTIC VIOLENCE 1.0 1.0 1 Wall time: 281 ms
# plot crimes by primary type counts
all_crime_types = crime_type_counts.sort_values(by='Total', ascending=True)\
.drop(['Arrest', 'Domestic'], axis=1) # denotes column
all_crime_types.plot(kind='barh', figsize=(10,8), color='#cc0000')
plt.title('All Chicago Crimes by Type (2001-2017)')
plt.xlabel('Number of Crime reports')
plt.ylabel('Crime Type')
plt.tight_layout()
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# plot top crimes (>-= 200,000 reports)
crime_type_counts[crime_type_counts['Total'] >= 200000].sort_values(by='Total', ascending=True)\
.plot(kind='barh', figsize=(10,8))
plt.title('Top Chicago Crimes (>= 200K reports) - (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Crime Type')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# get narcotics crimes
narcotics = crimes[crimes['PrimaryType']=='NARCOTICS']
#print(narcotics.head())
# get narcotics crime description counts
narcotics_crimes = narcotics[['PrimaryType', 'Description']]\
.groupby('Description').count().compute()\
.sort_values(by='PrimaryType', ascending=False)
print(narcotics_crimes.head(10))
print('...')
print('Total Narcotics Crime Descriptions: {}'.format(len(narcotics_crimes)))
PrimaryType Description POSS: CANNABIS 30GMS OR LESS 274724 POSS: CRACK 117403 POSS: HEROIN(WHITE) 88928 SOLICIT NARCOTICS ON PUBLICWAY 26144 POSS: COCAINE 23964 MANU/DELIVER:CRACK 23615 MANU/DELIVER: HEROIN (WHITE) 23531 POSS: CANNABIS MORE THAN 30GMS 16443 ATTEMPT POSSESSION NARCOTICS 16035 FOUND SUSPECT NARCOTICS 14976 ... Total Narcotics Crime Descriptions: 379
# plot top 20 narcotics crimes
narcotics_crimes[:20].sort_values(by='PrimaryType', ascending=True)\
.plot(kind='barh', figsize=(6,6), color='#cc0000')
plt.title('Top Chicago Narcotics Crimes (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Narcotics Crime')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# get other offenses crimes
other_offenses = crimes[crimes['PrimaryType']=='OTHER OFFENSE']
#print(other_offenses.head())
# get other offense crime description counts
other_offense_crimes = other_offenses[['PrimaryType', 'Description']]\
.groupby('Description').count().compute()\
.sort_values(by='PrimaryType', ascending=False)
print(other_offense_crimes.head(10))
print('...')
print('Total Other Offense Crime Descriptions: {}'.format(len(other_offense_crimes)))
PrimaryType Description TELEPHONE THREAT 129722 HARASSMENT BY TELEPHONE 97048 OTHER VEHICLE OFFENSE 34483 VIOLATE ORDER OF PROTECTION 31547 HARASSMENT BY ELECTRONIC MEANS 20470 FALSE/STOLEN/ALTERED TRP 19577 OTHER CRIME INVOLVING PROPERTY 11170 OTHER WEAPONS VIOLATION 7101 VEHICLE TITLE/REG OFFENSE 6668 LICENSE VIOLATION 6360 ... Total Other Offense Crime Descriptions: 379
# plot top 20 other offense crimes
other_offense_crimes[:20].sort_values(by='PrimaryType', ascending=True)\
.plot(kind='barh', figsize=(6,6), color='#cc0000')
plt.title('Top Chicago Other Offense Crimes (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Other Offense Crime')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# plot crimes per day of the week
days = ['Monday','Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
crimes.groupby([crimes.index.dayofweek]).size().compute()\
.plot(kind='barh', figsize=(6,3), color='#cc0000')
plt.title('All Chicago Crimes by Day of the Week (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Week Day')
plt.yticks(np.arange(7), days)
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# plot crimes per month of the year
crimes.groupby([crimes.index.month]).size().compute()\
.plot(kind='barh', figsize=(8,5), color='#cc0000')
plt.title('All Chicago Crimes by Month of the Year (2001-2017)')
plt.xlabel('Number of crimes')
plt.ylabel('Month of the year')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
%%time
print('\nParsing All Crimes...')
# get monthly total crime counts
monthly_crime = crime_types.resample('M').count().compute()
# print monthly total crime stats
print('---------------------------------------')
print(monthly_crime.head())
print('...\nTotal Crime records: {:,}'.format(len(crimes)))
print('Total Months: {:,}'.format(monthly_crime.size))
# dataframe stats print util function
def print_dataframe_stats(df, title):
print('\n{}\n---------------------------------------'.format(title))
print(df.head())
print('...\nTotal {}: {:,}'.format(title, df.size.compute()))
# get arrests
arrests = crimes[crimes['Arrest'] == True]['Arrest']
print_dataframe_stats(arrests, 'Arrests')
# get domestic crimes
domestic = crimes[crimes['Domestic'] == True]['Domestic']
print_dataframe_stats(domestic, 'Domestic')
Parsing All Crimes... --------------------------------------- PrimaryType 2001-01-31 38009 2001-02-28 33711 2001-03-31 40418 2001-04-30 40030 2001-05-31 41734 ... Total Crime records: 6,401,616 Total Months: 200 Arrests --------------------------------------- Date 2001-01-01 True 2001-01-01 True 2001-01-01 True 2001-01-01 True 2001-01-01 True Name: Arrest, dtype: bool ... Total Arrests: 1,796,176 Domestic --------------------------------------- Date 2001-01-01 True 2001-01-01 True 2001-01-01 True 2001-01-01 True 2001-01-01 True Name: Domestic, dtype: bool ... Total Domestic: 831,992 Wall time: 1.62 s
# plot monthly arrests
monthly_arrests = arrests.resample('M').sum().compute()
monthly_arrests.plot(figsize=(10,6), color='#3399ff')
plt.title('Monthly Chicago Arrests (2001-2017)')
plt.xlabel('Months')
plt.ylabel('Arrests')
ax = plt.gca()
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# get monthly domestic crimes
monthly_domestic = domestic.resample('M').sum().compute()
# plot monthly crime counts, arrests, and domestic incidents reports
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(monthly_crime.index, monthly_crime, '--', label='Total', color='#cc0000')
ax.plot(monthly_arrests.index, monthly_arrests, color='#3399ff', zorder=10)
ax.fill_between(monthly_domestic.index, monthly_domestic, \
label='Domestic', color='c')
ax.set_xlabel('Months')
ax.set_ylabel('Number of Crimes')
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
ax.legend(loc='upper right')
plt.title('Monthly Chicago Crime reports, Arrests, and Domestic incidents (2001-2017)')
plt.show()
%%time
# get crimes rolling sum for a smoother crime decline chart display
crimes_rolling_sum = crime_types.resample('D').count().rolling(365).sum().compute()
crimes_rolling_sum.plot(figsize=(10,6), color='#cc0000')
plt.title('Rolling sum of all Chicago crimes over 365 days period (2001-2017)')
plt.xlabel('Days')
plt.ylabel('Number of Crimes')
ax = plt.gca()
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
Wall time: 1.3 s
# get top crime types
top_crime_types = crime_type_counts[crime_type_counts['Total'] >= 200000]\
.sort_values(by='Total', ascending=False)
top_crime_types.head(10)
Arrest | Domestic | Total | |
---|---|---|---|
PrimaryType | |||
THEFT | 161206.0 | 31004.0 | 1336422 |
BATTERY | 268091.0 | 493984.0 | 1170982 |
CRIMINAL DAMAGE | 52256.0 | 55455.0 | 735055 |
NARCOTICS | 683386.0 | 245.0 | 687684 |
OTHER OFFENSE | 70194.0 | 115557.0 | 397375 |
ASSAULT | 92630.0 | 85388.0 | 393800 |
BURGLARY | 21364.0 | 2467.0 | 372293 |
MOTOR VEHICLE THEFT | 27796.0 | 1141.0 | 301173 |
ROBBERY | 23657.0 | 3902.0 | 242663 |
DECEPTIVE PRACTICE | 42641.0 | 2771.0 | 236939 |
%%time
print('Getting Top Crimes by Month...')
# get top crimes stats
# TODO: use isin and index by primary type to speed up this query?
top_crimes = crime_types[(crime_types['PrimaryType']=='THEFT') | \
(crime_types['PrimaryType']=='BATTERY') | \
(crime_types['PrimaryType']=='CRIMINAL DAMAGE') | \
(crime_types['PrimaryType']=='NARCOTICS') | \
(crime_types['PrimaryType']=='OTHER OFFENSE') | \
(crime_types['PrimaryType']=='ASSAULT') | \
(crime_types['PrimaryType']=='BURGLARY') | \
(crime_types['PrimaryType']=='MOTOR VEHICLE THEFT') | \
(crime_types['PrimaryType']=='ROBBERY') | \
(crime_types['PrimaryType']=='DECEPTIVE PRACTICE')]
#print(top_crimes.head(10))
# group top crimes by month and crime type
crimes_groupby = top_crimes.groupby([pd.TimeGrouper('M'), 'PrimaryType'])
top_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count')
print(top_crime_data.head(10))
print('...')
# unstack top crimes type group for series plotting
top_crime_data = top_crime_data.unstack()
print(top_crime_data.head(2))
print('...')
Getting Top Crimes by Month... Date PrimaryType 2001-01-31 OTHER OFFENSE 2797 THEFT 7852 DECEPTIVE PRACTICE 1383 CRIMINAL DAMAGE 3960 ASSAULT 2122 BATTERY 6520 BURGLARY 1932 MOTOR VEHICLE THEFT 2092 ROBBERY 1396 NARCOTICS 4708 Name: Count, dtype: int64 ... PrimaryType OTHER OFFENSE THEFT DECEPTIVE PRACTICE CRIMINAL DAMAGE \ Date 2001-01-31 2797 7852 1383 3960 2001-02-28 2485 6660 1155 3656 PrimaryType ASSAULT BATTERY BURGLARY MOTOR VEHICLE THEFT ROBBERY \ Date 2001-01-31 2122 6520 1932 2092 1396 2001-02-28 2028 6040 1664 1779 1159 PrimaryType NARCOTICS Date 2001-01-31 4708 2001-02-28 4344 ... Wall time: 2.42 s
# plot top crimes data by month
top_crime_data.plot(figsize=(10,8), cmap='tab10')
plt.legend(loc='upper right', frameon=True)
plt.title('Top Chicago Crimes by Month')
plt.tight_layout()
plt.xlabel('Months')
plt.ylabel('Number of Crimes')
ax = plt.gca()
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# plot top crimes grid
top_crime_data.plot(figsize=(12, 12), subplots=True, layout=(-1, 3),\
cmap='tab10', sharex=False, sharey=False)
plt.show()
%%time
print('Getting Property Crimes by Month...')
# get top crimes stats
property_crimes = crime_types[(crime_types['PrimaryType']=='ARSON') | \
(crime_types['PrimaryType']=='BURGLARY') | \
(crime_types['PrimaryType']=='MOTOR VEHICLE THEFT') | \
(crime_types['PrimaryType']=='THEFT')]
#print(property_crimes.head(10))
# group property crimes by month and crime type
crimes_groupby = property_crimes.groupby([pd.TimeGrouper('M'), 'PrimaryType'])
property_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count')
print(property_crime_data.head(10))
print('...')
# unstack top crimes type group for series plotting
property_crime_data = property_crime_data.unstack()
print(property_crime_data.head(2))
print('...')
Getting Property Crimes by Month... Date PrimaryType 2001-01-31 THEFT 7852 BURGLARY 1932 MOTOR VEHICLE THEFT 2092 ARSON 67 2001-02-28 THEFT 6660 BURGLARY 1664 MOTOR VEHICLE THEFT 1779 ARSON 57 2001-03-31 THEFT 7751 BURGLARY 1825 Name: Count, dtype: int64 ... PrimaryType THEFT BURGLARY MOTOR VEHICLE THEFT ARSON Date 2001-01-31 7852 1932 2092 67 2001-02-28 6660 1664 1779 57 ... Wall time: 875 ms
# plot property crimes data by month
property_crime_data.plot(figsize=(10,8), cmap='tab10')
plt.legend(loc='upper right', frameon=True)
plt.title('Chicago Property Crimes by Month')
plt.tight_layout()
plt.xlabel('Months')
plt.ylabel('Number of Crimes')
ax = plt.gca()
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# plot property crimes grid
property_crime_data.plot(figsize=(12, 6), subplots=True, layout=(-1, 3),\
cmap='tab10', sharex=False, sharey=False)
plt.show()
%%time
print('Getting Quality of Life Crimes by Month...')
# get top crimes stats
q_of_life_crimes = crime_types[(crime_types['PrimaryType']=='CRIMINAL DAMAGE') | \
(crime_types['PrimaryType']=='NARCOTICS') | \
(crime_types['PrimaryType']=='PROSTITUTION')]
#print(q_of_life_crimes.head(10))
# group quality of life crimes by month and crime type
crimes_groupby = q_of_life_crimes.groupby([pd.TimeGrouper('M'), 'PrimaryType'])
q_of_life_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count')
print(q_of_life_crime_data.head(10))
print('...')
# unstack top crimes type group for series plotting
q_of_life_crime_data = q_of_life_crime_data.unstack()
print(q_of_life_crime_data.head(2))
print('...')
Getting Quality of Life Crimes by Month... Date PrimaryType 2001-01-31 CRIMINAL DAMAGE 3960 NARCOTICS 4708 PROSTITUTION 560 2001-02-28 CRIMINAL DAMAGE 3656 NARCOTICS 4344 PROSTITUTION 421 2001-03-31 CRIMINAL DAMAGE 4602 NARCOTICS 4797 PROSTITUTION 540 2001-04-30 CRIMINAL DAMAGE 4910 Name: Count, dtype: int64 ... PrimaryType CRIMINAL DAMAGE NARCOTICS PROSTITUTION Date 2001-01-31 3960 4708 560 2001-02-28 3656 4344 421 ... Wall time: 656 ms
# plot quality of life crimes data by month
q_of_life_crime_data.plot(figsize=(10,8), cmap='tab10')
plt.legend(loc='upper right', frameon=True)
plt.title('Chicago Quality of Life Crimes by Month')
plt.tight_layout()
plt.xlabel('Months')
plt.ylabel('Number of Crimes')
ax = plt.gca()
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# plot quality of life crimes grid
q_of_life_crime_data.plot(figsize=(12, 3), subplots=True, layout=(-1, 3),\
cmap='tab10', sharex=False, sharey=False)
plt.show()
%%time
# get all crime data by month and primary type
crimes_groupby = crime_types.groupby([pd.TimeGrouper('M'), 'PrimaryType'])
all_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count')
print(all_crime_data.head(10))
print('...')
# unstack all crimes data for series plotting
all_crime_data = all_crime_data.unstack()
print(all_crime_data.head(2))
Date PrimaryType 2001-01-31 OFFENSE INVOLVING CHILDREN 239 OTHER OFFENSE 2797 SEX OFFENSE 185 THEFT 7852 DECEPTIVE PRACTICE 1383 CRIM SEXUAL ASSAULT 162 CRIMINAL DAMAGE 3960 ASSAULT 2122 BATTERY 6520 BURGLARY 1932 Name: Count, dtype: int64 ... PrimaryType OFFENSE INVOLVING CHILDREN OTHER OFFENSE SEX OFFENSE THEFT \ Date 2001-01-31 239.0 2797.0 185.0 7852.0 2001-02-28 151.0 2485.0 149.0 6660.0 PrimaryType DECEPTIVE PRACTICE CRIM SEXUAL ASSAULT CRIMINAL DAMAGE \ Date 2001-01-31 1383.0 162.0 3960.0 2001-02-28 1155.0 135.0 3656.0 PrimaryType ASSAULT BATTERY BURGLARY ... \ Date ... 2001-01-31 2122.0 6520.0 1932.0 ... 2001-02-28 2028.0 6040.0 1664.0 ... PrimaryType RITUALISM DOMESTIC VIOLENCE OBSCENITY \ Date 2001-01-31 2.0 1.0 2.0 2001-02-28 NaN NaN 1.0 PrimaryType OTHER NARCOTIC VIOLATION PUBLIC INDECENCY NON-CRIMINAL \ Date 2001-01-31 1.0 NaN NaN 2001-02-28 NaN 1.0 NaN PrimaryType NON-CRIMINAL (SUBJECT SPECIFIED) HUMAN TRAFFICKING \ Date 2001-01-31 NaN NaN 2001-02-28 NaN NaN PrimaryType NON - CRIMINAL CONCEALED CARRY LICENSE VIOLATION Date 2001-01-31 NaN NaN 2001-02-28 NaN NaN [2 rows x 35 columns] Wall time: 2.17 s
%%time
# plot all crimes over time individually
all_crime_data.plot(figsize=(12, 36), subplots=True, layout=(-1, 3),\
sharex=False, sharey=False)
plt.show()
Wall time: 9.17 s
# get crime location counts
crime_locations = crimes.groupby('LocationDescription').size().compute()
crime_locations = crime_locations.sort_values(ascending=False).rename('Total') #.reset_index()
# print crime location stats
print(crime_locations.head())
print("...\nTotal Locations: {:,}".format(crime_locations.size))
LocationDescription STREET 1691235 RESIDENCE 1085715 APARTMENT 654736 SIDEWALK 637895 OTHER 242088 Name: Total, dtype: int64 ... Total Locations: 173
# plot top 30 crime locations
crime_locations[:30].sort_values(ascending=True)\
.plot(kind='barh', figsize=(10,8), color='#cc0000')
plt.title('Top 30 Chicago Crime Locations (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Location')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# load Chicago community areas with pandas
# for plotting crime by Chicago 'sides'
community_areas = pd.read_csv('../data/chicago-community-areas.csv') #, index_col='CommunityName')
community_areas.head()
# get community crime stats
community_areas['Total'] = crimes.groupby('CommunityArea').size().compute().rename('Total')
community_crime = community_areas.sort_values(by='Total', ascending=False).dropna()
# print community crime stats
print(community_crime.head())
print("...\nTotal Communities: {:,}".format(community_crime.Total.count()))
CommunityArea CommunityName Side Total 25 25 Austin West Side 370680 8 8 Near North Side Central 192403 43 43 South Shore South Side 189126 23 23 Humboldt Park West Side 183219 24 24 West Town West Side 171884 ... Total Communities: 77
D:\tools\dev\python\Anaconda3-4.2.0\lib\site-packages\pandas\core\indexes\category.py:138: RuntimeWarning: Values and categories have different dtypes. Did you mean to use 'Categorical.from_codes(codes, categories)'? data = Categorical(data, categories=categories, ordered=ordered)
# drop unused columns and reindex
community_crime = community_crime.drop(['CommunityArea', 'Side'], axis=1) # denotes column
community_crime = community_crime.set_index('CommunityName')
community_crime.head()
Total | |
---|---|
CommunityName | |
Austin | 370680 |
Near North Side | 192403 |
South Shore | 189126 |
Humboldt Park | 183219 |
West Town | 171884 |
# plot 20 high crime communities
community_crime.head(20).sort_values(by='Total', ascending=True)\
.plot(kind='barh', figsize=(10,8), color='#cc0000')
plt.title('High Crime Chicago Communities (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Community')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# get Austin community crime stats to see why it is so high
Austin_crime = crimes[crimes['CommunityArea']==25]\
.drop(['CommunityArea', 'Year'], axis=1) # denotes column
# get Austin crimes by block
Austin_crime_blocks = Austin_crime[['Block']]\
.groupby('Block').size().compute()\
.sort_values(ascending=False)
print('Top Austin community crime blocks:')
print(Austin_crime_blocks.head(20))
print('...\nTotal blocks: {}'.format(len(Austin_crime_blocks)))
Top Austin community crime blocks: Block 046XX W NORTH AVE 2883 051XX W MADISON ST 2563 003XX N CENTRAL AVE 2459 048XX W NORTH AVE 2199 052XX W MADISON ST 2028 002XX N PINE AVE 1935 051XX W CHICAGO AVE 1847 055XX W NORTH AVE 1773 003XX S CICERO AVE 1743 055XX W WASHINGTON BLVD 1621 051XX W NORTH AVE 1609 050XX W WASHINGTON BLVD 1531 049XX W NORTH AVE 1525 001XX N PINE AVE 1504 056XX W MADISON ST 1443 055XX W CONGRESS PKWY 1438 050XX W MADISON ST 1433 001XX N LOTUS AVE 1422 053XX W MADISON ST 1380 054XX W WASHINGTON BLVD 1365 dtype: int64 ... Total blocks: 1227
# get Austin crimes by type
Austin_crime_by_type = Austin_crime[['PrimaryType']]\
.groupby('PrimaryType').size().compute().rename('Total')\
.sort_values(ascending=False)
print('Austin community crimes by type:')
print(Austin_crime_by_type)
print('\nTop Austin community crimes:')
print(Austin_crime_by_type.index[:24])
# group top Austin crimes by month and crime type
crimes_groupby = Austin_crime[Austin_crime['PrimaryType'].isin(Austin_crime_by_type.index[:24])]\
.groupby([pd.TimeGrouper('M'), 'PrimaryType'])
Austin_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count')
print('\nAustin community crime by month:')
print(Austin_crime_data.head(10))
print('...')
# unstack top crimes type group for series plotting
Austin_crime_data = Austin_crime_data.unstack()
Austin_crime_data.tail()
Austin community crimes by type: PrimaryType NARCOTICS 87280 BATTERY 73638 THEFT 47963 CRIMINAL DAMAGE 32492 OTHER OFFENSE 22181 ASSAULT 21924 ROBBERY 15618 BURGLARY 14830 MOTOR VEHICLE THEFT 13960 DECEPTIVE PRACTICE 8719 PROSTITUTION 7322 CRIMINAL TRESPASS 6947 WEAPONS VIOLATION 4647 GAMBLING 2376 PUBLIC PEACE VIOLATION 2318 OFFENSE INVOLVING CHILDREN 2247 CRIM SEXUAL ASSAULT 1513 INTERFERENCE WITH PUBLIC OFFICER 1359 SEX OFFENSE 949 HOMICIDE 696 ARSON 598 LIQUOR LAW VIOLATION 545 KIDNAPPING 285 INTIMIDATION 136 STALKING 95 OBSCENITY 17 CONCEALED CARRY LICENSE VIOLATION 9 PUBLIC INDECENCY 6 HUMAN TRAFFICKING 3 OTHER NARCOTIC VIOLATION 3 NON-CRIMINAL 2 NON - CRIMINAL 1 RITUALISM 1 NON-CRIMINAL (SUBJECT SPECIFIED) 0 DOMESTIC VIOLENCE 0 Name: Total, dtype: int64 Top Austin community crimes: CategoricalIndex(['NARCOTICS', 'BATTERY', 'THEFT', 'CRIMINAL DAMAGE', 'OTHER OFFENSE', 'ASSAULT', 'ROBBERY', 'BURGLARY', 'MOTOR VEHICLE THEFT', 'DECEPTIVE PRACTICE', 'PROSTITUTION', 'CRIMINAL TRESPASS', 'WEAPONS VIOLATION', 'GAMBLING', 'PUBLIC PEACE VIOLATION', 'OFFENSE INVOLVING CHILDREN', 'CRIM SEXUAL ASSAULT', 'INTERFERENCE WITH PUBLIC OFFICER', 'SEX OFFENSE', 'HOMICIDE', 'ARSON', 'LIQUOR LAW VIOLATION', 'KIDNAPPING', 'INTIMIDATION'], categories=['OFFENSE INVOLVING CHILDREN', 'OTHER OFFENSE', 'SEX OFFENSE', 'THEFT', 'DECEPTIVE PRACTICE', 'CRIM SEXUAL ASSAULT', 'CRIMINAL DAMAGE', 'ASSAULT', ...], ordered=False, name='PrimaryType', dtype='category') Austin community crime by month: Date PrimaryType 2001-01-31 OFFENSE INVOLVING CHILDREN 5 THEFT 13 DECEPTIVE PRACTICE 2 CRIM SEXUAL ASSAULT 1 MOTOR VEHICLE THEFT 2 NARCOTICS 4 HOMICIDE 5 2001-02-28 OTHER OFFENSE 1 SEX OFFENSE 1 BATTERY 1 Name: Count, dtype: int64 ...
PrimaryType | OFFENSE INVOLVING CHILDREN | OTHER OFFENSE | SEX OFFENSE | THEFT | DECEPTIVE PRACTICE | CRIM SEXUAL ASSAULT | CRIMINAL DAMAGE | ASSAULT | BATTERY | BURGLARY | ... | NARCOTICS | LIQUOR LAW VIOLATION | PUBLIC PEACE VIOLATION | INTERFERENCE WITH PUBLIC OFFICER | HOMICIDE | GAMBLING | KIDNAPPING | PROSTITUTION | INTIMIDATION | ARSON |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | |||||||||||||||||||||
2017-04-30 | 8.0 | 99.0 | 5.0 | 155.0 | 68.0 | 7.0 | 168.0 | 99.0 | 284.0 | 39.0 | ... | 124.0 | 1.0 | 13.0 | 6.0 | 5.0 | 5.0 | 2.0 | 23.0 | NaN | 1.0 |
2017-05-31 | 6.0 | 107.0 | 4.0 | 167.0 | 71.0 | 6.0 | 141.0 | 106.0 | 306.0 | 56.0 | ... | 140.0 | 2.0 | 6.0 | 11.0 | 8.0 | 2.0 | 2.0 | 25.0 | NaN | 4.0 |
2017-06-30 | 9.0 | 94.0 | 5.0 | 167.0 | 59.0 | 6.0 | 159.0 | 106.0 | 342.0 | 59.0 | ... | 115.0 | NaN | 18.0 | 9.0 | 14.0 | 5.0 | 1.0 | 6.0 | NaN | 5.0 |
2017-07-31 | 9.0 | 101.0 | 5.0 | 195.0 | 64.0 | 10.0 | 150.0 | 111.0 | 332.0 | 39.0 | ... | 126.0 | 3.0 | 15.0 | 9.0 | 4.0 | 5.0 | 1.0 | 10.0 | 3.0 | 1.0 |
2017-08-31 | 6.0 | 77.0 | 1.0 | 177.0 | 40.0 | 4.0 | 127.0 | 89.0 | 258.0 | 34.0 | ... | 87.0 | NaN | 6.0 | 5.0 | 4.0 | NaN | NaN | 11.0 | NaN | 2.0 |
5 rows × 24 columns
# plot top Austin crimes over time individually
Austin_crime_data.plot(figsize=(12, 24), subplots=True, layout=(-1, 3),\
sharex=False, sharey=False)
plt.show()
# plot Austin crimes on the rise
Austin_crime_data[['HOMICIDE', 'WEAPONS VIOLATION', 'DECEPTIVE PRACTICE']]\
.plot(figsize=(10,8), cmap='tab10')
plt.legend(loc='upper right', frameon=True)
plt.title('Austin Chicago community Crimes on the Rise')
plt.tight_layout()
plt.xlabel('Months')
plt.ylabel('Number of Crimes')
ax = plt.gca()
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
%%time
# get Austin crime geo data
Austin_crime_geo = Austin_crime[['PrimaryType', 'Latitude', 'Longitude']].dropna()
s1 = Austin_crime_geo[(Austin_crime_geo['PrimaryType']=='HOMICIDE')].compute()
s2 = Austin_crime_geo[(Austin_crime_geo['PrimaryType']=='WEAPONS VIOLATION')].compute()
s3 = Austin_crime_geo[(Austin_crime_geo['PrimaryType']=='KIDNAPPING')].compute()
s4 = Austin_crime_geo[(Austin_crime_geo['PrimaryType']=='THEFT')].compute()
# create Austin crime scatter map plot
fig,ax= plt.subplots(1, 1, figsize=(10,12))
plt.xlim(41.86, 41.925) # (minLat, maxLat)
plt.ylim(-87.808, -87.738) # (minLong, maxLong)
#ax.set_facecolor('black')
# plot thefts, homicides, kidnappings and sexual assaults
ax.scatter(s4['Latitude'], s4['Longitude'],\
s=50, alpha=0.8, color='#66ff99', label='Theft')
ax.scatter(s2['Latitude'], s2['Longitude'],\
s=20, alpha=0.8, color='#ff9933', label='Weapons Violation')
ax.scatter(s3['Latitude'], s3['Longitude'],\
s=20, alpha=0.8, color='#3333ff', label='Kidnapping')
ax.scatter(s1['Latitude'], s1['Longitude'],\
s=20, alpha=0.8, color='#ff0000', label='Homicide')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('Austin Chicago Crime Map (2001-2017)')
plt.legend(loc='upper right', frameon=True)
plt.grid(linestyle = '--', linewidth = 0.05)
plt.show()
Wall time: 2.33 s
# plot 20 low crime communities
community_crime.tail(20).sort_values(by='Total', ascending=False)\
.plot(kind='barh', figsize=(10,8), color='#3399ff')
plt.title('Low Crime Chicago Communities (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Community')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# group crime totals by Chicago 'sides'
crime_by_side = community_areas.groupby('Side').sum()\
.drop('CommunityArea', axis=1) # denotes column
crime_by_side.sort_values(by='Total', ascending=False)
Total | |
---|---|
Side | |
West Side | 1428846 |
Southwest Side | 888977 |
South Side | 795101 |
Far Southeast Side | 680550 |
Far North Side | 509413 |
North Side | 417278 |
Central | 367343 |
Far Southwest Side | 366448 |
Northwest Side | 333145 |
# plot crime by Chicago community sides
crime_by_side.sort_values(by='Total', ascending=True)\
.plot(kind='barh', figsize=(6,4), color='#cc0000')
plt.title('Chicago Crime by Sides (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Chicago Side')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
%%time
# get min/max lat and long
#minLat = crimes['Latitude'].min().compute()
#maxLat = crimes['Latitude'].max().compute()
#minLong = crimes['Longitude'].min().compute()
#maxLong = crimes['Longitude'].max().compute()
# get crime geo data
crime_geo = crimes[['PrimaryType', 'Latitude', 'Longitude']].dropna()
# get homicides
homicides = crime_geo[(crime_geo['PrimaryType']=='HOMICIDE')]\
.compute()
# get kidnappings
kidnappings = crime_geo[(crime_geo['PrimaryType']=='KIDNAPPING')]\
.compute()
# get criminal sexual assaults
sexual_assaults = crime_geo[(crime_geo['PrimaryType']=='CRIM SEXUAL ASSAULT')]\
.compute()
# get thefts
thefts = crime_geo[(crime_geo['PrimaryType']=='THEFT')]\
.compute()
# create scatter map plot
fig,ax= plt.subplots(1, 1, figsize=(10,12))
plt.xlim(41.65, 42.02) # (minLat, maxLat)
plt.ylim(-87.78, -87.53) # (minLong, maxLong)
#ax.set_facecolor('black')
# plot thefts, homicides, kidnappings and sexual assaults
ax.scatter(thefts['Latitude'], thefts['Longitude'],\
s=1, alpha=0.8, color='#66ff99', label='Theft')
ax.scatter(sexual_assaults['Latitude'], sexual_assaults['Longitude'],\
s=3, alpha=0.8, color='#ff9933', label='Criminal Sexual Assault')
ax.scatter(kidnappings['Latitude'], kidnappings['Longitude'],\
s=3, alpha=0.8, color='#3333ff', label='Kidnapping')
ax.scatter(homicides['Latitude'], homicides['Longitude'],\
s=3, alpha=0.8, color='#ff0000', label='Homicide')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('Chicago Crime Map (2001-2017)')
plt.legend(loc='upper right', frameon=True)
plt.grid(linestyle = '--', linewidth = 0.05)
plt.show()
Wall time: 30.2 s