In [1]:
import calendar 
import datetime
import dask.dataframe as dd
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from IPython.display import SVG, display
In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# set neat seaborn whitegrid styles for matplotlib charts
plt.style.use('seaborn')
sns.set_style('whitegrid')
#plt.style.available
In [3]:
%%time
# set data file path
parquet_data_folder = '../data/crimes-2017.snappy.parq'
print('Loading crime data from: {}'.format(parquet_data_folder))

# load crimes parquet data into dask df
crimes = dd.read_parquet(parquet_data_folder, index='Date')

# load all data into memory
crimes = crimes.persist()
print('Crime data loaded into memory.')
Loading crime data from: ../data/crimes-2017.snappy.parq
Crime data loaded into memory.
Wall time: 3.07 s
In [4]:
%%time
# log records count and data partitions
print('Crime data stats:')
print('---------------------------------------')
print('{:,} total records in {} partitions'.format(len(crimes), crimes.npartitions))
print('DataFrame size: {:,}'.format(crimes.size.compute()))
Crime data stats:
---------------------------------------
159,919 total records in 1 partitions
DataFrame size: 1,599,190
Wall time: 0 ns
In [5]:
# check data frame structure
crimes
Out[5]:
Dask DataFrame Structure:
Block PrimaryType Description LocationDescription CommunityArea Arrest Domestic Latitude Longitude Year
npartitions=1
2017-01-01 00:00:00 object category[unknown] category[unknown] category[unknown] category[unknown] bool bool float64 float64 category[unknown]
2017-08-10 23:55:00 ... ... ... ... ... ... ... ... ... ...
Dask Name: read-parquet, 1 tasks
In [6]:
# preview crimes data with Date index
crimes.head()
Out[6]:
Block PrimaryType Description LocationDescription CommunityArea Arrest Domestic Latitude Longitude Year
Date
2017-01-01 087XX S SAGINAW AVE OFFENSE INVOLVING CHILDREN SEX ASSLT OF CHILD BY FAM MBR RESIDENCE 46.0 False True NaN NaN 2017
2017-01-01 003XX N ELIZABETH ST DECEPTIVE PRACTICE FRAUD OR CONFIDENCE GAME OTHER 28.0 False False NaN NaN 2017
2017-01-01 058XX S SANGAMON ST OFFENSE INVOLVING CHILDREN AGG SEX ASSLT OF CHILD FAM MBR RESIDENCE 68.0 False False NaN NaN 2017
2017-01-01 035XX S GILES AVE CRIM SEXUAL ASSAULT NON-AGGRAVATED SCHOOL, PUBLIC, BUILDING 35.0 False False NaN NaN 2017
2017-01-01 052XX S KENWOOD AVE THEFT OVER $500 VEHICLE NON-COMMERCIAL 41.0 False False 41.800292 -87.593087 2017
In [7]:
# get crime types and counts by primary type
crime_types = crimes[['PrimaryType']]
crime_type_total = crime_types.groupby('PrimaryType').size().compute()

# print crime stats
print(crime_type_total.sort_values(ascending=False))
print("...\nTotal Primary Crime Types: {:,}".format(crime_type_total.size))
PrimaryType
THEFT                                37563
BATTERY                              30170
CRIMINAL DAMAGE                      17806
ASSAULT                              11855
OTHER OFFENSE                        10923
DECEPTIVE PRACTICE                   10318
BURGLARY                              7804
MOTOR VEHICLE THEFT                   6848
ROBBERY                               6750
NARCOTICS                             6501
CRIMINAL TRESPASS                     4247
WEAPONS VIOLATION                     2843
OFFENSE INVOLVING CHILDREN            1277
PUBLIC PEACE VIOLATION                 945
CRIM SEXUAL ASSAULT                    907
INTERFERENCE WITH PUBLIC OFFICER       679
SEX OFFENSE                            556
PROSTITUTION                           541
HOMICIDE                               414
ARSON                                  279
LIQUOR LAW VIOLATION                   133
STALKING                               128
GAMBLING                               115
KIDNAPPING                             114
INTIMIDATION                            84
OBSCENITY                               39
CONCEALED CARRY LICENSE VIOLATION       37
NON-CRIMINAL                            26
PUBLIC INDECENCY                         6
HUMAN TRAFFICKING                        6
OTHER NARCOTIC VIOLATION                 3
NON-CRIMINAL (SUBJECT SPECIFIED)         2
dtype: int64
...
Total Primary Crime Types: 32
In [8]:
# get arrests and domestic counts for each primary crime type
crime_type_counts = crimes[['PrimaryType', 'Arrest', 'Domestic']]\
.groupby('PrimaryType').sum().compute()

# print arrests stats
print(crime_type_counts.sort_values(by='Arrest', ascending=False))
                                   Arrest  Domestic
PrimaryType                                        
NARCOTICS                          6488.0       5.0
BATTERY                            5999.0   14716.0
THEFT                              3753.0    1430.0
CRIMINAL TRESPASS                  2560.0     206.0
OTHER OFFENSE                      2450.0    3377.0
WEAPONS VIOLATION                  2272.0      11.0
ASSAULT                            2109.0    3054.0
CRIMINAL DAMAGE                    1053.0    2068.0
INTERFERENCE WITH PUBLIC OFFICER    642.0       3.0
PUBLIC PEACE VIOLATION              635.0      32.0
MOTOR VEHICLE THEFT                 547.0      67.0
PROSTITUTION                        541.0       0.0
ROBBERY                             414.0     127.0
DECEPTIVE PRACTICE                  386.0      95.0
BURGLARY                            275.0     106.0
OFFENSE INVOLVING CHILDREN          152.0     636.0
LIQUOR LAW VIOLATION                133.0       1.0
GAMBLING                            115.0       0.0
SEX OFFENSE                          80.0      46.0
HOMICIDE                             59.0      20.0
CONCEALED CARRY LICENSE VIOLATION    36.0       0.0
CRIM SEXUAL ASSAULT                  34.0     124.0
OBSCENITY                            31.0       9.0
ARSON                                20.0       9.0
STALKING                             10.0      69.0
KIDNAPPING                            9.0      39.0
INTIMIDATION                          6.0       7.0
PUBLIC INDECENCY                      6.0       0.0
OTHER NARCOTIC VIOLATION              2.0       0.0
NON-CRIMINAL                          1.0       0.0
NON-CRIMINAL (SUBJECT SPECIFIED)      1.0       2.0
HUMAN TRAFFICKING                     0.0       2.0
In [9]:
# add crime type totals column
crime_type_counts['Total'] = crime_type_total

# print all crime stats (arrests, domestic + total)
print(crime_type_counts.sort_values(by='Total', ascending=False))
                                   Arrest  Domestic  Total
PrimaryType                                               
THEFT                              3753.0    1430.0  37563
BATTERY                            5999.0   14716.0  30170
CRIMINAL DAMAGE                    1053.0    2068.0  17806
ASSAULT                            2109.0    3054.0  11855
OTHER OFFENSE                      2450.0    3377.0  10923
DECEPTIVE PRACTICE                  386.0      95.0  10318
BURGLARY                            275.0     106.0   7804
MOTOR VEHICLE THEFT                 547.0      67.0   6848
ROBBERY                             414.0     127.0   6750
NARCOTICS                          6488.0       5.0   6501
CRIMINAL TRESPASS                  2560.0     206.0   4247
WEAPONS VIOLATION                  2272.0      11.0   2843
OFFENSE INVOLVING CHILDREN          152.0     636.0   1277
PUBLIC PEACE VIOLATION              635.0      32.0    945
CRIM SEXUAL ASSAULT                  34.0     124.0    907
INTERFERENCE WITH PUBLIC OFFICER    642.0       3.0    679
SEX OFFENSE                          80.0      46.0    556
PROSTITUTION                        541.0       0.0    541
HOMICIDE                             59.0      20.0    414
ARSON                                20.0       9.0    279
LIQUOR LAW VIOLATION                133.0       1.0    133
STALKING                             10.0      69.0    128
GAMBLING                            115.0       0.0    115
KIDNAPPING                            9.0      39.0    114
INTIMIDATION                          6.0       7.0     84
OBSCENITY                            31.0       9.0     39
CONCEALED CARRY LICENSE VIOLATION    36.0       0.0     37
NON-CRIMINAL                          1.0       0.0     26
PUBLIC INDECENCY                      6.0       0.0      6
HUMAN TRAFFICKING                     0.0       2.0      6
OTHER NARCOTIC VIOLATION              2.0       0.0      3
NON-CRIMINAL (SUBJECT SPECIFIED)      1.0       2.0      2
In [10]:
# plot crimes by primary type counts
all_crime_types = crime_type_counts.sort_values(by='Total', ascending=True)\
.drop(['Arrest', 'Domestic'], axis=1) # denotes column
all_crime_types.plot(kind='barh', figsize=(8,6), color='#cc0000')
plt.title('All Chicago Crimes by Type')
plt.xlabel('Number of Crime reports')
plt.ylabel('Crime Type')
plt.tight_layout()
plt.show()
In [11]:
# plot crimes by arrests
crime_type_counts.sort_values(by='Arrest', ascending=True)\
.plot(kind='barh', figsize=(8,10))
plt.title('2017 Chicago Crime reports by Arrests')
plt.ylabel('Crime Type')
plt.xlabel('Number of Crimes')
plt.tight_layout()
plt.show()
In [12]:
# plot crimes by domestic incident reports
crime_type_counts.sort_values(by='Domestic', ascending=True)\
.plot(kind='barh', figsize=(8,10))
plt.title('2017 Chicago Crime reports by Domestic incidents')
plt.ylabel('Crime Type')
plt.xlabel('Number of Crimes')
plt.tight_layout()
plt.show()
In [13]:
# plot high crimes (>-= 1,000 reports)
crime_type_counts[crime_type_counts['Total'] >= 1000]\
.sort_values(by='Total', ascending=True)\
.plot(kind='barh', figsize=(6,6))
plt.ylabel('Crime Type')
plt.xlabel('Number of Crimes')
plt.title('High 2017 Chicago Crimes (>= 1,000 reports)')
plt.show()
In [14]:
# plot less than 1000 crime reports types
crime_type_counts[crime_type_counts['Total'] < 1000]\
.sort_values(by='Total', ascending=True)\
.plot(kind='barh', figsize=(6,6))
plt.ylabel('Crime Type')
plt.xlabel('Number of Crimes')
plt.title('Low 2017 Chicago Crimes (<1,000 reports)')
plt.show()
In [15]:
# plot less than 100 crime reports types
crime_type_counts[crime_type_counts['Total'] < 100]\
.sort_values(by='Total', ascending=True)\
.plot(kind='barh', figsize=(6,4))
plt.ylabel('Crime Type')
plt.xlabel('Number of Crimes')
plt.title('Least 2017 Chicago Crimes (<100 reports)')
plt.show()
In [16]:
# get crime location counts
crime_locations = crimes.groupby('LocationDescription').size().compute()
crime_locations = crime_locations.sort_values(ascending=False).rename('Total') #.reset_index()

# print crime location stats
print(crime_locations.head())
print("...\nTotal Locations: {:,}".format(crime_locations.size))
LocationDescription
STREET       36104
RESIDENCE    27299
APARTMENT    20173
SIDEWALK     12643
OTHER         6622
Name: Total, dtype: int64
...
Total Locations: 121
In [17]:
# plot top 30 crime locations
crime_locations[:30].sort_values(ascending=True)\
.plot(kind='barh', figsize=(6,8))
plt.ylabel('Location')
plt.xlabel('Number of Crimes')
plt.title('2017 Chicago Crime Top 30 Locations')
plt.show()
In [18]:
# get arrests and domestic counts for each location description
crime_location_counts = crimes[['LocationDescription', 'Arrest', 'Domestic']]\
.groupby('LocationDescription').sum().compute()

# add crime location totals column
crime_location_counts['Total'] = crime_locations
crime_location_counts.head()
Out[18]:
Arrest Domestic Total
LocationDescription
RESIDENCE 2896.0 9246.0 27299
OTHER 623.0 503.0 6622
SCHOOL, PUBLIC, BUILDING 523.0 45.0 2113
VEHICLE NON-COMMERCIAL 722.0 393.0 2865
APARTMENT 2958.0 8734.0 20173
In [19]:
# plot top crime locations with arrests and domestic crime counts
crime_location_counts[crime_location_counts['Total'] >= 2500]\
.sort_values(by='Total', ascending=True)\
.plot(kind='barh', figsize=(6,6))
plt.ylabel('Location')
plt.xlabel('Number of Crimes')
plt.title('2017 Chicago Crime Top Locations (>=2,500 Crime Reports)')
plt.show()
In [20]:
# plot next 20 top crime locations
crime_location_counts = crime_location_counts[crime_location_counts['Total'] <= 3000]\
.sort_values(by='Total', ascending=False)
crime_location_counts[:20].sort_values(by='Total', ascending=True)\
.plot(kind='barh', figsize=(6,6))
plt.ylabel('Location')
plt.xlabel('Number of Crimes')
plt.title('2017 Chicago Crime Next Top 20 Crime Locations (<3,000 Crime Reports)')
plt.show()
In [21]:
# check for datetime64 index to plot crime over time
crimes.index
Out[21]:
Dask Index Structure:
npartitions=1
2017-01-01 00:00:00    datetime64[ns]
2017-08-10 23:55:00               ...
Name: Date, dtype: datetime64[ns]
Dask Name: read-parquet, 2 tasks
In [22]:
# get arrests
arrests = crimes[crimes['Arrest'] == True]['Arrest']

# print arrests stats
print(arrests.head())
print("...\nTotal Arrests: {:,}".format(arrests.size.compute()))
Date
2017-01-01 00:00:00    True
2017-01-01 00:00:00    True
2017-01-01 00:00:00    True
2017-01-01 00:01:00    True
2017-01-01 00:05:00    True
Name: Arrest, dtype: bool
...
Total Arrests: 30,819
In [23]:
# plot monthly arrests
monthly_arrests = arrests.resample('M').sum().compute()
monthly_arrests.plot(kind='bar', figsize=(6,3))
plt.xticks([0,1,2,3,4,5,6,7], calendar.month_name[1:13], rotation=0)
plt.xlabel('2017 Month')
plt.title('2017 Chicago Crime Monthly Arrests')
plt.show()
In [24]:
# plot weekly arrests
weekly_arrests = arrests.resample('W').sum().compute()
weekly_arrests.plot(kind='bar')
# TODO: format weekly xticks to only list week start date
plt.xlabel('Week Of 2017')
plt.title('2017 Chicago Crime Weekly Arrests')
plt.show()
In [25]:
# plot daily arrests
daily_arrests = arrests.resample('D').sum().compute()
daily_arrests.plot()
plt.title('2017 Chicago Crime Daily Arrests')
plt.show()
In [26]:
# get domestic crimes
domestic = crimes[crimes['Domestic'] == True]['Domestic']

# print domestic crime stats
print(domestic.head())
print("...\nTotal Domestic: {:,}".format(domestic.size.compute()))
Date
2017-01-01    True
2017-01-01    True
2017-01-01    True
2017-01-01    True
2017-01-01    True
Name: Domestic, dtype: bool
...
Total Domestic: 26,261
In [27]:
# plot daily domestic crimes
daily_domestic = domestic.resample('D').sum().compute()
daily_domestic.plot(color='g')
plt.title('2017 Chicago Crime Daily Domestic reports')
plt.show()
In [28]:
# get daily total crime counts
daily_crime = crime_types.resample('D').count().compute()

#print daily total crime stats
print(daily_crime.head())
print("...\nTotal Days: {:,}".format(daily_crime.size))
            PrimaryType
2017-01-01          933
2017-01-02          601
2017-01-03          704
2017-01-04          610
2017-01-05          601
...
Total Days: 222
In [29]:
# plot daily crime counts, arrests, and domestic incidents reports
fig, ax = plt.subplots()
ax.plot(daily_crime.index, daily_crime, '--', label='Total', color='r', zorder=10)
ax.plot(daily_arrests.index, daily_arrests, color='#3399ff', zorder=10)
ax.fill_between(daily_domestic.index, daily_domestic, label='Domestic', color='c')
ax.set_ylabel('Number of Crimes')
ax.set_xlabel('Month')
ax.legend(loc='right')
plt.title('2017 Daily Chicago Crime reports, Arests, and Domestic incidents')
plt.show()