In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

from mpl_toolkits.basemap import Basemap
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import SCORERS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV, LassoCV
from scipy import sparse
from sklearn.metrics import mean_squared_error, mean_absolute_error
import random
PATH = './data/'
In [2]:
from jupyterthemes import jtplot
jtplot.style(theme='onedork')
In [3]:
file_name_cr = 'crime.csv'
df_crime = pd.read_csv(PATH+file_name_cr, encoding = "ISO-8859-1")

Part 1. Dataset and features description

In [4]:
df_crime.head()
Out[4]:
INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART STREET Lat Long Location
0 I182080058 2403 Disorderly Conduct DISTURBING THE PEACE E18 495 NaN 2018-10-03 20:13:00 2018 10 Wednesday 20 Part Two ARLINGTON ST 42.262608 -71.121186 (42.26260773, -71.12118637)
1 I182080053 3201 Property Lost PROPERTY - LOST D14 795 NaN 2018-08-30 20:00:00 2018 8 Thursday 20 Part Three ALLSTON ST 42.352111 -71.135311 (42.35211146, -71.13531147)
2 I182080052 2647 Other THREATS TO DO BODILY HARM B2 329 NaN 2018-10-03 19:20:00 2018 10 Wednesday 19 Part Two DEVON ST 42.308126 -71.076930 (42.30812619, -71.07692974)
3 I182080051 413 Aggravated Assault ASSAULT - AGGRAVATED - BATTERY A1 92 NaN 2018-10-03 20:00:00 2018 10 Wednesday 20 Part One CAMBRIDGE ST 42.359454 -71.059648 (42.35945371, -71.05964817)
4 I182080050 3122 Aircraft AIRCRAFT INCIDENTS A7 36 NaN 2018-10-03 20:49:00 2018 10 Wednesday 20 Part Three PRESCOTT ST 42.375258 -71.024663 (42.37525782, -71.02466343)
"crime.csv" contains an array of data on crimes in the city of Boston for the years 2015-2018. The data array contains the following data: 'INCIDENT_NUMBER' - incident number 'OFFENSE_CODE' - offense code 'OFFENSE_CODE_GROUP' - offense group code 'OFFENSE_DESCRIPTION' - brief description of the incident 'DISTRICT' - district 'REPORTING_AREA' - reporting site 'SHOOTING' - whether weapons were used 'OCCURRED_ON_DATE' - date of offense 'YEAR' - year 'MONTH' - month 'DAY_OF_WEEK' 'HOUR' 'UCR_PART' - Uniform Crime Reports 'STREET' 'Lat' - latitude 'Long' - longitude 'Location' - coordinate The target variables that we will predict are the crime scene, i.e. coordinates - longitude and latitude. Also try to predict the number of crimes per day. The task is relevant due to the fact that it will help prevent crimes and reduce the number of victims.
In [5]:
file_name_weather = 'Boston weather_clean.csv'
df_weather = pd.read_csv(PATH+file_name_weather, encoding = "ISO-8859-1")
In [6]:
df_weather.tail()
Out[6]:
Year Month Day High Temp (F) Avg Temp (F) Low Temp (F) High Dew Point (F) Avg Dew Point (F) Low Dew Point (F) High Humidity (%) ... Low Sea Level Press (in) High Visibility (mi) Avg Visibility (mi) Low Visibility (mi) High Wind (mph) Avg Wind (mph) High Wind Gust (mph) Snowfall (in) Precip (in) Events
3744 2018 4 4 58 49 39 56 42 19 100 ... 29.29 10 4 0 31 13 42 0.00 0.20 Rain
3745 2018 4 5 43 37 30 21 9 4 48 ... 29.65 10 10 10 35 22 44 0.00 0.00 None
3746 2018 4 6 43 36 29 38 27 9 100 ... 29.71 10 8 0 22 11 26 0.21 0.21 Both
3747 2018 4 7 47 41 35 38 26 16 92 ... 29.69 10 10 10 20 12 27 0.00 0.01 Rain
3748 2018 4 8 42 37 32 21 17 11 52 ... 29.76 10 10 10 22 13 29 0.00 0.00 None

5 rows × 24 columns

"Boston weather_clean.csv" contains an array of weather conditions that were in the city of Boston and coincide in time with the main file "crime.csv". 'Year' - year 'Month' - month 'Day' - day 'High Temp (F)' - Maximum temperature 'Avg Temp (F)' - average 'Low Temp (F)' - minimal 'High Dew Point (F)' - maximum dew point 'Avg Dew Point (F)' - average 'Low Dew Point (F)' - min 'High Humidity (%)' - Humidity 'Avg Humidity (%)' 'Low Humidity (%)' 'High Sea Level Press (in)' 'Avg Sea Level Press (in)' 'Low Sea Level Press (in)' 'High Visibility (mi)' - visibility 'Avg Visibility (mi)' 'Low Visibility (mi)' 'High Wind (mph)' 'Avg Wind (mph)' 'High Wind Gust (mph)' 'Snowfall (in)' snow cover 'Precip (in)' - precipitation 'Events' - Events The weather information, I hope, will help build a more accurate model for the target variable.

Part 2. Exploratory data analysis

In [7]:
df_crime.isnull().sum()
Out[7]:
INCIDENT_NUMBER             0
OFFENSE_CODE                0
OFFENSE_CODE_GROUP          0
OFFENSE_DESCRIPTION         0
DISTRICT                 1774
REPORTING_AREA              0
SHOOTING               326765
OCCURRED_ON_DATE            0
YEAR                        0
MONTH                       0
DAY_OF_WEEK                 0
HOUR                        0
UCR_PART                   93
STREET                  10977
Lat                     20632
Long                    20632
Location                    0
dtype: int64

Let's see what areas are in the data

In [8]:
df_crime.DISTRICT.value_counts()
Out[8]:
B2     51288
C11    43817
D4     43338
A1     36735
B3     36400
C6     24190
D14    20632
E13    17981
E18    17825
A7     13634
E5     13543
A15     6663
Name: DISTRICT, dtype: int64
In [9]:
df_crime.DISTRICT = df_crime.DISTRICT.fillna('NON')
In [10]:
df_crime.SHOOTING = df_crime.SHOOTING.fillna('NON')
In [11]:
df_crime.UCR_PART.value_counts()
Out[11]:
Part Three    162928
Part Two      100283
Part One       63231
Other           1285
Name: UCR_PART, dtype: int64
In [12]:
df_crime.UCR_PART = df_crime.UCR_PART.fillna('Other')
In [13]:
len(df_crime.STREET.unique())
Out[13]:
4685
In [14]:
df_crime.STREET = df_crime.STREET.fillna('Other')
In [15]:
df_crime[np.isnan(df_crime.Lat)]['Location'].unique()
Out[15]:
array(['(0.00000000, 0.00000000)'], dtype=object)
In [16]:
df_crime = df_crime[~np.isnan(df_crime.Lat)]
Check out the passes now. They are not left.
In [17]:
df_crime.isnull().sum()
Out[17]:
INCIDENT_NUMBER        0
OFFENSE_CODE           0
OFFENSE_CODE_GROUP     0
OFFENSE_DESCRIPTION    0
DISTRICT               0
REPORTING_AREA         0
SHOOTING               0
OCCURRED_ON_DATE       0
YEAR                   0
MONTH                  0
DAY_OF_WEEK            0
HOUR                   0
UCR_PART               0
STREET                 0
Lat                    0
Long                   0
Location               0
dtype: int64
In [18]:
df_weather.isnull().sum()
Out[18]:
Year                         0
Month                        0
Day                          0
High Temp (F)                0
Avg Temp (F)                 0
Low Temp (F)                 0
High Dew Point (F)           0
Avg Dew Point (F)            0
Low Dew Point (F)            0
High Humidity (%)            0
Avg Humidity (%)             0
Low Humidity (%)             0
High Sea Level Press (in)    0
Avg Sea Level Press (in)     0
Low Sea Level Press (in)     0
High Visibility (mi)         0
Avg Visibility (mi)          0
Low Visibility (mi)          0
High Wind (mph)              0
Avg Wind (mph)               0
High Wind Gust (mph)         0
Snowfall (in)                0
Precip (in)                  0
Events                       0
dtype: int64
Let's look at the data using table analysis. On the most frequent crimes.
In [19]:
df_crime.OFFENSE_CODE_GROUP.value_counts()
Out[19]:
Motor Vehicle Accident Response              31874
Larceny                                      25900
Medical Assistance                           23158
Investigate Person                           18471
Other                                        17615
Simple Assault                               15356
Vandalism                                    15292
Drug Violation                               15081
Verbal Disputes                              13338
Towed                                        11120
Investigate Property                         10982
Larceny From Motor Vehicle                   10555
Property Lost                                 9690
Warrant Arrests                               7793
Aggravated Assault                            7470
Fraud                                         5922
Residential Burglary                          5701
Violations                                    5192
Missing Person Located                        5047
Auto Theft                                    4636
Robbery                                       4329
Harassment                                    4078
Missing Person Reported                       3798
Property Found                                3783
Confidence Games                              3165
Police Service Incidents                      2671
Disorderly Conduct                            2439
Fire Related Reports                          1909
License Violation                             1689
Restraining Order Violations                  1634
                                             ...  
Ballistics                                     966
Search Warrants                                950
Assembly or Gathering Violations               936
Property Related Damage                        887
Firearm Discovery                              686
License Plate Related Incidents                560
Offenses Against Child / Family                533
Operating Under the Influence                  479
Other Burglary                                 460
Evading Fare                                   401
Embezzlement                                   308
Service                                        267
Prisoner Related Incidents                     243
Prostitution                                   202
Homicide                                       157
Criminal Harassment                            135
Harbor Related Incidents                       133
Arson                                           92
HOME INVASION                                   80
Bomb Hoax                                       72
Phone Call Complaints                           33
Aircraft                                        32
Explosives                                      25
Manslaughter                                     8
Gambling                                         8
HUMAN TRAFFICKING                                7
INVESTIGATE PERSON                               4
HUMAN TRAFFICKING - INVOLUNTARY SERVITUDE        2
Biological Threat                                2
Burglary - No Property Taken                     2
Name: OFFENSE_CODE_GROUP, Length: 67, dtype: int64
In [20]:
df_crime.OFFENSE_CODE.value_counts()
Out[20]:
3006    18481
3115    18475
1402    15033
3831    14956
802     14340
3301    13338
3410    11120
3114    10982
617      9135
2647     9109
3201     8803
614      8632
613      8006
3125     7793
619      5886
3802     5553
413      4654
1102     4502
3502     4448
2629     4078
3803     3983
3501     3769
3207     3542
724      3424
2610     3197
1106     3165
423      2807
301      2789
520      2622
2900     2544
        ...  
770         2
123         2
2910        2
1002        2
2672        2
1866        2
623         2
629         2
1620        2
637         1
530         1
2606        1
639         1
402         1
404         1
432         1
714         1
527         1
624         1
627         1
2609        1
112         1
349         1
1105        1
547         1
335         1
1864        1
1863        1
315         1
511         1
Name: OFFENSE_CODE, Length: 219, dtype: int64
On the coordinates. We see that there are problems in filling, because there is a number -1
In [21]:
min(df_crime.Lat), max(df_crime.Lat)
Out[21]:
(-1.0, 42.39504158)
In [22]:
min(df_crime.Long), max(df_crime.Long)
Out[22]:
(-71.17867378, -1.0)

There are spikes in the coordinates.

Part 3. Visual analysis of the features

In [23]:
df_crime.OCCURRED_ON_DATE = df_crime.OCCURRED_ON_DATE.map(pd.to_datetime)
In [24]:
df_crime['test_one'] = 1
In [25]:
g = df_crime.groupby(('YEAR','MONTH'))['test_one'].sum()
fig = plt.figure(1, (12, 8))
ax1 = fig.add_subplot(211)
g.unstack(level=0).plot(kind='bar', grid = True, ax = ax1);
ax2 = fig.add_subplot(212)

g.plot(grid = True, ax = ax2);
ax2.set_xticks(range(len(g)));
ax2.set_xticklabels(["%s-%02d" % item for item in g.index.tolist()], rotation=90);

most crimes are committed in the summer, least in the winter. average offense remains constant

In [26]:
df_crime.groupby(('HOUR'))['test_one'].sum().plot(kind='bar',figsize=(15,6), grid = True);

most crimes are committed after dinner, least of all late at night

In [27]:
order = ['Monday', 'Tuesday', 'Wednesday','Thursday','Friday','Saturday','Sunday']
df_crime.groupby(('DAY_OF_WEEK'))['test_one'].sum().loc[order].plot(kind = 'pie', figsize=(7, 7), autopct='%.2f');

by the weekend the number of crimes decreases

In [28]:
g = df_crime[df_crime.SHOOTING != 'NON'].groupby(('YEAR','MONTH'))['test_one'].sum()
fig = plt.figure(1, (12, 12))
ax1 = fig.add_subplot(311)
g.unstack(level=0).plot(kind='bar', grid = True, ax = ax1, stacked=True);
ax2 = fig.add_subplot(312)
g.plot(grid = True, ax = ax2);
ax2.set_xticks(range(len(g)));
ax2.set_xticklabels(["%s-%02d" % item for item in g.index.tolist()], rotation=90);

g = df_crime[df_crime.SHOOTING != 'NON'].groupby(('DAY_OF_WEEK'))['test_one'].sum().loc[order]
ax3 = fig.add_subplot(313)
g.plot(kind='bar', grid = True, ax = ax3, stacked=True);
In [29]:
g = df_crime.groupby(('STREET'))['test_one'].sum().sort_values(ascending = False).head(30)
fig = plt.figure(1, (12, 8))
#ax1 = fig.add_subplot(211)
g.plot(kind='bar', grid = True);
Street connection and number of crimes
In [30]:
g = df_crime.groupby(('OFFENSE_CODE_GROUP'))['test_one'].sum().sort_values(ascending = False).head(20)
fig = plt.figure(1, (12, 8))
#ax1 = fig.add_subplot(211)
g.plot(kind='barh', grid = True);
Distribution by district
In [31]:
g = df_crime.groupby(('DISTRICT'))['test_one'].sum().sort_values(ascending = False).head(20)
fig = plt.figure(1, (12, 8))
#ax1 = fig.add_subplot(211)
g.plot(kind='pie', grid = True);
Remove emissions in the coordinates
In [32]:
df_crime_pre =  df_crime[(df_crime.Lat > 20) & (df_crime['Long']<-20)]
In [33]:
g = df_crime_pre.groupby(('OFFENSE_CODE_GROUP'))['Lat', 'Long', 'test_one'].aggregate((np.sum,  np.median)).sort_values(by = ('test_one','sum'), ascending = False)

fig = plt.figure(1, (8, 8))
#ax1 = fig.add_subplot(211)
plt.scatter(x = g[('Lat','median')].values, y = g[('Long','median')].values, marker = '*');
The graph shows the centers of gravity of the crimes. It can be seen that various crimes are most often committed in different parts of the city.
In [34]:
fig = plt.figure(1, (8, 8))
sns.scatterplot(x='Long',  y='Lat',   hue='DISTRICT',   data=df_crime_pre)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2);
Distribution of crimes by coordinates
In [35]:
df_crime_pre['Lat'].plot(kind='kde');
In [36]:
df_crime_pre['Long'].plot(kind='kde');
In [37]:
df_crime_pre['DAY'] = df_crime_pre['OCCURRED_ON_DATE'].dt.day
Now add an array with the weather.
In [38]:
df_total = df_crime_pre.merge(df_weather, left_on=['YEAR', 'MONTH', 'DAY'], right_on=['Year', 'Month', 'Day'])
In [ ]:
 
crime rate and temperature
In [39]:
plt.figure();
df_total['Low Temp (F)'].hist(alpha = 0.5, bins = 30);
df_total['High Temp (F)'].hist(alpha = 0.5, bins = 30);
In [164]:
# this is a real temperature distribution
plt.figure();
df_weather['Low Temp (F)'].hist(alpha = 0.5, bins = 30);
df_weather['High Temp (F)'].hist(alpha = 0.5, bins = 30);
In [41]:
df_total.groupby('Events')['test_one'].aggregate(np.sum).plot.pie();
# what precipitations were at the time of the crime

calculate the correlation between variables

In [ ]:
 
In [42]:
def plot_corr(df,size=10):
    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    cs = ax.matshow(corr, interpolation='none', cmap='plasma')
    plt.xticks(range(len(corr.columns)), corr.columns,  rotation=90);
    plt.yticks(range(len(corr.columns)), corr.columns);
    cbar = fig.colorbar(cs, ax=ax, shrink=0.7) 
In [43]:
df_dummies_OFFENSE_CODE_GROUP = pd.get_dummies(df_total['OFFENSE_CODE_GROUP'])
df_dummies_Events = pd.get_dummies(df_total['Events'])
df_dummies_DISTRICT = pd.get_dummies(df_total['DISTRICT'])
#df_dummies_SHOOTING = pd.get_dummies(df_total['SHOOTING'])
df_dummies_UCR_PART = pd.get_dummies(df_total['UCR_PART'])
df_dummies_STREET = pd.get_dummies(df_total['STREET'])

df_new = pd.concat([df_dummies_OFFENSE_CODE_GROUP, df_dummies_Events, df_dummies_DISTRICT, df_dummies_UCR_PART,  df_total[['Lat', 'Long']], df_total.iloc[:, 23:42]], axis=1)
In [44]:
plot_corr(df_new, 25)