#!/usr/bin/env python
# coding: utf-8

# # Assessing building fire risk with Random Forest
# ** *Combining different governmental datasets to prioritize fire department resources* **
# 
# ## Background
# 
# Fire departments across the United States spend considerable resources on educational and prevention campaigns to stop building fires before they happen. Moving from a shotgun approach to a targeted one that considers the likelihood of  fire incidents could significantly improve the efficacy of these interventions. To do this, machine learning algorithms built on data from a wide variety of sources could vastly improve how we perform against randomness. 
# 
# Using data provided by the city of Sioux Falls, this notebook walks through the steps of predicting fire risk for every address listed in the city with a random forest classifier algorithm written in Python, and trained on data coming from several different county and municipal departments.
# 

# ## Import libraries

# In[1]:


# Data analysis and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Interactive maps 
import folium
from folium.plugins import HeatMap

# Machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline


# ## Load and describe the data

# In[2]:


# Load the data into Python
fires_df = pd.read_csv('data/fires2.csv', parse_dates=[6])
parcels_df = pd.read_excel('data/parcels2.xlsx', parse_dates=True)
foreclosures_df = pd.read_excel('data/foreclosures.xlsx', parse_dates=True)
rent_reg_df = pd.read_excel('data/rental_registration_permits.xlsx', parse_dates=True)
utility_disconnects_df = pd.read_excel('data/utility_disconnects.xlsx', parse_dates=True)
code_cases_df = pd.read_excel('data/code_cases.xlsx', parse_dates = True)
minnehaha_tax_df = pd.read_excel('data/minnehaha_tax.xlsx', parse_dates = True)
lincoln_tax_df = pd.read_excel('data/lincoln_tax.xlsx', parse_dates = True)
crime_df = pd.read_excel('data/crime.xlsx', parse_dates = True)


# In[3]:


fires_df.type.value_counts()


# In[4]:


# Create copys of DataFrames so if I mess something up I don't have to load up everything again
building_fires = fires_df[fires_df['id2']== 111].copy()
parcels = parcels_df.copy()
foreclosures = foreclosures_df.copy()
rent_reg = rent_reg_df.copy()
utility_disconnects = utility_disconnects_df.copy()
code_cases = code_cases_df.copy()
minnehaha_tax = minnehaha_tax_df.copy()
lincoln_tax = lincoln_tax_df.copy()
crime = crime_df.copy()


# In[5]:


parcels.columns


# In[6]:


foreclosures.describe()


# In[7]:


building_fires.info()


# In[8]:


# Visualizing the fire incidents in a map
fire_map = folium.Map(location=[43.54, -96.72], zoom_start=14, tiles='Stamen Terrain')
heat_df = building_fires[['lat', 'lon']]
heat_df = heat_df.dropna(axis=0, subset=['lat','lon'])
heat_data = [[row['lat'],row['lon']] for index, row in heat_df.iterrows()]
HeatMap(heat_data).add_to(fire_map)
fire_map


# ## Data wrangling

# In[9]:


def cleanActivity(x):
    if x == 11 or x == 12 or x == 13:
        x = 'SINGLE OR TWO RESIDENTIAL'
    elif x == 21 or x == 22 or x == 23 or x == 24 or x == 25:
        x = 'MULTIFAMILY'
    elif x in list(range(31,39)):
        x = 'OFFICE AND PUBLIC SERVICE'
    elif x in list(range(40,50)):
        x = 'INSTITUTIONAL'
    elif x in list(range(51, 54)):
        x = 'COMMERCIAL'
    elif x == 61 or x in list(range(63,70)):
        x = 'INDUSTRIAL'
    elif x == 62:
        x = 'AIRPORT'
    elif x in range(70,98) or x in range(0, 9):
        x = 'OPEN SPACES/NA'
    return x


parcels['ACTIVITY'] = parcels.ACTIVITY.apply(cleanActivity)

# Parcels dataset
df = pd.concat([parcels, pd.get_dummies(parcels.ACTIVITY, prefix='ACT_'), pd.get_dummies(parcels.PARPR), pd.get_dummies(parcels.PARCELTYPE), pd.get_dummies(parcels.PARTYPE), pd.get_dummies(parcels.COUNTY)], axis = 1)
df.PARHOUSE.fillna(method='bfill', inplace = True)
df = df[df.ACTIVITY != 'AIRPORT']
df = df[df.ACTIVITY != 'OPEN SPACES/NA']
parcels.drop(columns = ['BlockDesig', 'FORM_PRIMA', 'PARCELTYPE', 'FORM_COMME', 'PARCEL_TRA', 'PARCEL_LOT', 'created_us', 'TAG', 'ADDITION', 'ADDITIONPR', 'COUNTYID', 'OWNNAME1', 'OWNZIP', 'OWNZIP2', 'OWNNAME2', 'OWNCITY', 'OWNSTATE', 'OWNADDRESS', 'LEGAL', 'ADDITIONNU', 'PARHALF', 'PARPD', 'UNITNUM', 'GlobalID', 'last_edite', 'Shape_Leng', 'Shape_Area', 'DEPARTMENT', 'FORM_ACCES', 'FORM_SIGNE', 'FORM_DATE', 'created_da', 'last_edi_1', 'PARPR', 'PARTYPE', 'COUNTY', 'PARSTREET', 'LegalStart', 'ACTIVITY'], inplace = True)
df.drop(columns = ['BlockDesig', 'ACREAGE', 'FORM_PRIMA', 'PARCELTYPE', 'FORM_COMME', 'PARCEL_TRA', 'PARCEL_LOT', 'created_us', 'TAG', 'ADDITION', 'ADDITIONPR', 'COUNTYID', 'OWNNAME1', 'OWNZIP', 'OWNZIP2', 'OWNNAME2', 'OWNCITY', 'OWNSTATE', 'OWNADDRESS', 'LEGAL', 'ADDITIONNU', 'PARHALF', 'PARPD', 'UNITNUM', 'GlobalID', 'last_edite', 'Shape_Leng', 'Shape_Area', 'DEPARTMENT', 'FORM_ACCES', 'FORM_SIGNE', 'FORM_DATE', 'created_da', 'last_edi_1', 'PARPR', 'PARTYPE', 'COUNTY', 'PARSTREET', 'LegalStart', 'ACTIVITY'], inplace = True)

# Fire dataset
building_fires.drop(columns = ['FID', 'Join_Count', 'TARGET_FID', 'id', 'type', 'lat', 'lon', 'id2'], inplace = True)
building_fires = building_fires.groupby('ADDRESS').max()
building_fires['INCIDENT'] = 1
df = pd.merge(df, building_fires, how = 'left', on='ADDRESS')
df.INCIDENT.fillna(0, inplace = True)
df.date.fillna(pd.to_datetime('1/1/2008'), inplace = True)

# Foreclosure dataset
foreclosure_df = pd.DataFrame(foreclosures.groupby('ADDRESS').count())
foreclosure_df.rename(columns = { 'NAME' : 'FORECLOSED'}, inplace = True)
foreclosure_df.drop(columns = ['AUCTIONDATE', 'ASSESSEDVALUE', 'YEAR'], inplace = True)
df = pd.merge(df, foreclosure_df, how = 'left', on = 'ADDRESS')
df.FORECLOSED.fillna(0, inplace = True)

# Rent registry dataset
rent_reg.drop(columns = ['City_1', 'State_1', 'Permit_Num', 'Contact_Ty', 'Last_Name', 'First_Name', 'Middle_Ini', 'Business_N', 'Contact_Pr', 'Business_P', 'Mobile_Pho', 'Home_Phone', 'Email', 'Contact_Ad', 'Issue_Date'], inplace = True)
rent_reg.Units.fillna(value=rent_reg.Units.median(), inplace = True)
rent_reg_grouped = rent_reg.groupby('Address').mean()
rent_df = pd.DataFrame(rent_reg_grouped)
rent_df.rename(columns = { 'Address' : 'ADDRESS', 'Units' : 'RENT_REG_UNITS', 'YEAR' : 'RENT_REG_YEAR'}, inplace = True)
rent_df.rename_axis('ADDRESS', inplace = True)
rent_df['RENT_REG'] = 1
df = pd.merge(df, rent_df, how = 'left', on = 'ADDRESS')
df.RENT_REG.fillna(0, inplace = True)
df.RENT_REG_UNITS.fillna(0, inplace = True)
df.RENT_REG_YEAR.fillna(0, inplace = True)

# Utility disconnects dataset
utility_disconnects.rename(columns = {'Address' : 'ADDRESS', 'Year' : 'UTILITY_DISCONNECTS'}, inplace = True)
uti_disc_grouped = utility_disconnects.groupby('ADDRESS').count()
uti_disc_df = pd.DataFrame(uti_disc_grouped)
uti_disc_df['ANY_DISCONNECT'] = 1
df = pd.merge(df, uti_disc_df, how = 'left', on = 'ADDRESS')
df.UTILITY_DISCONNECTS.fillna(0, inplace = True)
df.ANY_DISCONNECT.fillna(0, inplace = True)

# Code cases
case_types = pd.get_dummies(code_cases.CaseType)
code_cases = pd.concat([code_cases, case_types], axis=1)
code_cases.drop(columns = ['Year'], inplace = True)
code_cases_grouped = code_cases.groupby(['ADDRESS']).sum()
code_cases_df = pd.DataFrame(code_cases_grouped)
code_cases_df['TOTAL_VIOLATIONS'] = code_cases_df.sum(axis=1)
code_cases_df['ANY_VIOLATIONS'] = 1
df = pd.merge(df, code_cases_df, how = 'left', on = 'ADDRESS')
df.ANY_VIOLATIONS.fillna(0, inplace = True)
df['Building Service'].fillna(0, inplace = True)
df['Drainage'].fillna(0, inplace = True)
df['Erosion and Sediment Control'].fillna(0, inplace = True)
df['Fire'].fillna(0, inplace = True)
df['Health Nuisance Complaints'].fillna(0, inplace = True)
df['IMPORT / Tree Survey & Stump Removal'].fillna(0, inplace = True)
df['Illicit Discharge'].fillna(0, inplace = True)
df['Landfill'].fillna(0, inplace = True)
df['Manufactured Housing'].fillna(0, inplace = True)
df['Parks and Rec - Dead or Diseased Tree'].fillna(0, inplace = True)
df['Parks and Rec - Tree Complaint'].fillna(0, inplace = True)
df['Property Maintenance'].fillna(0, inplace = True)
df['Rental Registration'].fillna(0, inplace = True)
df['Right of Way '].fillna(0, inplace = True)
df['Sidewalks and Ramps'].fillna(0, inplace = True)
df['Snow'].fillna(0, inplace = True)
df['Special Assessment'].fillna(0, inplace = True)
df['Vegetation'].fillna(0, inplace = True)
df['Waste Water'].fillna(0, inplace = True)
df['Water Purfication'].fillna(0, inplace = True)
df['Zoning'].fillna(0, inplace = True)
df.TOTAL_VIOLATIONS.fillna(0, inplace = True)

# Crime
crime.rename(columns = {'Offense' : 'CRIME_INCIDENT'}, inplace = True)
crime_grouped = crime.groupby('ADDRESS').count()
crime_df = pd.DataFrame(crime_grouped[['CRIME_INCIDENT']])
crime_df['ANY_CRIME'] = 1
df = pd.merge(df, crime_df, how = 'left', on = 'ADDRESS')
df.CRIME_INCIDENT.fillna(0, inplace = True)
df.ANY_CRIME.fillna(0, inplace = True)

# Tax assessment
minnehaha_tax.drop(columns=['TAG', 'COUNTYID', 'ACREAGE', 'SQFT', 'FRONTFOOT', 'PARHOUSE',
       'PARHALF', 'PARPR', 'PARSTREET', 'PARTYPE', 'PARPD', 'LOT',
       'BLOCK', 'TRACT', 'SUBDIVNO', 'ADDITION', 'LEGAL', 'COUNTY', 'MRTNSP',
       'MRSCHD', 'SCHOOLDESC', 'MRZON1', 'NUMUNITS', 'MRLYAP', 'MAP_ID'], inplace = True)
lincoln_tax['ADDRESS'] = lincoln_tax.Address
lincoln_tax.drop(columns = ['FID', 'OBJECTID', 'PID', 'Plat', 'SchoolDist', 'Township', 'STR',
       'Address', 'Name', 'Add1', 'Add2', 'Add3', 'Zip', 'Legal1', 'Legal2',
       'Legal3', 'Legal4', 'Class1_1', 'Class2_1','Class3_1', 'Class4_1', 'Class5_1'], inplace = True)
combined_tax = pd.merge(lincoln_tax, minnehaha_tax, how = 'outer', on = 'ADDRESS')
combined_tax.fillna(0, inplace = True)

combined_tax['LANDVALUE'] = combined_tax.MRLNVC + combined_tax.Value1_1
combined_tax['BUILDVALUE'] = combined_tax.MRBDVC + combined_tax.Value12_1
combined_tax['TOTALVALUE'] = combined_tax[['MRTOTC' , 'Value1_1', 'Value12_1', 'Value13_1', 'Value14_1', 'Value15_1']].sum(axis=1) 

combined_tax.drop(columns=['Value1_1', 'Value12_1', 'Value13_1','Value14_1', 'Value15_1', 'MRLNVC', 'MRBDVC', 'MRTOTC'], inplace = True)

df = pd.merge(df, combined_tax, how = 'left', on = 'ADDRESS')
df = df[df.ADDRESS != '0']
df = df.dropna(subset=['ADDRESS'])
df.LANDVALUE.fillna(0, inplace = True)
df.BUILDVALUE.fillna(0, inplace = True)
df.TOTALVALUE.fillna(0, inplace = True)
df.set_index('ADDRESS', inplace = True)


# In[10]:


# Moving the incident column to the last position
df1 = df.pop('INCIDENT')
df['INCIDENT'] = df1


# In[11]:


# Calculate all addresses with at least one fire incident
df.INCIDENT.value_counts()


# In[12]:


## Hide the 2018 data
# First create a new column for parcels with incidents in 2018
df['INCIDENT_2018'] = [1 if date > pd.to_datetime('2018') else 0 for date in df['date']] 
# Now turn all 2018 fire incidents off
df.loc[(df.date > '2018') & (df.INCIDENT == 1), 'INCIDENT'] = 0


# In[13]:


# Calculate all addresses with at least one fire incident, excluding 2018 fires
df.INCIDENT.value_counts()


# In[14]:


# Split predictor and prediction variables
X = df.drop(columns = ['INCIDENT', 'date'])
y = df.INCIDENT


# ## Apply random forest algorithm

# In[15]:


# Optimizing the algorithm for the best possible results

# Create pipeline with feature selector and random forest classifier
pipe = Pipeline([
    ('feature_selection', SelectKBest(f_classif)),
    ('clf', RandomForestClassifier(random_state=2))])

# Create a parameter grid to test
params = {
   'feature_selection__k':[3, 5, 10, 20, 50],
    'clf__n_estimators':[2, 5, 10, 100],
    'clf__max_depth' : [3, 5, 10, 20, 50]}

# Initialize the grid search object
grid_search = GridSearchCV(pipe, param_grid=params)

# Fit it to the data and print the best value combination
print(grid_search.fit(X, y).best_params_)


# In[16]:


# Select the best features based on the optimized parameters
sk = SelectKBest(f_classif, k=grid_search.best_params_['feature_selection__k'])
which_selected = sk.fit(X, y).get_support()
X = X[X.columns[which_selected]]

# Fit the classifier with the optimized parameters
random_forest = RandomForestClassifier(n_estimators=grid_search.best_params_['clf__n_estimators'], max_depth = grid_search.best_params_['clf__max_depth'])
random_forest.fit(X, y)


# In[17]:


# Correlation heat map with our predictive variables
comb = pd.concat([X, y], axis = 1)
corr = comb.corr()

plt.figure(figsize = (10,10))

dropSelf = np.zeros_like(corr)
dropSelf[np.triu_indices_from(dropSelf)] = True


ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True, annot=True, annot_kws={"size": 7},
    mask=dropSelf
)


ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)

plt.savefig('Results - Correlation matrix.pdf', bbox_inches = 'tight', pad_inches = 2.5)


# ## Cross validation and 2018 fire prediction accuracy

# In[18]:


# Cross validation score for Random Forest
Score = cross_val_score(random_forest, X, y, cv=3)
round(np.mean(Score)*100, ndigits=4)


# In[19]:


# Assign a probability score to each address
df['PREDICTION'] =  random_forest.predict_proba(X)[:,1]
df.PREDICTION.describe()


# In[20]:


# Compare fires found in 2018 in the 2000 highest risk properties versus randomness
np.random.seed(0)
df['RANDOM'] = np.random.rand(df.shape[0])
df_random = df.nlargest(2000, columns = 'RANDOM')

rf_results_search = df.nlargest(2000, columns = 'PREDICTION')
print('Total 2018 predicted fires (2000 interventions): ' + str(rf_results_search.INCIDENT_2018.sum()))
print('Total random predicted fires (2000 interventions): ' + str(df_random.INCIDENT_2018.sum()))


# ## Describe and export the results

# In[21]:


# Visualize how fire risk is distributed among parcel addresses
df.PREDICTION.plot.hist()


# In[22]:


# Plotting the comparison between randomness and sorting by highest risk with our algorithm
random_results = df_random.INCIDENT_2018.sum()

rf_results = rf_results_search.INCIDENT_2018.sum()

df_results = pd.DataFrame({'Model' : ['Random search', 'RF - Highest risk'], 'results' : [random_results, rf_results]})
ax = df_results.plot.bar(x = 'Model', y = 'results', color = ['orange', 'b'])

plt.title('Results based on 2000 addresses in a year')
plt.ylabel('Fires catched')
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)


plt.savefig('Results - Model comparison.pdf', bbox_inches = 'tight', pad_inches = 0.5)


# In[23]:


# Comparing our predictive model to randomness
model_comparison = []

for searches in range(0, 55000, 100):
    prediction = df.nlargest(searches, columns = 'PREDICTION').INCIDENT_2018.sum()
    random = df.nlargest(searches, columns = 'RANDOM').INCIDENT_2018.sum()
    model_comparison.append([prediction, random])
    
model_comparison_df = pd.DataFrame(model_comparison, columns=['PREDICTION', 'RANDOM'])

model_comparison_df.plot()
plt.xlabel('Properties inspected or intervened')
plt.ylabel('Fire occurance in properties inspected')
plt.xticks([])

plt.savefig('Model comparison - properties intervened.pdf', bbox_inches = 'tight', pad_inches = 2.5)


# In[24]:


# ROC graph
y_test = df.INCIDENT_2018
preds = df.PREDICTION

fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.savefig('AUC ROC curve.pdf', bbox_inches = 'tight', pad_inches = 2.5)


# In[25]:


# Feature importance from the RFC 
feature_importances = pd.DataFrame(random_forest.feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance',                                                                 
                                                                        ascending=False)
feature_importances


# In[26]:


# Visualizing the feature importances through a graph
importances = random_forest.feature_importances_
indices = np.argsort(importances)
ax = plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [X.columns[i] for i in indices])
plt.xlabel('Relative Importance')
plt.savefig('Results - Feature importance.pdf', bbox_inches = 'tight', pad_inches = 0.5)


# In[27]:


# Export the results to CSV
df['PREDICTION'].to_csv('Results - risk predictions.csv', header = True)


# ## Have any questions? Email me at nsdiaz@uc.cl