#!/usr/bin/env python
# coding: utf-8
#
#
Frogtown DataLoad Workbook, 05/06/18
#
By Frogtown Crusader (Abu Nayeem)
# This is just clean coded process going over the uploading steps assuming all the preliminary steps after decoder are made
#
#
# ## Table of contents
# * [Data Setup](#setup)
# * [Create Variables](#create)
# * [Intersection Table: Preparation](#intersection)
# * [Address Table: Preparation](#address)
# * [Preliminary Data Cleaning](#pre)
#
# ### Data Setup
# In[1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
from IPython.display import HTML
from IPython.display import display
import requests # library to handle requests
#Load Data
#df_crime = pd.read_csv('Datasets/Crime_Incident_Report_-_Dataset.csv')
#rename columns
#cols= ['Case','Date','Time','Code','IncType','Incident','Grid','NNum','Neighborhood','Block','CallDispCode','CallDisposition', 'Count']
#df_crime.columns= cols
#selection for Frogtown and nearby area
#df=df_crime.query('Grid in [66,67, 68, 86, 87,88,89, 90, 91, 92,106,107,108,109,110]')
#Socarata upload Method
from sodapy import Socrata
#New Upload Method Get Information from Socrata API
client = Socrata("information.stpaul.gov", None)
#Easier to bulk upload
results = client.get("gppb-g9cg", limit=1000000)
df = pd.DataFrame.from_records(results)
# Find Max Date Value
results = client.get("gppb-g9cg", limit=1)
r_max = pd.DataFrame.from_records(results)
#rename columns [Note the order of Columns have changed]
cols= ['Block','CallDispCode','CallDisposition','Case','Code', 'Count','Date','Grid','Incident','IncType','Neighborhood','NNum','Time']
df.columns= cols
df=df.dropna()
df = df.astype({"Case": int, "Code": int, "Grid":float, "NNum":int,"Count":int})
#select respective Grids of interest
df=df.query("Grid in (66,67, 68, 86, 87,88,89, 90, 91, 92,106,107,108,109,110)")
df.head(4)
# ### Create New Variables
# In[2]:
#Add Time Variables
df= df[df.Case != 18254093] #messed up time variable
#Convert Date to Datetime!
from datetime import datetime
df['DateTime']= pd.to_datetime(df['Date']) # Create new column called DateTime
df['Year']= df['DateTime'].dt.year #create year column
df['DayofWeek']=df['DateTime'].dt.dayofweek #create day of the week column where default 0=Monday
df['Weekend'] = df['DayofWeek'].apply(lambda x: 1 if (x>4) else 0) #Create a weekend category
df['Month'] = df['DateTime'].dt.month # Create Month Category
df['Day'] = df['DateTime'].dt.day #Create Day of the Current month
df['DayYear'] = df['DateTime'].dt.dayofyear #Create Day of the year (0-365)
df['Day_Max'] = df.iloc[0,-1] #selects uptodate day; NOTE: the data is sorted chronologically
#Hour Data
df['TimeHour']= pd.to_datetime(df['Time'])
df['Hour'] = df['TimeHour'].dt.hour.astype(int) #Create Hour Colum
df['LateNight'] = df['Hour'].apply(lambda x: 1 if (x>21 or x<5) else 0) #Latenight designation from 10Pm to 6PM
#Creating the intersection Column. Note: the Block column has the address information
df.Block = df.Block.astype(str) #first change the type to string
df['Block']= df['Block'].str.lower() #lowercase string to create uniformity
#While scanning the data I noticed that all intersections had "&"
df['Intersection'] = df['Block'].apply(lambda x: 1 if '&' in x else 0) #intersection
df.head(5)
# ### Prepare Intersection table
# In[3]:
#Load clean intersections key
df_Intersection_key = pd.read_csv('Datasets/Intersection_key_clean.csv')
# Create a new dateframe specifying only intersections from primary dataset
dfI=df.query('Intersection ==1')
print('The intersection table dimension are ' + str(dfI.shape))
#print(dfI.Block.head(10))
#Split the strings
new=dfI['Block'].str.split("& ", n = 1, expand = True)
dfI['Inter2']= new[1]
new=dfI['Block'].str.split(" ", n = 1, expand = True) #Note the code specifies the first time a space occured
dfI['Inter1']=new[0]
#Create the IndexKey; recall we prepared the IntersectionKey where it considers any order
dfI['IndexKey']= dfI['Inter1']+ '_' + dfI['Inter2']
dfI.reset_index()
dfI=pd.merge(dfI, df_Intersection_key, on='IndexKey', how='left')
# In[5]:
#Drop rows with missing coordinates
dfI=dfI[dfI['Coordinates'].notnull()]
# Separate Latitude and Longitude
new=dfI['Coordinates'].str.split(",", n = 1, expand = True)
# making seperate first name column from new data frame
dfI['Latitude']= pd.to_numeric(new[0]) #pd.to_numeric convert it to float
dfI['Longitude']= pd.to_numeric(new[1])
#Renaming columns
dfI['Block']=dfI['OutputKey'] #for practical purposes it makes sense
Drop_col=['Inter2','Inter1', 'IndexKey', 'Coordinates', 'OutputKey']
dfI_Final=dfI.drop(Drop_col, axis=1,)
dfI_Final.head(5)
# ### Prepare Address Decoder Table
#
# It is so remarkably short
# In[5]:
#Split Data
dfW=df.query('Intersection==0')
#Load Complete Decoder Key
df_C= pd.read_csv('Datasets/SemiKey.csv')
df_C= df_C[['Block','Latitude','Longitude']]
# Merge with the dataset and remove missing values
dC=pd.merge(dfW, df_C, on='Block', how='left')
dC=dC.fillna('Mi')
dC=dC.query('Latitude != "Mi"')
#Bringing the data back together
fg= dfI.append(dC, ignore_index=True)
# ### Preliminary Pre-Cleaning steps
#
# It is good to do the data pre-cleaning steps here to reduce clutter on a working notebook. A few edits include renaming some values, clustering crimes together and creating some dummies. It is saved in a csv, which is used for execution
# In[6]:
#Few Quick Edits
fg.CallDisposition.loc[(fg['CallDisposition'] == 'G - Gone on Arrival')] = 'Gone on Arrival'
fg.CallDisposition.loc[(fg['CallDisposition'] == 'A - Advised')] = 'Advised'
fg.CallDisposition.loc[(fg['CallDisposition'] == 'RR - Report Written')] = 'Report Written'
fg.Incident.loc[(fg['Incident'] == 'Simple Asasult Dom.')] = 'Simple Assault Dom.'
fg.Incident.loc[(fg['Incident'] == 'Graffiti')] = 'Vandalism'
fg.Incident.loc[fg["Incident"].isin([ "Rape","Agg. Assault",'Homicide'])]= 'Violent'
fg.Incident.loc[fg["Incident"].isin(["Simple Assault Dom.","Agg. Assault Dom."])]= 'Domestic Assault'
#[fg["Incident"].isin(["Simple Assault Dom.", "Rape"])
#Create a dummy for each crime category
fg= pd.concat([fg,pd.get_dummies(fg['Incident'])], axis=1)
fg= pd.concat([fg,pd.get_dummies(fg['CallDisposition'])], axis=1)
fg.to_csv('Datasets/FGCrime_Final.csv', encoding='utf-8', index=False)
# In[8]:
fg.query('Year==2021')
# In[7]:
Ward1.query('ElectionDescription in ("11/08/2016 - STATE GENERAL", "08/09/2016 - STATE PRIMARY","11/07/2017 - MUNICIPAL GENERAL","11/07/2017 - SCHOOL DISTRICT GENERAL" ,"11/08/2016 - STATE GENERAL","08/09/2016 - STATE PRIMARY","11/03/2015 - MUNICIPAL GENERAL","11/03/2015 - SCHOOL DISTRICT GENERAL","11/08/2011 - MUNICIPAL GENERAL","11/06/2018 - STATE GENERAL","08/14/2018 - STATE PRIMARY")'
# In[ ]: