Frogtown DataLoad Workbook, 05/06/18

By Frogtown Crusader (Abu Nayeem)

# This is just clean coded process going over the uploading steps assuming all the preliminary steps after decoder are made # # # ## Table of contents # * [Data Setup](#setup) # * [Create Variables](#create) # * [Intersection Table: Preparation](#intersection) # * [Address Table: Preparation](#address) # * [Preliminary Data Cleaning](#pre) # # ### Data Setup # In[1]: import pandas as pd import numpy as np import warnings warnings.filterwarnings('ignore') import matplotlib as mpl import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') from IPython.display import HTML from IPython.display import display import requests # library to handle requests #Load Data #df_crime = pd.read_csv('Datasets/Crime_Incident_Report_-_Dataset.csv') #rename columns #cols= ['Case','Date','Time','Code','IncType','Incident','Grid','NNum','Neighborhood','Block','CallDispCode','CallDisposition', 'Count'] #df_crime.columns= cols #selection for Frogtown and nearby area #df=df_crime.query('Grid in [66,67, 68, 86, 87,88,89, 90, 91, 92,106,107,108,109,110]') #Socarata upload Method from sodapy import Socrata #New Upload Method Get Information from Socrata API client = Socrata("information.stpaul.gov", None) #Easier to bulk upload results = client.get("gppb-g9cg", limit=1000000) df = pd.DataFrame.from_records(results) # Find Max Date Value results = client.get("gppb-g9cg", limit=1) r_max = pd.DataFrame.from_records(results) #rename columns [Note the order of Columns have changed] cols= ['Block','CallDispCode','CallDisposition','Case','Code', 'Count','Date','Grid','Incident','IncType','Neighborhood','NNum','Time'] df.columns= cols df=df.dropna() df = df.astype({"Case": int, "Code": int, "Grid":float, "NNum":int,"Count":int}) #select respective Grids of interest df=df.query("Grid in (66,67, 68, 86, 87,88,89, 90, 91, 92,106,107,108,109,110)") df.head(4) # ### Create New Variables # In[2]: #Add Time Variables df= df[df.Case != 18254093] #messed up time variable #Convert Date to Datetime! from datetime import datetime df['DateTime']= pd.to_datetime(df['Date']) # Create new column called DateTime df['Year']= df['DateTime'].dt.year #create year column df['DayofWeek']=df['DateTime'].dt.dayofweek #create day of the week column where default 0=Monday df['Weekend'] = df['DayofWeek'].apply(lambda x: 1 if (x>4) else 0) #Create a weekend category df['Month'] = df['DateTime'].dt.month # Create Month Category df['Day'] = df['DateTime'].dt.day #Create Day of the Current month df['DayYear'] = df['DateTime'].dt.dayofyear #Create Day of the year (0-365) df['Day_Max'] = df.iloc[0,-1] #selects uptodate day; NOTE: the data is sorted chronologically #Hour Data df['TimeHour']= pd.to_datetime(df['Time']) df['Hour'] = df['TimeHour'].dt.hour.astype(int) #Create Hour Colum df['LateNight'] = df['Hour'].apply(lambda x: 1 if (x>21 or x<5) else 0) #Latenight designation from 10Pm to 6PM #Creating the intersection Column. Note: the Block column has the address information df.Block = df.Block.astype(str) #first change the type to string df['Block']= df['Block'].str.lower() #lowercase string to create uniformity #While scanning the data I noticed that all intersections had "&" df['Intersection'] = df['Block'].apply(lambda x: 1 if '&' in x else 0) #intersection df.head(5) # ### Prepare Intersection table # In[3]: #Load clean intersections key df_Intersection_key = pd.read_csv('Datasets/Intersection_key_clean.csv') # Create a new dateframe specifying only intersections from primary dataset dfI=df.query('Intersection ==1') print('The intersection table dimension are ' + str(dfI.shape)) #print(dfI.Block.head(10)) #Split the strings new=dfI['Block'].str.split("& ", n = 1, expand = True) dfI['Inter2']= new[1] new=dfI['Block'].str.split(" ", n = 1, expand = True) #Note the code specifies the first time a space occured dfI['Inter1']=new[0] #Create the IndexKey; recall we prepared the IntersectionKey where it considers any order dfI['IndexKey']= dfI['Inter1']+ '_' + dfI['Inter2'] dfI.reset_index() dfI=pd.merge(dfI, df_Intersection_key, on='IndexKey', how='left') # In[5]: #Drop rows with missing coordinates dfI=dfI[dfI['Coordinates'].notnull()] # Separate Latitude and Longitude new=dfI['Coordinates'].str.split(",", n = 1, expand = True) # making seperate first name column from new data frame dfI['Latitude']= pd.to_numeric(new[0]) #pd.to_numeric convert it to float dfI['Longitude']= pd.to_numeric(new[1]) #Renaming columns dfI['Block']=dfI['OutputKey'] #for practical purposes it makes sense Drop_col=['Inter2','Inter1', 'IndexKey', 'Coordinates', 'OutputKey'] dfI_Final=dfI.drop(Drop_col, axis=1,) dfI_Final.head(5) # ### Prepare Address Decoder Table # # It is so remarkably short # In[5]: #Split Data dfW=df.query('Intersection==0') #Load Complete Decoder Key df_C= pd.read_csv('Datasets/SemiKey.csv') df_C= df_C[['Block','Latitude','Longitude']] # Merge with the dataset and remove missing values dC=pd.merge(dfW, df_C, on='Block', how='left') dC=dC.fillna('Mi') dC=dC.query('Latitude != "Mi"') #Bringing the data back together fg= dfI.append(dC, ignore_index=True) # ### Preliminary Pre-Cleaning steps # # It is good to do the data pre-cleaning steps here to reduce clutter on a working notebook. A few edits include renaming some values, clustering crimes together and creating some dummies. It is saved in a csv, which is used for execution # In[6]: #Few Quick Edits fg.CallDisposition.loc[(fg['CallDisposition'] == 'G - Gone on Arrival')] = 'Gone on Arrival' fg.CallDisposition.loc[(fg['CallDisposition'] == 'A - Advised')] = 'Advised' fg.CallDisposition.loc[(fg['CallDisposition'] == 'RR - Report Written')] = 'Report Written' fg.Incident.loc[(fg['Incident'] == 'Simple Asasult Dom.')] = 'Simple Assault Dom.' fg.Incident.loc[(fg['Incident'] == 'Graffiti')] = 'Vandalism' fg.Incident.loc[fg["Incident"].isin([ "Rape","Agg. Assault",'Homicide'])]= 'Violent' fg.Incident.loc[fg["Incident"].isin(["Simple Assault Dom.","Agg. Assault Dom."])]= 'Domestic Assault' #[fg["Incident"].isin(["Simple Assault Dom.", "Rape"]) #Create a dummy for each crime category fg= pd.concat([fg,pd.get_dummies(fg['Incident'])], axis=1) fg= pd.concat([fg,pd.get_dummies(fg['CallDisposition'])], axis=1) fg.to_csv('Datasets/FGCrime_Final.csv', encoding='utf-8', index=False) # In[8]: fg.query('Year==2021') # In[7]: Ward1.query('ElectionDescription in ("11/08/2016 - STATE GENERAL", "08/09/2016 - STATE PRIMARY","11/07/2017 - MUNICIPAL GENERAL","11/07/2017 - SCHOOL DISTRICT GENERAL" ,"11/08/2016 - STATE GENERAL","08/09/2016 - STATE PRIMARY","11/03/2015 - MUNICIPAL GENERAL","11/03/2015 - SCHOOL DISTRICT GENERAL","11/08/2011 - MUNICIPAL GENERAL","11/06/2018 - STATE GENERAL","08/14/2018 - STATE PRIMARY")' # In[ ]: