#!/usr/bin/env python # coding: utf-8 #

Data Visualization of NYC High Schools College Enrollment for the 2013-14 School Year, by Marlon Pimentel

#

Quick Summary

#

Tools: Python (with pandas and folium) & Google's Geocoding API

#

Using the public data available at NYC OpenData, I downloaded two data sets, one that only had the DBN codes associated with the schools along with the statistics of college enrollment, and another which was a directory of high schools (name, street address, city..etc). I combined what I needed from the two and then converted each address into coordinates using Google's Geocoding API. I then mapped each school into a city map based on their coordinates and color coded them based on their college enrollment, thus painting an image out of the data.

#

Keep in Mind:

# # In[320]: import folium # map visuals import pandas as pd # data manipulation import requests from time import sleep # In[322]: schoolPerformance = pd.read_csv(r"C:\Users\Marlon Pimentel\Desktop\2013_-_2014_DOE_HS_Performance.csv") schoolInfo = pd.read_csv(r"C:\Users\Marlon Pimentel\Desktop\DOE_High_School_Directory_2013-2014.csv") # In[323]: # extracts only necessary columns off data sets schoolStat = schoolPerformance["college enroll 2011-12"] schoolInfoSub = schoolInfo[["DBN", "Printed_Name", "Primary_Address_Line_1",\ "City", "State_Code", "neighborhood", "Boro"]] # merging the two subsets into one, adding new columns for coords and renaming fullProfiles = pd.concat([schoolInfoSub, schoolStat], axis=1, sort=False) fullProfiles["Latitude"] = "" fullProfiles["Longitude"] = "" fullProfiles["hasStat"] = "True" fullProfiles.rename(columns = {"college enroll 2011-12": "collegeStat"}, inplace = True) pd.set_option('max_rows', 8) fullProfiles # In[324]: # getting rid of schools from set that have no stat # tracking where these schools are located to include in final report unavailableStats = {"bx": 0, "bk": 0, "manh": 0, "queens": 0, "si": 0} for idx, row in fullProfiles.iterrows(): if type(row["collegeStat"]) == float: if row["Boro"] == "M": unavailableStats["manh"] += 1 elif row["Boro"] == "X": unavailableStats["bx"] += 1 elif row["Boro"] == "K": unavailableStats["bk"] += 1 elif row["Boro"] == "Q": unavailableStats["queens"] += 1 elif row["Boro"] == "R": unavailableStats["si"] += 1 fullProfiles.loc[idx, "hasStat"] = "False" print(unavailableStats) totalUnvailable = 0 for borough in unavailableStats.values(): totalUnvailable += borough print("Total Unavailable: {}".format(totalUnvailable)) # new data set with schools that had no stat removed (108 removed out of 422, down to 314) fullProfilesv2 = fullProfiles[fullProfiles.hasStat != "False"] fullProfilesv2 #

Geocoding

#

At this point, the data set is almost how I want it to be. I then began converting the school's full address into coordinates of latitude and longitude to add into the empty columns seen above.

# In[325]: # 1. Add coords into new columns # first run will give us some with coords and some without due to being missed by the requests # then we will run a similar func until all have coords for idx, row in fullProfilesv2.iterrows(): try: # sleep is to avoid sending too many requests quickly schoolAddress = row["Primary_Address_Line_1"] + ", " + row["City"] \ + ", " + row["State_Code"] schoolAddressEdit = schoolAddress.replace(" ", "+") r = requests.get("https://maps.googleapis.com/maps/api/geocode/json?address={}key=AIzaSyBU828rT3AJLuU4LtZksyB2iWH63oBLCsU".format(schoolAddressEdit)) jsonInfo = r.json() locationDict = jsonInfo['results'][0]['geometry']['location'] row["Latitude"] = locationDict['lat'] row["Longitude"] = locationDict['lng'] sleep(2) except IndexError: row["Latitude"] = "missed" row["Longitude"] = "missed" # In[387]: # runs until all have coordinates, then we can finally map by retrieving them # have to do it this way to meet API's limit (cus im not paying to get more requests, im broke) for idx, row in fullProfilesv2.iterrows(): try: if row["Latitude"] == "missed": schoolAddress = row["Primary_Address_Line_1"] + ", " + row["City"] \ + ", " + row["State_Code"] schoolAddressEdit = schoolAddress.replace(" ", "+") r = requests.get("https://maps.googleapis.com/maps/api/geocode/json?address={}key=AIzaSyBU828rT3AJLuU4LtZksyB2iWH63oBLCsU".format(schoolAddressEdit)) jsonInfo = r.json() locationDict = jsonInfo['results'][0]['geometry']['location'] row["Latitude"] = locationDict['lat'] row["Longitude"] = locationDict['lng'] sleep(2) except IndexError: pass print(fullProfilesv2.Latitude[fullProfilesv2.Latitude == "missed"].count()) # In[388]: print(fullProfilesv2.Latitude[fullProfilesv2.Latitude == "missed"].count()) print(any(fullProfilesv2.Latitude == "missed")) fullProfilesv2 #

Plotting onto Folium Map

#

Here, I had gotten all of the necessary parts that I needed. Thus, I began to create a map through folium, a nice library for interactive maps. With each cell in my data set, I evaluated its statistic using a criteria I made, and placed it on the map with a specific color.

# In[392]: fullProfilesv2.to_csv("schoolCompleteData.csv", encoding='utf-8') # In[401]: city_map = folium.Map(location=[40.738, -73.98], zoom_start=11, tiles="CartoDB dark_matter") # this fixes our problem of the requests missing schools thus not having all coordinates to map at once # maps each school onto map with a color based on their stat boroughsShown = {"BX": 0, "BK": 0, "Manhattan": 0, "Queens": 0, "SI": 0} boroughsShown["BX"] = fullProfilesv2.Boro[fullProfilesv2.Boro == "X"].count() boroughsShown["BK"] = fullProfilesv2.Boro[fullProfilesv2.Boro == "K"].count() boroughsShown["SI"] = fullProfilesv2.Boro[fullProfilesv2.Boro == "R"].count() boroughsShown["Manhattan"] = fullProfilesv2.Boro[fullProfilesv2.Boro == "M"].count() boroughsShown["Queens"] = fullProfilesv2.Boro[fullProfilesv2.Boro == "Q"].count() for boro, count in boroughsShown.items(): print("{} represented by {} schools.".format(boro, count)) for idx, row in fullProfilesv2.iterrows(): if row["Latitude"] != "missed": stat = float(row["collegeStat"].replace("%", "")) if stat >= 90: color = "#0000A0" # dark blue elif stat >= 75 and stat < 90: color = "#736AFF" # light blue elif stat >= 65 and stat < 75: color = "FFFFFF" # white elif stat >= 50 and stat < 65: color = "#FF0099" # pink elif stat < 50: color = "#FF0000" # red # popup_text = "{}
Neighborhood: {}
College Enrollment: {}
" # popup_text = popup_text.format(row["Printed_Name"], # row["neighborhood"], stat) folium.CircleMarker(location=(row["Latitude"], row["Longitude"]), radius=2, color=color, fill=True).add_to(city_map) city_map #

Key

#