#!/usr/bin/env python
# coding: utf-8

# # Read in the data

# In[1]:


import pandas as pd
import numpy
import re

data_files = [
    "ap_2010.csv",
    "class_size.csv",
    "demographics.csv",
    "graduation.csv",
    "hs_directory.csv",
    "sat_results.csv"
]

data = {}

for f in data_files:
    d = pd.read_csv("schools/{0}".format(f))
    data[f.replace(".csv", "")] = d


# # Read in the surveys

# In[2]:


all_survey = pd.read_csv("schools/survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv("schools/survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)

survey["DBN"] = survey["dbn"]

survey_fields = [
    "DBN", 
    "rr_s", 
    "rr_t", 
    "rr_p", 
    "N_s", 
    "N_t", 
    "N_p", 
    "saf_p_11", 
    "com_p_11", 
    "eng_p_11", 
    "aca_p_11", 
    "saf_t_11", 
    "com_t_11", 
    "eng_t_11", 
    "aca_t_11", 
    "saf_s_11", 
    "com_s_11", 
    "eng_s_11", 
    "aca_s_11", 
    "saf_tot_11", 
    "com_tot_11", 
    "eng_tot_11", 
    "aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey


# # Add DBN columns

# In[3]:


data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]

def pad_csd(num):
    string_representation = str(num)
    if len(string_representation) > 1:
        return string_representation
    else:
        return "0" + string_representation
    
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]


# # Convert columns to numeric

# In[4]:


cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
    data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")

data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]

def find_lat(loc):
    coords = re.findall("\(.+, .+\)", loc)
    lat = coords[0].split(",")[0].replace("(", "")
    return lat

def find_lon(loc):
    coords = re.findall("\(.+, .+\)", loc)
    lon = coords[0].split(",")[1].replace(")", "").strip()
    return lon

data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)

data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")


# # Condense datasets

# In[5]:


class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]

class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size

data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]

data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]


# # Convert AP scores to numeric

# In[6]:


cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']

for col in cols:
    data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")


# # Combine the datasets

# In[7]:


combined = data["sat_results"]

combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")

to_merge = ["class_size", "demographics", "survey", "hs_directory"]

for m in to_merge:
    combined = combined.merge(data[m], on="DBN", how="inner")

combined = combined.fillna(combined.mean())
combined = combined.fillna(0)


# # Add a school district column for mapping

# In[8]:


def get_first_two_chars(dbn):
    return dbn[0:2]

combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)


# # Find correlations

# In[9]:


correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)


# # Plotting survey correlations

# In[10]:


# Remove DBN since it's a unique identifier, not a useful numerical value for correlation.
survey_fields.remove("DBN")


# In[11]:


get_ipython().run_line_magic('matplotlib', 'inline')
combined.corr()["sat_score"][survey_fields].plot.bar()


# As expected, any of the columns (*N_s*, *N_t*, *N_p*) associated with *total_enrollment* show a strong correlation with *sat_score*.  This is expected given the high corelation between *total_enrollment* and *sat_score*.
# 
# The response rate of the students (*rr_s*) is higher than the other response rates.  This could be skewed by better performing students, because they are more likely to fill out surveys when presented with them.
# 
# Safety correlates with SAT scores fairly well, specifically with teachers and students (*saf_t*, *saf_s*).  On the surface these might seem unrelated, but it is hard to teach or learn when one does not feel safe trying to do either.
# 
# It is also worth noting that students perception of academic standards (*aca_s*) correlates better to SAT scores than the perceptions of teachers or parents (*aca_t*, *aca_p*).

# # Looking at Safety

# In[12]:


combined.plot.scatter('saf_s_11', 'sat_score')


# The correlation between safety and SAT scores is not as strong as the bar graph above indicates.  Looking at the scatter plot the correlation falls off as the safety score increases.  This is interesting because it points to the fact that schools can be perceived as safe by students and still be low performing.  The most useful information to take from this plot is that it appears that a safety score of below 6.5 to 7 caps SAT scores around 1500, because there are no SAT scores higher than 1500 on the low safety score side of the plot.

# # Safety by District

# In[13]:


import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

districts = combined.groupby("school_dist").agg(numpy.mean)
districts.reset_index(inplace=True)

m = Basemap(
    projection='merc', 
    llcrnrlat=40.496044, 
    urcrnrlat=40.915256, 
    llcrnrlon=-74.255735, 
    urcrnrlon=-73.700272,
    resolution='i'
)

m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)

longitudes = districts['lon'].tolist()
latitudes = districts['lat'].tolist()
m.scatter(longitudes, latitudes, s=50, zorder=2, latlon=True, c=districts['saf_s_11'], cmap='summer')


# Parts of Manhattan, Queens, and the Bronx have the highest safety scores.  Brooklyn tends to have lower scores in comparison to the other burroghs.

# # SAT Scores by Racial Identity

# In[14]:


race = ['white_per', 'asian_per', 'black_per', 'hispanic_per']
combined.corr()['sat_score'][race].plot.bar()


# White and Asian students correlate more positively to SAT scores, while black and hispanic students are more negative.  This could be a funding issue with certain areas and burroughs.  The lowest correlation being among hispanic students could also point to a language barrier as well.

# In[15]:


combined.plot.scatter('hispanic_per', 'sat_score')


# There is a definite correlation in the data between hispanic percentage and SAT scores.  As this percentage increases the scores decrease.  While funding in predominately hispanic school districts could be an issue, this still could be a function of classes being taught in english to non-english speakers.  

# In[16]:


print(combined[combined['hispanic_per'] > 95] ['SCHOOL NAME'])


# The majority of these schools serve students that have just immigrated to the United States.  These students are attempting to learn english as a second language along with their normal school studies.  This would explain lower SAT scores.

# In[17]:


print(combined[(combined['hispanic_per'] < 10) & (combined['sat_score'] > 1800)]['SCHOOL NAME'])


# The majority of these schools are STEM focused, which means they receive extra funding and focus on math, science, and technology.  Many of these schools also require an entrance exam.  This explains the higher SAT scores, as these are students that have already passed certain standaradized test just to be able to attend.  While it does not completely explain the low percentage of hispanic students, it could point back to the language issue being a barrier to otherwise capable students being unable to comprehend certain portions of the entrance exams.

# # SAT Scores by Gender

# In[18]:


gender = ['male_per', 'female_per']
combined.corr()['sat_score'][gender].plot.bar()


# While neither correlation is especially strong, it appears that schools with a higher female population correlate to higher SAT scores.

# In[19]:


combined.plot.scatter('female_per', 'sat_score')


# Nothing in the above scatterplot stands out as to why SAT scores and female populations have the positive correlation as the majority of the cluster is right along the naturally expected 50%.  There are, however, a group of high scoring high female population (60%-80%) schools worth further investigation.

# In[20]:


print(combined[(combined['female_per'] > 60) & (combined['sat_score'] > 1700)]['SCHOOL NAME'])


# These schools are all highly rated liberal arts focused high schools, which also happen to be extremely selective.

# # AP Scores vs. SAT Scores

# In[21]:


combined['ap_per'] = combined['AP Test Takers '] / combined['total_enrollment']

combined.plot.scatter('ap_per', 'sat_score')


# This information is not very strong, but there is a correlation between the number of AP test participants in a school and the school's overall SAT score average.