import pandas as pd
import numpy
import re
data_files = [
"ap_2010.csv",
"class_size.csv",
"demographics.csv",
"graduation.csv",
"hs_directory.csv",
"sat_results.csv"
]
data = {}
for f in data_files:
d = pd.read_csv("schools/{0}".format(f))
data[f.replace(".csv", "")] = d
all_survey = pd.read_csv("schools/survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv("schools/survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)
survey["DBN"] = survey["dbn"]
survey_fields = [
"DBN",
"rr_s",
"rr_t",
"rr_p",
"N_s",
"N_t",
"N_p",
"saf_p_11",
"com_p_11",
"eng_p_11",
"aca_p_11",
"saf_t_11",
"com_t_11",
"eng_t_11",
"aca_t_11",
"saf_s_11",
"com_s_11",
"eng_s_11",
"aca_s_11",
"saf_tot_11",
"com_tot_11",
"eng_tot_11",
"aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]
def pad_csd(num):
string_representation = str(num)
if len(string_representation) > 1:
return string_representation
else:
return "0" + string_representation
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")
data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
def find_lat(loc):
coords = re.findall("\(.+, .+\)", loc)
lat = coords[0].split(",")[0].replace("(", "")
return lat
def find_lon(loc):
coords = re.findall("\(.+, .+\)", loc)
lon = coords[0].split(",")[1].replace(")", "").strip()
return lon
data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)
data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]
class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size
data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]
data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']
for col in cols:
data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")
combined = data["sat_results"]
combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")
to_merge = ["class_size", "demographics", "survey", "hs_directory"]
for m in to_merge:
combined = combined.merge(data[m], on="DBN", how="inner")
combined = combined.fillna(combined.mean())
combined = combined.fillna(0)
def get_first_two_chars(dbn):
return dbn[0:2]
combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)
correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)
SAT Critical Reading Avg. Score 0.986820 SAT Math Avg. Score 0.972643 SAT Writing Avg. Score 0.987771 sat_score 1.000000 AP Test Takers 0.523140 Total Exams Taken 0.514333 Number of Exams with scores 3 4 or 5 0.463245 Total Cohort 0.325144 CSD 0.042948 NUMBER OF STUDENTS / SEATS FILLED 0.394626 NUMBER OF SECTIONS 0.362673 AVERAGE CLASS SIZE 0.381014 SIZE OF SMALLEST CLASS 0.249949 SIZE OF LARGEST CLASS 0.314434 SCHOOLWIDE PUPIL-TEACHER RATIO NaN schoolyear NaN fl_percent NaN frl_percent -0.722225 total_enrollment 0.367857 ell_num -0.153778 ell_percent -0.398750 sped_num 0.034933 sped_percent -0.448170 asian_num 0.475445 asian_per 0.570730 black_num 0.027979 black_per -0.284139 hispanic_num 0.025744 hispanic_per -0.396985 white_num 0.449559 ... rr_p 0.047925 N_s 0.423463 N_t 0.291463 N_p 0.421530 saf_p_11 0.122913 com_p_11 -0.115073 eng_p_11 0.020254 aca_p_11 0.035155 saf_t_11 0.313810 com_t_11 0.082419 eng_t_11 0.036906 aca_t_11 0.132348 saf_s_11 0.337639 com_s_11 0.187370 eng_s_11 0.213822 aca_s_11 0.339435 saf_tot_11 0.318753 com_tot_11 0.077310 eng_tot_11 0.100102 aca_tot_11 0.190966 grade_span_max NaN expgrade_span_max NaN zip -0.063977 total_students 0.407827 number_programs 0.117012 priority08 NaN priority09 NaN priority10 NaN lat -0.121029 lon -0.132222 Name: sat_score, Length: 67, dtype: float64
# Remove DBN since it's a unique identifier, not a useful numerical value for correlation.
survey_fields.remove("DBN")
#survey_fields
import numpy as np
correlations = combined.corr()
correlations = correlations.loc[survey_fields]
# print(correlations.dtypes)
import matplotlib.pyplot as plt
%matplotlib inline
correlations.reset_index(inplace=True)
# correlations.head()
correlations.plot.bar(x='index',y='sat_score')
# combined.corr()['sat_score'][survey_fields]
<matplotlib.axes._subplots.AxesSubplot at 0x7f5bec27ba58>
Above, On the last screen (means above plot) you may have noticed that saf_t_11 and saf_s_11, which measure how teachers and students perceive safety at school, correlated highly with sat_score. On this screen, we'll dig into this relationship a bit more, and try to figure out which schools have low safety scores.
# Making scatterplot for saf_s_11 vs sat_score
combined.plot.scatter(x='saf_s_11',y='sat_score')
<matplotlib.axes._subplots.AxesSubplot at 0x7f5bec0ded68>
We can clearly see a strong possitive correlation in above plot this means that with increase in the 'sat_score' the values of 'safety score from students' is also increasing.
# The Borough is present in the column "boro"
import numpy as np
combined.groupby(by="boro").agg(np.mean)['saf_s_11']
boro Bronx 6.606577 Brooklyn 6.370755 Manhattan 6.831370 Queens 6.721875 Staten Island 6.530000 Name: saf_s_11, dtype: float64
From the above cell we can conclude,
race_list = ['white_per','asian_per','black_per','hispanic_per']
combined.corr()['sat_score'][race_list].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f5bec1aa390>
From the above plot we can observe that,
# Making scatterplot for hispanic_per vs sat_score
combined.plot.scatter(x='hispanic_per',y='sat_score')
<matplotlib.axes._subplots.AxesSubplot at 0x7f5bebdf4dd8>
from the above plot we can observe that, Yes, there is a moderate negative correlation which indicated that yes, hispanic person tends to score less score in sat.
combined[combined['hispanic_per']>95]['SCHOOL NAME']
44 MANHATTAN BRIDGES HIGH SCHOOL 82 WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL 89 GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M... 125 ACADEMY FOR LANGUAGE AND TECHNOLOGY 141 INTERNATIONAL SCHOOL FOR LIBERAL ARTS 176 PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE 253 MULTICULTURAL HIGH SCHOOL 286 PAN AMERICAN INTERNATIONAL HIGH SCHOOL Name: SCHOOL NAME, dtype: object
The schools listed above appear to primarily be geared towards recent immigrants to the US. These schools have a lot of students who are learning English, which would explain the lower SAT scores.
print(combined[(combined['hispanic_per']<10) & (combined['sat_score']>1800)]['SCHOOL NAME'])
37 STUYVESANT HIGH SCHOOL 151 BRONX HIGH SCHOOL OF SCIENCE 187 BROOKLYN TECHNICAL HIGH SCHOOL 327 QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO... 356 STATEN ISLAND TECHNICAL HIGH SCHOOL Name: SCHOOL NAME, dtype: object
Many of the schools above appear to be specialized science and technology schools that receive extra funding, and only admit students who pass an entrance exam. This doesn't explain the low hispanic_per, but it does explain why their students tend to do better on the SAT -- they are students from all over New York City who did well on a standardized test.
# Correlatio among male_per & female_per
gender_per_list = ['male_per','female_per']
combined.corr()['sat_score'][gender_per_list].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f5bebd40978>
The above plot clearly indicates that males tend to have lesser SAT scores than Female. But, neither correlation is a significant strong one.
# Making scatterplot
combined.plot.scatter(x='female_per',y='sat_score')
<matplotlib.axes._subplots.AxesSubplot at 0x7f5bebcfdcf8>
From the above scatterplot we can observe a cluster of schools with high percentage of females in them with percentage between (40-60).
print(combined[(combined['female_per']>60) & (combined['sat_score']>1700)]['SCHOOL NAME'])
5 BARD HIGH SCHOOL EARLY COLLEGE 26 ELEANOR ROOSEVELT HIGH SCHOOL 60 BEACON HIGH SCHOOL 61 FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A... 302 TOWNSEND HARRIS HIGH SCHOOL Name: SCHOOL NAME, dtype: object
From result in above cell we can cnclude that,
combined['ap_per'] = combined['AP Test Takers '] / combined['total_enrollment']
# Making the Scatter Plot
combined.plot.scatter(x='ap_per',y='sat_score')
<matplotlib.axes._subplots.AxesSubplot at 0x7f5bebbbc438>
listt=['ap_per']
combined.corr()['sat_score'][listt].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f5bebb6f1d0>
So, based on the scatterplot and bar plot above we can say that there is almost no correlation among the 'ap_per' and 'sat_score'. Even if we consider this as a correlation the it's not to be considered as a strong correlation.