import pandas as pd
import numpy
import re
data_files = [
"ap_2010.csv",
"class_size.csv",
"demographics.csv",
"graduation.csv",
"hs_directory.csv",
"sat_results.csv"
]
data = {}
for f in data_files:
d = pd.read_csv("schools/{0}".format(f))
data[f.replace(".csv", "")] = d
all_survey = pd.read_csv("schools/survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv("schools/survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)
survey["DBN"] = survey["dbn"]
survey_fields = [
"DBN",
"rr_s",
"rr_t",
"rr_p",
"N_s",
"N_t",
"N_p",
"saf_p_11",
"com_p_11",
"eng_p_11",
"aca_p_11",
"saf_t_11",
"com_t_11",
"eng_t_11",
"aca_t_11",
"saf_s_11",
"com_s_11",
"eng_s_11",
"aca_s_11",
"saf_tot_11",
"com_tot_11",
"eng_tot_11",
"aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]
def pad_csd(num):
string_representation = str(num)
if len(string_representation) > 1:
return string_representation
else:
return "0" + string_representation
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")
data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
def find_lat(loc):
coords = re.findall("\(.+, .+\)", loc)
lat = coords[0].split(",")[0].replace("(", "")
return lat
def find_lon(loc):
coords = re.findall("\(.+, .+\)", loc)
lon = coords[0].split(",")[1].replace(")", "").strip()
return lon
data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)
data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]
class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size
data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]
data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']
for col in cols:
data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")
combined = data["sat_results"]
combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")
to_merge = ["class_size", "demographics", "survey", "hs_directory"]
for m in to_merge:
combined = combined.merge(data[m], on="DBN", how="inner")
combined = combined.fillna(combined.mean())
combined = combined.fillna(0)
def get_first_two_chars(dbn):
return dbn[0:2]
combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)
correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)
SAT Critical Reading Avg. Score 0.986820 SAT Math Avg. Score 0.972643 SAT Writing Avg. Score 0.987771 sat_score 1.000000 AP Test Takers 0.523140 Total Exams Taken 0.514333 Number of Exams with scores 3 4 or 5 0.463245 Total Cohort 0.325144 CSD 0.042948 NUMBER OF STUDENTS / SEATS FILLED 0.394626 NUMBER OF SECTIONS 0.362673 AVERAGE CLASS SIZE 0.381014 SIZE OF SMALLEST CLASS 0.249949 SIZE OF LARGEST CLASS 0.314434 SCHOOLWIDE PUPIL-TEACHER RATIO NaN schoolyear NaN fl_percent NaN frl_percent -0.722225 total_enrollment 0.367857 ell_num -0.153778 ell_percent -0.398750 sped_num 0.034933 sped_percent -0.448170 asian_num 0.475445 asian_per 0.570730 black_num 0.027979 black_per -0.284139 hispanic_num 0.025744 hispanic_per -0.396985 white_num 0.449559 ... rr_p 0.047925 N_s 0.423463 N_t 0.291463 N_p 0.421530 saf_p_11 0.122913 com_p_11 -0.115073 eng_p_11 0.020254 aca_p_11 0.035155 saf_t_11 0.313810 com_t_11 0.082419 eng_t_11 0.036906 aca_t_11 0.132348 saf_s_11 0.337639 com_s_11 0.187370 eng_s_11 0.213822 aca_s_11 0.339435 saf_tot_11 0.318753 com_tot_11 0.077310 eng_tot_11 0.100102 aca_tot_11 0.190966 grade_span_max NaN expgrade_span_max NaN zip -0.063977 total_students 0.407827 number_programs 0.117012 priority08 NaN priority09 NaN priority10 NaN lat -0.121029 lon -0.132222 Name: sat_score, Length: 67, dtype: float64
# Remove DBN since it's a unique identifier, not a useful numerical value for correlation.
survey_fields.remove("DBN")
#setting up matplotlib
%matplotlib inline
combined.corr()["sat_score"][survey_fields].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b53b3ef0>
Conclussion: High correlation is observed between N_s, N_t, N_p (which are the survey's number of respondents for student, teacher and parents) and sat_score as these are related to total enrollment.
Interesting correlation is also seen between student response rate(rr_s) and sat_scores as good academic result is likely to get responded to in a survey.
# Making a scatter plot of the saf_s_11 column vs. the sat_score
# in combined
combined.plot.scatter(x="saf_s_11", y="sat_score")
<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b53575c0>
The scatter plot above is used to explore and explain the relationship between the student perception of school safety and the Sat Scores. Where the safety score is 7 or less, the average score achieved is around 1400 or less. And where the safety score is 7.5 or more, higher average score of 1800 and over is being achieved.
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
districts = combined.groupby('school_dist').agg(np.mean)
districts.reset_index(inplace=True)
m = Basemap(
projection='merc',
llcrnrlat=40.496044,
urcrnrlat=40.915256,
llcrnrlon=-74.255735,
urcrnrlon=-73.700272,
resolution='i'
)
m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)
longitudes = districts["lon"].tolist()
latitudes = districts["lat"].tolist()
m.scatter(longitudes, latitudes, s=50, zorder=2, latlon=True, c=districts["saf_s_11"], cmap="summer")
plt.show()
race_fields = ["white_per","asian_per","black_per","hispanic_per"]
combined.corr()["sat_score"][race_fields].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b5293470>
Plots shows that both White and Asian studesnts correalte positively with the sat results while Black and Hispanic students correlate negativle probably due to the social, housing and family disadvantages including funding for the minority communities
# Using scatter plot to explore the relationship between low SAT Scores
# and high concentration of Hispanics in combined dataset
combined.plot.scatter(x="hispanic_per", y="sat_score")
<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b4fd66d8>
high_hispanic = combined[combined["hispanic_per"] > 95]
high_hispanic["School Name"].head()
44 MANHATTAN BRIDGES HIGH SCHOOL 82 0 89 GREGORIO LUPERON HIGH SCHOOL FOR SCIE 125 0 141 INTERNATIONAL SCHOOL FOR LIBERAL ARTS Name: School Name, dtype: object
low_hispanic = combined[combined["hispanic_per"] < 10]
low_hispanic = combined[combined["sat_score"] > 1800]
low_hispanic["School Name"]
5 BARD HIGH SCHOOL EARLY COLLEGE 37 STUYVESANT HIGH SCHOOL 79 HIGH SCHOOL FOR MATHEMATICS SCIENCE 151 BRONX HIGH SCHOOL OF SCIENCE 155 HIGH SCHOOL OF AMERICAN STUDIES AT LE 187 BROOKLYN TECHNICAL HIGH SCHOOL 302 TOWNSEND HARRIS HIGH SCHOOL 327 QUEENS HIGH SCHOOL FOR THE SCIENCES A 356 STATEN ISLAND TECHNICAL HIGH SCHOOL Name: School Name, dtype: object
gender_fields = ["male_per","female_per"]
combined.corr()["sat_score"][gender_fields].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b4fd6438>
A high percentage of female students correllates positively with the sat score while male students correlates negatively
combined.plot.scatter("female_per", "sat_score")
<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b5014cc0>
The scatter plot shows that around half of the percentage of the females achieved less than 1400 in SAT score
combined[(combined["female_per"] > 60 ) & (combined["sat_score"] > 1700)]["SCHOOL NAME"]
5 BARD HIGH SCHOOL EARLY COLLEGE 26 ELEANOR ROOSEVELT HIGH SCHOOL 60 BEACON HIGH SCHOOL 61 FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A... 302 TOWNSEND HARRIS HIGH SCHOOL Name: SCHOOL NAME, dtype: object
Above shows unique sets of predominantly female students at schools specializing Arts and/or music schools and with high Sat scores or good academic achievement.
combined["ap_per"] = combined["AP Test Takers "] / combined["total_enrollment"]
combined.plot.scatter(x="ap_per", y="sat_score")
<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b5017fd0>
The scatter plot above shows a not too strong correlation and/or relationship between the percentage of students in a school who take the AP exam, and their average SAT scores.
correlations = combined.corr()
correlations = correlations["AVERAGE CLASS SIZE"]
print(correlations)
SAT Critical Reading Avg. Score 0.376117 SAT Math Avg. Score 0.376290 SAT Writing Avg. Score 0.369994 sat_score 0.381014 AP Test Takers 0.331098 Total Exams Taken 0.307422 Number of Exams with scores 3 4 or 5 0.220873 Total Cohort 0.489342 CSD 0.177610 NUMBER OF STUDENTS / SEATS FILLED 0.554340 NUMBER OF SECTIONS 0.485068 AVERAGE CLASS SIZE 1.000000 SIZE OF SMALLEST CLASS 0.737460 SIZE OF LARGEST CLASS 0.922846 SCHOOLWIDE PUPIL-TEACHER RATIO NaN schoolyear NaN fl_percent NaN frl_percent -0.400651 total_enrollment 0.538782 ell_num 0.127532 ell_percent -0.279265 sped_num 0.411212 sped_percent -0.130716 asian_num 0.417055 asian_per 0.234196 black_num 0.354672 black_per -0.092176 hispanic_num 0.342446 hispanic_per -0.225512 white_num 0.400543 ... N_s 0.546125 N_t 0.493508 N_p 0.469885 saf_p_11 -0.244755 com_p_11 -0.306949 eng_p_11 -0.177926 aca_p_11 -0.262945 saf_t_11 0.068197 com_t_11 0.139299 eng_t_11 0.039632 aca_t_11 0.052221 saf_s_11 -0.113110 com_s_11 -0.237042 eng_s_11 -0.130713 aca_s_11 -0.116830 saf_tot_11 -0.087799 com_tot_11 -0.077659 eng_tot_11 -0.074314 aca_tot_11 -0.091287 grade_span_max NaN expgrade_span_max NaN zip 0.103041 total_students 0.547814 number_programs 0.386975 priority08 NaN priority09 NaN priority10 NaN lat -0.150483 lon -0.006868 ap_per -0.330042 Name: AVERAGE CLASS SIZE, Length: 68, dtype: float64
# Making a scatter plot of AVERAGE CLASS SIZE column vs. the SAT Score
combined.plot.scatter(x="AVERAGE CLASS SIZE", y="sat_score")
<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b52d92e8>
There appears to be no direct relationship or correlation between SAT Scores and the Average Class Size.