import pandas as pd
import numpy
import re
data_files = [
"ap_2010.csv",
"class_size.csv",
"demographics.csv",
"graduation.csv",
"hs_directory.csv",
"sat_results.csv"
]
data = {}
for f in data_files:
d = pd.read_csv("schools/{0}".format(f))
data[f.replace(".csv", "")] = d
all_survey = pd.read_csv("schools/survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv("schools/survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)
survey["DBN"] = survey["dbn"]
survey_fields = [
"DBN",
"rr_s",
"rr_t",
"rr_p",
"N_s",
"N_t",
"N_p",
"saf_p_11",
"com_p_11",
"eng_p_11",
"aca_p_11",
"saf_t_11",
"com_t_11",
"eng_t_11",
"aca_t_11",
"saf_s_11",
"com_s_11",
"eng_s_11",
"aca_s_11",
"saf_tot_11",
"com_tot_11",
"eng_tot_11",
"aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]
def pad_csd(num):
string_representation = str(num)
if len(string_representation) > 1:
return string_representation
else:
return "0" + string_representation
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")
data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
def find_lat(loc):
coords = re.findall("\(.+, .+\)", loc)
lat = coords[0].split(",")[0].replace("(", "")
return lat
def find_lon(loc):
coords = re.findall("\(.+, .+\)", loc)
lon = coords[0].split(",")[1].replace(")", "").strip()
return lon
data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)
data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]
class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size
data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]
data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']
for col in cols:
data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")
combined = data["sat_results"]
combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")
to_merge = ["class_size", "demographics", "survey", "hs_directory"]
for m in to_merge:
combined = combined.merge(data[m], on="DBN", how="inner")
combined = combined.fillna(combined.mean())
combined = combined.fillna(0)
def get_first_two_chars(dbn):
return dbn[0:2]
combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)
correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)
SAT Critical Reading Avg. Score 0.986820 SAT Math Avg. Score 0.972643 SAT Writing Avg. Score 0.987771 sat_score 1.000000 AP Test Takers 0.523140 Total Exams Taken 0.514333 Number of Exams with scores 3 4 or 5 0.463245 Total Cohort 0.325144 CSD 0.042948 NUMBER OF STUDENTS / SEATS FILLED 0.394626 NUMBER OF SECTIONS 0.362673 AVERAGE CLASS SIZE 0.381014 SIZE OF SMALLEST CLASS 0.249949 SIZE OF LARGEST CLASS 0.314434 SCHOOLWIDE PUPIL-TEACHER RATIO NaN schoolyear NaN fl_percent NaN frl_percent -0.722225 total_enrollment 0.367857 ell_num -0.153778 ell_percent -0.398750 sped_num 0.034933 sped_percent -0.448170 asian_num 0.475445 asian_per 0.570730 black_num 0.027979 black_per -0.284139 hispanic_num 0.025744 hispanic_per -0.396985 white_num 0.449559 ... rr_p 0.047925 N_s 0.423463 N_t 0.291463 N_p 0.421530 saf_p_11 0.122913 com_p_11 -0.115073 eng_p_11 0.020254 aca_p_11 0.035155 saf_t_11 0.313810 com_t_11 0.082419 eng_t_11 0.036906 aca_t_11 0.132348 saf_s_11 0.337639 com_s_11 0.187370 eng_s_11 0.213822 aca_s_11 0.339435 saf_tot_11 0.318753 com_tot_11 0.077310 eng_tot_11 0.100102 aca_tot_11 0.190966 grade_span_max NaN expgrade_span_max NaN zip -0.063977 total_students 0.407827 number_programs 0.117012 priority08 NaN priority09 NaN priority10 NaN lat -0.121029 lon -0.132222 Name: sat_score, Length: 67, dtype: float64
# Remove DBN since it's a unique identifier, not a useful numerical value for correlation.
survey_fields.remove("DBN")
import matplotlib.pyplot as plt
%matplotlib inline
combined.columns
Index(['DBN', 'SCHOOL NAME', 'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score', 'sat_score', 'SchoolName', 'AP Test Takers ', 'Total Exams Taken', ... 'priority05', 'priority06', 'priority07', 'priority08', 'priority09', 'priority10', 'Location 1', 'lat', 'lon', 'school_dist'], dtype='object', length=160)
correlations = combined.corr()
correlations["sat_score"][survey_fields].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f2748b11048>
The N_s, N_p, N_p which are the number of teachers, parents and students resposponses are higly correated to the sat score. This makes sense because they are higly correlated with the total enrolment. Saf_t_11 and Saf_s_11, which are safety perecptions of teachers and students are also highly correlated to sat score. For in reality it's just hard to teach and learn in unsafe places. aca_t_11, aca_p_11 (teachers and parents pereception of the tests) are not highly correlated to the score while aca_s_11 (students perecption) has a very high correlation because its them that sit for the test.
combined.plot.scatter(x = "saf_s_11", y = "sat_score")
plt.show()
Sat scores are generally 1000 - 1400 in areas with student safety pereception levels that are low. Sat scores get to increase for a few people at safety levels above 7. The influence of student safety perception on the SAT score is not very high.
districts = combined.groupby("school_dist").agg(numpy.mean)
districts
SAT Critical Reading Avg. Score | SAT Math Avg. Score | SAT Writing Avg. Score | sat_score | AP Test Takers | Total Exams Taken | Number of Exams with scores 3 4 or 5 | Total Cohort | CSD | NUMBER OF STUDENTS / SEATS FILLED | ... | grade_span_max | expgrade_span_max | zip | total_students | number_programs | priority08 | priority09 | priority10 | lat | lon | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
school_dist | |||||||||||||||||||||
01 | 441.833333 | 473.333333 | 439.333333 | 1354.500000 | 116.681090 | 173.019231 | 135.800000 | 93.500000 | 1.0 | 115.244241 | ... | 12.0 | 12.0 | 10003.166667 | 659.500000 | 1.333333 | 0.0 | 0.0 | 0.0 | 40.719022 | -73.982377 |
02 | 426.619092 | 444.186256 | 424.832836 | 1295.638184 | 128.908454 | 201.516827 | 157.495833 | 158.647849 | 2.0 | 149.818949 | ... | 12.0 | 12.0 | 10023.770833 | 621.395833 | 1.416667 | 0.0 | 0.0 | 0.0 | 40.739699 | -73.991386 |
03 | 428.529851 | 437.997512 | 426.915672 | 1293.443035 | 156.183494 | 244.522436 | 193.087500 | 183.384409 | 3.0 | 156.005994 | ... | 12.0 | 12.0 | 10023.750000 | 717.916667 | 2.000000 | 0.0 | 0.0 | 0.0 | 40.781574 | -73.977370 |
04 | 402.142857 | 416.285714 | 405.714286 | 1224.142857 | 129.016484 | 183.879121 | 151.035714 | 113.857143 | 4.0 | 132.362265 | ... | 12.0 | 12.0 | 10029.857143 | 580.857143 | 1.142857 | 0.0 | 0.0 | 0.0 | 40.793449 | -73.943215 |
05 | 427.159915 | 438.236674 | 419.666098 | 1285.062687 | 85.722527 | 115.725275 | 142.464286 | 143.677419 | 5.0 | 120.623901 | ... | 12.0 | 12.0 | 10030.142857 | 609.857143 | 1.142857 | 0.0 | 0.0 | 0.0 | 40.817077 | -73.949251 |
06 | 382.011940 | 400.565672 | 382.066269 | 1164.643881 | 108.711538 | 159.715385 | 105.425000 | 180.848387 | 6.0 | 139.041709 | ... | 12.0 | 12.0 | 10036.200000 | 628.900000 | 1.300000 | 0.0 | 0.0 | 0.0 | 40.848970 | -73.932502 |
07 | 376.461538 | 380.461538 | 371.923077 | 1128.846154 | 73.703402 | 112.476331 | 105.276923 | 105.605459 | 7.0 | 97.597416 | ... | 12.0 | 12.0 | 10452.692308 | 465.846154 | 1.461538 | 0.0 | 0.0 | 0.0 | 40.816815 | -73.919971 |
08 | 386.214383 | 395.542741 | 377.908005 | 1159.665129 | 118.379371 | 168.020979 | 144.731818 | 215.510264 | 8.0 | 129.765099 | ... | 12.0 | 12.0 | 10467.000000 | 547.636364 | 1.272727 | 0.0 | 0.0 | 0.0 | 40.823803 | -73.866087 |
09 | 373.755970 | 383.582836 | 374.633134 | 1131.971940 | 71.411538 | 104.265385 | 98.470000 | 113.330645 | 9.0 | 100.118588 | ... | 12.0 | 12.0 | 10456.100000 | 449.700000 | 1.150000 | 0.0 | 0.0 | 0.0 | 40.836349 | -73.906240 |
10 | 403.363636 | 418.000000 | 400.863636 | 1222.227273 | 132.231206 | 226.914336 | 191.618182 | 161.318182 | 10.0 | 168.876526 | ... | 12.0 | 12.0 | 10463.181818 | 757.863636 | 1.500000 | 0.0 | 0.0 | 0.0 | 40.870345 | -73.898360 |
11 | 389.866667 | 394.533333 | 380.600000 | 1165.000000 | 83.813462 | 122.484615 | 108.833333 | 122.866667 | 11.0 | 129.031031 | ... | 12.0 | 12.0 | 10467.933333 | 563.666667 | 1.533333 | 0.0 | 0.0 | 0.0 | 40.873138 | -73.856120 |
12 | 364.769900 | 379.109453 | 357.943781 | 1101.823134 | 93.102564 | 139.442308 | 153.450000 | 110.467742 | 12.0 | 91.684504 | ... | 12.0 | 12.0 | 10463.166667 | 409.000000 | 1.083333 | 0.0 | 0.0 | 0.0 | 40.831412 | -73.886946 |
13 | 409.393800 | 424.127440 | 403.666361 | 1237.187600 | 232.931953 | 382.704142 | 320.773077 | 224.595533 | 13.0 | 218.306055 | ... | 12.0 | 12.0 | 11207.153846 | 895.153846 | 2.076923 | 0.0 | 0.0 | 0.0 | 40.692865 | -73.977016 |
14 | 395.937100 | 398.189765 | 385.333049 | 1179.459915 | 77.798077 | 114.873626 | 123.282143 | 112.347926 | 14.0 | 123.643728 | ... | 12.0 | 12.0 | 11210.785714 | 545.357143 | 2.000000 | 0.0 | 0.0 | 0.0 | 40.711599 | -73.948360 |
15 | 395.679934 | 404.628524 | 390.295854 | 1190.604312 | 94.574786 | 141.581197 | 153.450000 | 104.207885 | 15.0 | 135.707319 | ... | 12.0 | 12.0 | 11214.222222 | 573.111111 | 1.666667 | 0.0 | 0.0 | 0.0 | 40.675972 | -73.989255 |
16 | 371.529851 | 379.164179 | 369.415672 | 1120.109701 | 82.264423 | 126.519231 | 153.450000 | 247.185484 | 16.0 | 177.501282 | ... | 12.0 | 12.0 | 11219.000000 | 440.250000 | 1.750000 | 0.0 | 0.0 | 0.0 | 40.688008 | -73.929686 |
17 | 386.571429 | 394.071429 | 380.785714 | 1161.428571 | 105.583791 | 163.087912 | 111.360714 | 121.357143 | 17.0 | 130.246192 | ... | 12.0 | 12.0 | 11220.642857 | 547.071429 | 1.642857 | 0.0 | 0.0 | 0.0 | 40.660313 | -73.955636 |
18 | 373.454545 | 373.090909 | 371.454545 | 1118.000000 | 129.028846 | 197.038462 | 153.450000 | 72.771261 | 18.0 | 72.209438 | ... | 12.0 | 12.0 | 11224.000000 | 344.000000 | 1.090909 | 0.0 | 0.0 | 0.0 | 40.641863 | -73.914726 |
19 | 367.083333 | 377.583333 | 359.166667 | 1103.833333 | 88.097756 | 124.769231 | 120.670833 | 114.322581 | 19.0 | 105.752625 | ... | 12.0 | 12.0 | 11207.500000 | 440.416667 | 1.916667 | 0.0 | 0.0 | 0.0 | 40.676547 | -73.882158 |
20 | 406.223881 | 465.731343 | 401.732537 | 1273.687761 | 227.805769 | 359.407692 | 177.690000 | 591.374194 | 20.0 | 420.029766 | ... | 12.0 | 12.0 | 11210.200000 | 2521.400000 | 3.800000 | 0.0 | 0.0 | 0.0 | 40.626751 | -74.006191 |
21 | 395.283582 | 421.786974 | 389.242062 | 1206.312619 | 135.467657 | 203.835664 | 142.377273 | 275.351906 | 21.0 | 224.702989 | ... | 12.0 | 12.0 | 11221.000000 | 1098.272727 | 3.272727 | 0.0 | 0.0 | 0.0 | 40.593596 | -73.978465 |
22 | 473.500000 | 502.750000 | 474.250000 | 1450.500000 | 391.007212 | 614.509615 | 370.362500 | 580.250000 | 22.0 | 495.279369 | ... | 12.0 | 12.0 | 11223.000000 | 2149.000000 | 2.250000 | 0.0 | 0.0 | 0.0 | 40.618285 | -73.952288 |
23 | 380.666667 | 398.666667 | 378.000000 | 1157.333333 | 29.000000 | 31.000000 | 153.450000 | 87.000000 | 23.0 | 120.113095 | ... | 12.0 | 12.0 | 11219.000000 | 391.000000 | 1.333333 | 0.0 | 0.0 | 0.0 | 40.668586 | -73.912298 |
24 | 405.846154 | 434.000000 | 402.153846 | 1242.000000 | 126.474852 | 179.094675 | 115.165385 | 234.682382 | 24.0 | 213.471903 | ... | 12.0 | 12.0 | 11206.153846 | 962.461538 | 2.230769 | 0.0 | 0.0 | 0.0 | 40.740621 | -73.911518 |
25 | 437.250000 | 483.500000 | 436.250000 | 1357.000000 | 205.260817 | 279.889423 | 174.793750 | 268.733871 | 25.0 | 280.576007 | ... | 12.0 | 12.0 | 11361.000000 | 1288.875000 | 1.875000 | 0.0 | 0.0 | 0.0 | 40.745414 | -73.815558 |
26 | 445.200000 | 487.600000 | 444.800000 | 1377.600000 | 410.605769 | 632.407692 | 392.090000 | 825.600000 | 26.0 | 595.953216 | ... | 12.0 | 12.0 | 11388.600000 | 2837.400000 | 4.600000 | 0.0 | 0.0 | 0.0 | 40.748507 | -73.759176 |
27 | 407.800000 | 422.200000 | 394.300000 | 1224.300000 | 100.611538 | 145.315385 | 95.125000 | 288.961290 | 27.0 | 249.324536 | ... | 12.0 | 12.0 | 11556.300000 | 1072.000000 | 2.500000 | 0.0 | 0.0 | 0.0 | 40.638828 | -73.807823 |
28 | 445.941655 | 465.997286 | 435.908005 | 1347.846947 | 182.010490 | 273.559441 | 175.336364 | 351.214076 | 28.0 | 255.381164 | ... | 12.0 | 12.0 | 11422.000000 | 1304.272727 | 2.545455 | 0.0 | 0.0 | 0.0 | 40.709344 | -73.806367 |
29 | 395.764925 | 399.457090 | 386.707836 | 1181.929851 | 63.385817 | 96.514423 | 135.268750 | 98.108871 | 29.0 | 88.372155 | ... | 12.0 | 12.0 | 11413.625000 | 474.125000 | 1.250000 | 0.0 | 0.0 | 0.0 | 40.685276 | -73.752740 |
30 | 430.679934 | 465.961857 | 429.740299 | 1326.382090 | 157.231838 | 252.123932 | 115.150000 | 310.526882 | 30.0 | 251.803744 | ... | 12.0 | 12.0 | 11103.000000 | 1123.333333 | 2.555556 | 0.0 | 0.0 | 0.0 | 40.755398 | -73.932306 |
31 | 457.500000 | 472.500000 | 452.500000 | 1382.500000 | 228.908654 | 355.111538 | 194.435000 | 450.787097 | 31.0 | 380.528319 | ... | 12.0 | 12.0 | 10307.100000 | 1847.500000 | 5.000000 | 0.0 | 0.0 | 0.0 | 40.595680 | -74.125726 |
32 | 371.500000 | 385.833333 | 362.166667 | 1119.500000 | 70.342949 | 100.179487 | 83.558333 | 105.333333 | 32.0 | 100.525613 | ... | 12.0 | 12.0 | 11231.666667 | 381.500000 | 1.000000 | 0.0 | 0.0 | 0.0 | 40.696295 | -73.917124 |
32 rows × 67 columns
districts.columns
Index(['SAT Critical Reading Avg. Score', 'SAT Math Avg. Score', 'SAT Writing Avg. Score', 'sat_score', 'AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5', 'Total Cohort', 'CSD', 'NUMBER OF STUDENTS / SEATS FILLED', 'NUMBER OF SECTIONS', 'AVERAGE CLASS SIZE', 'SIZE OF SMALLEST CLASS', 'SIZE OF LARGEST CLASS', 'SCHOOLWIDE PUPIL-TEACHER RATIO', 'schoolyear', 'fl_percent', 'frl_percent', 'total_enrollment', 'ell_num', 'ell_percent', 'sped_num', 'sped_percent', 'asian_num', 'asian_per', 'black_num', 'black_per', 'hispanic_num', 'hispanic_per', 'white_num', 'white_per', 'male_num', 'male_per', 'female_num', 'female_per', 'rr_s', 'rr_t', 'rr_p', 'N_s', 'N_t', 'N_p', 'saf_p_11', 'com_p_11', 'eng_p_11', 'aca_p_11', 'saf_t_11', 'com_t_11', 'eng_t_11', 'aca_t_11', 'saf_s_11', 'com_s_11', 'eng_s_11', 'aca_s_11', 'saf_tot_11', 'com_tot_11', 'eng_tot_11', 'aca_tot_11', 'grade_span_max', 'expgrade_span_max', 'zip', 'total_students', 'number_programs', 'priority08', 'priority09', 'priority10', 'lat', 'lon'], dtype='object')
from mpl_toolkits.basemap import Basemap
m = Basemap(
projection='merc',
llcrnrlat=40.496044,
urcrnrlat=40.915256,
llcrnrlon=-74.255735,
urcrnrlon=-73.700272,
resolution='i'
)
m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)
longitudes = districts["lon"].tolist()
latitudes = districts["lat"].tolist()
m.scatter(longitudes, latitudes, s = 20, zorder = 2, latlon = True, c = districts["saf_tot_11"], cmap="summer" )
plt.show()
It looks like Upper Manhattan and parts of Queens and the Bronx tend to have higher safety scores, whereas Brooklyn has low safety scores.
races = ["white_per", "asian_per", "black_per", "hispanic_per"]
correlations["sat_score"][races].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f2741f20cc0>
There is a higher correlation between the white and asian races in the sat scores and a negative correlation for the black and hispanic races.
combined.plot.scatter(x = "hispanic_per", y = "sat_score")
plt.show()
The average SAT score for students from schools with a large number of hispanic students is between 1000-1250 which is below the good college entry average(https://blog.prepscholar.com/sat-scores-for-colleges). Also from the scatter the SAT are high with a low percentage of hispanics in schools but drop as the number of hispanic students increase.
combined[combined["hispanic_per"] > 95]
DBN | SCHOOL NAME | Num of SAT Test Takers | SAT Critical Reading Avg. Score | SAT Math Avg. Score | SAT Writing Avg. Score | sat_score | SchoolName | AP Test Takers | Total Exams Taken | ... | priority05 | priority06 | priority07 | priority08 | priority09 | priority10 | Location 1 | lat | lon | school_dist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
44 | 02M542 | MANHATTAN BRIDGES HIGH SCHOOL | 66 | 336.0 | 378.0 | 344.0 | 1058.0 | Manhattan Bridges High School | 67.000000 | 102.000000 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 525 West 50Th Street\nNew York, NY 10019\n(40.... | 40.765027 | -73.992517 | 02 |
82 | 06M348 | WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL | 70 | 380.0 | 395.0 | 399.0 | 1174.0 | 0 | 129.028846 | 197.038462 | ... | Then to New York City residents | 0 | 0 | 0 | 0 | 0 | 511 West 182Nd Street\nNew York, NY 10033\n(40... | 40.848879 | -73.930807 | 06 |
89 | 06M552 | GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M... | 56 | 339.0 | 349.0 | 326.0 | 1014.0 | GREGORIO LUPERON HS SCI & MATH | 88.000000 | 138.000000 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 501 West 165Th\nNew York, NY 10032\n(40.838032... | 40.838032 | -73.938371 | 06 |
125 | 09X365 | ACADEMY FOR LANGUAGE AND TECHNOLOGY | 54 | 315.0 | 339.0 | 297.0 | 951.0 | Academy for Language and Technology | 20.000000 | 20.000000 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1700 Macombs Road\nBronx, NY 10453\n(40.849102... | 40.849102 | -73.916088 | 09 |
141 | 10X342 | INTERNATIONAL SCHOOL FOR LIBERAL ARTS | 49 | 300.0 | 333.0 | 301.0 | 934.0 | International School for Liberal Arts | 55.000000 | 73.000000 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 2780 Reservoir Avenue\nBronx, NY 10468\n(40.87... | 40.870377 | -73.898163 | 10 |
176 | 12X388 | PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE | 30 | 321.0 | 351.0 | 298.0 | 970.0 | 0 | 129.028846 | 197.038462 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1300 Boynton Avenue\nBronx, NY 10472\n(40.8313... | 40.831366 | -73.878823 | 12 |
253 | 19K583 | MULTICULTURAL HIGH SCHOOL | 29 | 279.0 | 322.0 | 286.0 | 887.0 | Multicultural High School | 44.000000 | 44.000000 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 999 Jamaica Avenue\nBrooklyn, NY 11208\n(40.69... | 40.691144 | -73.868426 | 19 |
286 | 24Q296 | PAN AMERICAN INTERNATIONAL HIGH SCHOOL | 55 | 317.0 | 323.0 | 311.0 | 951.0 | 0 | 129.028846 | 197.038462 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 45-10 94Th Street\nElmhurst, NY 11373\n(40.743... | 40.743303 | -73.870575 | 24 |
8 rows × 160 columns
The schools have been set up for people that have just entered the united states and are still learning English. This could actually explain the low SAT's
combined[(combined["hispanic_per"] < 10) & (combined["sat_score"] > 1800)]
DBN | SCHOOL NAME | Num of SAT Test Takers | SAT Critical Reading Avg. Score | SAT Math Avg. Score | SAT Writing Avg. Score | sat_score | SchoolName | AP Test Takers | Total Exams Taken | ... | priority05 | priority06 | priority07 | priority08 | priority09 | priority10 | Location 1 | lat | lon | school_dist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
37 | 02M475 | STUYVESANT HIGH SCHOOL | 832 | 679.0 | 735.0 | 682.0 | 2096.0 | STUYVESANT HS | 1510.0 | 2819.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 345 Chambers Street\nNew York, NY 10282\n(40.7... | 40.717746 | -74.014049 | 02 |
151 | 10X445 | BRONX HIGH SCHOOL OF SCIENCE | 731 | 632.0 | 688.0 | 649.0 | 1969.0 | BRONX HS OF SCIENCE | 1190.0 | 2435.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 75 West 205 Street\nBronx, NY 10468\n(40.87705... | 40.877056 | -73.889780 | 10 |
187 | 13K430 | BROOKLYN TECHNICAL HIGH SCHOOL | 1277 | 587.0 | 659.0 | 587.0 | 1833.0 | BROOKLYN TECHNICAL HS | 2117.0 | 3692.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 29 Ft Greene Place\nBrooklyn, NY 11217\n(40.68... | 40.688107 | -73.976745 | 13 |
327 | 28Q687 | QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO... | 121 | 612.0 | 660.0 | 596.0 | 1868.0 | Queens HS for Science York Colllege | 215.0 | 338.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 94-50 159 Street\nJamaica, NY 11433\n(40.70099... | 40.700999 | -73.798154 | 28 |
356 | 31R605 | STATEN ISLAND TECHNICAL HIGH SCHOOL | 227 | 635.0 | 682.0 | 636.0 | 1953.0 | STATEN ISLAND TECHNICAL HS | 528.0 | 905.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 485 Clawson Street\nStaten Island, NY 10306\n(... | 40.567913 | -74.115362 | 31 |
5 rows × 160 columns
These schools specilise in science and technology and receive special income to enrol students that have passed highly even if they can't afford the school
genders = ["male_per", "female_per"]
correlations["sat_score"][genders].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f2741e94240>
There is a positive correlation with the female percent and negative correlation with the male percent on the SAT score
combined.plot.scatter(x = "female_per", y = "sat_score")
plt.show()
Based on the scatterplot, there doesn't seem to be any real correlation between sat_score and female_per. However, there is a cluster of schools with a high percentage of females (40 to 80), and high SAT scores.
combined[(combined["female_per"] > 50) & (combined["sat_score"] > 1700)]["SCHOOL NAME"]
5 BARD HIGH SCHOOL EARLY COLLEGE 26 ELEANOR ROOSEVELT HIGH SCHOOL 60 BEACON HIGH SCHOOL 61 FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A... 302 TOWNSEND HARRIS HIGH SCHOOL Name: SCHOOL NAME, dtype: object
These schools appears to be very selective liberal arts schools that have high academic standards.
combined["ap_per"] = combined["AP Test Takers "] / combined["total_enrollment"]
combined.plot.scatter(x = "ap_per", y = "sat_score")
plt.show()
It looks like there is a relationship between the percentage of students in a school who take the AP exam, and their average SAT scores. It's not an extremely strong correlation, though.
class_size_1 = ['NUMBER OF STUDENTS / SEATS FILLED', 'NUMBER OF SECTIONS',
'AVERAGE CLASS SIZE', 'SIZE OF SMALLEST CLASS', 'SIZE OF LARGEST CLASS',
'SCHOOLWIDE PUPIL-TEACHER RATIO']
correlations["sat_score"][class_size_1].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f2741d00278>
The size of the schools and parameters, Number of seats filled that filled, the sizes of the class all are positively correlated to the SAT score.
combined.plot.scatter(x = "NUMBER OF STUDENTS / SEATS FILLED", y = "sat_score")
plt.show()
There is a stable rise in the SAT score as the number of students increase. Most of the schools have few students
combined.plot.scatter(x = "AVERAGE CLASS SIZE", y = "sat_score")
plt.show()
As from the previous analysis, the averge class size is low but there's a cluster of schools withscores above 1600 with class size as normal as the others. let me extract them for better analysis
combined[(combined["AVERAGE CLASS SIZE"] > 20) & (combined["sat_score"] > 1700)]["SCHOOL NAME"]
5 BARD HIGH SCHOOL EARLY COLLEGE 26 ELEANOR ROOSEVELT HIGH SCHOOL 37 STUYVESANT HIGH SCHOOL 60 BEACON HIGH SCHOOL 61 FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A... 79 HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN... 151 BRONX HIGH SCHOOL OF SCIENCE 155 HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE 187 BROOKLYN TECHNICAL HIGH SCHOOL 198 BROOKLYN LATIN SCHOOL, THE 302 TOWNSEND HARRIS HIGH SCHOOL 327 QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO... 356 STATEN ISLAND TECHNICAL HIGH SCHOOL Name: SCHOOL NAME, dtype: object
These schools either major on science and technology or are liberal arts schools. The science schools have extra financial support will admi only good performers while the liberal arts maintain high standards
We will obtain the student teacher ration by dividing the number of students by the number of teachers that responded to the survey. I would have used the total Number of students from class size but it gives urealistic values (above 20 which is far above the NY State Avg of 13.1(https://www.publicschoolreview.com/student-teacher-ratio-stats/new-york)) which could be because not all teachers participated in the survey. Then obtain its relationship with the SAT scores
combined["SCHOOLWIDE PUPIL-TEACHER RATIO"] = combined["N_s"]/combined["N_t"]
combined.plot.scatter(x = "SCHOOLWIDE PUPIL-TEACHER RATIO", y = "sat_score")
plt.show()
There is a relationship, most of the schools are close to the state average. However, there are some that have are some schools again that have exceptional results from the same numbers.
combined[(combined["SCHOOLWIDE PUPIL-TEACHER RATIO"] > 10) & (combined["sat_score"] > 1700)]["SCHOOL NAME"]
5 BARD HIGH SCHOOL EARLY COLLEGE 26 ELEANOR ROOSEVELT HIGH SCHOOL 37 STUYVESANT HIGH SCHOOL 60 BEACON HIGH SCHOOL 61 FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A... 79 HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN... 151 BRONX HIGH SCHOOL OF SCIENCE 155 HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE 187 BROOKLYN TECHNICAL HIGH SCHOOL 198 BROOKLYN LATIN SCHOOL, THE 302 TOWNSEND HARRIS HIGH SCHOOL 327 QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO... 356 STATEN ISLAND TECHNICAL HIGH SCHOOL Name: SCHOOL NAME, dtype: object
Its still the same special schools from the previous results. They might have the same class ratio but have external funding to raise scientists or to build high performing students in the liberal arts.