import pandas as pd
import numpy as np
import numpy
import re
%matplotlib inline
data_files = [
"ap_2010.csv",
"class_size.csv",
"demographics.csv",
"graduation.csv",
"hs_directory.csv",
"sat_results.csv"
]
data = {}
for f in data_files:
d = pd.read_csv(r"C:\Users\Alima\OneDrive\Рабочий стол\PYTHON\Projects\SAT Schools Project\{0}".format(f))
data[f.replace(".csv", "")] = d
all_survey = pd.read_csv(r"C:\Users\Alima\OneDrive\Рабочий стол\PYTHON\Projects\SAT Schools Project\survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv(r"C:\Users\Alima\OneDrive\Рабочий стол\PYTHON\Projects\SAT Schools Project\survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)
survey["DBN"] = survey["dbn"]
survey_fields = [
"DBN",
"rr_s",
"rr_t",
"rr_p",
"N_s",
"N_t",
"N_p",
"saf_p_11",
"com_p_11",
"eng_p_11",
"aca_p_11",
"saf_t_11",
"com_t_11",
"eng_t_11",
"aca_t_11",
"saf_s_11",
"com_s_11",
"eng_s_11",
"aca_s_11",
"saf_tot_11",
"com_tot_11",
"eng_tot_11",
"aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]
def pad_csd(num):
string_representation = str(num)
if len(string_representation) > 1:
return string_representation
else:
return "0" + string_representation
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")
data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
def find_lat(loc):
coords = re.findall("\(.+, .+\)", loc)
lat = coords[0].split(",")[0].replace("(", "")
return lat
def find_lon(loc):
coords = re.findall("\(.+, .+\)", loc)
lon = coords[0].split(",")[1].replace(")", "").strip()
return lon
data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)
data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]
class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size
data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]
data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']
for col in cols:
data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")
combined = data["sat_results"]
combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")
to_merge = ["class_size", "demographics", "survey", "hs_directory"]
for m in to_merge:
combined = combined.merge(data[m], on="DBN", how="inner")
combined = combined.fillna(combined.mean())
combined = combined.fillna(0)
def get_first_two_chars(dbn):
return dbn[0:2]
combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)
correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)
SAT Critical Reading Avg. Score 0.986820 SAT Math Avg. Score 0.972643 SAT Writing Avg. Score 0.987771 sat_score 1.000000 AP Test Takers 0.523140 ... Census Tract 0.048737 BIN 0.052232 BBL 0.044427 lat -0.121029 lon -0.132222 Name: sat_score, Length: 85, dtype: float64
# Remove DBN since it's a unique identifier, not a useful numerical value for correlation.
survey_fields.remove("DBN")
combined.corr()['sat_score'][survey_fields].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x286597c5748>
Looking at the bar plot above we can consider fields correlating highly with sat_score
if R value is above 0.23. The following fields seem to have the highest correlation: N-s, N-t, N_p, saf_t_11, saf_s_11, aca_s_11, saf_tot_11
.
When the correlation of the number of respondents (students, teachers, and parents) with sat_score
is not that surprising, there are three fields with high correlation that relate to safety and respect score at school. That means the relationship between safety and respect at school and SAT scores is something worth looking into.
combined.plot.scatter(x='saf_s_11', y='sat_score')
<matplotlib.axes._subplots.AxesSubplot at 0x2865bb9cfc8>
Above we plotted the saf_s_11
column vs. sat_score
and we can see that the correlation is not that strong since the dots are not aligned and we can see that even when the safety score is high at some schools the SAT score is still low.
Now, we will map out safety scores for each district by grouping combined
by school_dist
and compute the average values using agg
function and numpy.mean
function.
districts = combined.groupby('school_dist').agg(numpy.mean)
districts.reset_index(inplace=True)
districts
school_dist | SAT Critical Reading Avg. Score | SAT Math Avg. Score | SAT Writing Avg. Score | sat_score | AP Test Takers | Total Exams Taken | Number of Exams with scores 3 4 or 5 | Total Cohort | Total Grads - % of cohort | ... | priority08 | priority09 | priority10 | Community Board | Council District | Census Tract | BIN | BBL | lat | lon | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 01 | 441.833333 | 473.333333 | 439.333333 | 1354.500000 | 116.681090 | 173.019231 | 135.800000 | 93.500000 | 71.333333 | ... | 0.0 | 0.0 | 0.0 | 3.000000 | 1.500000 | 1106.833333 | 1.004144e+06 | 1.003302e+09 | 40.719022 | -73.982377 |
1 | 02 | 426.619092 | 444.186256 | 424.832836 | 1295.638184 | 128.908454 | 201.516827 | 157.495833 | 158.647849 | 75.453574 | ... | 0.0 | 0.0 | 0.0 | 4.204005 | 2.963296 | 1132.479744 | 1.103862e+06 | 1.070438e+09 | 40.739699 | -73.991386 |
2 | 03 | 428.529851 | 437.997512 | 426.915672 | 1293.443035 | 156.183494 | 244.522436 | 193.087500 | 183.384409 | 74.029937 | ... | 0.0 | 0.0 | 0.0 | 7.500000 | 6.500000 | 166.666667 | 1.034931e+06 | 1.012833e+09 | 40.781574 | -73.977370 |
3 | 04 | 402.142857 | 416.285714 | 405.714286 | 1224.142857 | 129.016484 | 183.879121 | 151.035714 | 113.857143 | 76.257143 | ... | 0.0 | 0.0 | 0.0 | 11.000000 | 8.000000 | 2637.000000 | 1.055874e+06 | 1.016681e+09 | 40.793449 | -73.943215 |
4 | 05 | 427.159915 | 438.236674 | 419.666098 | 1285.062687 | 85.722527 | 115.725275 | 142.464286 | 143.677419 | 75.200881 | ... | 0.0 | 0.0 | 0.0 | 9.428571 | 8.142857 | 9372.571429 | 1.063080e+06 | 1.019721e+09 | 40.817077 | -73.949251 |
5 | 06 | 382.011940 | 400.565672 | 382.066269 | 1164.643881 | 108.711538 | 159.715385 | 105.425000 | 180.848387 | 69.811233 | ... | 0.0 | 0.0 | 0.0 | 11.179224 | 10.823823 | 2765.702770 | 1.226037e+06 | 1.170781e+09 | 40.848970 | -73.932502 |
6 | 07 | 376.461538 | 380.461538 | 371.923077 | 1128.846154 | 73.703402 | 112.476331 | 105.276923 | 105.605459 | 67.146628 | ... | 0.0 | 0.0 | 0.0 | 1.692308 | 12.846154 | 58.538462 | 2.032562e+06 | 2.024050e+09 | 40.816815 | -73.919971 |
7 | 08 | 386.214383 | 395.542741 | 377.908005 | 1159.665129 | 118.379371 | 168.020979 | 144.731818 | 215.510264 | 59.514477 | ... | 0.0 | 0.0 | 0.0 | 7.272727 | 16.727273 | 67.909091 | 2.037043e+06 | 2.036723e+09 | 40.823803 | -73.866087 |
8 | 09 | 373.755970 | 383.582836 | 374.633134 | 1131.971940 | 71.411538 | 104.265385 | 98.470000 | 113.330645 | 67.607962 | ... | 0.0 | 0.0 | 0.0 | 3.550000 | 15.750000 | 5243.750000 | 2.022457e+06 | 2.027913e+09 | 40.836349 | -73.906240 |
9 | 10 | 403.363636 | 418.000000 | 400.863636 | 1222.227273 | 132.231206 | 226.914336 | 191.618182 | 161.318182 | 68.656958 | ... | 0.0 | 0.0 | 0.0 | 7.000000 | 12.227273 | 1686.045455 | 1.856268e+06 | 1.850558e+09 | 40.870345 | -73.898360 |
10 | 11 | 389.866667 | 394.533333 | 380.600000 | 1165.000000 | 83.813462 | 122.484615 | 108.833333 | 122.866667 | 70.713333 | ... | 0.0 | 0.0 | 0.0 | 11.466667 | 12.266667 | 6469.733333 | 2.064514e+06 | 2.046691e+09 | 40.873138 | -73.856120 |
11 | 12 | 364.769900 | 379.109453 | 357.943781 | 1101.823134 | 93.102564 | 139.442308 | 153.450000 | 110.467742 | 65.459361 | ... | 0.0 | 0.0 | 0.0 | 5.083333 | 17.000000 | 120.083333 | 2.024516e+06 | 2.032603e+09 | 40.831412 | -73.886946 |
12 | 13 | 409.393800 | 424.127440 | 403.666361 | 1237.187600 | 232.931953 | 382.704142 | 320.773077 | 224.595533 | 74.685090 | ... | 0.0 | 0.0 | 0.0 | 2.615385 | 33.846154 | 100.846154 | 3.149543e+06 | 3.009073e+09 | 40.692865 | -73.977016 |
13 | 14 | 395.937100 | 398.189765 | 385.333049 | 1179.459915 | 77.798077 | 114.873626 | 123.282143 | 112.347926 | 69.436155 | ... | 0.0 | 0.0 | 0.0 | 1.142857 | 33.642857 | 491.142857 | 3.142769e+06 | 3.026164e+09 | 40.711599 | -73.948360 |
14 | 15 | 395.679934 | 404.628524 | 390.295854 | 1190.604312 | 94.574786 | 141.581197 | 153.450000 | 104.207885 | 64.505898 | ... | 0.0 | 0.0 | 0.0 | 5.666667 | 36.222222 | 95.333333 | 3.091899e+06 | 3.006329e+09 | 40.675972 | -73.989255 |
15 | 16 | 371.529851 | 379.164179 | 369.415672 | 1120.109701 | 82.264423 | 126.519231 | 153.450000 | 247.185484 | 64.326541 | ... | 0.0 | 0.0 | 0.0 | 3.000000 | 37.250000 | 307.000000 | 3.131325e+06 | 3.016038e+09 | 40.688008 | -73.929686 |
16 | 17 | 386.571429 | 394.071429 | 380.785714 | 1161.428571 | 105.583791 | 163.087912 | 111.360714 | 121.357143 | 68.564286 | ... | 0.0 | 0.0 | 0.0 | 10.785714 | 37.857143 | 556.142857 | 3.178521e+06 | 3.033695e+09 | 40.660313 | -73.955636 |
17 | 18 | 373.454545 | 373.090909 | 371.454545 | 1118.000000 | 129.028846 | 197.038462 | 153.450000 | 72.771261 | 70.453082 | ... | 0.0 | 0.0 | 0.0 | 17.636364 | 45.272727 | 950.727273 | 3.182085e+06 | 3.068627e+09 | 40.641863 | -73.914726 |
18 | 19 | 367.083333 | 377.583333 | 359.166667 | 1103.833333 | 88.097756 | 124.769231 | 120.670833 | 114.322581 | 61.209361 | ... | 0.0 | 0.0 | 0.0 | 5.000000 | 39.500000 | 1158.833333 | 3.108754e+06 | 3.039994e+09 | 40.676547 | -73.882158 |
19 | 20 | 406.223881 | 465.731343 | 401.732537 | 1273.687761 | 227.805769 | 359.407692 | 177.690000 | 591.374194 | 65.830616 | ... | 0.0 | 0.0 | 0.0 | 11.000000 | 43.400000 | 199.200000 | 3.143345e+06 | 3.058004e+09 | 40.626751 | -74.006191 |
20 | 21 | 395.283582 | 421.786974 | 389.242062 | 1206.312619 | 135.467657 | 203.835664 | 142.377273 | 275.351906 | 62.987204 | ... | 0.0 | 0.0 | 0.0 | 12.909091 | 46.818182 | 360.727273 | 3.187467e+06 | 3.069480e+09 | 40.593596 | -73.978465 |
21 | 22 | 473.500000 | 502.750000 | 474.250000 | 1450.500000 | 391.007212 | 614.509615 | 370.362500 | 580.250000 | 86.325000 | ... | 0.0 | 0.0 | 0.0 | 12.750000 | 45.000000 | 633.500000 | 3.260954e+06 | 3.071058e+09 | 40.618285 | -73.952288 |
22 | 23 | 380.666667 | 398.666667 | 378.000000 | 1157.333333 | 29.000000 | 31.000000 | 153.450000 | 87.000000 | 68.766667 | ... | 0.0 | 0.0 | 0.0 | 16.000000 | 41.000000 | 12783.000000 | 3.067259e+06 | 3.028433e+09 | 40.668586 | -73.912298 |
23 | 24 | 405.846154 | 434.000000 | 402.153846 | 1242.000000 | 126.474852 | 179.094675 | 115.165385 | 234.682382 | 69.546628 | ... | 0.0 | 0.0 | 0.0 | 2.846154 | 25.692308 | 3626.538462 | 4.116109e+06 | 4.009562e+09 | 40.740621 | -73.911518 |
24 | 25 | 437.250000 | 483.500000 | 436.250000 | 1357.000000 | 205.260817 | 279.889423 | 174.793750 | 268.733871 | 71.356635 | ... | 0.0 | 0.0 | 0.0 | 8.000000 | 21.875000 | 10598.875000 | 4.217178e+06 | 4.058888e+09 | 40.745414 | -73.815558 |
25 | 26 | 445.200000 | 487.600000 | 444.800000 | 1377.600000 | 410.605769 | 632.407692 | 392.090000 | 825.600000 | 76.180000 | ... | 0.0 | 0.0 | 0.0 | 11.800000 | 21.600000 | 28776.600000 | 4.228215e+06 | 4.072510e+09 | 40.748507 | -73.759176 |
26 | 27 | 407.800000 | 422.200000 | 394.300000 | 1224.300000 | 100.611538 | 145.315385 | 95.125000 | 288.961290 | 69.831233 | ... | 0.0 | 0.0 | 0.0 | 12.000000 | 30.500000 | 39899.900000 | 4.285880e+06 | 4.131440e+09 | 40.638828 | -73.807823 |
27 | 28 | 445.941655 | 465.997286 | 435.908005 | 1347.846947 | 182.010490 | 273.559441 | 175.336364 | 351.214076 | 77.315037 | ... | 0.0 | 0.0 | 0.0 | 8.727273 | 25.818182 | 498.272727 | 4.336396e+06 | 4.086008e+09 | 40.709344 | -73.806367 |
28 | 29 | 395.764925 | 399.457090 | 386.707836 | 1181.929851 | 63.385817 | 96.514423 | 135.268750 | 98.108871 | 73.044135 | ... | 0.0 | 0.0 | 0.0 | 12.250000 | 28.500000 | 13638.750000 | 4.256972e+06 | 4.119641e+09 | 40.685276 | -73.752740 |
29 | 30 | 430.679934 | 465.961857 | 429.740299 | 1326.382090 | 157.231838 | 252.123932 | 115.150000 | 310.526882 | 73.611796 | ... | 0.0 | 0.0 | 0.0 | 1.222222 | 25.111111 | 52.777778 | 4.166018e+06 | 4.004830e+09 | 40.755398 | -73.932306 |
30 | 31 | 457.500000 | 472.500000 | 452.500000 | 1382.500000 | 228.908654 | 355.111538 | 194.435000 | 450.787097 | 77.345308 | ... | 0.0 | 0.0 | 0.0 | 1.800000 | 50.000000 | 6752.500000 | 5.093752e+06 | 5.022487e+09 | 40.595680 | -74.125726 |
31 | 32 | 371.500000 | 385.833333 | 362.166667 | 1119.500000 | 70.342949 | 100.179487 | 83.558333 | 105.333333 | 65.466667 | ... | 0.0 | 0.0 | 0.0 | 4.000000 | 36.000000 | 420.666667 | 3.131944e+06 | 3.033185e+09 | 40.696295 | -73.917124 |
32 rows × 86 columns
Now, we will plot the map of NYC and then we can map out the safety scores by districts on top of it using scatter plot which would help us explore the school districts with high/low safety scores.
# Plotting the map of NYC
import os
os.environ['PROJ_LIB'] = r'C:\Users\Alima\anaconda3\pkgs\proj4-5.2.0-ha925a31_1\Library\share'
from mpl_toolkits.basemap import Basemap
m = Basemap(
projection='merc',
llcrnrlat=40.496044,
urcrnrlat=40.915256,
llcrnrlon=-74.255735,
urcrnrlon=-73.700272,
resolution='i'
)
m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)
longitudes = districts['lon'].tolist()
latitudes = districts['lat'].tolist()
m.scatter(longitudes, latitudes, s=50, zorder=2, latlon=True, c=districts['saf_s_11'], cmap='summer')
C:\Users\Alima\anaconda3\lib\site-packages\ipykernel_launcher.py:13: MatplotlibDeprecationWarning: The dedent function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use inspect.cleandoc instead. del sys.path[0] C:\Users\Alima\anaconda3\lib\site-packages\ipykernel_launcher.py:18: MatplotlibDeprecationWarning: The dedent function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use inspect.cleandoc instead.
<matplotlib.collections.PathCollection at 0x2865bb6c748>
If we take a look at our map, we can observe districts where schools with high safety score shaded in yellow. Most of them are located in Manhattan, Bronx and very little are in Queens and Brooklyn. Whereas the majority of schools with low safety score are saturated in Brooklyn, and very little in Bronx. Safety level of schools in Staten Island can be considered medium.
race = ['white_per', 'asian_per', 'black_per', 'hispanic_per']
combined.corr()['sat_score'][race].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x2865bb66908>
The bar plot demonstrates the racial inequality in SAT performance. The high percentage of white and asian students correlate positively with high SAT scores which means the students of these racial groups consistently perfrom better on the SAT, whereas the percentage of black and hispanic students correlate negatively. This inequality serves as an evidence that the SAT might be unfair towards certain racial groups.
Next step is to explore schools with low SAT scores and high values for hispanic_per
.
combined.plot.scatter(x='hispanic_per', y='sat_score')
<matplotlib.axes._subplots.AxesSubplot at 0x2865bb45648>
The scatter plot above indicates that schools with high concentration of hispanic students (more than 50%) do not score higher than 1400 points on SAT. Only small fraction of schools with under 20% of hispanic students reach higher SAT scores.
We will research schools with a hispanic_per
greater than 95%:
# Filtering the dataframe
hispanic_high = combined[combined['hispanic_per'] > 95]
# Identifying school names
hispanic_high['SCHOOL NAME']
44 MANHATTAN BRIDGES HIGH SCHOOL 82 WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL 89 GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M... 125 ACADEMY FOR LANGUAGE AND TECHNOLOGY 141 INTERNATIONAL SCHOOL FOR LIBERAL ARTS 176 PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE 253 MULTICULTURAL HIGH SCHOOL 286 PAN AMERICAN INTERNATIONAL HIGH SCHOOL Name: SCHOOL NAME, dtype: object
First, we will create a new dataframe using the data from "School Performance Dashboard" from https://www.schools.nyc.gov for the schools mentioned above.
We selected the following criteria that can aid us in getting some insights about the performance of the schools with high hispanic_per
:
Economic Need Index
Student Achievement Score
College Readiness Rate
Graduation Rate
# Creating a new dataframe
hispanic_high_perf = {'School': ['MANHATTAN BRIDGES HIGH SCHOOL','WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL','GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M...','ACADEMY FOR LANGUAGE AND TECHNOLOGY', 'INTERNATIONAL SCHOOL FOR LIBERAL ARTS', 'PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE', 'MULTICULTURAL HIGH SCHOOL', 'PAN AMERICAN INTERNATIONAL HIGH SCHOOL'],
'Economic Need Index(%)': [91, 82, 99, 99, 98, 98, 99, 95],
'Student Achievement Score(/5)': [4.51, 4.51, 3.47, 4.75, 2.50, 4.89, 3.34, 4.66],
'College Readiness Rate(%)': [77, np.NaN, 48, 40, 32, 48, 25, 33],
'Graduation Rate(%)': [95, np.NaN, 87, 99, 65, 84, 81, 85]
}
performance_high = pd.DataFrame(hispanic_high_perf, columns = ['School', 'Economic Need Index(%)', 'Student Achievement Score(/5)', 'College Readiness Rate(%)', 'Graduation Rate(%)'])
performance_high
School | Economic Need Index(%) | Student Achievement Score(/5) | College Readiness Rate(%) | Graduation Rate(%) | |
---|---|---|---|---|---|
0 | MANHATTAN BRIDGES HIGH SCHOOL | 91 | 4.51 | 77.0 | 95.0 |
1 | WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL | 82 | 4.51 | NaN | NaN |
2 | GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M... | 99 | 3.47 | 48.0 | 87.0 |
3 | ACADEMY FOR LANGUAGE AND TECHNOLOGY | 99 | 4.75 | 40.0 | 99.0 |
4 | INTERNATIONAL SCHOOL FOR LIBERAL ARTS | 98 | 2.50 | 32.0 | 65.0 |
5 | PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE | 98 | 4.89 | 48.0 | 84.0 |
6 | MULTICULTURAL HIGH SCHOOL | 99 | 3.34 | 25.0 | 81.0 |
7 | PAN AMERICAN INTERNATIONAL HIGH SCHOOL | 95 | 4.66 | 33.0 | 85.0 |
# Calculate the mean for College Readiness column
mean_col_read = performance_high['College Readiness Rate(%)'].mean()
# Fill in the missing values with the mean
performance_high['College Readiness Rate(%)'].fillna(mean_col_read)
0 77.000000 1 43.285714 2 48.000000 3 40.000000 4 32.000000 5 48.000000 6 25.000000 7 33.000000 Name: College Readiness Rate(%), dtype: float64
# Calculate the mean for Graduation Rate column
mean_grad = performance_high['Graduation Rate(%)'].mean()
# Fill in the missing values with the mean
performance_high['Graduation Rate(%)'].fillna(mean_grad)
0 95.000000 1 85.142857 2 87.000000 3 99.000000 4 65.000000 5 84.000000 6 81.000000 7 85.000000 Name: Graduation Rate(%), dtype: float64
Now that our dataframe is ready, we can perform some manipulations and make observations.
# Plotting Economic Need Index for each school on horizontal bar plot
performance_high.plot.barh(x='School', y='Economic Need Index(%)', legend=False)
<matplotlib.axes._subplots.AxesSubplot at 0x2865bb239c8>
performance_high['Economic Need Index(%)'].describe()
count 8.000000 mean 95.125000 std 5.986592 min 82.000000 25% 94.000000 50% 98.000000 75% 99.000000 max 99.000000 Name: Economic Need Index(%), dtype: float64
All 8 schools have an extremely high Economic Need Index with 95% on average. Economic Need Index is an estimated percentage of students at school facing economic hardship, based on temporary housing, eligibility for public assistance, and Census tract poverty rates.
# Plotting Student Achievement Score for each school on horizontal bar plot
performance_high.plot.barh(x='School', y='Student Achievement Score(/5)', legend=False)
<matplotlib.axes._subplots.AxesSubplot at 0x2865bb05f08>
As the bar plot shows, the Student Achievement Scores
vary depending on the school with an average value of 4 out of 5 points. The poorest performance was demonstrated at INTERNATIONAL SCHOOL FOR LIBERAL ARTS
. This particular school is 100% hispanic students.
performance_high['College Readiness Rate(%)'].describe()
count 7.000000 mean 43.285714 std 17.124196 min 25.000000 25% 32.500000 50% 40.000000 75% 48.000000 max 77.000000 Name: College Readiness Rate(%), dtype: float64
performance_high['Graduation Rate(%)'].describe()
count 7.000000 mean 85.142857 std 10.930516 min 65.000000 25% 82.500000 50% 85.000000 75% 91.000000 max 99.000000 Name: Graduation Rate(%), dtype: float64
The Graduation rate
among the schools with high percentage of hispanic students is about 85% on average. On the downside, the College Readiness Rate
is very low, only about 43% of students are ready to go to college which makes it unlikely for them to get high scores on SAT.
hispanic_per
and high SAT score¶Now we will reaserch the schools with hispanic_per
lower than 10%, and SAT scores greater than 1800.
# Filtering the dataframe
hispanic_low = combined[combined['hispanic_per'] < 10]
hispanic_low = combined[combined['sat_score'] > 1800]
# Identifying school names
hispanic_low['SCHOOL NAME']
5 BARD HIGH SCHOOL EARLY COLLEGE 37 STUYVESANT HIGH SCHOOL 79 HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN... 151 BRONX HIGH SCHOOL OF SCIENCE 155 HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE 187 BROOKLYN TECHNICAL HIGH SCHOOL 302 TOWNSEND HARRIS HIGH SCHOOL 327 QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO... 356 STATEN ISLAND TECHNICAL HIGH SCHOOL Name: SCHOOL NAME, dtype: object
We will create a new dataframe using the data from "School Performance Dashboard" from https://www.schools.nyc.gov for the schools mentioned below. Then, we will manipulate this dataframe to gain some insights about schools where hispanic representation is low and SAT scores are high.
hispanic_low_perf = {'School': ['BARD HIGH SCHOOL EARLY COLLEGE','STUYVESANT HIGH SCHOOL',' HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGINERING','BRONX HIGH SCHOOL OF SCIENCE', 'HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE', 'BROOKLYN TECHNICAL HIGH SCHOOL', ' TOWNSEND HARRIS HIGH SCHOOL', ' QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK COLLEGE', ' STATEN ISLAND TECHNICAL HIGH SCHOOL' ],
'Economic Need Index(%)': [40, 40, 40, 39, 26, 52, 39, 49, 35],
'Student Achievement Score(/5)': [4.49, 4.83, 4.65, 4.90, 4.86, 4.58, 4.90, 4.80, 4.77],
'College Readiness Rate(%)': [98, 99, 99, 100, 100, 97, 100, 100, 100],
'Graduation Rate(%)': [99, 99, 99, 100, 100, 97, 100, 100, 100]
}
performance_low = pd.DataFrame(hispanic_low_perf, columns = ['School', 'Economic Need Index(%)', 'Student Achievement Score(/5)', 'College Readiness Rate(%)', 'Graduation Rate(%)'])
performance_low
School | Economic Need Index(%) | Student Achievement Score(/5) | College Readiness Rate(%) | Graduation Rate(%) | |
---|---|---|---|---|---|
0 | BARD HIGH SCHOOL EARLY COLLEGE | 40 | 4.49 | 98 | 99 |
1 | STUYVESANT HIGH SCHOOL | 40 | 4.83 | 99 | 99 |
2 | HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGI... | 40 | 4.65 | 99 | 99 |
3 | BRONX HIGH SCHOOL OF SCIENCE | 39 | 4.90 | 100 | 100 |
4 | HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE | 26 | 4.86 | 100 | 100 |
5 | BROOKLYN TECHNICAL HIGH SCHOOL | 52 | 4.58 | 97 | 97 |
6 | TOWNSEND HARRIS HIGH SCHOOL | 39 | 4.90 | 100 | 100 |
7 | QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK C... | 49 | 4.80 | 100 | 100 |
8 | STATEN ISLAND TECHNICAL HIGH SCHOOL | 35 | 4.77 | 100 | 100 |
# Plotting Economic Need Index for each school on horizontal bar plot
performance_low.plot.barh(x='School', y='Economic Need Index(%)', legend=False)
<matplotlib.axes._subplots.AxesSubplot at 0x2865bae7a08>
The bar plor above shows that on average only 40% of students fall into economically challenged category among all 9 schools unlike all hispanic schools we researched earlier.
performance_low['Student Achievement Score(/5)'].describe()
count 9.000000 mean 4.753333 std 0.146969 min 4.490000 25% 4.650000 50% 4.800000 75% 4.860000 max 4.900000 Name: Student Achievement Score(/5), dtype: float64
The average of Student Achievement Score is 4.75 points out of 5 which means students from these schools perform well academically.
performance_low['College Readiness Rate(%)'].describe()
count 9.000000 mean 99.222222 std 1.092906 min 97.000000 25% 99.000000 50% 100.000000 75% 100.000000 max 100.000000 Name: College Readiness Rate(%), dtype: float64
All 9 schools indicate high levels of College Readiness Rate
, 99.22% on average to be precise, the same can be observed for Graduation Rate
since the values are almost identical. That means students from these schools will definitely graduate and they are better prepared to take college entry exams like SAT.
We will plot out the correlations between the percentages of each gender (male_per
and female_per
) and sat_score
.
gender = ['male_per', 'female_per']
combined.corr()['sat_score'][gender].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x286599f4ec8>
The bar plot above shows that the number of female students correlate positively with the SAT score, whereas the number of male students tend to correlate negatively. Could that mean girls in general do better on SAT? To find out, we would have to research more deeply the relationship between female_per
and sat_score
.
# Making a scatter plot of female_per vs. sat_score
combined.plot.scatter(x='female_per', y='sat_score')
<matplotlib.axes._subplots.AxesSubplot at 0x28659d02248>
The values seem to be plotted out sporadically and do not follow any particular pattern, therefore the correlation is rather weak.
# Filtering the dataframe
female_high = combined[combined['female_per'] > 60]
female_high = combined[combined['sat_score'] > 1700]
# Identifying school names
female_high['SCHOOL NAME']
5 BARD HIGH SCHOOL EARLY COLLEGE 26 ELEANOR ROOSEVELT HIGH SCHOOL 37 STUYVESANT HIGH SCHOOL 60 BEACON HIGH SCHOOL 61 FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A... 79 HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN... 151 BRONX HIGH SCHOOL OF SCIENCE 155 HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE 187 BROOKLYN TECHNICAL HIGH SCHOOL 198 BROOKLYN LATIN SCHOOL, THE 302 TOWNSEND HARRIS HIGH SCHOOL 327 QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO... 356 STATEN ISLAND TECHNICAL HIGH SCHOOL Name: SCHOOL NAME, dtype: object
We will follow the same approach as before by creating a new dataframe using the data from "School Performance Dashboard" from https://www.schools.nyc.gov.
# Creating a new dataframe
female_high_perf = {'School': ['BARD HIGH SCHOOL EARLY COLLEGE', 'ELEANOR ROOSEVELT HIGH SCHOOL', 'STUYVESANT HIGH SCHOOL', 'BEACON HIGH SCHOOL', 'FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & ARTS', 'HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGINERING','BRONX HIGH SCHOOL OF SCIENCE', 'HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE', 'BROOKLYN TECHNICAL HIGH SCHOOL', 'BROOKLYN LATIN SCHOOL, THE', 'TOWNSEND HARRIS HIGH SCHOOL', ' QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK COLLEGE', ' STATEN ISLAND TECHNICAL HIGH SCHOOL' ],
'Economic Need Index(%)': [40, 20, 40, 30, 30, 40, 39, 26, 52, 55, 39, 49, 35],
'Student Achievement Score(/5)': [4.49, 4.79, 4.83, 4.33, 4.69, 4.65, 4.90, 4.86, 4.58, 4.64, 4.90, 4.80, 4.77],
'College Readiness Rate(%)': [98, 100, 99, 97, 99, 99, 100, 100, 97, 97, 100, 100, 100],
'Graduation Rate(%)': [99, 100, 99, 98, 99, 99, 100, 100, 97, 97, 100, 100, 100]
}
female_perf = pd.DataFrame(female_high_perf, columns = ['School', 'Economic Need Index(%)', 'Student Achievement Score(/5)', 'College Readiness Rate(%)', 'Graduation Rate(%)'])
female_perf
School | Economic Need Index(%) | Student Achievement Score(/5) | College Readiness Rate(%) | Graduation Rate(%) | |
---|---|---|---|---|---|
0 | BARD HIGH SCHOOL EARLY COLLEGE | 40 | 4.49 | 98 | 99 |
1 | ELEANOR ROOSEVELT HIGH SCHOOL | 20 | 4.79 | 100 | 100 |
2 | STUYVESANT HIGH SCHOOL | 40 | 4.83 | 99 | 99 |
3 | BEACON HIGH SCHOOL | 30 | 4.33 | 97 | 98 |
4 | FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & ARTS | 30 | 4.69 | 99 | 99 |
5 | HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN... | 40 | 4.65 | 99 | 99 |
6 | BRONX HIGH SCHOOL OF SCIENCE | 39 | 4.90 | 100 | 100 |
7 | HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE | 26 | 4.86 | 100 | 100 |
8 | BROOKLYN TECHNICAL HIGH SCHOOL | 52 | 4.58 | 97 | 97 |
9 | BROOKLYN LATIN SCHOOL, THE | 55 | 4.64 | 97 | 97 |
10 | TOWNSEND HARRIS HIGH SCHOOL | 39 | 4.90 | 100 | 100 |
11 | QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK C... | 49 | 4.80 | 100 | 100 |
12 | STATEN ISLAND TECHNICAL HIGH SCHOOL | 35 | 4.77 | 100 | 100 |
female_perf['Economic Need Index(%)'].describe()
count 13.000000 mean 38.076923 std 10.103312 min 20.000000 25% 30.000000 50% 39.000000 75% 40.000000 max 55.000000 Name: Economic Need Index(%), dtype: float64
female_perf['Student Achievement Score(/5)'].describe()
count 13.000000 mean 4.710000 std 0.169755 min 4.330000 25% 4.640000 50% 4.770000 75% 4.830000 max 4.900000 Name: Student Achievement Score(/5), dtype: float64
female_perf['College Readiness Rate(%)'].describe()
count 13.000000 mean 98.923077 std 1.255756 min 97.000000 25% 98.000000 50% 99.000000 75% 100.000000 max 100.000000 Name: College Readiness Rate(%), dtype: float64
female_perf['Graduation Rate(%)'].describe()
count 13.000000 mean 99.076923 std 1.115164 min 97.000000 25% 99.000000 50% 99.000000 75% 100.000000 max 100.000000 Name: Graduation Rate(%), dtype: float64
After researching the data on schools where the majority of students are female, and SAT scores are high we can make the following observations:
The Economic Need Index
is relatively low, only 38% on average require fundingStudent Achievement Rate
These metrics indicate that these schools have good chances to get high scores on SAT.
First step is to calculate the percentage of students in each school that took an AP exam by dividing the AP Test Takers
column by the total_enrollment
column.
combined['ap_per'] = combined['AP Test Takers '] / combined['total_enrollment']
combined['ap_per']
0 0.305756 1 0.098985 2 0.031773 3 0.351577 4 0.158091 ... 358 0.077558 359 0.297301 360 0.190955 361 0.026616 362 0.073593 Name: ap_per, Length: 363, dtype: float64
Next step is to investigate the relationship between AP Scores and SAT Scores by making a scatter plot.
combined.plot.scatter(x= 'ap_per', y='sat_score')
<matplotlib.axes._subplots.AxesSubplot at 0x2865bacc248>
According to the scatter plot the correlation between sat_score
and ap_per
doesn't seem to be so strong because schools with the higher percentage of AP test takers do not perform better on SAT.
class_size_cols = ['AVERAGE CLASS SIZE', 'NUMBER OF STUDENTS / SEATS FILLED' ]
combined.corr()['sat_score'][class_size_cols].plot.barh()
<matplotlib.axes._subplots.AxesSubplot at 0x2865bab89c8>
The bar plot shows that both average class size and the number of seats filled correlate positively with SAT scores, but to understand these relationships deeper we would plot each of the columns on a scatter plot.
combined.plot.scatter(x= 'NUMBER OF STUDENTS / SEATS FILLED', y='sat_score')
<matplotlib.axes._subplots.AxesSubplot at 0x2865ba97688>
combined.plot.scatter(x= 'AVERAGE CLASS SIZE', y='sat_score')
<matplotlib.axes._subplots.AxesSubplot at 0x2865ba7b108>
Both scatter plots do not indicate very strong relationship between class size and SAT scores, so we will reseach how small and large classes perform academically and compare.
# Researching schools with small classes and high SAT scores
small_class = combined[combined['AVERAGE CLASS SIZE'] < 20 ]
small_class = combined[combined['sat_score'] > 1700]
small_class['SCHOOL NAME']
5 BARD HIGH SCHOOL EARLY COLLEGE 26 ELEANOR ROOSEVELT HIGH SCHOOL 37 STUYVESANT HIGH SCHOOL 60 BEACON HIGH SCHOOL 61 FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A... 79 HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN... 151 BRONX HIGH SCHOOL OF SCIENCE 155 HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE 187 BROOKLYN TECHNICAL HIGH SCHOOL 198 BROOKLYN LATIN SCHOOL, THE 302 TOWNSEND HARRIS HIGH SCHOOL 327 QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO... 356 STATEN ISLAND TECHNICAL HIGH SCHOOL Name: SCHOOL NAME, dtype: object
# Researching schools with large classes and high SAT scores
large_class = combined[combined['AVERAGE CLASS SIZE'] > 35 ]
large_class = combined[combined['sat_score'] > 1700]
large_class['SCHOOL NAME']
5 BARD HIGH SCHOOL EARLY COLLEGE 26 ELEANOR ROOSEVELT HIGH SCHOOL 37 STUYVESANT HIGH SCHOOL 60 BEACON HIGH SCHOOL 61 FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A... 79 HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN... 151 BRONX HIGH SCHOOL OF SCIENCE 155 HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE 187 BROOKLYN TECHNICAL HIGH SCHOOL 198 BROOKLYN LATIN SCHOOL, THE 302 TOWNSEND HARRIS HIGH SCHOOL 327 QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO... 356 STATEN ISLAND TECHNICAL HIGH SCHOOL Name: SCHOOL NAME, dtype: object
Notice that the aforementioned schools are the exact same schools that have high percentage of females and high SAT scores. We've taken a look at schools with both small and large classes that get high SAT scores, and the results are identical. That means the class size doesn't really affect the performance on SAT.