Notebook

Guided Project of Analyzing NYC High School Data¶

Read in the data¶

In [51]:

import pandas as pd
import numpy
import re

data_files = [
    "ap_2010.csv",
    "class_size.csv",
    "demographics.csv",
    "graduation.csv",
    "hs_directory.csv",
    "sat_results.csv"
]

data = {}

for f in data_files:
    d = pd.read_csv("schools/{0}".format(f))
    data[f.replace(".csv", "")] = d

Read in the surveys¶

In [52]:

all_survey = pd.read_csv("schools/survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv("schools/survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)

survey["DBN"] = survey["dbn"]

survey_fields = [
    "DBN", 
    "rr_s", 
    "rr_t", 
    "rr_p", 
    "N_s", 
    "N_t", 
    "N_p", 
    "saf_p_11", 
    "com_p_11", 
    "eng_p_11", 
    "aca_p_11", 
    "saf_t_11", 
    "com_t_11", 
    "eng_t_11", 
    "aca_t_11", 
    "saf_s_11", 
    "com_s_11", 
    "eng_s_11", 
    "aca_s_11", 
    "saf_tot_11", 
    "com_tot_11", 
    "eng_tot_11", 
    "aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey

Add DBN columns¶

In [53]:

data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]

def pad_csd(num):
    string_representation = str(num)
    if len(string_representation) > 1:
        return string_representation
    else:
        return "0" + string_representation
    
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]

Convert columns to numeric¶

In [54]:

cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
    data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")

data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]

def find_lat(loc):
    coords = re.findall("\(.+, .+\)", loc)
    lat = coords[0].split(",")[0].replace("(", "")
    return lat

def find_lon(loc):
    coords = re.findall("\(.+, .+\)", loc)
    lon = coords[0].split(",")[1].replace(")", "").strip()
    return lon

data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)

data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")

Condense datasets¶

In [55]:

class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]

class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size

data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]

data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]

Convert AP scores to numeric¶

In [56]:

cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']

for col in cols:
    data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")

Combine the datasets¶

In [57]:

combined = data["sat_results"]

combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")

to_merge = ["class_size", "demographics", "survey", "hs_directory"]

for m in to_merge:
    combined = combined.merge(data[m], on="DBN", how="inner")

combined = combined.fillna(combined.mean())
combined = combined.fillna(0)

Add a school district column for mapping¶

In [58]:

def get_first_two_chars(dbn):
    return dbn[0:2]

combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)

Find correlations¶

In [59]:

correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)

SAT Critical Reading Avg. Score         0.986820
SAT Math Avg. Score                     0.972643
SAT Writing Avg. Score                  0.987771
sat_score                               1.000000
AP Test Takers                          0.523140
Total Exams Taken                       0.514333
Number of Exams with scores 3 4 or 5    0.463245
Total Cohort                            0.325144
CSD                                     0.042948
NUMBER OF STUDENTS / SEATS FILLED       0.394626
NUMBER OF SECTIONS                      0.362673
AVERAGE CLASS SIZE                      0.381014
SIZE OF SMALLEST CLASS                  0.249949
SIZE OF LARGEST CLASS                   0.314434
SCHOOLWIDE PUPIL-TEACHER RATIO               NaN
schoolyear                                   NaN
fl_percent                                   NaN
frl_percent                            -0.722225
total_enrollment                        0.367857
ell_num                                -0.153778
ell_percent                            -0.398750
sped_num                                0.034933
sped_percent                           -0.448170
asian_num                               0.475445
asian_per                               0.570730
black_num                               0.027979
black_per                              -0.284139
hispanic_num                            0.025744
hispanic_per                           -0.396985
white_num                               0.449559
                                          ...   
rr_p                                    0.047925
N_s                                     0.423463
N_t                                     0.291463
N_p                                     0.421530
saf_p_11                                0.122913
com_p_11                               -0.115073
eng_p_11                                0.020254
aca_p_11                                0.035155
saf_t_11                                0.313810
com_t_11                                0.082419
eng_t_11                                0.036906
aca_t_11                                0.132348
saf_s_11                                0.337639
com_s_11                                0.187370
eng_s_11                                0.213822
aca_s_11                                0.339435
saf_tot_11                              0.318753
com_tot_11                              0.077310
eng_tot_11                              0.100102
aca_tot_11                              0.190966
grade_span_max                               NaN
expgrade_span_max                            NaN
zip                                    -0.063977
total_students                          0.407827
number_programs                         0.117012
priority08                                   NaN
priority09                                   NaN
priority10                                   NaN
lat                                    -0.121029
lon                                    -0.132222
Name: sat_score, Length: 67, dtype: float64

Plotting survey correlations¶

In [60]:

# Remove DBN since it's a unique identifier, not a useful numerical value for correlation.
survey_fields.remove("DBN")

In [61]:

#setting up matplotlib 
%matplotlib inline
combined.corr()["sat_score"][survey_fields].plot.bar()

Out[61]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b53b3ef0>

Conclussion: High correlation is observed between N_s, N_t, N_p (which are the survey's number of respondents for student, teacher and parents) and sat_score as these are related to total enrollment.

Interesting correlation is also seen between student response rate(rr_s) and sat_scores as good academic result is likely to get responded to in a survey.

Scatter plot of safety and SAT scores¶

In [62]:

# Making a scatter plot of the saf_s_11 column vs. the sat_score 
# in combined
combined.plot.scatter(x="saf_s_11", y="sat_score")

Out[62]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b53575c0>

The scatter plot above is used to explore and explain the relationship between the student perception of school safety and the Sat Scores. Where the safety score is 7 or less, the average score achieved is around 1400 or less. And where the safety score is 7.5 or more, higher average score of 1800 and over is being achieved.

Map of safety scores by Districts¶

In [63]:

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

districts = combined.groupby('school_dist').agg(np.mean)
districts.reset_index(inplace=True)



m = Basemap(
    projection='merc', 
    llcrnrlat=40.496044, 
    urcrnrlat=40.915256, 
    llcrnrlon=-74.255735, 
    urcrnrlon=-73.700272,
    resolution='i'
)

m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)

longitudes = districts["lon"].tolist()
latitudes = districts["lat"].tolist()
m.scatter(longitudes, latitudes, s=50, zorder=2, latlon=True, c=districts["saf_s_11"], cmap="summer")
plt.show()

Plotting Racial Groupings against SAT scores¶

In [64]:

race_fields = ["white_per","asian_per","black_per","hispanic_per"]

In [65]:

combined.corr()["sat_score"][race_fields].plot.bar()

Out[65]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b5293470>

Plots shows that both White and Asian studesnts correalte positively with the sat results while Black and Hispanic students correlate negativle probably due to the social, housing and family disadvantages including funding for the minority communities

Exploring SAT scores of the Hispanics¶

In [66]:

# Using scatter plot to explore the relationship between low SAT Scores 
# and high concentration of Hispanics in combined dataset
combined.plot.scatter(x="hispanic_per", y="sat_score")

Out[66]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b4fd66d8>

In [67]:

high_hispanic = combined[combined["hispanic_per"] > 95]  
high_hispanic["School Name"].head()

Out[67]:

44             MANHATTAN BRIDGES HIGH SCHOOL
82                                         0
89     GREGORIO LUPERON HIGH SCHOOL FOR SCIE
125                                        0
141    INTERNATIONAL SCHOOL FOR LIBERAL ARTS
Name: School Name, dtype: object

In [68]:

low_hispanic = combined[combined["hispanic_per"] < 10]  
low_hispanic = combined[combined["sat_score"] > 1800]
low_hispanic["School Name"]

Out[68]:

5             BARD HIGH SCHOOL EARLY COLLEGE
37                    STUYVESANT HIGH SCHOOL
79      HIGH SCHOOL FOR MATHEMATICS  SCIENCE
151             BRONX HIGH SCHOOL OF SCIENCE
155    HIGH SCHOOL OF AMERICAN STUDIES AT LE
187           BROOKLYN TECHNICAL HIGH SCHOOL
302              TOWNSEND HARRIS HIGH SCHOOL
327    QUEENS HIGH SCHOOL FOR THE SCIENCES A
356      STATEN ISLAND TECHNICAL HIGH SCHOOL
Name: School Name, dtype: object

Exploring SAT scores and Genders¶

In [69]:

gender_fields = ["male_per","female_per"]
combined.corr()["sat_score"][gender_fields].plot.bar()

Out[69]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b4fd6438>

A high percentage of female students correllates positively with the sat score while male students correlates negatively

Scatter plot distribution of female percentage and SAT scores¶

In [70]:

combined.plot.scatter("female_per", "sat_score")

Out[70]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b5014cc0>

The scatter plot shows that around half of the percentage of the females achieved less than 1400 in SAT score

Predominantly Female schools with close to average SAT scores¶

In [71]:

combined[(combined["female_per"] > 60 ) & (combined["sat_score"] > 1700)]["SCHOOL NAME"]

Out[71]:

5                         BARD HIGH SCHOOL EARLY COLLEGE
26                         ELEANOR ROOSEVELT HIGH SCHOOL
60                                    BEACON HIGH SCHOOL
61     FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A...
302                          TOWNSEND HARRIS HIGH SCHOOL
Name: SCHOOL NAME, dtype: object

Above shows unique sets of predominantly female students at schools specializing Arts and/or music schools and with high Sat scores or good academic achievement.

Correlation between SAT scores and AP Exam¶

In [72]:

combined["ap_per"] = combined["AP Test Takers "] / combined["total_enrollment"]
combined.plot.scatter(x="ap_per", y="sat_score")

Out[72]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b5017fd0>

The scatter plot above shows a not too strong correlation and/or relationship between the percentage of students in a school who take the AP exam, and their average SAT scores.

In [73]:

correlations = combined.corr()
correlations = correlations["AVERAGE CLASS SIZE"]
print(correlations)

SAT Critical Reading Avg. Score         0.376117
SAT Math Avg. Score                     0.376290
SAT Writing Avg. Score                  0.369994
sat_score                               0.381014
AP Test Takers                          0.331098
Total Exams Taken                       0.307422
Number of Exams with scores 3 4 or 5    0.220873
Total Cohort                            0.489342
CSD                                     0.177610
NUMBER OF STUDENTS / SEATS FILLED       0.554340
NUMBER OF SECTIONS                      0.485068
AVERAGE CLASS SIZE                      1.000000
SIZE OF SMALLEST CLASS                  0.737460
SIZE OF LARGEST CLASS                   0.922846
SCHOOLWIDE PUPIL-TEACHER RATIO               NaN
schoolyear                                   NaN
fl_percent                                   NaN
frl_percent                            -0.400651
total_enrollment                        0.538782
ell_num                                 0.127532
ell_percent                            -0.279265
sped_num                                0.411212
sped_percent                           -0.130716
asian_num                               0.417055
asian_per                               0.234196
black_num                               0.354672
black_per                              -0.092176
hispanic_num                            0.342446
hispanic_per                           -0.225512
white_num                               0.400543
                                          ...   
N_s                                     0.546125
N_t                                     0.493508
N_p                                     0.469885
saf_p_11                               -0.244755
com_p_11                               -0.306949
eng_p_11                               -0.177926
aca_p_11                               -0.262945
saf_t_11                                0.068197
com_t_11                                0.139299
eng_t_11                                0.039632
aca_t_11                                0.052221
saf_s_11                               -0.113110
com_s_11                               -0.237042
eng_s_11                               -0.130713
aca_s_11                               -0.116830
saf_tot_11                             -0.087799
com_tot_11                             -0.077659
eng_tot_11                             -0.074314
aca_tot_11                             -0.091287
grade_span_max                               NaN
expgrade_span_max                            NaN
zip                                     0.103041
total_students                          0.547814
number_programs                         0.386975
priority08                                   NaN
priority09                                   NaN
priority10                                   NaN
lat                                    -0.150483
lon                                    -0.006868
ap_per                                 -0.330042
Name: AVERAGE CLASS SIZE, Length: 68, dtype: float64

Scatter plot view of AVERAGE CLASS SIZE and SAT scores¶

In [74]:

# Making a scatter plot of AVERAGE CLASS SIZE column vs. the SAT Score 
combined.plot.scatter(x="AVERAGE CLASS SIZE", y="sat_score")

Out[74]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fc7b52d92e8>

There appears to be no direct relationship or correlation between SAT Scores and the Average Class Size.

In [ ]: