#!/usr/bin/env python # coding: utf-8 # # Guided Project: Analyzing NYC High School Data # # The [SAT](https://en.wikipedia.org/wiki/SAT), or Scholastic Aptitude Test, is a test that high school seniors in the U.S. take every year. The SAT has three sections, each of which is worth a maximum of 800 points. Colleges use the SAT to determine which students to admit. High average SAT scores are usually indicative of a good school. # # New York City has published data on [student SAT scores](https://data.cityofnewyork.us/Education/2012-SAT-Results/f9bf-2cp4) by high school, along with additional demographic data sets. The datasets available to us for this project are: # # - [SAT scores by school](https://data.cityofnewyork.us/Education/SAT-Results/f9bf-2cp4) - SAT scores for each high school in New York City # - [School attendance](https://data.cityofnewyork.us/Education/2010-2011-School-Attendance-and-Enrollment-Statist/7z8d-msnt) - Attendance information for each school in New York City # - [Class size](https://data.cityofnewyork.us/Education/2010-2011-Class-Size-School-level-detail/urz7-pzb3) - Information on class size for each school # - [AP test results](https://data.cityofnewyork.us/Education/2010-AP-College-Board-School-Level-Results/itfs-ms3e) - Advanced Placement (AP) exam results for each high school (passing an optional AP exam in a particular subject can earn a student college credit in that subject) # - [Graduation outcomes](https://data.cityofnewyork.us/Education/2005-2010-Graduation-Outcomes-School-Level/vh2h-md7a) - The percentage of students who graduated, and other outcome information # - [Demographics](https://data.cityofnewyork.us/Education/School-Demographics-and-Accountability-Snapshot-20/ihfw-zy9j) - Demographic information for each school # - [School survey](https://data.cityofnewyork.us/Education/NYC-School-Survey-2011/mnz3-dyi8) - Surveys of parents, teachers, and students at each school # # New York City has a significant immigrant population and is very diverse, so comparing demographic factors such as race, income, and gender with SAT scores is a good way to determine whether the SAT is a fair test. For example, if certain racial groups consistently perform better on the SAT, we would have some evidence that the SAT is unfair. # # First, let's start by reading in all the datasets, doing some clean up and creating a `combined` dataframe containing data from all sets. # # ## Read in the data # In[1]: import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: data_files = [ "ap_2010.csv", "class_size.csv", "demographics.csv", "graduation.csv", "hs_directory.csv", "sat_results.csv" ] data = {} for f in data_files: file = pd.read_csv('schools/{0}'.format(f)) key = f.replace('.csv', '') data[key] = file all_survey = pd.read_csv('schools/survey_all.txt', delimiter='\t', encoding="windows-1252") d75_survey = pd.read_csv('schools/survey_d75.txt', delimiter='\t', encoding="windows-1252") #Let's also print the first five rows of the datasets in the data dictionary: for key in data.keys(): print(data[key].head()) # In[3]: #Now, let's merge the all_survey and d75_survey dataframes: survey = pd.concat([all_survey, d75_survey]) #Let's see how many rows and columns the dataset has as well as the first five rows: print(survey.shape) survey.head() # - There are over 2000 columns, nearly all of which we don't need. We'll have to filter the data to remove the unnecessary ones. Working with fewer columns will make it easier to print the dataframe out and find correlations within it. # - The survey data has a dbn column that we'll want to convert to uppercase (DBN). The conversion will make the column name consistent with the other data sets. # # First, we'll need to filter the columns to remove the ones we don't need. Luckily, there's a data dictionary at the original data download [location](https://data.cityofnewyork.us/Education/2011-NYC-School-Survey/mnz3-dyi8). The dictionary tells us what each column represents. Based on our knowledge of the problem and the analysis we're trying to do, we can use the data dictionary to determine which columns to use. # In[4]: cols_to_keep = ["dbn", "rr_s", "rr_t", "rr_p", "N_s", "N_t", "N_p", "saf_p_11", "com_p_11", "eng_p_11", "aca_p_11", "saf_t_11", "com_t_11", "eng_t_11", "aca_t_11", "saf_s_11", "com_s_11", "eng_s_11", "aca_s_11", "saf_tot_11", "com_tot_11", "eng_tot_11", "aca_tot_11"] survey = survey[cols_to_keep] survey['DBN'] = survey['dbn'] #Let's get the updated number of rows and columns: survey.shape # In[5]: #Let's add the survey dataframe to the data dictionary: data['survey'] = survey # ## Create `DBN` columns # # When we explored all of the datasets, we noticed that some of them, like `class_size` and `hs_directory`, don't have a DBN column. `hs_directory` does have a `dbn` column, though, so we can just rename it. # In[6]: data['hs_directory']['DBN'] = data['hs_directory']['dbn'] data['hs_directory']['DBN'].head() # The DBN in the `sat_results` data is just a combination of the `CSD` and `SCHOOL CODE` columns in the `class_size` data. The main difference is that the `DBN` is padded, so that the `CSD` portion of it always consists of two digits. That means we'll need to add a leading 0 to the `CSD` if the `CSD` is less than two digits long. # In[7]: def padding(x): x_str = str(x) if len(x_str) < 2: return x_str.zfill(2) else: return x_str data['class_size']['padded_csd'] = data['class_size']['CSD'].apply(padding) data['class_size']['DBN'] = data['class_size']['padded_csd'] + data['class_size']['SCHOOL CODE'] # In[8]: data['class_size']['DBN'].head() # ## Combine the SAT scores # # The `SAT Math Avg. Score`, `SAT Critical Reading Avg. Score`, and `SAT Writing Avg. Score` columns in the `sat_results` dataset contain scores on the different components of the SAT test. We would like to combine them all in one column. Before we can do that, we need to convert them to numeric values as currently they are stored as strings. # In[9]: cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score'] for col in cols: data['sat_results'][col] = pd.to_numeric(data['sat_results'][col], errors='coerce') data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]] data['sat_results'].head() # ## Parsing Geographic Coordinates for Schools # # Next, we'll want to parse the latitude and longitude coordinates for each school. This will enable us to map the schools and uncover any geographic patterns in the data. The coordinates are currently in the text field `Location 1` in the `hs_directory` data set. Let's extract them and create two new columns that will store them. # In[10]: import re def latitude(x): coords = re.findall("\(.+\)", x) lat = coords[0].split(',')[0].replace('(', '') return lat data['hs_directory']['lat'] = data['hs_directory']['Location 1'].apply(latitude) # In[11]: def longtitude(x): coords = re.findall("\(.+\)", x) lon = coords[0].split(',')[1].replace(')', '') return lon data['hs_directory']['lon'] = data['hs_directory']['Location 1'].apply(longtitude) # In[12]: #Since the data is stored in strings, we need to convert it to numeric: data['hs_directory']['lat'] = pd.to_numeric(data['hs_directory']['lat'], errors='coerce') data['hs_directory']['lon'] = pd.to_numeric(data['hs_directory']['lon'], errors='coerce') data['hs_directory'].head() # ## Filtering the data # # We are only interested in data for highschools, i.e. grades 9 through 12. The `class_size` dataset contains data for lower grades than that. Let's filter them out. # # Each school can have multiple program types. Because `GEN ED` is the largest category by far, let's only select rows where `PROGRAM TYPE` is `GEN ED`. # In[13]: class_size = data['class_size'] class_size = class_size[class_size['GRADE '] == '09-12'] class_size['GRADE '].unique() # In[14]: class_size = class_size[class_size['PROGRAM TYPE'] == 'GEN ED'] class_size['PROGRAM TYPE'].unique() class_size.head() # ## Computing Average Class Sizes # # Once we are done cleaning up the data, we will combine the datasets into one. We will use the `DBN` column to combine them on. However, currently, the `DBN` in the `class_size` dataset isn't completely unique. This is due to the `CORE COURSE (MS CORE and 9-12 ONLY)` and `CORE SUBJECT (MS CORE and 9-12 ONLY)` columns. # In[15]: import numpy as np class_size = class_size.groupby('DBN').agg(np.mean) class_size.reset_index(inplace=True) data['class_size'] = class_size data['class_size'].head() # Next, let's condense `demographics`. In this case, the only column that prevents a given DBN from being unique is `schoolyear`. We only want to select rows where schoolyear is 20112012. This will give us the most recent year of data, and also match our SAT results data # In[16]: data['demographics'] = data['demographics'][data['demographics']['schoolyear'] == 20112012] data['demographics'].head() # Finally, we'll need to condense the `graduation` data set. The `Demographic` and `Cohort` columns are what prevent DBN from being unique in the graduation data. A `Cohort` appears to refer to the year the data represents, and the `Demographic` appears to refer to a specific demographic group. In this case, we want to pick data from the most recent `Cohort` available, which is 2006. We also want data from the full cohort, so we'll only pick rows where `Demographic` is `Total Cohort`. # In[17]: data['graduation'] = data['graduation'][data['graduation']['Cohort'] == '2006'] data['graduation'] = data['graduation'][data['graduation']['Demographic'] == 'Total Cohort'] data['graduation'].head() # ## Converting AP Test Scores # # We're almost ready to combine all of the data sets. The only remaining thing to do is convert the [Advanced Placement (AP)](https://en.wikipedia.org/wiki/Advanced_Placement_exams) test scores from strings to numeric values. High school students take the AP exams before applying to college. There are several AP exams, each corresponding to a school subject. High school students who earn high scores may receive college credit. # # AP exams have a `1` to `5` scale; `3` or higher is a passing score. Many high school students take AP exams -- particularly those who attend academically challenging institutions. AP exams are much more rare in schools that lack funding or academic rigor. # # It will be interesting to find out whether AP exam scores are correlated with SAT scores across high schools. To determine this, we'll need to convert the AP exam scores in the `ap_2010` data set to numeric values first. # # There are three columns we'll need to convert: # # - `AP Test Takers` # - `Total Exams Taken` # - `Number of Exams with scores 3 4 or 5` # In[18]: cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5'] for c in cols: data['ap_2010'][c] = pd.to_numeric(data['ap_2010'][c], errors='coerce') print(data['ap_2010'][c].dtype) # ## Merging the data # # We will finally merge the datasets into one - `combined`. # In[19]: combined = data['sat_results'] combined = combined.merge(data['ap_2010'], on='DBN', how='left') combined = combined.merge(data['graduation'], on='DBN', how='left') combined.head() # In[20]: combined.shape # In[21]: sets = ['class_size', 'demographics', 'survey', 'hs_directory'] for s in sets: combined = combined.merge(data[s], on='DBN', how='inner') combined.head() # In[22]: combined.shape # We now have many columns with null (`NaN`) values. This is because we chose to do left joins, where some columns may not have had data. The data set also had some missing values to begin with. We will fill in the missing values in each column with the mean of the column. # In[23]: combined = combined.fillna(combined.mean()) combined = combined.fillna(0) combined.head() # ## Adding a School District Column for Mapping # # The school district is just the first two characters of the `DBN` column. Let's extract them and store them in a new column - `school_dist`. # In[24]: def school_district(x): return x[:2] combined['school_dist'] = combined['DBN'].apply(school_district) combined['school_dist'].head() # Now that we have cleaned and combined our data, we are ready to analyze it. # # ## Finding correlations # # There are several fields in `combined` that originally came from a [survey of parents, teachers, and students](https://data.cityofnewyork.us/Education/2011-NYC-School-Survey/mnz3-dyi8). Let's make a bar plot of the correlations between these fields and sat_score. # In[25]: survey_cols = ['rr_s', 'rr_t', 'rr_p', 'N_s', 'N_t', 'N_p', 'saf_p_11', 'com_p_11','eng_p_11','aca_p_11','saf_t_11', 'com_t_11', 'eng_t_11','aca_t_11','saf_s_11','com_s_11','eng_s_11','aca_s_11','saf_tot_11','com_tot_11','eng_tot_11', 'aca_tot_11'] correlations = combined.corr() correlations = correlations['sat_score'] print(correlations) # In[58]: plt.style.use('ggplot') combined.corr()["sat_score"][survey_cols].plot.bar(figsize=(10,10)) plt.title('Correlation between SAT score and Suvey fields', y=1.03) plt.axhline() # It looks like there is a strong positive correlation between the sat scores and: # # - N_s - Number of student respondents # - N_t - Number of teacher respondents # - N_p - Number of parent respondents # - saf_s_11 - Safety and Respect score based on student responses # - saf_t_11 - Safety and Respect score based on teacher responses # - aca_s_11 - Academic expectations score based on student responses # - saf_tot_11 - Safety and Respect total score # # The correlation between the way students and teachers perceive school safety and the sat results is understandable - it's difficult to teach and study in an unsafe environment. Let's dig into this relationship a bit further. # # ## Exploring Safety and SAT Scores # In[87]: combined.plot.scatter(x='saf_s_11', y='sat_score', figsize=(8, 5)) plt.title('Safety score vs SAT score', y=1.03) plt.xlim(4, 10) # The scatter plot above visualizes the possitive correlation between the safety score of the schools and the SAT scores. # # ## Map out safety scores # In[28]: by_dist = combined.groupby('school_dist').agg(np.mean) by_dist.reset_index(inplace=True) # In[51]: from mpl_toolkits.basemap import Basemap m = Basemap( projection='merc', llcrnrlat=40.496044, urcrnrlat=40.915256, llcrnrlon=-74.255735, urcrnrlon=-73.700272, resolution='i' ) m.drawmapboundary(fill_color='#85A6D9') m.drawcoastlines(color='#6D5F47', linewidth=.4) m.drawrivers(color='#6D5F47', linewidth=.4) m.fillcontinents(color='white',lake_color='#85A6D9') longitudes = by_dist['lon'].tolist() latitudes = by_dist['lat'].tolist() m.scatter(longitudes, latitudes, s=50, zorder=2, latlon=True, c=by_dist["saf_s_11"], cmap="summer") plt.colorbar() plt.title('School safety scores by District', y=1.03) plt.show() # ## SAT scores by race # # There are a few columns that indicate the percentage of each race at a given school: # # - `white_per` # - `asian_per` # - `black_per` # - `hispanic_per` # # By plotting out the correlations between these columns and `sat_score`, we can determine whether there are any racial differences in SAT performance. # In[75]: race = ['white_per', 'asian_per', 'black_per', 'hispanic_per'] combined.corr()['sat_score'][race].plot.barh(figsize=(8,5)) plt.title('Correlation between race and SAT scores', y=1.03) plt.xlim(-0.5,0.8) plt.axvline() # It looks like there is a strong possitive correlation between the SAT scores and the high percentage white and asians students. # There is a negative correlation between the SAT scores and the high percentage black and hispanic students. # In[76]: combined.plot.scatter(x='hispanic_per', y='sat_score', figsize=(8,5)) plt.title('% hispanic students vs SAT scores', y=1.03) plt.xlim(0,103) # It seems that the schools whose students achieve greater SAT scores, have less than 20% hispanic students. # # Let's isolate the schools which have over 95% hispanic students. # In[32]: hispanic_schools = combined[combined['hispanic_per'] > 95] hispanic_schools['SCHOOL NAME'].unique() # - **Manhattan Bridges High School** is a public school in Manhattan, NY, with a student-teacher ratio of 18 to 1*. The school caters to immigrants recently arrived from Spanish-speaking countries. # - **Washington Heights Expeditionary Learning School** is a public school in Manhattan, NY, with a student-teacher ratio of 13 to 1. The school's students are predominantely Spanish speaking Dominican American and/or born in the Dominican Republic. # - **Gregorio Luperon High School for Science and Mathematics** is a public school in Manhattan, NY, with a student-teacher ratio of 19 to 1. The school serves Spanish-speaking students who are new to this country # - **Academy for Language and Technology** is a public school in the Bronx, NY, with a student-teacher ratio of 13 to 1. The school offers new immigrants a safe, orderly environment and the chance to learn computer programming and maintenance. # - **International School for Liberal Arts** is a public school in the Bronx, NY, with a student-teacher ratio of 13 to 1. It is designed to offer Spanish-speaking teens a gentle transition to English. # - **Pan American International High School at Monroe** is a public school in the Bronx, NY, with a student-teacher ratio of 15 to 1. It offers new immigrants—including many unaccompanied minors—the chance to learn English and complete high school. # - **Multicultural High School** is a public school located in Brooklyn, NY, with a student-teacher ratio of 10 to 1. It serves Spanish-speaking students who have been in the United States for less than three years. # - **Pan American International High School** is a public school located in Elmhurst, NY, with a student-teacher ratio of 11 to 1. Students there all speak Spanish and have been in the United States for less than four years. # # All in all, all eight schools have a predominantly immigrant students. The fact that English is not the students' first language might have an affect on their preformence on the SAT. # _________ # *The national average public school student:teacher ratio is approximately 16:1, Source: [Public School Review](https://www.publicschoolreview.com/average-student-teacher-ratio-stats/national-data) # # ------- # The above school information was taken from the following websites: # # - [Inside Schools](https://insideschools.org/) # - [Niche](https://www.niche.com/?ref=k12) # # ## Exploring Gender and SAT Scores # # There are two columns that indicate the percentage of each gender at a school: # # - `male_per` # - `female_per` # # We can plot out the correlations between each percentage and `sat_score`. # In[63]: gender = ['male_per', 'female_per'] combined.corr()['sat_score'][gender].plot.bar(figsize=(8,5), rot=360) plt.axhline() plt.title('Gender vs SAT scores', y=1.03) # There is a positive correlation between the high percent female students and SAT scores and a negative correlation between high percent male students and SAT scores. # In[86]: combined.plot.scatter(x='female_per', y='sat_score', figsize=(8, 5)) plt.title('% Female students vs SAT score', y=1.03) plt.xlim(-1, 103) # It is interesting to see that the scatter plot **does not** confirm the correlation. # # Let's isolate the schools with over 60% female students and high SAT scores: # In[35]: female_high = combined[(combined['female_per'] > 60) & (combined['sat_score'] > 1700)] female_high['SCHOOL NAME'].unique() # - **Bard High School Early College** is a public school in NY with a student-teacher ratio of 17 to 1. It offers small class sizes, excellent teaching, and the promise of two years of college credit for free. # - **Eleanor Roosevelt High School** is a public school in NY with a student-teacher ratio of 17 to 1. It offers a traditional college-prep curriculum. # - **Beacon High School** is a public school in Manhattan, NY with a student-teacher ratio of 23 to 1. It is a selective college-preparatory school. # - **Fiorello H. Laguardia High School of Music & Art and Performing Arts** is a public school located in New York, NY with a student-teacher ratio of 20 to 1. It was founded as a way to give aspiring actors, dancers, and singers the technical skills they need to become professional performers and the academic skills they need in whatever career they choose. Students receive first-rate training from professional studios and companies, including the Ailey School for dance, Rosie’s Theater Kids for musical theater, the National Chorale for voice and Waterwell for drama. # - **Townsend Harris High School** is a public school in Queens, NY, with a student-teacher ratio of 20 to 1. It combines rigorous academics with a well-rounded high school experience. # # All in all, the above schools have an above average students to teacher ratio. All are focused on preparing students for college which might be the reason why the SAT scores of their students are higher. # # ## Exploring AP Scores vs. SAT Scores # # In the U.S., high school students take [Advanced Placement (AP)](https://en.wikipedia.org/wiki/Advanced_Placement_exams) exams to earn college credit. There are AP exams for many different subjects. # # It makes sense that the number of students at a school who took AP exams would be highly correlated with the school's SAT scores. Let's explore this relationship. We'll look at the percentage of students in each school who took at least one AP exam. # In[65]: combined['ap_per'] = combined['AP Test Takers ']/combined['total_enrollment'] combined.plot.scatter(x='ap_per', y='sat_score') plt.xlim(-0.1, 1) plt.title('% AP test takers vs SAT score', y=1.03) # It seems that the correlation between the percent students who take the AP test and the SAT score isn't that high. # # ## Class size vs SAT score # # Let's see wheter there's a correlation between class size and SAT scores. # In[88]: combined.plot.scatter(x='AVERAGE CLASS SIZE', y='sat_score', figsize=(8, 5)) plt.title('Class size vs SAT score', y=1.03) # It seems that there is a slight correlation between the average class size and the SAT score. It is interesting to see that the best SAT scores were achieved in average class sizes of 20-35. It's also interesting to see that lower class size does not necessarily mean better SAT results. # # ## Neighborhoods with best schools # In[67]: best = combined[combined['sat_score'] > 1700] best['boro'].value_counts(normalize=True) # It looks like 46% of the schools whose students achieve best results on th SAT are located in Manhattan. # # ## School scores based on SAT scores # # Based on the [College Board Review](https://reports.collegeboard.org/pdf/2019-total-group-sat-suite-assessments-annual-report.pdf), we will split the schools in three categories based on the average SAT score of their students: # # - Below average: SAT score below 1000 # - Average: SAT score between 1000 - 1190 # - Above average: SAT score between 1190 - 1400 # - Overachievers: SAT score over 1400 # In[73]: def sat_scores(x): if x <= 1000: return "Below average" elif 1000 < x <=1190: return "Average" elif 1190 < x <= 1400: return "Above average" elif x > 1400: return "Overachievers" # In[74]: combined['school_score'] = combined['sat_score'].apply(sat_scores) combined['school_score'].value_counts(normalize=True)*100 # It looks like only 3% of the schools preform below average at the SAT and about 11.5% exceed in their average scores. # # ## Parents vs Teachers vs Students # # Let's explore how the parents, teachers and students percieve the school environment. We will focus on those columns: # # - Safety score - `saf_p_11`, `saf_t_11`, `saf_s_11` # - Communication score - `com_p_11`, `com_t_11`, `com_s_11` # - Engagement score - `eng_p_11`, `eng_t_11`, `eng_s_11` # - Academic expectation score - `aca_p_11`, `aca_t_11`, `aca_s_11` # # We will calculate the mean for each category for each group and then plot them to see if there are any major differences in the way each group percieves the school environment. # In[124]: parents = ['saf_p_11', 'com_p_11', 'eng_p_11', 'aca_p_11'] teachers = ['saf_t_11', 'com_t_11', 'eng_t_11', 'aca_t_11'] students = ['saf_s_11', 'com_s_11', 'eng_s_11', 'aca_s_11'] parents_means = [] for p in parents: parents_means.append(combined[p].mean()) teachers_means = [] for t in teachers: teachers_means.append(combined[t].mean()) students_means = [] for s in students: students_means.append(combined[s].mean()) # In[162]: data = np.array([parents_means, teachers_means, students_means]) length = len(data) width = 0.2 x = np.arange(length) x_labels = ['Parents', 'Teachers', 'Students'] fig, ax = plt.subplots(figsize=(10,6)) ax.bar(x - width, data[:,0], width, color=(162/255,200/255,236/255), label='Safety') ax.bar(x, data[:,1], width, color=(200/255,82/255,0/255), label='Communication') ax.bar(x + width, data[:,2], width, color=(255/255,181/255,121/255), label='Engagement') ax.bar(x + (2 * width), data[:,3], width, color=(137/255,137/255,137/255), label='Academic Expectations') plt.legend(loc='upper center') ax.set_xticks(x + width) ax.set_xticklabels(x_labels) plt.ylim(0,12) plt.title('Parents vs Teachers vs Students', y=1.03) plt.show() # We see that on average, parents rate the school environment higher than student and teachers do, whereas Students and Teachers have similar ratings. This might be due to the fact that parents do not have first hand experience of the schools. # # ## Conclusion # # In this project we analyzed data on the SAT scores of students in NYC schools. # We found out that the more the students and teachers percieve the school as safe, the better the SAT scores. We also found out that the schools with majority of students who do not have English as a first language and/or have recently moved to the US tend to score lower results. # In[ ]: # In[ ]: