import pandas as pd
import numpy
import re
data_files = [
"ap_2010.csv",
"class_size.csv",
"demographics.csv",
"graduation.csv",
"hs_directory.csv",
"sat_results.csv"
]
data = {}
for f in data_files:
d = pd.read_csv("{0}".format(f))
data[f.replace(".csv", "")] = d
all_survey = pd.read_csv("survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv("survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)
survey["DBN"] = survey["dbn"]
survey_fields = [
"DBN",
"rr_s",
"rr_t",
"rr_p",
"N_s",
"N_t",
"N_p",
"saf_p_11",
"com_p_11",
"eng_p_11",
"aca_p_11",
"saf_t_11",
"com_t_11",
"eng_t_11",
"aca_t_11",
"saf_s_11",
"com_s_11",
"eng_s_11",
"aca_s_11",
"saf_tot_11",
"com_tot_11",
"eng_tot_11",
"aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]
def pad_csd(num):
string_representation = str(num)
if len(string_representation) > 1:
return string_representation
else:
return "0" + string_representation
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")
data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
def find_lat(loc):
coords = re.findall("\(.+, .+\)", loc)
lat = coords[0].split(",")[0].replace("(", "")
return lat
def find_lon(loc):
coords = re.findall("\(.+, .+\)", loc)
lon = coords[0].split(",")[1].replace(")", "").strip()
return lon
data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)
data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]
class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size
data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]
data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']
for col in cols:
data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")
combined = data["sat_results"]
combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")
to_merge = ["class_size", "demographics", "survey", "hs_directory"]
for m in to_merge:
combined = combined.merge(data[m], on="DBN", how="inner")
combined = combined.fillna(combined.mean())
combined = combined.fillna(0)
def get_first_two_chars(dbn):
return dbn[0:2]
combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)
correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)
SAT Critical Reading Avg. Score 0.986820 SAT Math Avg. Score 0.972643 SAT Writing Avg. Score 0.987771 sat_score 1.000000 AP Test Takers 0.523140 ... priority08 NaN priority09 NaN priority10 NaN lat -0.121029 lon -0.132222 Name: sat_score, Length: 67, dtype: float64
# Remove DBN since it's a unique identifier, not a useful numerical value for correlation.
survey_fields.remove("DBN")
import matplotlib.pyplot as plt
%matplotlib inline
## plotting correlations among sat score and survey columns
combined.corr()['sat_score'][survey_fields].plot(kind='bar')
<AxesSubplot:>
So we now analized eventual correlations among the sat score and the columns from the survey of parents, teachers and students. Organizing the results in a bar graph, it is clear that we have to clarify what every column of the survey really means; for this reason, we are going to check further informations from this link
We can see that there are some strong positive correlations between the sat score and the safety and respect score based both on teacher and student response. Another positive correlations is between the aforementioned sat score and the academic expactations based on student response.
We're going to dig into this relationships a bit more.
## scatter plot of saf_s_11 and sat_score
plt.scatter(combined['saf_s_11'], combined['sat_score'])
<matplotlib.collections.PathCollection at 0x1c75c5fe490>
## scatter plot of aca_s_11 and sat_score
plt.scatter(combined['aca_s_11'], combined['sat_score'])
<matplotlib.collections.PathCollection at 0x1c75c645c70>
In the first graph we can see that there is a cluster for values of safety lower than 8.0 and for sat scores lower than 1600. The rest of the graph seems to represent more granular, less correlated results. For this reason, the high positive correlation could be inficiated by this specific values.
We're going to plot again and see what happens
## create a new df with specific values
lower_safety = combined[combined['saf_s_11'] < 8.0]
lower_safety = combined[combined['sat_score'] < 1600]
## scatter plot of saf_s_11 and sat_score
plt.scatter(lower_safety['saf_s_11'], lower_safety['sat_score'])
<matplotlib.collections.PathCollection at 0x1c75c6aca60>
lower_safety.corr()['saf_s_11']['sat_score']
0.13493741434206447
Quite a low r value indeed. We can conclude that among those two values there is a positive correlation (except for the cluster we spoke before). The more the safety score is high (safety of the environment, school and neighboor), the more the sat scores are high. There are a few amount of schools that share both high values on safety score and on sat score.
In general, we can affirm that no school with a value of safety lower than 6.5 has a sat score higher than 1500.
## create a df indexed by borough and calculate avg safety score
boros = combined.groupby('boro').agg(numpy.mean)['saf_s_11']
print(boros)
boro Bronx 6.606577 Brooklyn 6.370755 Manhattan 6.831370 Queens 6.721875 Staten Island 6.530000 Name: saf_s_11, dtype: float64
The average safety score for the different boroughs is in the same range; however it seems that Manhattan and Queens has the highest values while Brooklyn has the lowest.
In this dataset there are also few columns that indicate the percentage of each race at given school. We now are going to check if there's any correlation between these columns and sat score
race_perc = ['white_per', 'asian_per', 'black_per', 'hispanic_per']
## plotting correlations among sat_score and race cols
combined.corr()['sat_score'][race_perc].plot(kind='bar')
<AxesSubplot:>
By analizing these correlations, we can see now that there is an extremely strong positive correlation between high sat scores and high percentage of white students in the classes and a negative correlation between the same sat scores and the presence of students belonging to hispanic ethnicity (we mean that the lowest is the sat score, the higheste is the percentage of hispanic students).
We are going to visualize this correlation with a scatter plot to make things clearer.
## plotting a scatter chart for sat_score and hispanic_per
plt.scatter(combined['hispanic_per'], combined['sat_score'])
<matplotlib.collections.PathCollection at 0x1c75c771e20>
By visualizing the scatter plot we can see that there is only one school that scored a sat score greater than 2000. That same school show a very small percentage of hispanic students. In general we can see that all the schools with a percentage of hispanic students super to 30% register sat scores not superior to 1400/1500; in the schools where that percentage is the highest are registered the lowest scores for SAT tests.
At this point we're going to check the specific schools where the percentage is greater than 95% and the schools where the percentage is less than 10% and an average SAT score greater than 1800
## checking schools with hispanic_per > 95%
combined[combined['hispanic_per'] > 95]['SCHOOL NAME']
44 MANHATTAN BRIDGES HIGH SCHOOL 82 WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL 89 GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M... 125 ACADEMY FOR LANGUAGE AND TECHNOLOGY 141 INTERNATIONAL SCHOOL FOR LIBERAL ARTS 176 PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE 253 MULTICULTURAL HIGH SCHOOL 286 PAN AMERICAN INTERNATIONAL HIGH SCHOOL Name: SCHOOL NAME, dtype: object
By doing our search using Google, we noticed that all these schools above are specialized and focus their core into taking care of students coming from hispanophone countries. Most of these schools approach their students as english-learners students. That's the explanation for the extremely high percentage of hispanic students.
For what we've said before we'll now check for the school that has the highest scores in the SAT and that has less than 10% of hispanic students.
## create a df that has less than 10% hispanic students and a SAT score > 1800
hisp_10_perc = combined[combined['hispanic_per'] < 10]
hisp_10_perc = hisp_10_perc[hisp_10_perc['sat_score'] > 1800]
hisp_10_perc['SCHOOL NAME']
37 STUYVESANT HIGH SCHOOL 151 BRONX HIGH SCHOOL OF SCIENCE 187 BROOKLYN TECHNICAL HIGH SCHOOL 327 QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO... 356 STATEN ISLAND TECHNICAL HIGH SCHOOL Name: SCHOOL NAME, dtype: object
By doing some search on Google we find that the schools above are among the most exclusive and elitist public high schools in the entire country. Particularly fonded into scientific subjects, some of them had, among their alumni, even Nobel Prize winners. These schools are mostly attended by asian students who has to pass an entrance exam.
gender_cols = ['male_per', 'female_per']
## bar plot of the correlations between gender and sat_score
combined.corr()['sat_score'][gender_cols].plot(kind='bar')
<AxesSubplot:>
Not very strong correlations here. The r factor shows that gender doesn't affect, both positively or negatively, the results of the SAT tests that much. We could however suggest that a school with an higher percentage of female students is more likely to score higher results in the SAT tests.
We're going to make a scatter plot to show this correlation more clearly.
## make a scatter plot
plt.scatter(combined['female_per'], combined['sat_score'])
<matplotlib.collections.PathCollection at 0x1c75c82c190>
As we said there isn't any specifical correlation among the percentage of female students and the result in the tests. Still we can see a cluster of schools that has an high percentage of female students (60/80%) and high scores in the tests.
## find the name of those specific schools
combined[(combined['female_per'] > 60) & (combined['sat_score'] > 1700)]['SCHOOL NAME']
5 BARD HIGH SCHOOL EARLY COLLEGE 26 ELEANOR ROOSEVELT HIGH SCHOOL 60 BEACON HIGH SCHOOL 61 FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A... 302 TOWNSEND HARRIS HIGH SCHOOL Name: SCHOOL NAME, dtype: object
These schools are focused on liberal arts and a particularly free approach to the curriculum with a strong attention to art and music subjects.
## calculate the percentage of AP Test Takers among the total students enrolled
combined['ap_per'] = round((combined['AP Test Takers '] / combined['total_enrollment']) * 100, 2)
combined['ap_per']
0 30.58 1 9.90 2 3.18 3 35.16 4 15.81 ... 358 7.76 359 29.73 360 19.10 361 2.66 362 7.36 Name: ap_per, Length: 363, dtype: float64
## make a scatter plot between ap_per and sat_score
plt.scatter(combined['ap_per'], combined['sat_score'])
<matplotlib.collections.PathCollection at 0x1c75c883460>
From the scatter plot we can say that there is not a clear strong correlation between percentage of students taking AP exams and results in the SAT tests. We can see that there is a quite big cluster of schools that has a small percentage of AP takers and low SAT scores, otherwise those results are shared by school where the percentage of AP takers is much higher. However, just to be sure, we'll calculate the exact r factor
## plotting the r factor into a bar plot
combined.corr()['sat_score']['ap_per']
0.05717292460813363
Indeed an r factor almost equal to 0.
## dropping double columns
combined.drop(columns=['SchoolName'],inplace=True)
plt.scatter(combined['AVERAGE CLASS SIZE'], combined['sat_score'])
<matplotlib.collections.PathCollection at 0x1c75c8d0940>
combined.corr()['AVERAGE CLASS SIZE']['sat_score']
0.3810143308095523
As we could imagine, the bigger is the class size the higher is the possibility to score high scores in the SAT test. According to the graph, there are no schools in wich the average size of the classes is less than 20 mq that had scores higher than 1400.
What we're going to do now is to check which is the best neighborhood according to the average score in the SAT test.
## grouping the df by 'boro' column
combined.groupby('boro').agg(numpy.mean)['sat_score']
boro Bronx 1157.598203 Brooklyn 1181.364461 Manhattan 1278.331410 Queens 1286.753032 Staten Island 1382.500000 Name: sat_score, dtype: float64
The neighborhood where we can find the best schools is Staten Island.
Now we're going to establish a ranking for each school based on the results of the safety survey and the SAT scores.
## calculating mean for safety surveys and applying to a new column
combined['SAFETY MEAN'] = round((combined['saf_t_11'] + combined['saf_p_11'] + combined['saf_s_11']) / 3, 2)
combined['SAFETY MEAN'].value_counts(ascending=True)
8.60 1 5.90 1 8.20 1 6.10 1 8.30 1 .. 7.33 9 7.40 10 7.47 10 7.37 10 7.23 17 Name: SAFETY MEAN, Length: 82, dtype: int64
## calculating a coefficient for ranking
combined['K'] = round((combined['SAFETY MEAN'] * combined['sat_score']) / 100, 2)
combined['K'].value_counts(bins=5)
(81.172, 103.014] 179 (59.22, 81.172] 127 (103.014, 124.856] 35 (124.856, 146.698] 15 (146.698, 168.54] 7 Name: K, dtype: int64
## create a function returning a ranking
def ranking(x):
if x >= 59.23 and x <= 81.172:
return 'BB'
elif x >= 81.173 and x <= 103.013:
return 'BBB'
elif x >= 103.014 and x <= 124.856:
return 'A'
elif x >= 124.857 and x <= 146.697:
return 'AA'
else:
return 'AAA'
##applying function and creating new column RANKING
combined['RANKING'] = combined['K'].apply(lambda x: ranking(x))
combined['RANKING']
0 BB 1 BB 2 BBB 3 BBB 4 AA ... 358 BB 359 BB 360 BB 361 A 362 BB Name: RANKING, Length: 363, dtype: object
At this point, by adding a new dataframe, we'll analyze which borough is the best according to the quality of the schools and the cost of the apartments. We'll use a dataframe found here.
ny_sales = pd.read_csv('nyc-rolling-sales.csv')
ny_sales.head()
Unnamed: 0 | BOROUGH | NEIGHBORHOOD | BUILDING CLASS CATEGORY | TAX CLASS AT PRESENT | BLOCK | LOT | EASE-MENT | BUILDING CLASS AT PRESENT | ADDRESS | ... | RESIDENTIAL UNITS | COMMERCIAL UNITS | TOTAL UNITS | LAND SQUARE FEET | GROSS SQUARE FEET | YEAR BUILT | TAX CLASS AT TIME OF SALE | BUILDING CLASS AT TIME OF SALE | SALE PRICE | SALE DATE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2A | 392 | 6 | C2 | 153 AVENUE B | ... | 5 | 0 | 5 | 1633 | 6440 | 1900 | 2 | C2 | 6625000 | 2017-07-19 00:00:00 | |
1 | 5 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2 | 399 | 26 | C7 | 234 EAST 4TH STREET | ... | 28 | 3 | 31 | 4616 | 18690 | 1900 | 2 | C7 | - | 2016-12-14 00:00:00 | |
2 | 6 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2 | 399 | 39 | C7 | 197 EAST 3RD STREET | ... | 16 | 1 | 17 | 2212 | 7803 | 1900 | 2 | C7 | - | 2016-12-09 00:00:00 | |
3 | 7 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2B | 402 | 21 | C4 | 154 EAST 7TH STREET | ... | 10 | 0 | 10 | 2272 | 6794 | 1913 | 2 | C4 | 3936272 | 2016-09-23 00:00:00 | |
4 | 8 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2A | 404 | 55 | C2 | 301 EAST 10TH STREET | ... | 6 | 0 | 6 | 2369 | 4615 | 1900 | 2 | C2 | 8000000 | 2016-11-17 00:00:00 |
5 rows × 22 columns
ny_sales.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 84548 entries, 0 to 84547 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 84548 non-null int64 1 BOROUGH 84548 non-null int64 2 NEIGHBORHOOD 84548 non-null object 3 BUILDING CLASS CATEGORY 84548 non-null object 4 TAX CLASS AT PRESENT 84548 non-null object 5 BLOCK 84548 non-null int64 6 LOT 84548 non-null int64 7 EASE-MENT 84548 non-null object 8 BUILDING CLASS AT PRESENT 84548 non-null object 9 ADDRESS 84548 non-null object 10 APARTMENT NUMBER 84548 non-null object 11 ZIP CODE 84548 non-null int64 12 RESIDENTIAL UNITS 84548 non-null int64 13 COMMERCIAL UNITS 84548 non-null int64 14 TOTAL UNITS 84548 non-null int64 15 LAND SQUARE FEET 84548 non-null object 16 GROSS SQUARE FEET 84548 non-null object 17 YEAR BUILT 84548 non-null int64 18 TAX CLASS AT TIME OF SALE 84548 non-null int64 19 BUILDING CLASS AT TIME OF SALE 84548 non-null object 20 SALE PRICE 84548 non-null object 21 SALE DATE 84548 non-null object dtypes: int64(10), object(12) memory usage: 14.2+ MB
First thing first, we'll rename the values of the BOROUGH column. From digits to the values present in the combined data frame.
num_to_name = {1:'Manhattan',2:'Bronx',3:'Brooklyn',4:'Queens',5:'Staten Island'}
ny_sales['BOROUGH'] = ny_sales['BOROUGH'].map(num_to_name)
ny_sales['SALE PRICE'].value_counts()
- 14561 0 10228 10 766 450000 427 550000 416 ... 3701338 1 366500 1 41033 1 207072 1 726059 1 Name: SALE PRICE, Length: 10008, dtype: int64
## replace '-' values with '0'
ny_sales['SALE PRICE'] = ny_sales['SALE PRICE'].astype(str).str.replace('-','0').astype('int64')
ny_sales[ny_sales['SALE PRICE'] <= 5000]['SALE PRICE'].value_counts()
0 24789 10 766 1 134 100 90 1000 77 500 67 5000 32 2000 14 3000 9 4000 7 2500 5 20 4 3500 4 2 3 3 2 2800 1 501 1 2400 1 2352 1 210 1 19 1 2096 1 5 1 373 1 1200 1 3774 1 3012 1 4500 1 4063 1 750 1 1500 1 300 1 1275 1 315 1 1162 1 250 1 2200 1 3001 1 600 1 200 1 8 1 3750 1 1175 1 1110 1 2510 1 4900 1 Name: SALE PRICE, dtype: int64
By analizing the SALE PRICE column, we notice that many sales occur with a nonsensically small dollar amount: $0 most commonly. These sales are actually transfers of deeds between parties: for example, parents transferring ownership to their home to a child after moving out for retirement.
For the fact that sale price is a key value for our search and also because there is a huge amount of values that holds these unrealistical prices, we can't simply drop those kind of rows in which are contained. We'll, instead, replace those values with the mean.
We'll replace all the values lower than 5000 $
## calculating mean price for values higher than 5000
mean_price = round(ny_sales[ny_sales['SALE PRICE'] >= 5000]['SALE PRICE'].mean(),0)
## replace values <= 5000
ny_sales['SALE PRICE'] = ny_sales['SALE PRICE'].mask(ny_sales['SALE PRICE'] <= 5000, mean_price)
pd.pivot_table(ny_sales, values='SALE PRICE', index='BOROUGH').plot(kind='barh')
plt.title('Mean prices for borough')
Text(0.5, 1.0, 'Mean prices for borough')
At this point we know which is the most expensive borough in New York (Manhattan). The other four share more or less the same range of price. What we'll do now is to find which borough has the best schools and is the most economic for buying flats. For doing this, we're going to reduce the first dataframe to the columns that are significant for our analysis and add to it a new column MEAN PRICE.
## reducing first df
combined_final = combined[['DBN','SCHOOL NAME','boro','Num of SAT Test Takers',
'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score',
'SAT Writing Avg. Score', 'sat_score','bus', 'subway','Location 1', 'lat',
'lon', 'SAFETY MEAN','RANKING']].copy()
## rename boro column to BOROUGH
combined_final.rename(columns={'boro':'BOROUGH'},inplace=True)
## create a function to calculate mean price
def mean_price(b):
if b == 'Manhattan':
return round(ny_sales[ny_sales['BOROUGH'] == 'Manhattan']['SALE PRICE'].mean(), 0)
elif b == 'Bronx':
return round(ny_sales[ny_sales['BOROUGH'] == 'Bronx']['SALE PRICE'].mean(), 0)
elif b == 'Brooklyn':
return round(ny_sales[ny_sales['BOROUGH'] == 'Brooklyn']['SALE PRICE'].mean(), 0)
elif b == 'Queens':
return round(ny_sales[ny_sales['BOROUGH'] == 'Queens']['SALE PRICE'].mean(), 0)
else:
return round(ny_sales[ny_sales['BOROUGH'] == 'Staten Island']['SALE PRICE'].mean(), 0)
combined_final['MEAN PRICE'] = combined_final['BOROUGH'].apply(lambda x: mean_price(x))
manhattan = combined_final[combined_final['BOROUGH'] == 'Manhattan']['RANKING'].value_counts()
bronx = combined_final[combined_final['BOROUGH'] == 'Bronx']['RANKING'].value_counts()
brooklyn = combined_final[combined_final['BOROUGH'] == 'Brooklyn']['RANKING'].value_counts()
queens = combined_final[combined_final['BOROUGH'] == 'Queens']['RANKING'].value_counts()
staten_island = combined_final[combined_final['BOROUGH'] == 'Staten Island']['RANKING'].value_counts()
fig, (ax1,ax2,ax3,ax4,ax5) = plt.subplots(ncols=5,figsize=(16,4))
axes = [ax1,ax2,ax3,ax4]
for ax in axes:
ax.set_xticks([0,10,20,30,40,50])
manhattan.plot(kind='barh',ax=ax1,'tab:orange')
ax1.set_title('Manhattan')
bronx.plot(kind='barh',ax=ax2)
ax2.set_title('Bronx')
brooklyn.plot(kind='barh',ax=ax3)
ax3.set_title('Brooklyn')
queens.plot(kind='barh',ax=ax4)
ax4.set_title('Queens')
staten_island.plot(kind='barh',ax=ax5)
ax5.set_title('Staten Island')
File "<ipython-input-145-824e03a44d67>", line 7 manhattan.plot(kind='barh',ax=ax1,'tab:orange') ^ SyntaxError: positional argument follows keyword argument