Read in the data¶

In [1]:

import pandas as pd
import numpy
import re

data_files = [
    "ap_2010.csv",
    "class_size.csv",
    "demographics.csv",
    "graduation.csv",
    "hs_directory.csv",
    "sat_results.csv"
]

data = {}

for f in data_files:
    d = pd.read_csv("{0}".format(f))
    data[f.replace(".csv", "")] = d

Read in the surveys¶

In [2]:

all_survey = pd.read_csv("survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv("survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)

survey["DBN"] = survey["dbn"]

survey_fields = [
    "DBN", 
    "rr_s", 
    "rr_t", 
    "rr_p", 
    "N_s", 
    "N_t", 
    "N_p", 
    "saf_p_11", 
    "com_p_11", 
    "eng_p_11", 
    "aca_p_11", 
    "saf_t_11", 
    "com_t_11", 
    "eng_t_11", 
    "aca_t_11", 
    "saf_s_11", 
    "com_s_11", 
    "eng_s_11", 
    "aca_s_11", 
    "saf_tot_11", 
    "com_tot_11", 
    "eng_tot_11", 
    "aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey

Add DBN columns¶

In [3]:

data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]

def pad_csd(num):
    string_representation = str(num)
    if len(string_representation) > 1:
        return string_representation
    else:
        return "0" + string_representation
    
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]

Convert columns to numeric¶

In [4]:

cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
    data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")

data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]

def find_lat(loc):
    coords = re.findall("\(.+, .+\)", loc)
    lat = coords[0].split(",")[0].replace("(", "")
    return lat

def find_lon(loc):
    coords = re.findall("\(.+, .+\)", loc)
    lon = coords[0].split(",")[1].replace(")", "").strip()
    return lon

data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)

data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")

Condense datasets¶

In [5]:

class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]

class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size

data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]

data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]

Convert AP scores to numeric¶

In [6]:

cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']

for col in cols:
    data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")

Combine the datasets¶

In [7]:

combined = data["sat_results"]

combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")

to_merge = ["class_size", "demographics", "survey", "hs_directory"]

for m in to_merge:
    combined = combined.merge(data[m], on="DBN", how="inner")

combined = combined.fillna(combined.mean())
combined = combined.fillna(0)

Add a school district column for mapping¶

In [8]:

def get_first_two_chars(dbn):
    return dbn[0:2]

combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)

Find correlations¶

In [9]:

correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)

SAT Critical Reading Avg. Score    0.986820
SAT Math Avg. Score                0.972643
SAT Writing Avg. Score             0.987771
sat_score                          1.000000
AP Test Takers                     0.523140
                                     ...   
priority08                              NaN
priority09                              NaN
priority10                              NaN
lat                               -0.121029
lon                               -0.132222
Name: sat_score, Length: 67, dtype: float64

Plotting survey correlations¶

In [10]:

# Remove DBN since it's a unique identifier, not a useful numerical value for correlation.
survey_fields.remove("DBN")

In [11]:

import matplotlib.pyplot as plt
%matplotlib inline

## plotting correlations among sat score and survey columns
combined.corr()['sat_score'][survey_fields].plot(kind='bar')

Out[11]:

<AxesSubplot:>

So we now analized eventual correlations among the sat score and the columns from the survey of parents, teachers and students. Organizing the results in a bar graph, it is clear that we have to clarify what every column of the survey really means; for this reason, we are going to check further informations from this link

Student Response Rate
Teacher Response Rate
Parent Response Rate
Number of student respondents
Number of teacher respondents
Number of parent respondents
Number of eligible students
Number of eligible teachers
Number of eligible parents
Safety and Respect score based on parent responses
Communication score based on parent responses
Engagement score based on parent responses
Academic expectations score based on parent responses
Safety and Respect score based on teacher responses
Communication score based on teacher responses
Engagement score based on teacher responses
Academic expectations score based on teacher responses
Safety and Respect score based on student responses
Communication score based on student responses
Engagement score based on student responses
Academic expectations score based on student responses

We can see that there are some strong positive correlations between the sat score and the safety and respect score based both on teacher and student response. Another positive correlations is between the aforementioned sat score and the academic expactations based on student response.

We're going to dig into this relationships a bit more.

In [12]:

## scatter plot of saf_s_11 and sat_score

plt.scatter(combined['saf_s_11'], combined['sat_score'])

Out[12]:

<matplotlib.collections.PathCollection at 0x1c75c5fe490>

In [13]:

## scatter plot of aca_s_11 and sat_score

plt.scatter(combined['aca_s_11'], combined['sat_score'])

Out[13]:

<matplotlib.collections.PathCollection at 0x1c75c645c70>

In the first graph we can see that there is a cluster for values of safety lower than 8.0 and for sat scores lower than 1600. The rest of the graph seems to represent more granular, less correlated results. For this reason, the high positive correlation could be inficiated by this specific values.

We're going to plot again and see what happens

In [14]:

## create a new df with specific values

lower_safety = combined[combined['saf_s_11'] < 8.0]
lower_safety = combined[combined['sat_score'] < 1600]

## scatter plot of saf_s_11 and sat_score

plt.scatter(lower_safety['saf_s_11'], lower_safety['sat_score'])

Out[14]:

<matplotlib.collections.PathCollection at 0x1c75c6aca60>

In [15]:

lower_safety.corr()['saf_s_11']['sat_score']

Out[15]:

0.13493741434206447

Quite a low r value indeed. We can conclude that among those two values there is a positive correlation (except for the cluster we spoke before). The more the safety score is high (safety of the environment, school and neighboor), the more the sat scores are high. There are a few amount of schools that share both high values on safety score and on sat score.

In general, we can affirm that no school with a value of safety lower than 6.5 has a sat score higher than 1500.

In [16]:

## create a df indexed by borough and calculate avg safety score

boros = combined.groupby('boro').agg(numpy.mean)['saf_s_11']

print(boros)

boro
Bronx            6.606577
Brooklyn         6.370755
Manhattan        6.831370
Queens           6.721875
Staten Island    6.530000
Name: saf_s_11, dtype: float64

The average safety score for the different boroughs is in the same range; however it seems that Manhattan and Queens has the highest values while Brooklyn has the lowest.

In this dataset there are also few columns that indicate the percentage of each race at given school. We now are going to check if there's any correlation between these columns and sat score

Plotting race correlations¶

Find SAT score differences among the races¶

In [17]:

race_perc = ['white_per', 'asian_per', 'black_per', 'hispanic_per']

## plotting correlations among sat_score and race cols

combined.corr()['sat_score'][race_perc].plot(kind='bar')

Out[17]:

<AxesSubplot:>

By analizing these correlations, we can see now that there is an extremely strong positive correlation between high sat scores and high percentage of white students in the classes and a negative correlation between the same sat scores and the presence of students belonging to hispanic ethnicity (we mean that the lowest is the sat score, the higheste is the percentage of hispanic students).

We are going to visualize this correlation with a scatter plot to make things clearer.

In [18]:

## plotting a scatter chart for sat_score and hispanic_per

plt.scatter(combined['hispanic_per'], combined['sat_score'])

Out[18]:

<matplotlib.collections.PathCollection at 0x1c75c771e20>

By visualizing the scatter plot we can see that there is only one school that scored a sat score greater than 2000. That same school show a very small percentage of hispanic students. In general we can see that all the schools with a percentage of hispanic students super to 30% register sat scores not superior to 1400/1500; in the schools where that percentage is the highest are registered the lowest scores for SAT tests.

At this point we're going to check the specific schools where the percentage is greater than 95% and the schools where the percentage is less than 10% and an average SAT score greater than 1800

In [19]:

## checking schools with hispanic_per > 95%

combined[combined['hispanic_per'] > 95]['SCHOOL NAME']

Out[19]:

44                         MANHATTAN BRIDGES HIGH SCHOOL
82      WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL
89     GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M...
125                  ACADEMY FOR LANGUAGE AND TECHNOLOGY
141                INTERNATIONAL SCHOOL FOR LIBERAL ARTS
176     PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE
253                            MULTICULTURAL HIGH SCHOOL
286               PAN AMERICAN INTERNATIONAL HIGH SCHOOL
Name: SCHOOL NAME, dtype: object

By doing our search using Google, we noticed that all these schools above are specialized and focus their core into taking care of students coming from hispanophone countries. Most of these schools approach their students as english-learners students. That's the explanation for the extremely high percentage of hispanic students.

For what we've said before we'll now check for the school that has the highest scores in the SAT and that has less than 10% of hispanic students.

In [20]:

## create a df that has less than 10% hispanic students and a SAT score > 1800

hisp_10_perc = combined[combined['hispanic_per'] < 10]
hisp_10_perc = hisp_10_perc[hisp_10_perc['sat_score'] > 1800]

hisp_10_perc['SCHOOL NAME']

Out[20]:

37                                STUYVESANT HIGH SCHOOL
151                         BRONX HIGH SCHOOL OF SCIENCE
187                       BROOKLYN TECHNICAL HIGH SCHOOL
327    QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO...
356                  STATEN ISLAND TECHNICAL HIGH SCHOOL
Name: SCHOOL NAME, dtype: object

By doing some search on Google we find that the schools above are among the most exclusive and elitist public high schools in the entire country. Particularly fonded into scientific subjects, some of them had, among their alumni, even Nobel Prize winners. These schools are mostly attended by asian students who has to pass an entrance exam.

Plotting gender correlation¶

In [21]:

gender_cols = ['male_per', 'female_per']

## bar plot of the correlations between gender and sat_score

combined.corr()['sat_score'][gender_cols].plot(kind='bar')

Out[21]:

<AxesSubplot:>

Not very strong correlations here. The r factor shows that gender doesn't affect, both positively or negatively, the results of the SAT tests that much. We could however suggest that a school with an higher percentage of female students is more likely to score higher results in the SAT tests.

We're going to make a scatter plot to show this correlation more clearly.

In [22]:

## make a scatter plot

plt.scatter(combined['female_per'], combined['sat_score'])

Out[22]:

<matplotlib.collections.PathCollection at 0x1c75c82c190>

As we said there isn't any specifical correlation among the percentage of female students and the result in the tests. Still we can see a cluster of schools that has an high percentage of female students (60/80%) and high scores in the tests.

In [23]:

## find the name of those specific schools

combined[(combined['female_per'] > 60) & (combined['sat_score'] > 1700)]['SCHOOL NAME']

Out[23]:

5                         BARD HIGH SCHOOL EARLY COLLEGE
26                         ELEANOR ROOSEVELT HIGH SCHOOL
60                                    BEACON HIGH SCHOOL
61     FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A...
302                          TOWNSEND HARRIS HIGH SCHOOL
Name: SCHOOL NAME, dtype: object

These schools are focused on liberal arts and a particularly free approach to the curriculum with a strong attention to art and music subjects.

Finding correlation among AP Test Takers and SAT results¶

In [24]:

## calculate the percentage of AP Test Takers among the total students enrolled

combined['ap_per'] = round((combined['AP Test Takers '] / combined['total_enrollment']) * 100, 2)

combined['ap_per']

Out[24]:

0      30.58
1       9.90
2       3.18
3      35.16
4      15.81
       ...  
358     7.76
359    29.73
360    19.10
361     2.66
362     7.36
Name: ap_per, Length: 363, dtype: float64

In [25]:

## make a scatter plot between ap_per and sat_score

plt.scatter(combined['ap_per'], combined['sat_score'])

Out[25]:

<matplotlib.collections.PathCollection at 0x1c75c883460>

From the scatter plot we can say that there is not a clear strong correlation between percentage of students taking AP exams and results in the SAT tests. We can see that there is a quite big cluster of schools that has a small percentage of AP takers and low SAT scores, otherwise those results are shared by school where the percentage of AP takers is much higher. However, just to be sure, we'll calculate the exact r factor

In [26]:

## plotting the r factor into a bar plot

combined.corr()['sat_score']['ap_per']

Out[26]:

0.05717292460813363

Indeed an r factor almost equal to 0.

In [27]:

## dropping double columns

combined.drop(columns=['SchoolName'],inplace=True)

Correlation between class size and SAT scores¶

In [28]:

plt.scatter(combined['AVERAGE CLASS SIZE'], combined['sat_score'])

Out[28]:

<matplotlib.collections.PathCollection at 0x1c75c8d0940>

In [29]:

combined.corr()['AVERAGE CLASS SIZE']['sat_score']

Out[29]:

0.3810143308095523

As we could imagine, the bigger is the class size the higher is the possibility to score high scores in the SAT test. According to the graph, there are no schools in wich the average size of the classes is less than 20 mq that had scores higher than 1400.

What we're going to do now is to check which is the best neighborhood according to the average score in the SAT test.

In [30]:

## grouping the df by 'boro' column

combined.groupby('boro').agg(numpy.mean)['sat_score']

Out[30]:

boro
Bronx            1157.598203
Brooklyn         1181.364461
Manhattan        1278.331410
Queens           1286.753032
Staten Island    1382.500000
Name: sat_score, dtype: float64

The neighborhood where we can find the best schools is Staten Island.

Now we're going to establish a ranking for each school based on the results of the safety survey and the SAT scores.

In [31]:

## calculating mean for safety surveys and applying to a new column

combined['SAFETY MEAN'] = round((combined['saf_t_11'] + combined['saf_p_11'] + combined['saf_s_11']) / 3, 2)

combined['SAFETY MEAN'].value_counts(ascending=True)

Out[31]:

8.60     1
5.90     1
8.20     1
6.10     1
8.30     1
        ..
7.33     9
7.40    10
7.47    10
7.37    10
7.23    17
Name: SAFETY MEAN, Length: 82, dtype: int64

In [32]:

## calculating a coefficient for ranking 

combined['K'] = round((combined['SAFETY MEAN'] * combined['sat_score']) / 100, 2)

In [33]:

combined['K'].value_counts(bins=5)

Out[33]:

(81.172, 103.014]     179
(59.22, 81.172]       127
(103.014, 124.856]     35
(124.856, 146.698]     15
(146.698, 168.54]       7
Name: K, dtype: int64

In [34]:

## create a function returning a ranking

def ranking(x):
    if x >= 59.23 and x <= 81.172:
        return 'BB'
    elif x >= 81.173 and x <= 103.013:
        return 'BBB'
    elif x >= 103.014 and x <= 124.856:
        return 'A'
    elif x >= 124.857 and x <= 146.697:
        return 'AA'
    else:
        return 'AAA'

In [35]:

##applying function and creating new column RANKING

combined['RANKING'] = combined['K'].apply(lambda x: ranking(x))

combined['RANKING']

Out[35]:

0       BB
1       BB
2      BBB
3      BBB
4       AA
      ... 
358     BB
359     BB
360     BB
361      A
362     BB
Name: RANKING, Length: 363, dtype: object

At this point, by adding a new dataframe, we'll analyze which borough is the best according to the quality of the schools and the cost of the apartments. We'll use a dataframe found here.

In [36]:

ny_sales = pd.read_csv('nyc-rolling-sales.csv')
ny_sales.head()

Out[36]:

	Unnamed: 0	BOROUGH	NEIGHBORHOOD	BUILDING CLASS CATEGORY	TAX CLASS AT PRESENT	BLOCK	LOT	BUILDING CLASS AT PRESENT	ADDRESS	...	RESIDENTIAL UNITS	COMMERCIAL UNITS	TOTAL UNITS	LAND SQUARE FEET	GROSS SQUARE FEET	YEAR BUILT	TAX CLASS AT TIME OF SALE	BUILDING CLASS AT TIME OF SALE	SALE PRICE	SALE DATE
0	4	1	ALPHABET CITY	07 RENTALS - WALKUP APARTMENTS	2A	392	6	C2	153 AVENUE B	...	5	0	5	1633	6440	1900	2	C2	6625000	2017-07-19 00:00:00
1	5	1	ALPHABET CITY	07 RENTALS - WALKUP APARTMENTS	2	399	26	C7	234 EAST 4TH STREET	...	28	3	31	4616	18690	1900	2	C7	-	2016-12-14 00:00:00
2	6	1	ALPHABET CITY	07 RENTALS - WALKUP APARTMENTS	2	399	39	C7	197 EAST 3RD STREET	...	16	1	17	2212	7803	1900	2	C7	-	2016-12-09 00:00:00
3	7	1	ALPHABET CITY	07 RENTALS - WALKUP APARTMENTS	2B	402	21	C4	154 EAST 7TH STREET	...	10	0	10	2272	6794	1913	2	C4	3936272	2016-09-23 00:00:00
4	8	1	ALPHABET CITY	07 RENTALS - WALKUP APARTMENTS	2A	404	55	C2	301 EAST 10TH STREET	...	6	0	6	2369	4615	1900	2	C2	8000000	2016-11-17 00:00:00

5 rows × 22 columns

In [37]:

ny_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84548 entries, 0 to 84547
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Unnamed: 0                      84548 non-null  int64 
 1   BOROUGH                         84548 non-null  int64 
 2   NEIGHBORHOOD                    84548 non-null  object
 3   BUILDING CLASS CATEGORY         84548 non-null  object
 4   TAX CLASS AT PRESENT            84548 non-null  object
 5   BLOCK                           84548 non-null  int64 
 6   LOT                             84548 non-null  int64 
 7   EASE-MENT                       84548 non-null  object
 8   BUILDING CLASS AT PRESENT       84548 non-null  object
 9   ADDRESS                         84548 non-null  object
 10  APARTMENT NUMBER                84548 non-null  object
 11  ZIP CODE                        84548 non-null  int64 
 12  RESIDENTIAL UNITS               84548 non-null  int64 
 13  COMMERCIAL UNITS                84548 non-null  int64 
 14  TOTAL UNITS                     84548 non-null  int64 
 15  LAND SQUARE FEET                84548 non-null  object
 16  GROSS SQUARE FEET               84548 non-null  object
 17  YEAR BUILT                      84548 non-null  int64 
 18  TAX CLASS AT TIME OF SALE       84548 non-null  int64 
 19  BUILDING CLASS AT TIME OF SALE  84548 non-null  object
 20  SALE PRICE                      84548 non-null  object
 21  SALE DATE                       84548 non-null  object
dtypes: int64(10), object(12)
memory usage: 14.2+ MB

First thing first, we'll rename the values of the BOROUGH column. From digits to the values present in the combined data frame.

In [38]:

num_to_name = {1:'Manhattan',2:'Bronx',3:'Brooklyn',4:'Queens',5:'Staten Island'}

ny_sales['BOROUGH'] = ny_sales['BOROUGH'].map(num_to_name)

In [39]:

ny_sales['SALE PRICE'].value_counts()

Out[39]:

 -         14561
0          10228
10           766
450000       427
550000       416
           ...  
3701338        1
366500         1
41033          1
207072         1
726059         1
Name: SALE PRICE, Length: 10008, dtype: int64

In [40]:

## replace '-' values with '0'

ny_sales['SALE PRICE'] = ny_sales['SALE PRICE'].astype(str).str.replace('-','0').astype('int64')

ny_sales[ny_sales['SALE PRICE'] <= 5000]['SALE PRICE'].value_counts()

Out[40]:

0       24789
10        766
1         134
100        90
1000       77
500        67
5000       32
2000       14
3000        9
4000        7
2500        5
20          4
3500        4
2           3
3           2
2800        1
501         1
2400        1
2352        1
210         1
19          1
2096        1
5           1
373         1
1200        1
3774        1
3012        1
4500        1
4063        1
750         1
1500        1
300         1
1275        1
315         1
1162        1
250         1
2200        1
3001        1
600         1
200         1
8           1
3750        1
1175        1
1110        1
2510        1
4900        1
Name: SALE PRICE, dtype: int64

By analizing the SALE PRICE column, we notice that many sales occur with a nonsensically small dollar amount: $0 most commonly. These sales are actually transfers of deeds between parties: for example, parents transferring ownership to their home to a child after moving out for retirement.

For the fact that sale price is a key value for our search and also because there is a huge amount of values that holds these unrealistical prices, we can't simply drop those kind of rows in which are contained. We'll, instead, replace those values with the mean.

We'll replace all the values lower than 5000 $

In [41]:

## calculating mean price for values higher than 5000

mean_price = round(ny_sales[ny_sales['SALE PRICE'] >= 5000]['SALE PRICE'].mean(),0)

## replace values <= 5000

ny_sales['SALE PRICE'] = ny_sales['SALE PRICE'].mask(ny_sales['SALE PRICE'] <= 5000, mean_price)

In [42]:

pd.pivot_table(ny_sales, values='SALE PRICE', index='BOROUGH').plot(kind='barh')
plt.title('Mean prices for borough')

Out[42]:

Text(0.5, 1.0, 'Mean prices for borough')

At this point we know which is the most expensive borough in New York (Manhattan). The other four share more or less the same range of price. What we'll do now is to find which borough has the best schools and is the most economic for buying flats. For doing this, we're going to reduce the first dataframe to the columns that are significant for our analysis and add to it a new column MEAN PRICE.

In [43]:

## reducing first df

combined_final = combined[['DBN','SCHOOL NAME','boro','Num of SAT Test Takers',
       'SAT Critical Reading Avg. Score', 'SAT Math Avg. Score',
       'SAT Writing Avg. Score', 'sat_score','bus', 'subway','Location 1', 'lat',
       'lon', 'SAFETY MEAN','RANKING']].copy()

## rename boro column to BOROUGH

combined_final.rename(columns={'boro':'BOROUGH'},inplace=True)

## create a function to calculate mean price 

def mean_price(b):
        if b == 'Manhattan':
            return round(ny_sales[ny_sales['BOROUGH'] == 'Manhattan']['SALE PRICE'].mean(), 0)
        elif b == 'Bronx':
            return round(ny_sales[ny_sales['BOROUGH'] == 'Bronx']['SALE PRICE'].mean(), 0)
        elif b == 'Brooklyn':
            return round(ny_sales[ny_sales['BOROUGH'] == 'Brooklyn']['SALE PRICE'].mean(), 0)
        elif b == 'Queens':
            return round(ny_sales[ny_sales['BOROUGH'] == 'Queens']['SALE PRICE'].mean(), 0)
        else:
            return round(ny_sales[ny_sales['BOROUGH'] == 'Staten Island']['SALE PRICE'].mean(), 0)

In [44]:

combined_final['MEAN PRICE'] = combined_final['BOROUGH'].apply(lambda x: mean_price(x))

In [83]:

manhattan = combined_final[combined_final['BOROUGH'] == 'Manhattan']['RANKING'].value_counts()
bronx = combined_final[combined_final['BOROUGH'] == 'Bronx']['RANKING'].value_counts()
brooklyn = combined_final[combined_final['BOROUGH'] == 'Brooklyn']['RANKING'].value_counts()
queens = combined_final[combined_final['BOROUGH'] == 'Queens']['RANKING'].value_counts()
staten_island = combined_final[combined_final['BOROUGH'] == 'Staten Island']['RANKING'].value_counts()

In [145]:

fig, (ax1,ax2,ax3,ax4,ax5) = plt.subplots(ncols=5,figsize=(16,4))
axes = [ax1,ax2,ax3,ax4]

for ax in axes:
    ax.set_xticks([0,10,20,30,40,50])

manhattan.plot(kind='barh',ax=ax1,'tab:orange')
ax1.set_title('Manhattan')

bronx.plot(kind='barh',ax=ax2)
ax2.set_title('Bronx')

brooklyn.plot(kind='barh',ax=ax3)
ax3.set_title('Brooklyn')

queens.plot(kind='barh',ax=ax4)
ax4.set_title('Queens')

staten_island.plot(kind='barh',ax=ax5)
ax5.set_title('Staten Island')

  File "<ipython-input-145-824e03a44d67>", line 7
    manhattan.plot(kind='barh',ax=ax1,'tab:orange')
                                      ^
SyntaxError: positional argument follows keyword argument