Author: Matthew Huh
To put it in their own words, "Airbnb is built around the idea that everyone should be able to take the perfect trip, including where they stay, what they do, and who they meet. To that end, we empower millions of people around the world to use their spaces, passions, and talents to become entrepreneurs."
It is a platform that allows people to host their abodes as temporary living quarters for tourism, finding a place away from home, or short-term housing. In certain ways, the platform offers a number of similarities to hotels, or renting apartments, but it comes with its own advantages and disadvantages.
Airbnb regularly uploads datasets on their site, allowing individuals, or organizations to play around with the listings that they have disclosed. For this project, I have chosen to take a closer look at a city quite familiar to me, Seattle, WA, home to over 700k people and a thriving tech and tourism hub in the Pacific Northwest.
# Necessary imports
import os
import time
import timeit
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# Modelling packages
from sklearn import ensemble
from sklearn.feature_selection import chi2, f_classif, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import adjusted_rand_score, classification_report, confusion_matrix, silhouette_score
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import normalize
# Plotly packages
import cufflinks as cf
import ipywidgets as widgets
import plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly import tools
from scipy import special
py.offline.init_notebook_mode(connected=True)
# Import the data
listings = pd.read_csv('listings.csv')
neighborhoods = pd.read_csv('neighbourhoods.csv')
neighborhood_map = pd.read_json('neighbourhoods.geojson')
# Remove the 'id' column from the dataframe
listings.drop(['id', 'host_id'], axis=1, inplace=True)
# Preview the data for listings
listings.head()
name | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Casa Madrona - Urban Oasis, 1 block from the P... | Megan | Central Area | Madrona | 47.610819 | -122.290816 | Entire home/apt | 296 | 4 | 20 | 2018-09-18 | 0.16 | 3 | 345 |
1 | Sunrise in Seattle Master Suite | Jess & Joey | Other neighborhoods | Roosevelt | 47.687801 | -122.313427 | Private room | 82 | 2 | 63 | 2018-10-06 | 0.96 | 5 | 365 |
2 | Cozy Studio, min. to downtown -WiFi | Maddy | Delridge | South Delridge | 47.523980 | -122.359891 | Entire home/apt | 48 | 3 | 461 | 2018-11-02 | 4.37 | 1 | 24 |
3 | Fab, private seattle urban cottage! | Joyce | Other neighborhoods | Wallingford | 47.654109 | -122.337605 | Entire home/apt | 90 | 2 | 134 | 2018-10-21 | 1.18 | 3 | 287 |
4 | Glorious sun room w/ memory foambed | Angielena | Other neighborhoods | Georgetown | 47.550620 | -122.320135 | Private room | 65 | 2 | 130 | 2018-09-09 | 1.29 | 5 | 336 |
# Print the size of the dataframe
print('Size of the dataframe: {} listings x {} features'.format(listings.shape[0],listings.shape[1]))
Size of the dataframe: 8740 listings x 14 features
# Check for any missing values
print('Missing values: \n', listings.isna().sum())
Missing values: name 0 host_name 0 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 last_review 1048 reviews_per_month 1049 calculated_host_listings_count 0 availability_365 0 dtype: int64
# View descriptive statistics
listings.describe()
latitude | longitude | price | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|
count | 8740.000000 | 8740.000000 | 8740.000000 | 8740.000000 | 8740.000000 | 7691.000000 | 8740.000000 | 8740.000000 |
mean | 47.626403 | -122.333693 | 152.228375 | 3.691419 | 42.772769 | 2.462089 | 22.570252 | 138.483295 |
std | 0.045696 | 0.031548 | 141.676868 | 13.235492 | 63.760075 | 2.427051 | 62.737811 | 130.909168 |
min | 47.496037 | -122.419637 | 0.000000 | 1.000000 | 0.000000 | 0.010000 | 1.000000 | 0.000000 |
25% | 47.606366 | -122.354144 | 80.000000 | 1.000000 | 3.000000 | 0.565000 | 1.000000 | 14.000000 |
50% | 47.621423 | -122.332098 | 119.000000 | 2.000000 | 17.000000 | 1.690000 | 1.000000 | 89.000000 |
75% | 47.661255 | -122.312031 | 189.000000 | 2.000000 | 55.000000 | 3.820000 | 5.000000 | 263.000000 |
max | 47.736128 | -122.234026 | 5400.000000 | 400.000000 | 717.000000 | 46.410000 | 311.000000 | 365.000000 |
# View room types on Airbnb
# Set the size of the chart
plt.rcParams['figure.figsize'] = [5,5]
# Import the data for the pie chart
trace = go.Pie(labels=listings['room_type'].value_counts().index, values = listings['room_type'].value_counts())
# Create the layout
layout = go.Layout(
title = 'Room Types on Airbnb',
height = 600,
width = 800,
autosize = False
)
# Construct the chart
fig = go.Figure(data = [trace], layout = layout)
py.offline.iplot(fig, filename ='cufflinks/simple')
# Plot the minimum number of nights required for units
plt.rcParams['figure.figsize'] = [10,5]
listings['minimum_nights_temp'] = np.where(listings['minimum_nights'] > 7, 8, listings['minimum_nights'])
plt.title('Minimum number of Nights')
plt.xlabel('Minimum number of nights')
sns.barplot(x = listings['minimum_nights_temp'].value_counts().index,
y = listings['minimum_nights_temp'].value_counts())
listings.drop('minimum_nights_temp', axis=1)
print()
# View all prices
plt.title('Airbnb Rental Prices (USD/day)')
sns.distplot(listings['price'])
<matplotlib.axes._subplots.AxesSubplot at 0x17540dcbef0>
So, that's a nice visual, eh? All that we can really see is that most listings are below 1000 USD / day, and that at least one listing is a little over 5000 USD. Let's take a look at the prices again, and focus on what 99% of us will be looking at using the outlier rule (Q3 + 1.5 (Q3 - Q1))
# Examine by removing outliers
plt.xlim(0,370)
plt.title('Airbnb Rental Prices (USD/day)')
sns.distplot(listings['price'], bins=500)
<matplotlib.axes._subplots.AxesSubplot at 0x17540e80ef0>
# View most expensive listings in Seattle
listings.sort_values(by = 'price', ascending=False).head(10)
name | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | minimum_nights_temp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7845 | Light &Airy home--convenient Seattle location | Deanna | Central Area | Harrison/Denny-Blaine | 47.621206 | -122.293840 | Entire home/apt | 5400 | 7 | 0 | NaN | NaN | 1 | 143 | 7 |
8107 | We are true 12's! | Chris | Queen Anne | West Queen Anne | 47.630456 | -122.367444 | Entire home/apt | 4000 | 1 | 0 | NaN | NaN | 1 | 0 | 1 |
1041 | Neat & Roomy Photography Studio | Steve | Central Area | Minor | 47.602292 | -122.308681 | Entire home/apt | 3999 | 29 | 58 | 2016-09-21 | 1.29 | 5 | 0 | 8 |
5405 | Spacious Family-Friendly Tri-Plex with Views i... | Kimberly | Queen Anne | West Queen Anne | 47.630824 | -122.368964 | Entire home/apt | 1650 | 2 | 3 | 2018-08-07 | 0.22 | 5 | 222 | 2 |
3908 | Seattle Executive Home for Large Gatherings | Andrew | Seward Park | Seward Park | 47.547348 | -122.266204 | Entire home/apt | 1395 | 7 | 2 | 2018-08-19 | 0.10 | 1 | 358 | 7 |
6963 | Summer Apart (girls only) | Happy | University District | University District | 47.663574 | -122.315006 | Private room | 1250 | 7 | 0 | NaN | NaN | 1 | 365 | 7 |
3892 | Beautiful French Style Home with Panoramic Views. | Amy | Queen Anne | Lower Queen Anne | 47.629652 | -122.360114 | Entire home/apt | 1200 | 4 | 2 | 2017-08-06 | 0.10 | 1 | 8 | 4 |
6982 | LARGE HOUSE ON LAKE WASHINGTON WITH HOT TUB!!!! | Jason | Central Area | Leschi | 47.591348 | -122.287627 | Entire home/apt | 1129 | 30 | 9 | 2018-08-09 | 1.57 | 12 | 365 | 8 |
4485 | Family luxury-steps to Columbia City & light r... | Joel And Amanda | Rainier Valley | Columbia City | 47.558111 | -122.289506 | Entire home/apt | 1079 | 2 | 20 | 2018-10-07 | 1.15 | 1 | 362 | 2 |
904 | Private Townhome 2 bedroom 1.5 bath | Natalie | Rainier Valley | Mount Baker | 47.564861 | -122.285446 | Entire home/apt | 1000 | 2 | 23 | 2018-06-29 | 0.48 | 1 | 0 | 2 |
# View # of listings by neighbourhood
plt.rcParams['figure.figsize'] = [10,20]
plt.title('# of Listings')
plt.xticks(rotation=60)
sns.barplot(x = listings['neighbourhood'].value_counts(),
y = listings['neighbourhood'].value_counts().index)
<matplotlib.axes._subplots.AxesSubplot at 0x17540e4d940>
# Create new dataframe for the statistics on each neighbourhood
neighbourhood_averages = listings.groupby(['neighbourhood']).mean()
neighbourhood_averages
latitude | longitude | price | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | minimum_nights_temp | |
---|---|---|---|---|---|---|---|---|---|
neighbourhood | |||||||||
Adams | 47.671825 | -122.386322 | 138.147887 | 4.281690 | 54.521127 | 2.728203 | 2.669014 | 142.795775 | 2.542254 |
Alki | 47.575216 | -122.406858 | 180.870000 | 2.670000 | 40.900000 | 2.930353 | 3.060000 | 167.050000 | 2.180000 |
Arbor Heights | 47.510825 | -122.379924 | 131.173913 | 2.086957 | 56.130435 | 3.297143 | 1.260870 | 110.260870 | 2.086957 |
Atlantic | 47.595275 | -122.304397 | 117.609756 | 2.390244 | 56.317073 | 3.162609 | 3.243902 | 80.813008 | 2.056911 |
Belltown | 47.615074 | -122.346078 | 195.643911 | 4.326568 | 40.566421 | 2.599138 | 72.691882 | 142.612546 | 2.232472 |
Bitter Lake | 47.720124 | -122.351726 | 88.585366 | 6.634146 | 26.463415 | 2.067500 | 1.609756 | 154.268293 | 2.243902 |
Briarcliff | 47.643675 | -122.405051 | 212.478261 | 2.608696 | 23.478261 | 1.286000 | 3.652174 | 139.347826 | 2.608696 |
Brighton | 47.539739 | -122.277059 | 103.245614 | 1.491228 | 37.473684 | 2.596078 | 8.561404 | 108.964912 | 1.491228 |
Broadview | 47.717846 | -122.360142 | 110.450000 | 2.275000 | 44.975000 | 2.404571 | 1.225000 | 195.825000 | 2.275000 |
Broadway | 47.620953 | -122.320762 | 138.194704 | 4.303738 | 57.289720 | 2.943924 | 22.806854 | 121.584112 | 2.302181 |
Bryant | 47.670175 | -122.286529 | 119.272727 | 2.727273 | 32.090909 | 1.923191 | 3.727273 | 143.709091 | 2.490909 |
Cedar Park | 47.722855 | -122.286721 | 179.727273 | 6.848485 | 20.212121 | 1.718966 | 3.212121 | 130.848485 | 2.363636 |
Central Business District | 47.608374 | -122.336328 | 250.503676 | 4.797794 | 20.772059 | 1.498141 | 95.702206 | 188.132353 | 2.503676 |
Columbia City | 47.559685 | -122.286081 | 133.904459 | 2.292994 | 51.878981 | 2.759789 | 1.834395 | 133.394904 | 2.178344 |
Crown Hill | 47.696858 | -122.371444 | 107.023810 | 2.023810 | 49.523810 | 2.506944 | 1.904762 | 163.642857 | 1.952381 |
Dunlap | 47.526687 | -122.273131 | 87.368421 | 2.710526 | 38.894737 | 2.599697 | 2.868421 | 164.605263 | 2.026316 |
East Queen Anne | 47.636095 | -122.347965 | 179.634483 | 3.710345 | 49.848276 | 2.655682 | 2.765517 | 123.296552 | 2.262069 |
Eastlake | 47.641088 | -122.325416 | 147.564356 | 3.673267 | 32.099010 | 2.164157 | 3.287129 | 113.217822 | 2.425743 |
Fairmount Park | 47.555840 | -122.380859 | 132.956522 | 2.869565 | 47.869565 | 2.394750 | 3.478261 | 150.478261 | 2.391304 |
Fauntleroy | 47.521155 | -122.387104 | 181.062500 | 2.062500 | 39.062500 | 1.889355 | 1.406250 | 172.093750 | 2.062500 |
First Hill | 47.610932 | -122.326804 | 182.413249 | 7.492114 | 33.110410 | 1.830117 | 120.189274 | 152.198738 | 3.236593 |
Fremont | 47.656894 | -122.351429 | 144.244966 | 4.567114 | 54.516779 | 2.510265 | 3.204698 | 127.224832 | 2.167785 |
Gatewood | 47.539604 | -122.385536 | 118.339286 | 3.303571 | 41.267857 | 2.092128 | 2.553571 | 154.017857 | 2.517857 |
Genesee | 47.565657 | -122.386001 | 133.848485 | 3.318182 | 41.787879 | 2.405968 | 2.090909 | 127.772727 | 2.227273 |
Georgetown | 47.546030 | -122.320669 | 117.826087 | 1.782609 | 68.739130 | 4.117000 | 3.695652 | 170.652174 | 1.782609 |
Green Lake | 47.680656 | -122.332132 | 160.625000 | 2.536765 | 42.323529 | 2.257984 | 3.713235 | 151.632353 | 2.205882 |
Greenwood | 47.694075 | -122.354016 | 109.920398 | 2.507463 | 39.845771 | 2.252833 | 2.114428 | 122.781095 | 2.169154 |
Haller Lake | 47.721091 | -122.334585 | 102.254545 | 2.745455 | 40.636364 | 1.916400 | 4.618182 | 114.636364 | 1.945455 |
Harrison/Denny-Blaine | 47.622597 | -122.291258 | 330.027778 | 8.416667 | 50.361111 | 2.482069 | 2.305556 | 148.416667 | 2.638889 |
High Point | 47.545940 | -122.371315 | 93.878049 | 1.682927 | 34.975610 | 2.608889 | 1.585366 | 122.195122 | 1.682927 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Pike-Market | 47.609928 | -122.341437 | 236.748768 | 2.990148 | 27.216749 | 2.381268 | 107.556650 | 201.251232 | 2.167488 |
Pinehurst | 47.717593 | -122.319125 | 90.735294 | 1.735294 | 20.647059 | 1.615357 | 3.235294 | 206.088235 | 1.735294 |
Pioneer Square | 47.600442 | -122.333421 | 193.537313 | 1.686567 | 54.656716 | 2.781207 | 67.791045 | 141.910448 | 1.686567 |
Portage Bay | 47.647829 | -122.320068 | 228.533333 | 8.700000 | 50.366667 | 2.111481 | 1.533333 | 185.333333 | 2.766667 |
Rainier Beach | 47.513075 | -122.260340 | 97.142857 | 1.946429 | 37.250000 | 2.515000 | 1.625000 | 151.678571 | 1.946429 |
Rainier View | 47.502174 | -122.263298 | 76.437500 | 1.500000 | 43.250000 | 4.755714 | 1.250000 | 145.312500 | 1.500000 |
Ravenna | 47.675711 | -122.301541 | 116.925926 | 5.296296 | 41.472222 | 2.018980 | 5.268519 | 151.129630 | 2.777778 |
Riverview | 47.545298 | -122.355146 | 96.104167 | 3.375000 | 48.895833 | 2.717333 | 4.895833 | 124.041667 | 2.041667 |
Roosevelt | 47.680683 | -122.315589 | 115.035714 | 2.261905 | 33.892857 | 2.263699 | 2.761905 | 137.940476 | 2.190476 |
Roxhill | 47.531221 | -122.370670 | 117.866667 | 1.800000 | 52.466667 | 3.406154 | 1.000000 | 107.600000 | 1.800000 |
Seaview | 47.551599 | -122.393578 | 128.781250 | 2.125000 | 42.406250 | 2.507667 | 2.437500 | 119.125000 | 2.125000 |
Seward Park | 47.553667 | -122.268696 | 152.831325 | 2.951807 | 47.192771 | 2.167922 | 5.554217 | 165.132530 | 2.433735 |
South Beacon Hill | 47.527429 | -122.286680 | 86.222222 | 2.083333 | 40.750000 | 3.472647 | 2.444444 | 149.027778 | 1.888889 |
South Delridge | 47.525493 | -122.359797 | 81.613636 | 2.250000 | 46.954545 | 3.362308 | 1.500000 | 85.500000 | 2.022727 |
South Lake Union | 47.621913 | -122.335947 | 245.850000 | 4.041667 | 16.541667 | 1.601048 | 99.625000 | 146.275000 | 2.308333 |
South Park | 47.526048 | -122.322801 | 87.666667 | 1.416667 | 55.500000 | 3.705000 | 2.083333 | 166.583333 | 1.416667 |
Southeast Magnolia | 47.643995 | -122.390031 | 145.389831 | 2.355932 | 38.474576 | 2.492727 | 2.084746 | 168.152542 | 2.271186 |
Stevens | 47.623838 | -122.306376 | 146.151639 | 5.118852 | 35.827869 | 2.493486 | 6.946721 | 123.168033 | 2.655738 |
Sunset Hill | 47.677553 | -122.398790 | 189.290909 | 2.527273 | 39.509091 | 2.461400 | 2.509091 | 144.454545 | 2.127273 |
University District | 47.663182 | -122.313397 | 113.242915 | 2.842105 | 25.259109 | 1.493932 | 21.991903 | 164.574899 | 2.299595 |
Victory Heights | 47.710331 | -122.306357 | 117.743590 | 1.948718 | 36.743590 | 1.978684 | 1.897436 | 127.564103 | 1.948718 |
View Ridge | 47.679692 | -122.275277 | 124.272727 | 5.909091 | 45.454545 | 2.243889 | 1.772727 | 96.681818 | 2.909091 |
Wallingford | 47.659748 | -122.332586 | 139.724458 | 2.897833 | 46.421053 | 2.248215 | 7.696594 | 147.467492 | 2.436533 |
Wedgwood | 47.690439 | -122.292792 | 100.342105 | 4.921053 | 25.657895 | 2.066786 | 2.026316 | 115.184211 | 2.789474 |
West Queen Anne | 47.635634 | -122.367560 | 244.689394 | 3.462121 | 44.818182 | 2.397699 | 4.303030 | 172.750000 | 2.159091 |
West Woodland | 47.668591 | -122.366707 | 133.821429 | 5.900000 | 53.850000 | 2.306107 | 2.157143 | 117.764286 | 2.392857 |
Westlake | 47.632247 | -122.341561 | 133.596154 | 2.048077 | 26.644231 | 1.766354 | 55.875000 | 74.557692 | 1.980769 |
Whittier Heights | 47.682033 | -122.371607 | 130.000000 | 2.701493 | 66.253731 | 2.509516 | 1.910448 | 114.253731 | 2.313433 |
Windermere | 47.671436 | -122.268761 | 157.545455 | 3.318182 | 25.318182 | 1.605789 | 1.318182 | 116.318182 | 3.000000 |
Yesler Terrace | 47.604683 | -122.319851 | 114.650000 | 1.500000 | 22.575000 | 3.074000 | 4.500000 | 102.950000 | 1.500000 |
88 rows × 9 columns
# View # of listings by neighbourhood
plt.rcParams['figure.figsize'] = [10,20]
neighbourhood_averages.sort_values('price', ascending=False, inplace=True)
plt.title('Average price')
plt.xticks(rotation=60)
sns.barplot(x = neighbourhood_averages['price'],
y = neighbourhood_averages.index)
<matplotlib.axes._subplots.AxesSubplot at 0x17541ad15f8>
# Mapbox API credentials
mapbox_access_token = 'pk.eyJ1IjoibWh1aDIyIiwiYSI6IkpGdEhkXzgifQ.Af4NVoaKw7lvlqVyak8Ydw'
# Import the data for the map
data = go.Data([
go.Scattermapbox(
lat = neighbourhood_averages['latitude'],
lon = neighbourhood_averages['longitude'],
mode='markers',
marker=go.scattermapbox.Marker(
size=9
),
text = neighbourhood_averages.price,
)
])
# Format the layout of the map
layout = go.Layout(
title = 'Airbnb Units in Seattle',
height=600,
autosize=True,
hovermode='closest',
mapbox=dict(
layers=[
dict(
sourcetype = 'geojson',
source = 'neighbourhoods.geojson',
type = 'fill',
color = 'rgba(40,0,113,0.8)'
)
],
accesstoken=mapbox_access_token,
bearing=0,
center=dict(
lat=47.6062,
lon=-122.3321
),
pitch=0,
zoom=9.5,
style='light',
),
)
fig = dict(data=data, layout=layout)
py.offline.iplot(fig, filename='county-level-choropleths-python')
C:\Users\mhuh22\Anaconda3\lib\site-packages\plotly\graph_objs\_deprecations.py:39: DeprecationWarning: plotly.graph_objs.Data is deprecated. Please replace it with a list or tuple of instances of the following types - plotly.graph_objs.Scatter - plotly.graph_objs.Bar - plotly.graph_objs.Area - plotly.graph_objs.Histogram - etc.
get_ipython().run_cell_magic('html', '', '<script src="https://cdn.rawgit.com/parente/4c3e6936d0d7a46fd071/raw/65b816fb9bdd3c28b4ddf3af602bfd6015486383/code_toggle.js"></script>')