#!/usr/bin/env python # coding: utf-8 # ## Airbnb Homes in Vancouver - Data Cleaning # #### Arash Tavassoli - April 2019 # --- # ### Introduction # # In less than 10 years since its official launch, airbnb has now found its place in many of our lives and continues to prove itself as an affordable and convenient alternative to conventional hotel services. This notebook cleans and prepares the data before importing into a Tableau workbook that will visualize airbnb's presence in the City of Vancouver based on publicly available data from airbnb-owned website, Inside-airbnb. # # The Tableau visualization is available in [here](https://public.tableau.com/views/AirbnbHomesinVancouver/Dashboard?:embed=y&:display_count=yes). # # ### Data Source # # The data is downloaded from Inside-airbnb on April 29, 2019 and reflects the detailed listing data, calendar data and listing reviews for 4,929 uniquely identified homes in Vancouver, BC, that were active on March 8, 2019. # # The data is dowloaded from [Inside Airbnb website](http://insideairbnb.com/get-the-data.html) where information is available under a Creative Commons CC0 1.0 Universal (CC0 1.0) "Public Domain Dedication" license. # # The datasets are specific to the City of Vancouver and are provided in 5 separate CSV and 1 GeoJSON files: # # |File Name | Description # |:---------|:--------- # |`calendar.csv` | Detailed Calendar Data for listings in Vancouver # |`listings.csv` | Summary information and metrics for listings in Vancouver # |`listings-2.csv` | Detailed Listings data for Vancouver # |`reviews.csv` | Summary Review data and Listing ID # |`reviews-2.csv` | Detailed Review Data for listings in Vancouver # |`neighbourhoods.csv` | Neighbourhood list for geo filter (ourced from city or open source GIS files) # |`neighbourhoods.geojson` | GeoJSON file of neighbourhoods of the city # In[1]: # Importing libraries: import pandas as pd import numpy as np # In[2]: # Importing raw datasets: df_cal = pd.read_csv('Data/calendar.csv') df_listings = pd.read_csv('Data/listings.csv') df_listings_more = pd.read_csv('Data/listings-2.csv') df_neighb = pd.read_csv('Data/neighbourhoods.csv') df_reviews = pd.read_csv('Data/reviews.csv') df_reviews_more = pd.read_csv('Data/reviews-2.csv') # ### Cleaning the `df_listings` and `df_listings_more` datasets: # # We start by looking at the two listing datasets: `df_listings` and `df_listings_more` # In[3]: print('Shape of df_listings:\t\t', df_listings.shape) print('Shape of df_listings_more:\t', df_listings_more.shape) # In[4]: df_listings.head(3).T # We drop the columns that we do not need in the analysis (not required or repeated on other datasets): # In[5]: df_listings.drop(['neighbourhood_group', 'last_review', 'reviews_per_month', 'availability_365', 'neighbourhood'], axis = 1, inplace = True) # In[6]: #Checking for NaNs: df_listings.isna().sum() # We'll deal with that one missing value later. # In[7]: pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) df_listings_more.head(2).T # Not all columns will be required for visualization so let's do some clean-up: # In[8]: columns_to_keep = ['id', 'summary', 'space', 'description', 'host_listings_count', 'host_total_listings_count', 'neighbourhood_cleansed', 'is_location_exact', 'property_type', 'accommodates', 'bedrooms', 'bathrooms', 'beds', 'amenities', 'availability_30', 'availability_60', 'availability_90', 'availability_365'] df_listings_more = df_listings_more.loc[:,columns_to_keep] # In[9]: df_listings_more.dtypes # In[10]: df_listings_more.isna().sum() # In case of missing data in `summary`, `space` and `description` columns we will replace NaN's with empty strings: # In[11]: df_listings_more['summary'] = df_listings_more['summary'].fillna('') df_listings_more['space'] = df_listings_more['space'].fillna('') df_listings_more['description'] = df_listings_more['description'].fillna('') # In[12]: df_listings_more.isna().sum() # We will deal with the remaining NaN's in the following sections. # We now merge the two datasets, `df_listings` and `df_listings_more` into one single dataset and call it `df_listings_clean`: # In[13]: df_listings_clean = df_listings.merge(df_listings_more, on = 'id') # In[14]: df_listings_clean.isna().sum() # We can now drop the 9 remaining rows with missing values in `name`, `bedrooms` and `beds` columns. # In[15]: df_listings_clean.dropna(axis = 0, inplace = True) # In[16]: df_listings_clean.isna().sum().any() # In[17]: df_listings_clean.dtypes # As an optional help for visulaization down the road let's change data type in some columns to appropriate integer type: # In[18]: df_listings_clean['bedrooms'] = df_listings_clean['bedrooms'].astype('int64') df_listings_clean['bathrooms'] = df_listings_clean['bathrooms'].astype('int64') df_listings_clean['beds'] = df_listings_clean['beds'].astype('int64') # ### EDA on different neighbourhoods: # # Although Tableau will be used down the road to visualize different measures in different neighbourhoods we can also have a quick look at different features across neighbourhoods in here to get an understanding that will help us choose the best visualizations in Tableau: # In[19]: # Creating an empty dataframe to store different measures for different neighbourhoods: df_neighbourhood = pd.DataFrame(index = df_listings_clean['neighbourhood_cleansed'].unique()) # Average nighly price in different neighbourhoods: df_neighbourhood['Avg_price'] = df_listings_clean.groupby(by = 'neighbourhood_cleansed')['price'].mean().round() # Total number of listings in different neighbourhoods: df_neighbourhood['Listing_count'] = df_listings_clean.groupby(by = 'neighbourhood_cleansed')['id'].count() # Average 30-Day availability in different neighbourhoods: df_neighbourhood['Avg_30_availability'] = df_listings_clean.groupby(by = 'neighbourhood_cleansed')['availability_30']\ .mean().round().astype(int) # Average 60-Day availability in different neighbourhoods: df_neighbourhood['Avg_60_availability'] = df_listings_clean.groupby(by = 'neighbourhood_cleansed')['availability_60']\ .mean().round().astype(int) # Average 90-Day availability in different neighbourhoods: df_neighbourhood['Avg_90_availability'] = df_listings_clean.groupby(by = 'neighbourhood_cleansed')['availability_90']\ .mean().round().astype(int) # Average 365-Day availability in different neighbourhoods: df_neighbourhood['Avg_365_availability'] = df_listings_clean.groupby(by = 'neighbourhood_cleansed')['availability_365']\ .mean().round().astype(int) # In[20]: df_neighbourhood # ### Finding the most common words in Reviews and listing Summaries: # # With the aim of making word clouds of most common words in property summary (description) and reviews we go through all reviews and listing summaries in each neghbourhood and will feed Tableau with tables that list the top 50 words used in reviews and listing summaries, in each neighbourhood: # In[21]: # Funtion to return the top 50 words: from operator import itemgetter def top_word_counter(df, filter_col, filter_match, lookup_col): words = list(entry.lower().strip() for entry in df[df[filter_col] == filter_match][lookup_col].astype(str)) wordcount = {} # Words that will not be counted: stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", ""] symbols_to_replace = [".", ",", ":", ";", "\"", "/", "!", "?", "“", "‘", "*", ")", "(", "|", ">", "<", "{", "}", "&", "^", "~", "+"] seperator = ' ' text = seperator.join(words) for word in text.split(): for symbol in symbols_to_replace: word = word.replace(symbol,"") if word not in stopwords: if word not in wordcount: wordcount[word] = 1 else: wordcount[word] += 1 top_words = sorted(wordcount.items(), key=itemgetter(1), reverse = True)[0:50] return [(filter_match, i[0], i[1]) for i in top_words] # In[22]: # Finding the top 50 words in summaries for each neighbouthood: summary_words = [] for neighb in df_listings_clean['neighbourhood_cleansed'].unique(): summary_words += top_word_counter(df_listings_clean, 'neighbourhood_cleansed', neighb, 'summary') summary_words = pd.DataFrame(summary_words) summary_words.columns = ['neighbourhood_cleansed', 'Word', 'Word_Count'] # In[23]: # Cleaning the reviews dataframe before checking for most used words: reviews_clean = df_reviews_more.drop(['id', 'date', 'reviewer_id', 'reviewer_name'] , axis = 1)\ .rename(columns = {'listing_id': 'id', 'comments': 'review'})\ .merge(df_listings_more.loc[:, ['id', 'neighbourhood_cleansed']], on = 'id') # Finding the top 50 words in reviews for each neighbouthood: review_words = [] for neighb in reviews_clean['neighbourhood_cleansed'].unique(): review_words += top_word_counter(reviews_clean, 'neighbourhood_cleansed', neighb, 'review') review_words = pd.DataFrame(review_words) review_words.columns = ['neighbourhood_cleansed', 'Word', 'Word_Count'] # In[24]: # Exporting new, clean dataframes as csv files that will feed into Tableau: df_listings_clean.to_csv('Data/clean_data.csv') review_words.to_csv('Data/review_words.csv') summary_words.to_csv('Data/summary_words.csv') # Data visualization to continue on Tableau.