# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# Stats Models functions for Time Series Analysis
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
# Plotly Imports
import plotly.offline as plotly
import plotly.graph_objs as go
from plotly import tools
plotly.init_notebook_mode(connected=False)
# Translate DataFrame into Plotly Figure
def plot_df(dataframe, title):
data = [go.Scatter(x=dataframe.index,
y=dataframe[column],
name=column) for column in dataframe.columns]
layout = dict(
title=title,
legend={
'orientation': 'h'
}
)
fig = dict(data=data, layout=layout)
return fig
# Helper to convert the dollars to numbers
def dollars_to_int(dollar_string):
converted = dollar_string.rstrip('K').lstrip('$').replace(',','')
try:
converted = float(converted)
except:
converted = 'NaN'
return converted
# Helper to convert the percentage points to numbers between 0 and 1
def pct_to_int(pct_string):
converted = float(pct_string.rstrip('%')) / 100
return converted
# Normalize Numpy Series Values between 0 and 1
def normalize_series(series):
max_value = series.max()
min_value = series.min()
def normalize(number):
return (number-min_value) / (max_value - min_value)
series = series.apply(normalize)
return series
# Define constants for pandas read_csv
converters = {
'Median Sale Price': dollars_to_int,
'Median List Price': dollars_to_int,
}
parse_dates = ['Period Begin', 'Period End']
index_col = ['City', 'Period End']
files = {
'condos': 'redfin-bayarea-condos.csv',
'single_family': 'redfin-bayarea-single-family.csv',
'townhouse': 'redfin-bayarea-townhouse.csv',
'multi_family': 'redfin-bayarea-multi-fam.csv'
}
cities = ['Oakland', 'Berkeley', 'San Ramon', 'Dublin', 'Fremont', 'Alameda']
# Function for reading and sorting CSV data
def read_data(file, **kw):
df = pd.read_csv(file, parse_dates=parse_dates, converters=converters, index_col=index_col)
df.sort_index(inplace=True)
return df
# Read Redfin data into pandas dataframe
condos = read_data('redfin-bayarea-condos.csv')
single_family = read_data('redfin-bayarea-single-family.csv')
townhouse = read_data('redfin-bayarea-townhouse.csv')
multi_family = read_data('redfin-bayarea-multi-fam.csv')
all_homes = read_data('redfin-bayarea-all.csv')
# So we can see all the data we have
all_homes.info()
<class 'pandas.core.frame.DataFrame'> MultiIndex: 468 entries, (Alameda, 2012-01-31 00:00:00) to (San Ramon, 2018-06-30 00:00:00) Data columns (total 47 columns): Worksheet Filter 468 non-null object Measure Display 0 non-null float64 Number of Records 468 non-null int64 Avg Sale To List 468 non-null float64 Avg Sale To List Mom 468 non-null object Avg Sale To List Yoy 468 non-null object Homes Sold 468 non-null int64 Homes Sold Mom 468 non-null object Homes Sold Yoy 468 non-null object Inventory 468 non-null int64 Inventory Mom 468 non-null object Inventory Yoy 468 non-null object Median Dom 468 non-null int64 Median Dom Mom 468 non-null int64 Median Dom Yoy 468 non-null int64 Median List Ppsf 468 non-null float64 Median List Ppsf Mom 468 non-null float64 Median List Ppsf Yoy 468 non-null float64 Median List Price 468 non-null float64 Median List Price Mom 468 non-null float64 Median List Price Yoy 468 non-null float64 Median Ppsf 468 non-null float64 Median Ppsf Mom 468 non-null float64 Median Ppsf Yoy 468 non-null float64 Median Sale Price 468 non-null float64 Median Sale Price Mom 468 non-null object Median Sale Price Yoy 468 non-null object months_of_supply 468 non-null float64 months_of_supply_mom 468 non-null float64 months_of_supply_yoy 468 non-null float64 New Listings 468 non-null int64 New Listings Mom 468 non-null object New Listings Yoy 468 non-null object Period Begin 468 non-null datetime64[ns] Period Duration 468 non-null int64 Price Drops 467 non-null float64 Price Drops Mom 466 non-null float64 Price Drops Yoy 467 non-null float64 Property Type 468 non-null object Region 468 non-null object Region Type 468 non-null object Sold Above List 468 non-null float64 Sold Above List Mom 468 non-null float64 Sold Above List Yoy 468 non-null float64 State 468 non-null object State Code 468 non-null object Table Id 468 non-null int64 dtypes: datetime64[ns](1), float64(21), int64(9), object(16) memory usage: 173.5+ KB
# Construct dataframe for correlation
all_homes_corr = all_homes.drop(['Worksheet Filter', 'Measure Display', 'Number of Records', 'Period Begin', 'Period Duration', 'Avg Sale To List Mom', 'Avg Sale To List Yoy', 'Inventory Mom', 'Inventory Yoy', 'Median Dom Mom', 'Median Dom Yoy', 'Median List Price Mom', 'Median List Price Yoy', 'Median Ppsf Mom', 'Median Ppsf Yoy', 'Median Sale Price Mom', 'Median Sale Price Yoy', 'months_of_supply_mom', 'months_of_supply_yoy', 'New Listings Mom', 'New Listings Yoy', 'Price Drops Mom', 'Price Drops Yoy', 'Property Type', 'Region', 'Region Type', 'Sold Above List Mom', 'Sold Above List Yoy', 'State', 'State Code', 'Table Id', 'Homes Sold Mom', 'Homes Sold Yoy', 'Median List Ppsf Mom', 'Median List Ppsf Yoy'], axis=1)
# Pearson Correlation Coefficients
all_homes_corr.loc['Oakland'].corr().style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)
Avg Sale To List | Homes Sold | Inventory | Median Dom | Median List Ppsf | Median List Price | Median Ppsf | Median Sale Price | months_of_supply | New Listings | Price Drops | Sold Above List | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Avg Sale To List | 1.0 | 0.25 | -0.52 | -0.83 | 0.81 | 0.81 | 0.87 | 0.86 | -0.66 | 0.19 | -0.43 | 0.96 |
Homes Sold | 0.25 | 1.0 | 0.35 | -0.32 | -0.11 | -0.12 | 0.018 | 0.023 | -0.33 | 0.4 | 0.4 | 0.25 |
Inventory | -0.52 | 0.35 | 1.0 | 0.5 | -0.63 | -0.62 | -0.64 | -0.64 | 0.75 | 0.6 | 0.71 | -0.56 |
Median Dom | -0.83 | -0.32 | 0.5 | 1.0 | -0.6 | -0.6 | -0.65 | -0.65 | 0.72 | -0.13 | 0.44 | -0.88 |
Median List Ppsf | 0.81 | -0.11 | -0.63 | -0.6 | 1.0 | 0.99 | 0.97 | 0.96 | -0.53 | -0.06 | -0.4 | 0.8 |
Median List Price | 0.81 | -0.12 | -0.62 | -0.6 | 0.99 | 1.0 | 0.96 | 0.95 | -0.51 | -0.031 | -0.4 | 0.79 |
Median Ppsf | 0.87 | 0.018 | -0.64 | -0.65 | 0.97 | 0.96 | 1.0 | 0.98 | -0.62 | -0.094 | -0.36 | 0.86 |
Median Sale Price | 0.86 | 0.023 | -0.64 | -0.65 | 0.96 | 0.95 | 0.98 | 1.0 | -0.63 | -0.11 | -0.36 | 0.85 |
months_of_supply | -0.66 | -0.33 | 0.75 | 0.72 | -0.53 | -0.51 | -0.62 | -0.63 | 1.0 | 0.32 | 0.42 | -0.71 |
New Listings | 0.19 | 0.4 | 0.6 | -0.13 | -0.06 | -0.031 | -0.094 | -0.11 | 0.32 | 1.0 | 0.24 | 0.11 |
Price Drops | -0.43 | 0.4 | 0.71 | 0.44 | -0.4 | -0.4 | -0.36 | -0.36 | 0.42 | 0.24 | 1.0 | -0.44 |
Sold Above List | 0.96 | 0.25 | -0.56 | -0.88 | 0.8 | 0.79 | 0.86 | 0.85 | -0.71 | 0.11 | -0.44 | 1.0 |
# Spearman Correlation Coefficients
all_homes_corr.loc['Oakland'].corr(method='spearman').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)
Avg Sale To List | Homes Sold | Inventory | Median Dom | Median List Ppsf | Median List Price | Median Ppsf | Median Sale Price | months_of_supply | New Listings | Price Drops | Sold Above List | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Avg Sale To List | 1.0 | 0.22 | -0.35 | -0.91 | 0.78 | 0.79 | 0.83 | 0.82 | -0.59 | 0.23 | -0.33 | 0.97 |
Homes Sold | 0.22 | 1.0 | 0.48 | -0.31 | -0.14 | -0.14 | -0.033 | -0.019 | -0.28 | 0.46 | 0.38 | 0.26 |
Inventory | -0.35 | 0.48 | 1.0 | 0.19 | -0.58 | -0.56 | -0.58 | -0.57 | 0.61 | 0.65 | 0.61 | -0.31 |
Median Dom | -0.91 | -0.31 | 0.19 | 1.0 | -0.64 | -0.64 | -0.69 | -0.68 | 0.52 | -0.34 | 0.24 | -0.87 |
Median List Ppsf | 0.78 | -0.14 | -0.58 | -0.64 | 1.0 | 0.99 | 0.96 | 0.96 | -0.5 | -0.11 | -0.32 | 0.76 |
Median List Price | 0.79 | -0.14 | -0.56 | -0.64 | 0.99 | 1.0 | 0.95 | 0.95 | -0.47 | -0.071 | -0.32 | 0.76 |
Median Ppsf | 0.83 | -0.033 | -0.58 | -0.69 | 0.96 | 0.95 | 1.0 | 0.98 | -0.61 | -0.12 | -0.28 | 0.83 |
Median Sale Price | 0.82 | -0.019 | -0.57 | -0.68 | 0.96 | 0.95 | 0.98 | 1.0 | -0.61 | -0.13 | -0.26 | 0.81 |
months_of_supply | -0.59 | -0.28 | 0.61 | 0.52 | -0.5 | -0.47 | -0.61 | -0.61 | 1.0 | 0.27 | 0.22 | -0.6 |
New Listings | 0.23 | 0.46 | 0.65 | -0.34 | -0.11 | -0.071 | -0.12 | -0.13 | 0.27 | 1.0 | 0.18 | 0.23 |
Price Drops | -0.33 | 0.38 | 0.61 | 0.24 | -0.32 | -0.32 | -0.28 | -0.26 | 0.22 | 0.18 | 1.0 | -0.27 |
Sold Above List | 0.97 | 0.26 | -0.31 | -0.87 | 0.76 | 0.76 | 0.83 | 0.81 | -0.6 | 0.23 | -0.27 | 1.0 |
# Use matplotlib to graph the percentage of inventory over time by city
f, axs = plt.subplots(3,2,figsize=(15,15), dpi=100)
for index, city in enumerate(cities):
plt.subplot(3,2,index+1)
condos.loc[city]['Inventory'].divide(all_homes.loc[city]['Inventory']).plot(label="Condos")
townhouse.loc[city]['Inventory'].divide(all_homes.loc[city]['Inventory']).plot(label="Townhouse")
single_family.loc[city]['Inventory'].divide(all_homes.loc[city]['Inventory']).plot(label="Single Family")
multi_family.loc[city]['Inventory'].divide(all_homes.loc[city]['Inventory']).plot(label="Multi Family")
plt.legend()
plt.ylabel('Percentage of Listings')
plt.xlabel('Date')
plt.title(f'{city} Inventory')
plt.suptitle('% Inventory By City', fontsize=30)
plt.show()
# Inventory percentage by housing type, average for 2017
data = {
'Condos': [],
'Townhouse': [],
'Single Family': [],
'Multi Family': []
}
for city in cities:
all_homes_average = all_homes.fillna(0).loc[city]['Inventory'].resample('Y').mean()
condos_avg = condos.fillna(0).loc[city]['Inventory'].resample('Y').mean().divide(all_homes_average)['2017-12-31']
data['Condos'].append(condos_avg)
townhome_avg = townhouse.fillna(0).loc[city]['Inventory'].resample('Y').mean().divide(all_homes_average)['2017-12-31']
data['Townhouse'].append(townhome_avg)
single_avg = single_family.fillna(0).loc[city]['Inventory'].resample('Y').mean().divide(all_homes_average)['2017-12-31']
data['Single Family'].append(single_avg)
multi_avg = multi_family.fillna(0).loc[city]['Inventory'].resample('Y').mean().divide(all_homes_average)['2017-12-31']
data['Multi Family'].append(multi_avg)
traces = []
for k,v in data.items():
trace = go.Bar(
x=cities,
y=data[k],
name=k
)
traces.append(trace)
layout = go.Layout(
barmode='stack',
title='2017 Average - Distribution of Home Inventory by Location',
yaxis={
'title': 'Percentage'
},
xaxis={
'title': 'Location'
}
)
fig = go.Figure(data=traces, layout=layout)
plotly.iplot(fig, filename='Inventory Percentage Bar Chart', show_link=False)
# plotly.plot(fig, image='svg', image_filename='2017-Inventory-Averages')
# Sum the total homes sold by type for each city
# Plot on plotly bar chart
data = {
'Condos': [],
'Townhouse': [],
'Single Family': [],
'Multi Family': []
}
for city in cities:
condos_sum = condos.fillna(0).loc[city]['Homes Sold'].resample('Y').sum()['2017-12-31']
data['Condos'].append(condos_sum)
townhome_sum = townhouse.fillna(0).loc[city]['Homes Sold'].resample('Y').sum()['2017-12-31']
data['Townhouse'].append(townhome_sum)
single_sum = single_family.fillna(0).loc[city]['Homes Sold'].resample('Y').sum()['2017-12-31']
data['Single Family'].append(single_sum)
multi_sum = multi_family.fillna(0).loc[city]['Homes Sold'].resample('Y').sum()['2017-12-31']
data['Multi Family'].append(multi_sum)
traces = []
for k,v in data.items():
trace = go.Bar(
x=cities,
y=data[k],
name=k,
text=data[k],
textposition='auto'
)
traces.append(trace)
layout = go.Layout(
barmode='stack',
title='2017 Total Homes Sold by City',
yaxis={
'title': 'Number of Homes'
}
)
fig = go.Figure(data=traces, layout=layout)
plotly.iplot(fig, filename='Inventory Percentage Bar Chart', show_link=False)
plotly.plot(fig, image='svg', image_filename='2017-Total-Homes-Sold')
'file:///notebooks/Redfin Plots/temp-plot.html'
# Quickly compare median price per square foot by city
# Use matplotlib to plot each city separately
fig, axes = plt.subplots(3, 2, figsize=(15,15), dpi=120)
indicator = 'Median Ppsf'
for index, city in enumerate(cities):
plt.subplot(3,2,index+1)
condos.loc[city][indicator].dropna().plot(label="Condos")
townhouse.loc[city][indicator].dropna().plot(label="Townhouse")
single_family.loc[city][indicator].dropna().plot(label="Single Family")
multi_family.loc[city][indicator].dropna().plot(label="Multi Family")
all_homes.loc[city][indicator].dropna().plot(label="All Homes")
plt.ylabel('$ / Sq Ft')
plt.xlabel('Date')
plt.legend()
plt.title(f'{city} {indicator}')
plt.suptitle('Median Sale Price Per Sq Ft', fontsize=16)
plt.show()
# Use statsmodels seasonal_decompose() on Oakland Median Sale Price
# Using additive model, could also use multiplicative
decomposed = seasonal_decompose(all_homes.loc['Oakland']['Median Sale Price'], model="additive")
# Use Plotly to plot observed, trend, seasonal, and residuals
trace1 = go.Scatter(
x=decomposed.observed.index,
y=decomposed.observed,
name='Observed'
)
trace2 = go.Scatter(
x=decomposed.seasonal.index,
y=decomposed.seasonal,
name='Seasonal'
)
trace3 = go.Scatter(
x=decomposed.trend.index,
y=decomposed.trend,
name='Trend'
)
trace4 = go.Scatter(
x=decomposed.resid.index,
y=decomposed.resid,
name='Residual'
)
fig = tools.make_subplots(rows=4, cols=1,shared_xaxes=True)
fig.append_trace(trace1,1,1)
fig.append_trace(trace2,2,1)
fig.append_trace(trace3,3,1)
fig.append_trace(trace4,4,1)
fig['layout'].update(
title='Oakland Median Sale Price / SqFt Seasonal Decomposition',
yaxis={'title': '$ / sqft'},
yaxis2={'title': '$ / sqft'},
yaxis3={'title': '$ / sqft'},
yaxis4={'title': '$ / sqft'},
legend={
'orientation': 'h'
}
)
plotly.iplot(fig, filename='ETS Decomp', show_link=False)
# plotly.plot(fig, image='svg', image_filename='msp-all-oakland-seasonal', image_width=1080, image_height=960)
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (2,1) x1,y2 ] [ (3,1) x1,y3 ] [ (4,1) x1,y4 ]
# Normalize seasonal decomposition data by city
# Input cities array
# Output dict of normalized data
# Keys are each city
def normalize_city_data(cities):
output = {}
for city in cities:
output[city] = seasonal_decompose(normalize_series(all_homes.loc[city]['Median Sale Price']), model="additive")
return output
# Function to help us map the data to plotly traces
# 2 sets of traces:
# - Data
# - Peaks
def get_traces(df_dict):
traces = []
peaks = []
for k,v in df_dict.items():
trace = go.Scatter(
x=v.seasonal.index,
y=v.seasonal,
name=k
)
trace_max = go.Scatter(
showlegend=False,
x=[v.seasonal.idxmax(),v.seasonal.idxmin()],
y=[v.seasonal.max(), v.seasonal.min()],
mode='markers+text',
marker=dict(
size=4,
color='rgb(255,0,0)',
symbol='cross'
),
text=[v.seasonal.idxmax().month,v.seasonal.idxmin().month],
textposition='middle right',
textfont={
'size': 14
},
name='Peak'
)
traces.append(trace)
peaks.append(trace_max)
return (traces, peaks)
data = normalize_city_data(cities)
# Plotting the traces from the previous function
traces, peaks = get_traces(data)
fig = tools.make_subplots(rows=6, cols=1,shared_xaxes=True, vertical_spacing=0.01)
for index, trace in enumerate(traces):
fig.append_trace(traces[index], index+1, 1)
for index, trace in enumerate(peaks):
fig.append_trace(peaks[index], index+1, 1)
fig['layout'].update(
title='Seasonalily Across Cities - Median Price',
legend={
'orientation': 'h'
}
)
plotly.iplot(fig, filename='Seasonal Trend Analysis', show_link=False)
# plotly.plot(fig, image='svg', image_filename='seasonal-trace-all-cities', image_width=1080, image_height=960)
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (2,1) x1,y2 ] [ (3,1) x1,y3 ] [ (4,1) x1,y4 ] [ (5,1) x1,y5 ] [ (6,1) x1,y6 ]
We will base this off of Zillow Rental Data and Redfin Sales Data
eastbay_rental_prices = pd.read_csv('east-bay-mrpah.csv', parse_dates=['Date'], index_col=['Date'])
eastbay_rental_prices.dropna(inplace=True)
eastbay_rental_prices.drop(['Emeryville, CA'], axis=1, inplace=True)
eastbay_rental_prices = eastbay_rental_prices.apply(lambda x: x*12)
eastbay_rental_prices.tail()
Berkeley, CA | Oakland, CA | Dublin, CA | San Ramon, CA | Fremont, CA | |
---|---|---|---|---|---|
Date | |||||
2017-12-31 | 40470.0 | 35394.0 | 39600.0 | 40800.0 | 36600.0 |
2018-01-31 | 42000.0 | 34800.0 | 39540.0 | 38700.0 | 36000.0 |
2018-02-28 | 41994.0 | 33600.0 | 39270.0 | 37500.0 | 38400.0 |
2018-03-31 | 43800.0 | 33600.0 | 40800.0 | 40770.0 | 38400.0 |
2018-04-30 | 42600.0 | 34800.0 | 40800.0 | 41940.0 | 39600.0 |
indicator = 'Median Sale Price'
df_arr = [all_homes.loc[city][indicator].dropna().rename(f'{city}, CA') for city in cities]
msp_df = pd.concat(df_arr, axis=1)
msp_df.drop(['Alameda, CA'], axis=1, inplace=True)
msp_df = msp_df['2017-01-31':]
msp_df = msp_df.apply(lambda x: x*1000)
msp_df.tail()
Oakland, CA | Berkeley, CA | San Ramon, CA | Dublin, CA | Fremont, CA | |
---|---|---|---|---|---|
Period End | |||||
2018-02-28 | 655000.0 | 1110000.0 | 921000.0 | 852000.0 | 1003000.0 |
2018-03-31 | 720000.0 | 1226000.0 | 966000.0 | 967000.0 | 1120000.0 |
2018-04-30 | 780000.0 | 1265000.0 | 1030000.0 | 880000.0 | 1167000.0 |
2018-05-31 | 775000.0 | 1308000.0 | 1040000.0 | 981000.0 | 1180000.0 |
2018-06-30 | 824000.0 | 1350000.0 | 1075000.0 | 900000.0 | 1171000.0 |
prr_df = msp_df.divide(eastbay_rental_prices, axis=1)
prr_df.dropna(inplace=True)
fig = plot_df(prr_df, 'Our Calculated Price to Rent Ratio')
plotly.iplot(fig, filename='Seasonal Trend Analysis', show_link=False)
plotly.plot(fig, filename='calculated-prr.html',image='svg', image_filename='calculated-prr', image_width=1280, image_height=720)
'file:///notebooks/Redfin Plots/calculated-prr.html'
zillow_prr = pd.read_csv('zillow-prr.csv', parse_dates=['Date'], index_col=['Date'])
zillow_prr.dropna(inplace=True)
zillow_prr.drop(['Emeryville, CA'], axis=1, inplace=True)
zillow_prr = zillow_prr['2017-01-31':'2018-04-30']
zillow_prr = zillow_prr[['Berkeley, CA', 'Dublin, CA', 'Fremont, CA', 'Oakland, CA', 'San Ramon, CA']]
fig = plot_df(zillow_prr, 'Zillow\'s Price to Rent Ratio')
plotly.iplot(fig, filename='Zillow PRR', show_link=False)
plotly.plot(fig, filename='zillow-prr.html',image='svg', image_filename='zillow-prr', image_width=1280, image_height=720)
'file:///notebooks/Redfin Plots/zillow-prr.html'