In [1]:

# Standard imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

# Stats Models functions for Time Series Analysis
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose

In [2]:

# Plotly Imports
import plotly.offline as plotly
import plotly.graph_objs as go
from plotly import tools

plotly.init_notebook_mode(connected=False)

In [44]:

# Translate DataFrame into Plotly Figure
def plot_df(dataframe, title): 
    data = [go.Scatter(x=dataframe.index,
                                  y=dataframe[column],
                                  name=column) for column in dataframe.columns]
    layout = dict(
        title=title, 
        legend={
            'orientation': 'h'
        }
    )
    fig = dict(data=data, layout=layout)
    return fig

In [4]:

# Helper to convert the dollars to numbers
def dollars_to_int(dollar_string):
    converted = dollar_string.rstrip('K').lstrip('$').replace(',','')
    try:
        converted = float(converted)
    except:
        converted = 'NaN'
    return converted

# Helper to convert the percentage points to numbers between 0 and 1
def pct_to_int(pct_string):
    converted = float(pct_string.rstrip('%')) / 100
    return converted

In [5]:

# Normalize Numpy Series Values between 0 and 1
def normalize_series(series):
    max_value = series.max()
    min_value = series.min()
    def normalize(number):
        return (number-min_value) / (max_value - min_value)
    series = series.apply(normalize)
    return series

In [6]:

# Define constants for pandas read_csv
converters = {
    'Median Sale Price': dollars_to_int,
    'Median List Price': dollars_to_int,
    
}
parse_dates = ['Period Begin', 'Period End']
index_col = ['City', 'Period End']
files = {
    'condos': 'redfin-bayarea-condos.csv',
    'single_family': 'redfin-bayarea-single-family.csv',
    'townhouse': 'redfin-bayarea-townhouse.csv',
    'multi_family': 'redfin-bayarea-multi-fam.csv'
}

cities = ['Oakland', 'Berkeley', 'San Ramon', 'Dublin', 'Fremont', 'Alameda']

In [7]:

# Function for reading and sorting CSV data
def read_data(file, **kw):
    df = pd.read_csv(file, parse_dates=parse_dates, converters=converters, index_col=index_col)
    df.sort_index(inplace=True)
    return df

In [8]:

# Read Redfin data into pandas dataframe
condos = read_data('redfin-bayarea-condos.csv')
single_family = read_data('redfin-bayarea-single-family.csv')
townhouse = read_data('redfin-bayarea-townhouse.csv')
multi_family = read_data('redfin-bayarea-multi-fam.csv')
all_homes = read_data('redfin-bayarea-all.csv')

In [9]:

# So we can see all the data we have
all_homes.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 468 entries, (Alameda, 2012-01-31 00:00:00) to (San Ramon, 2018-06-30 00:00:00)
Data columns (total 47 columns):
Worksheet Filter         468 non-null object
Measure Display          0 non-null float64
Number of Records        468 non-null int64
Avg Sale To List         468 non-null float64
Avg Sale To List Mom     468 non-null object
Avg Sale To List Yoy     468 non-null object
Homes Sold               468 non-null int64
Homes Sold Mom           468 non-null object
Homes Sold Yoy           468 non-null object
Inventory                468 non-null int64
Inventory Mom            468 non-null object
Inventory Yoy            468 non-null object
Median Dom               468 non-null int64
Median Dom Mom           468 non-null int64
Median Dom Yoy           468 non-null int64
Median List Ppsf         468 non-null float64
Median List Ppsf Mom     468 non-null float64
Median List Ppsf Yoy     468 non-null float64
Median List Price        468 non-null float64
Median List Price Mom    468 non-null float64
Median List Price Yoy    468 non-null float64
Median Ppsf              468 non-null float64
Median Ppsf Mom          468 non-null float64
Median Ppsf Yoy          468 non-null float64
Median Sale Price        468 non-null float64
Median Sale Price Mom    468 non-null object
Median Sale Price Yoy    468 non-null object
months_of_supply         468 non-null float64
months_of_supply_mom     468 non-null float64
months_of_supply_yoy     468 non-null float64
New Listings             468 non-null int64
New Listings Mom         468 non-null object
New Listings Yoy         468 non-null object
Period Begin             468 non-null datetime64[ns]
Period Duration          468 non-null int64
Price Drops              467 non-null float64
Price Drops Mom          466 non-null float64
Price Drops Yoy          467 non-null float64
Property Type            468 non-null object
Region                   468 non-null object
Region Type              468 non-null object
Sold Above List          468 non-null float64
Sold Above List Mom      468 non-null float64
Sold Above List Yoy      468 non-null float64
State                    468 non-null object
State Code               468 non-null object
Table Id                 468 non-null int64
dtypes: datetime64[ns](1), float64(21), int64(9), object(16)
memory usage: 173.5+ KB

In [10]:

# Construct dataframe for correlation
all_homes_corr = all_homes.drop(['Worksheet Filter', 'Measure Display', 'Number of Records', 'Period Begin', 'Period Duration', 'Avg Sale To List Mom', 'Avg Sale To List Yoy', 'Inventory Mom', 'Inventory Yoy', 'Median Dom Mom', 'Median Dom Yoy', 'Median List Price Mom', 'Median List Price Yoy', 'Median Ppsf Mom', 'Median Ppsf Yoy', 'Median Sale Price Mom', 'Median Sale Price Yoy', 'months_of_supply_mom', 'months_of_supply_yoy', 'New Listings Mom', 'New Listings Yoy', 'Price Drops Mom', 'Price Drops Yoy', 'Property Type', 'Region', 'Region Type', 'Sold Above List Mom', 'Sold Above List Yoy', 'State', 'State Code', 'Table Id', 'Homes Sold Mom', 'Homes Sold Yoy', 'Median List Ppsf Mom', 'Median List Ppsf Yoy'], axis=1)

In [11]:

# Pearson Correlation Coefficients
all_homes_corr.loc['Oakland'].corr().style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

Out[11]:

	Avg Sale To List	Homes Sold	Inventory	Median Dom	Median List Ppsf	Median List Price	Median Ppsf	Median Sale Price	months_of_supply	New Listings	Price Drops	Sold Above List
Avg Sale To List	1.0	0.25	-0.52	-0.83	0.81	0.81	0.87	0.86	-0.66	0.19	-0.43	0.96
Homes Sold	0.25	1.0	0.35	-0.32	-0.11	-0.12	0.018	0.023	-0.33	0.4	0.4	0.25
Inventory	-0.52	0.35	1.0	0.5	-0.63	-0.62	-0.64	-0.64	0.75	0.6	0.71	-0.56
Median Dom	-0.83	-0.32	0.5	1.0	-0.6	-0.6	-0.65	-0.65	0.72	-0.13	0.44	-0.88
Median List Ppsf	0.81	-0.11	-0.63	-0.6	1.0	0.99	0.97	0.96	-0.53	-0.06	-0.4	0.8
Median List Price	0.81	-0.12	-0.62	-0.6	0.99	1.0	0.96	0.95	-0.51	-0.031	-0.4	0.79
Median Ppsf	0.87	0.018	-0.64	-0.65	0.97	0.96	1.0	0.98	-0.62	-0.094	-0.36	0.86
Median Sale Price	0.86	0.023	-0.64	-0.65	0.96	0.95	0.98	1.0	-0.63	-0.11	-0.36	0.85
months_of_supply	-0.66	-0.33	0.75	0.72	-0.53	-0.51	-0.62	-0.63	1.0	0.32	0.42	-0.71
New Listings	0.19	0.4	0.6	-0.13	-0.06	-0.031	-0.094	-0.11	0.32	1.0	0.24	0.11
Price Drops	-0.43	0.4	0.71	0.44	-0.4	-0.4	-0.36	-0.36	0.42	0.24	1.0	-0.44
Sold Above List	0.96	0.25	-0.56	-0.88	0.8	0.79	0.86	0.85	-0.71	0.11	-0.44	1.0

In [12]:

# Spearman Correlation Coefficients
all_homes_corr.loc['Oakland'].corr(method='spearman').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

Out[12]:

	Avg Sale To List	Homes Sold	Inventory	Median Dom	Median List Ppsf	Median List Price	Median Ppsf	Median Sale Price	months_of_supply	New Listings	Price Drops	Sold Above List
Avg Sale To List	1.0	0.22	-0.35	-0.91	0.78	0.79	0.83	0.82	-0.59	0.23	-0.33	0.97
Homes Sold	0.22	1.0	0.48	-0.31	-0.14	-0.14	-0.033	-0.019	-0.28	0.46	0.38	0.26
Inventory	-0.35	0.48	1.0	0.19	-0.58	-0.56	-0.58	-0.57	0.61	0.65	0.61	-0.31
Median Dom	-0.91	-0.31	0.19	1.0	-0.64	-0.64	-0.69	-0.68	0.52	-0.34	0.24	-0.87
Median List Ppsf	0.78	-0.14	-0.58	-0.64	1.0	0.99	0.96	0.96	-0.5	-0.11	-0.32	0.76
Median List Price	0.79	-0.14	-0.56	-0.64	0.99	1.0	0.95	0.95	-0.47	-0.071	-0.32	0.76
Median Ppsf	0.83	-0.033	-0.58	-0.69	0.96	0.95	1.0	0.98	-0.61	-0.12	-0.28	0.83
Median Sale Price	0.82	-0.019	-0.57	-0.68	0.96	0.95	0.98	1.0	-0.61	-0.13	-0.26	0.81
months_of_supply	-0.59	-0.28	0.61	0.52	-0.5	-0.47	-0.61	-0.61	1.0	0.27	0.22	-0.6
New Listings	0.23	0.46	0.65	-0.34	-0.11	-0.071	-0.12	-0.13	0.27	1.0	0.18	0.23
Price Drops	-0.33	0.38	0.61	0.24	-0.32	-0.32	-0.28	-0.26	0.22	0.18	1.0	-0.27
Sold Above List	0.97	0.26	-0.31	-0.87	0.76	0.76	0.83	0.81	-0.6	0.23	-0.27	1.0

In [13]:

# Use matplotlib to graph the percentage of inventory over time by city

f, axs = plt.subplots(3,2,figsize=(15,15), dpi=100)

for index, city in enumerate(cities):
    plt.subplot(3,2,index+1)
    condos.loc[city]['Inventory'].divide(all_homes.loc[city]['Inventory']).plot(label="Condos")
    townhouse.loc[city]['Inventory'].divide(all_homes.loc[city]['Inventory']).plot(label="Townhouse")
    single_family.loc[city]['Inventory'].divide(all_homes.loc[city]['Inventory']).plot(label="Single Family")
    multi_family.loc[city]['Inventory'].divide(all_homes.loc[city]['Inventory']).plot(label="Multi Family")
    plt.legend()
    plt.ylabel('Percentage of Listings')
    plt.xlabel('Date')
    plt.title(f'{city} Inventory')

plt.suptitle('% Inventory By City', fontsize=30)
plt.show()

In [14]:

# Inventory percentage by housing type, average for 2017

data = {
    'Condos': [],
    'Townhouse': [],
    'Single Family': [],
    'Multi Family': []
}

for city in cities:
    all_homes_average = all_homes.fillna(0).loc[city]['Inventory'].resample('Y').mean()
    
    condos_avg = condos.fillna(0).loc[city]['Inventory'].resample('Y').mean().divide(all_homes_average)['2017-12-31']
    data['Condos'].append(condos_avg)
    
    townhome_avg = townhouse.fillna(0).loc[city]['Inventory'].resample('Y').mean().divide(all_homes_average)['2017-12-31']
    data['Townhouse'].append(townhome_avg)
    
    single_avg = single_family.fillna(0).loc[city]['Inventory'].resample('Y').mean().divide(all_homes_average)['2017-12-31']
    data['Single Family'].append(single_avg)
    
    multi_avg = multi_family.fillna(0).loc[city]['Inventory'].resample('Y').mean().divide(all_homes_average)['2017-12-31']
    data['Multi Family'].append(multi_avg)
    

traces = []
for k,v in data.items():
    trace = go.Bar(
        x=cities,
        y=data[k],
        name=k
    )
    traces.append(trace)


layout = go.Layout(
    barmode='stack',
    title='2017 Average - Distribution of Home Inventory by Location',
    yaxis={
        'title': 'Percentage'
    },
    xaxis={
        'title': 'Location'
    }
)

fig = go.Figure(data=traces, layout=layout)
plotly.iplot(fig, filename='Inventory Percentage Bar Chart', show_link=False)
# plotly.plot(fig, image='svg', image_filename='2017-Inventory-Averages')

In [15]:

# Sum the total homes sold by type for each city
# Plot on plotly bar chart

data = {
    'Condos': [],
    'Townhouse': [],
    'Single Family': [],
    'Multi Family': []
}

for city in cities:
    
    condos_sum = condos.fillna(0).loc[city]['Homes Sold'].resample('Y').sum()['2017-12-31']
    data['Condos'].append(condos_sum)
    
    townhome_sum = townhouse.fillna(0).loc[city]['Homes Sold'].resample('Y').sum()['2017-12-31']
    data['Townhouse'].append(townhome_sum)
    
    single_sum = single_family.fillna(0).loc[city]['Homes Sold'].resample('Y').sum()['2017-12-31']
    data['Single Family'].append(single_sum)
    
    multi_sum = multi_family.fillna(0).loc[city]['Homes Sold'].resample('Y').sum()['2017-12-31']
    data['Multi Family'].append(multi_sum)
    

traces = []
for k,v in data.items():
    trace = go.Bar(
        x=cities,
        y=data[k],
        name=k,
        text=data[k],
        textposition='auto'
    )
    traces.append(trace)


layout = go.Layout(
    barmode='stack',
    title='2017 Total Homes Sold by City',
    yaxis={
        'title': 'Number of Homes'
    }
)

fig = go.Figure(data=traces, layout=layout)
plotly.iplot(fig, filename='Inventory Percentage Bar Chart', show_link=False)
plotly.plot(fig, image='svg', image_filename='2017-Total-Homes-Sold')

Out[15]:

'file:///notebooks/Redfin Plots/temp-plot.html'

In [16]:

# Quickly compare median price per square foot by city
# Use matplotlib to plot each city separately

fig, axes = plt.subplots(3, 2, figsize=(15,15), dpi=120)

indicator = 'Median Ppsf'

for index, city in enumerate(cities):
    plt.subplot(3,2,index+1)
    condos.loc[city][indicator].dropna().plot(label="Condos")
    townhouse.loc[city][indicator].dropna().plot(label="Townhouse")
    single_family.loc[city][indicator].dropna().plot(label="Single Family")
    multi_family.loc[city][indicator].dropna().plot(label="Multi Family")
    all_homes.loc[city][indicator].dropna().plot(label="All Homes")
    plt.ylabel('$ / Sq Ft')
    plt.xlabel('Date')
    plt.legend()
    plt.title(f'{city} {indicator}')

plt.suptitle('Median Sale Price Per Sq Ft', fontsize=16)
plt.show()

Seasonal Decomposition Section¶

In [17]:

# Use statsmodels seasonal_decompose() on Oakland Median Sale Price
# Using additive model, could also use multiplicative

decomposed = seasonal_decompose(all_homes.loc['Oakland']['Median Sale Price'], model="additive")

In [18]:

# Use Plotly to plot observed, trend, seasonal, and residuals

trace1 = go.Scatter(
    x=decomposed.observed.index,
    y=decomposed.observed,
    name='Observed'
)

trace2 = go.Scatter(
    x=decomposed.seasonal.index,
    y=decomposed.seasonal,
    name='Seasonal'
)

trace3 = go.Scatter(
    x=decomposed.trend.index,
    y=decomposed.trend,
    name='Trend'
)

trace4 = go.Scatter(
    x=decomposed.resid.index,
    y=decomposed.resid,
    name='Residual'
)

fig = tools.make_subplots(rows=4, cols=1,shared_xaxes=True)

fig.append_trace(trace1,1,1)
fig.append_trace(trace2,2,1)
fig.append_trace(trace3,3,1)
fig.append_trace(trace4,4,1)

fig['layout'].update(
    title='Oakland Median Sale Price / SqFt Seasonal Decomposition',
    yaxis={'title': '$ / sqft'},
    yaxis2={'title': '$ / sqft'},
    yaxis3={'title': '$ / sqft'},
    yaxis4={'title': '$ / sqft'},
    legend={
        'orientation': 'h'
    }
    )

plotly.iplot(fig, filename='ETS Decomp', show_link=False)
# plotly.plot(fig, image='svg', image_filename='msp-all-oakland-seasonal', image_width=1080, image_height=960)

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x1,y2 ]
[ (3,1) x1,y3 ]
[ (4,1) x1,y4 ]

In [19]:

# Normalize seasonal decomposition data by city
# Input cities array
# Output dict of normalized data
# Keys are each city

def normalize_city_data(cities):
    output = {}
    for city in cities:
        output[city] = seasonal_decompose(normalize_series(all_homes.loc[city]['Median Sale Price']), model="additive")
    return output

In [20]:

# Function to help us map the data to plotly traces
# 2 sets of traces: 
# - Data
# - Peaks

def get_traces(df_dict):
    traces = []
    peaks = []
    for k,v in df_dict.items():
        trace = go.Scatter(
            x=v.seasonal.index,
            y=v.seasonal,
            name=k
        )
        trace_max = go.Scatter(
            showlegend=False,
            x=[v.seasonal.idxmax(),v.seasonal.idxmin()],
            y=[v.seasonal.max(), v.seasonal.min()],
            mode='markers+text',
            marker=dict(
                size=4,
                color='rgb(255,0,0)',
                symbol='cross'
            ),
            text=[v.seasonal.idxmax().month,v.seasonal.idxmin().month],
            textposition='middle right',
            textfont={
              'size': 14  
            },
            name='Peak'
        )
        traces.append(trace)
        peaks.append(trace_max)
    return (traces, peaks)

In [21]:

data = normalize_city_data(cities)

In [22]:

# Plotting the traces from the previous function

traces, peaks = get_traces(data)

fig = tools.make_subplots(rows=6, cols=1,shared_xaxes=True, vertical_spacing=0.01)

for index, trace in enumerate(traces):
    fig.append_trace(traces[index], index+1, 1)

for index, trace in enumerate(peaks):
    fig.append_trace(peaks[index], index+1, 1)

fig['layout'].update(
    title='Seasonalily Across Cities - Median Price', 
    legend={
        'orientation': 'h'
    }
)

plotly.iplot(fig, filename='Seasonal Trend Analysis', show_link=False)
# plotly.plot(fig, image='svg', image_filename='seasonal-trace-all-cities', image_width=1080, image_height=960)

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x1,y2 ]
[ (3,1) x1,y3 ]
[ (4,1) x1,y4 ]
[ (5,1) x1,y5 ]
[ (6,1) x1,y6 ]

Calculating our own PRR¶

We will base this off of Zillow Rental Data and Redfin Sales Data

Get the median rental price by city
Get the median sale price by city
Divide the two
Plot the result

In [66]:

eastbay_rental_prices = pd.read_csv('east-bay-mrpah.csv', parse_dates=['Date'], index_col=['Date'])
eastbay_rental_prices.dropna(inplace=True)
eastbay_rental_prices.drop(['Emeryville, CA'], axis=1, inplace=True)
eastbay_rental_prices = eastbay_rental_prices.apply(lambda x: x*12)
eastbay_rental_prices.tail()

Out[66]:

	Berkeley, CA	Oakland, CA	Dublin, CA	San Ramon, CA	Fremont, CA
Date
2017-12-31	40470.0	35394.0	39600.0	40800.0	36600.0
2018-01-31	42000.0	34800.0	39540.0	38700.0	36000.0
2018-02-28	41994.0	33600.0	39270.0	37500.0	38400.0
2018-03-31	43800.0	33600.0	40800.0	40770.0	38400.0
2018-04-30	42600.0	34800.0	40800.0	41940.0	39600.0

In [65]:

indicator = 'Median Sale Price'
df_arr = [all_homes.loc[city][indicator].dropna().rename(f'{city}, CA') for city in cities]
msp_df = pd.concat(df_arr, axis=1)
msp_df.drop(['Alameda, CA'], axis=1, inplace=True)
msp_df = msp_df['2017-01-31':]
msp_df = msp_df.apply(lambda x: x*1000)
msp_df.tail()

Out[65]:

	Oakland, CA	Berkeley, CA	San Ramon, CA	Dublin, CA	Fremont, CA
Period End
2018-02-28	655000.0	1110000.0	921000.0	852000.0	1003000.0
2018-03-31	720000.0	1226000.0	966000.0	967000.0	1120000.0
2018-04-30	780000.0	1265000.0	1030000.0	880000.0	1167000.0
2018-05-31	775000.0	1308000.0	1040000.0	981000.0	1180000.0
2018-06-30	824000.0	1350000.0	1075000.0	900000.0	1171000.0

In [68]:

prr_df = msp_df.divide(eastbay_rental_prices, axis=1)
prr_df.dropna(inplace=True)

In [69]:

fig = plot_df(prr_df, 'Our Calculated Price to Rent Ratio')
plotly.iplot(fig, filename='Seasonal Trend Analysis', show_link=False)
plotly.plot(fig, filename='calculated-prr.html',image='svg', image_filename='calculated-prr', image_width=1280, image_height=720)

Out[69]:

'file:///notebooks/Redfin Plots/calculated-prr.html'

In [70]:

zillow_prr = pd.read_csv('zillow-prr.csv', parse_dates=['Date'], index_col=['Date'])
zillow_prr.dropna(inplace=True)
zillow_prr.drop(['Emeryville, CA'], axis=1, inplace=True)
zillow_prr = zillow_prr['2017-01-31':'2018-04-30']
zillow_prr = zillow_prr[['Berkeley, CA', 'Dublin, CA', 'Fremont, CA', 'Oakland, CA', 'San Ramon, CA']]

In [71]:

fig = plot_df(zillow_prr, 'Zillow\'s Price to Rent Ratio')
plotly.iplot(fig, filename='Zillow PRR', show_link=False)
plotly.plot(fig, filename='zillow-prr.html',image='svg', image_filename='zillow-prr', image_width=1280, image_height=720)

Out[71]:

'file:///notebooks/Redfin Plots/zillow-prr.html'

In [ ]: