In [1]:

import os
import pandas as pd
import numpy as np
import orca
from urbansim.models import RegressionModel
from urbansim.utils import misc

Set the location of the HDFStore as an injectable called "store"¶

In [2]:

orca.add_injectable("store", pd.HDFStore(os.path.join(misc.data_dir(), "sanfran_public.h5"), mode="r"))

Specify table sources and broadcasts that will be used later¶

In [3]:

@orca.table('buildings')
def buildings(store):
    df = store['buildings']
    return df

@orca.table('zones')
def zones(store):
    df = store['zones']
    return df

@orca.table('households')
def households(store):
    df = store['households']
    return df

@orca.table('parcels')
def parcels(store):
    df = store['parcels']
    return df

orca.broadcast('zones', 'buildings', cast_index=True, onto_on='zone_id')

Specify the computed columns¶

In [4]:

@orca.column('households', 'income_quartile', cache=True)
def income_quartile(households):
    return pd.Series(pd.qcut(households.income, 4).labels,
                     index=households.index)

@orca.column('households', 'zone_id', cache=True)
def zone_id(households, buildings):
    return misc.reindex(buildings.zone_id, households.building_id)

@orca.column('zones', 'ave_unit_sqft')
def ave_unit_sqft(buildings, zones):
    s = buildings.unit_sqft[buildings.general_type == "Residential"]\
        .groupby(buildings.zone_id).quantile().apply(np.log1p)
    return s.reindex(zones.index).fillna(s.quantile())

@orca.column('zones', 'ave_lot_sqft')
def ave_lot_sqft(buildings, zones):
    s = buildings.unit_lot_size.groupby(buildings.zone_id).quantile().apply(np.log1p)
    return s.reindex(zones.index).fillna(s.quantile())

@orca.column('zones', 'sum_residential_units')
def sum_residential_units(buildings):
    return buildings.residential_units.groupby(buildings.zone_id).sum().apply(np.log1p)

@orca.column('zones', 'ave_income')
def ave_income(households, zones):
    s = households.income.groupby(households.zone_id).quantile().apply(np.log1p)
    return s.reindex(zones.index).fillna(s.quantile())

orca.add_injectable("building_type_map", {
    1: "Residential",
    2: "Residential",
    3: "Residential",
    4: "Office",
    5: "Hotel",
    6: "School",
    7: "Industrial",
    8: "Industrial",
    9: "Industrial",
    10: "Retail",
    11: "Retail",
    12: "Residential",
    13: "Retail",
    14: "Office"
})

@orca.column('buildings', 'zone_id', cache=True)
def zone_id(buildings, parcels):
    return misc.reindex(parcels.zone_id, buildings.parcel_id)

@orca.column('buildings', 'general_type', cache=True)
def general_type(buildings, building_type_map):
    return buildings.building_type_id.map(building_type_map)

@orca.column('buildings', 'unit_sqft', cache=True)
def unit_sqft(buildings):
    return buildings.building_sqft / buildings.residential_units.replace(0, 1)

@orca.column('buildings', 'unit_lot_size', cache=True)
def unit_lot_size(buildings, parcels):
    return misc.reindex(parcels.parcel_size, buildings.parcel_id) / \
        buildings.residential_units.replace(0, 1)
    
@orca.column('parcels', 'parcel_size', cache=True)
def parcel_size(parcels):
    return parcels.shape_area * 10.764

Configure the model¶

In [5]:

rm = RegressionModel(
    fit_filters=[
        'unit_lot_size > 0',
        'year_built > 1000',
        'year_built < 2020',
        'unit_sqft > 100',
        'unit_sqft < 20000'
    ],
    predict_filters=[
        "general_type == 'Residential'"
    ],
    model_expression='np.log1p(residential_sales_price) ~ I(year_built < 1940)'
        '+ I(year_built > 2005) + np.log1p(unit_sqft) + np.log1p(unit_lot_size)'
        '+ sum_residential_units + ave_lot_sqft + ave_unit_sqft + ave_income',
    ytransform = np.exp
)     

Get the data - merge buildings and zones (notice UrbanSum does the merge based on the broadcast)¶

In [6]:

merged_df = orca.merge_tables(target="buildings", tables=["buildings", "zones"], columns=rm.columns_used()) 

Fill nans - UrbanSim wants you to take care of nans¶

In [7]:

import utils
merged_df["year_built"] = merged_df.year_built.fillna(merged_df.year_built.quantile())
merged_df["residential_sales_price"] = merged_df.residential_sales_price.fillna(0)
merged_df["general_type"] = merged_df.general_type.fillna(merged_df.general_type.value_counts().idxmax())
_ = utils.deal_with_nas(merged_df)

Fit and report¶

In [8]:

rm.fit(merged_df).summary()

Out[8]:

OLS Regression Results
Dep. Variable:	np.log1p(residential_sales_price)	R-squared:	0.399
Model:	OLS	Adj. R-squared:	0.399
Method:	Least Squares	F-statistic:	1.240e+04
Date:	Tue, 28 Apr 2015	Prob (F-statistic):	0.00
Time:	10:38:22	Log-Likelihood:	-2.5244e+05
No. Observations:	149409	AIC:	5.049e+05
Df Residuals:	149400	BIC:	5.050e+05
Df Model:	8
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	-3.8664	0.170	-22.761	0.000	-4.199 -3.533
I(year_built < 1940)[T.True]	0.0011	0.007	0.144	0.885	-0.013 0.016
I(year_built > 2005)[T.True]	-0.0632	0.049	-1.298	0.194	-0.159 0.032
np.log1p(unit_sqft)	-1.4830	0.007	-210.907	0.000	-1.497 -1.469
np.log1p(unit_lot_size)	-0.1476	0.006	-23.219	0.000	-0.160 -0.135
sum_residential_units	0.0986	0.008	11.972	0.000	0.082 0.115
ave_lot_sqft	-0.2179	0.010	-21.293	0.000	-0.238 -0.198
ave_unit_sqft	0.9376	0.020	45.828	0.000	0.898 0.978
ave_income	1.4192	0.015	92.812	0.000	1.389 1.449

Omnibus:	89673.667	Durbin-Watson:	1.798
Prob(Omnibus):	0.000	Jarque-Bera (JB):	819859.564
Skew:	-2.841	Prob(JB):	0.00
Kurtosis:	12.970	Cond. No.	1.01e+03

Predict and report¶

In [9]:

rm.predict(merged_df).describe()

Out[9]:

count      140782.000000
mean         1157.915780
std         41536.026751
min             0.001362
25%           252.965573
50%           380.341584
75%           554.749517
max      12517261.710714
dtype: float64

In [9]: