import os
import pandas as pd
import numpy as np
import orca
from urbansim.models import RegressionModel
from urbansim.utils import misc
orca.add_injectable("store", pd.HDFStore(os.path.join(misc.data_dir(), "sanfran_public.h5"), mode="r"))
@orca.table('buildings')
def buildings(store):
df = store['buildings']
return df
@orca.table('zones')
def zones(store):
df = store['zones']
return df
@orca.table('households')
def households(store):
df = store['households']
return df
@orca.table('parcels')
def parcels(store):
df = store['parcels']
return df
orca.broadcast('zones', 'buildings', cast_index=True, onto_on='zone_id')
@orca.column('households', 'income_quartile', cache=True)
def income_quartile(households):
return pd.Series(pd.qcut(households.income, 4).labels,
index=households.index)
@orca.column('households', 'zone_id', cache=True)
def zone_id(households, buildings):
return misc.reindex(buildings.zone_id, households.building_id)
@orca.column('zones', 'ave_unit_sqft')
def ave_unit_sqft(buildings, zones):
s = buildings.unit_sqft[buildings.general_type == "Residential"]\
.groupby(buildings.zone_id).quantile().apply(np.log1p)
return s.reindex(zones.index).fillna(s.quantile())
@orca.column('zones', 'ave_lot_sqft')
def ave_lot_sqft(buildings, zones):
s = buildings.unit_lot_size.groupby(buildings.zone_id).quantile().apply(np.log1p)
return s.reindex(zones.index).fillna(s.quantile())
@orca.column('zones', 'sum_residential_units')
def sum_residential_units(buildings):
return buildings.residential_units.groupby(buildings.zone_id).sum().apply(np.log1p)
@orca.column('zones', 'ave_income')
def ave_income(households, zones):
s = households.income.groupby(households.zone_id).quantile().apply(np.log1p)
return s.reindex(zones.index).fillna(s.quantile())
orca.add_injectable("building_type_map", {
1: "Residential",
2: "Residential",
3: "Residential",
4: "Office",
5: "Hotel",
6: "School",
7: "Industrial",
8: "Industrial",
9: "Industrial",
10: "Retail",
11: "Retail",
12: "Residential",
13: "Retail",
14: "Office"
})
@orca.column('buildings', 'zone_id', cache=True)
def zone_id(buildings, parcels):
return misc.reindex(parcels.zone_id, buildings.parcel_id)
@orca.column('buildings', 'general_type', cache=True)
def general_type(buildings, building_type_map):
return buildings.building_type_id.map(building_type_map)
@orca.column('buildings', 'unit_sqft', cache=True)
def unit_sqft(buildings):
return buildings.building_sqft / buildings.residential_units.replace(0, 1)
@orca.column('buildings', 'unit_lot_size', cache=True)
def unit_lot_size(buildings, parcels):
return misc.reindex(parcels.parcel_size, buildings.parcel_id) / \
buildings.residential_units.replace(0, 1)
@orca.column('parcels', 'parcel_size', cache=True)
def parcel_size(parcels):
return parcels.shape_area * 10.764
rm = RegressionModel(
fit_filters=[
'unit_lot_size > 0',
'year_built > 1000',
'year_built < 2020',
'unit_sqft > 100',
'unit_sqft < 20000'
],
predict_filters=[
"general_type == 'Residential'"
],
model_expression='np.log1p(residential_sales_price) ~ I(year_built < 1940)'
'+ I(year_built > 2005) + np.log1p(unit_sqft) + np.log1p(unit_lot_size)'
'+ sum_residential_units + ave_lot_sqft + ave_unit_sqft + ave_income',
ytransform = np.exp
)
merged_df = orca.merge_tables(target="buildings", tables=["buildings", "zones"], columns=rm.columns_used())
import utils
merged_df["year_built"] = merged_df.year_built.fillna(merged_df.year_built.quantile())
merged_df["residential_sales_price"] = merged_df.residential_sales_price.fillna(0)
merged_df["general_type"] = merged_df.general_type.fillna(merged_df.general_type.value_counts().idxmax())
_ = utils.deal_with_nas(merged_df)
rm.fit(merged_df).summary()
Dep. Variable: | np.log1p(residential_sales_price) | R-squared: | 0.399 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.399 |
Method: | Least Squares | F-statistic: | 1.240e+04 |
Date: | Tue, 28 Apr 2015 | Prob (F-statistic): | 0.00 |
Time: | 10:38:22 | Log-Likelihood: | -2.5244e+05 |
No. Observations: | 149409 | AIC: | 5.049e+05 |
Df Residuals: | 149400 | BIC: | 5.050e+05 |
Df Model: | 8 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | -3.8664 | 0.170 | -22.761 | 0.000 | -4.199 -3.533 |
I(year_built < 1940)[T.True] | 0.0011 | 0.007 | 0.144 | 0.885 | -0.013 0.016 |
I(year_built > 2005)[T.True] | -0.0632 | 0.049 | -1.298 | 0.194 | -0.159 0.032 |
np.log1p(unit_sqft) | -1.4830 | 0.007 | -210.907 | 0.000 | -1.497 -1.469 |
np.log1p(unit_lot_size) | -0.1476 | 0.006 | -23.219 | 0.000 | -0.160 -0.135 |
sum_residential_units | 0.0986 | 0.008 | 11.972 | 0.000 | 0.082 0.115 |
ave_lot_sqft | -0.2179 | 0.010 | -21.293 | 0.000 | -0.238 -0.198 |
ave_unit_sqft | 0.9376 | 0.020 | 45.828 | 0.000 | 0.898 0.978 |
ave_income | 1.4192 | 0.015 | 92.812 | 0.000 | 1.389 1.449 |
Omnibus: | 89673.667 | Durbin-Watson: | 1.798 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 819859.564 |
Skew: | -2.841 | Prob(JB): | 0.00 |
Kurtosis: | 12.970 | Cond. No. | 1.01e+03 |
rm.predict(merged_df).describe()
count 140782.000000 mean 1157.915780 std 41536.026751 min 0.001362 25% 252.965573 50% 380.341584 75% 554.749517 max 12517261.710714 dtype: float64