NOTE: work in progress
This document presents a comparison between the Python library PySAL
and different packages of the R
statistical environment relating to spatial econometric functionality.
The following functionality is tested:
This is an IPython notebook hosted on github as a gist. You can see an online rendered version on this link or downloaded on this link. Feel free to fork it and have a go at playing with it. Any change proposal can be issued through a PR following standard Github procedures.
%pylab inline
Populating the interactive namespace from numpy and matplotlib
import time, seaborn
import pandas as pd
import numpy as np
import pysal as ps
import matplotlib.pyplot as plt
from scipy.sparse import eye
from scipy.sparse.linalg import inv as spinv
from scipy.linalg import inv
from pysal.spreg.utils import spdot
%load_ext rpy2.ipython
%%R
library(spdep)
library(sphet)
print(sessionInfo())
Loading required package: sp Loading required package: Matrix R version 3.0.2 (2013-09-25) Platform: x86_64-pc-linux-gnu (64-bit) locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 [7] LC_PAPER=en_US.UTF-8 LC_NAME=C [9] LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] tools stats graphics grDevices utils datasets methods [8] base other attached packages: [1] sphet_1.5 spdep_0.5-77 Matrix_1.1-2 sp_1.0-13 loaded via a namespace (and not attached): [1] boot_1.3-9 coda_0.16-1 deldir_0.0-22 grid_3.0.2 [5] lattice_0.20-24 LearnBayes_2.12 MASS_7.3-29 nlme_3.1-113 [9] parallel_3.0.2 splines_3.0.2
%%R
# Utilities to run tests on the R sides
buildW <- function(n){
t0 = Sys.time()
w = cell2nb(sqrt(n), sqrt(n))
we = nb2listw(w)
t1 = Sys.time()
out = list('we'=we, 'tim'=t1-t0)
return(out)
}
runOLS <- function(y, x){
db <- cbind(data.frame(y=y), data.frame(x))
t0 = Sys.time()
m <- lm('y ~ V1 + V2 + V3', db)
t1 = Sys.time()
return(t1-t0)
}
runGMERR <- function(y, x, w){
db <- cbind(data.frame(y=y), data.frame(x))
t0 = Sys.time()
m <- spreg(y ~ V1 + V2 + V3, db, listw=w)
t1 = Sys.time()
return(t1-t0)
}
runMLERR <- function(y, x, w){
db <- cbind(data.frame(y=y), data.frame(x))
t0 = Sys.time()
m <- errorsarlm(y ~ V1 + V2 + V3, db, listw=w)
t1 = Sys.time()
return(t1-t0)
}
runGMLAG <- function(y, x, w){
db <- cbind(data.frame(y=y), data.frame(x))
t0 = Sys.time()
m <- spreg(y ~ V1 + V2 + V3, db, listw=w, model = "lag")
t1 = Sys.time()
return(t1-t0)
}
runMLLAG <- function(y, x, w){
db <- cbind(data.frame(y=y), data.frame(x))
t0 = Sys.time()
m <- lagsarlm(y ~ V1 + V2 + V3, db, listw=w)
t1 = Sys.time()
return(t1-t0)
}
np.random.seed(1234)
def sim(sizes, rho, lamb, ml=False):
timings = pd.DataFrame()
%R timingsR = data.frame()
for n in sizes:
### Data
print "Working on Size %i"%n
timing = {}
# Python
t0 = time.time()
w = ps.lat2W(int(np.sqrt(n)), int(np.sqrt(n)))
t1 = time.time()
w.transform = 'R'
timing['W'] = t1 - t0
x = np.random.random((n, 3))
X = np.hstack((np.ones((n, 1)), x))
eps = np.random.normal(size=(n, 1))
xb =np.dot(X, betas)
spxb_r = ps.spreg.utils.power_expansion(w, xb, rho)
speps_r = ps.spreg.utils.power_expansion(w, eps, rho)
speps_l = ps.spreg.utils.power_expansion(w, eps, lamb)
y = xb + eps
y_lag = spxb_r + speps_r
y_err = xb + speps_l
# R
%Rpush n x y y_lag y_err
%R x = as.data.frame(x)
%R wtw = buildW(n)
%R res = data.frame(W=wtw$tim)
%R w = wtw$we
### OLS
t0 = time.time()
m = ps.spreg.OLS(y, x)
t1 = time.time()
timing['OLS'] = t1 - t0
%R t = runOLS(y, x)
%R res = cbind(res, data.frame(OLS=t))
### Spatial Error
t0 = time.time()
m = ps.spreg.GM_Error_Het(y_err, x, w=w)
t1 = time.time()
timing['Err_gm'] = t1 - t0
%R t = runGMERR(y_err, x, w)
%R res = cbind(res, data.frame(Err_gm=t))
if ml:
t0 = time.time()
m = ps.spreg.ML_Error(y, x, w=w)
t1 = time.time()
timing['Err_ml'] = t1 - t0
%R t = runMLERR(y_err, x, w)
%R res = cbind(res, data.frame(Err_ml=t))
# Spatial lag
t0 = time.time()
m = ps.spreg.GM_Lag(y_lag, x, w=w)
t1 = time.time()
timing['Lag_gm'] = t1 - t0
%R t = runGMLAG(y_lag, x, w)
%R res = cbind(res, data.frame(Lag_gm=t))
if ml:
t0 = time.time()
m = ps.spreg.ML_Lag(y_lag, x, w=w)
t1 = time.time()
timing['Lag_ml'] = t1 - t0
%R t = runMLLAG(y_lag, x, w)
%R res = cbind(res, data.frame(Lag_ml=t))
timings['n%i'%n] = pd.Series(timing)
%R row.names(res) = paste('n', n, sep='')
%R timingsR = rbind(timingsR, res)
timingsPY = timings.T
timingsPY.index = pd.MultiIndex.from_arrays([['Py']*timingsPY.shape[0], \
timingsPY.index])
%Rpull timingsR
ri = %R rownames(timingsR)
timingsR.index = pd.MultiIndex.from_arrays([['R']*ri.shape[0], ri])
timings = timingsPY.T.join(timingsR.T).T
return timings
The basic model we are going to test against is a linear regression with four explanatory variables to which, depending on the model, we will be appending terms:
$$ y = (\rho W y) + \alpha + \beta_1 X_1 + \beta_2 X_2 + \beta_3 X_3 + \beta_4 X_4 + (\lambda W u) + \epsilon $$We will record the running time of these models for the following number of observations:
sizes = [100, 625, 900, 3025, 4900, 10000, 25600, 50625, 99225]
betas = np.array([[1, 1, 1, 1]]).T
rho = 0.5
lamb = 0.5
%%time
ssizes = sizes[:5]
stimings = sim(ssizes, rho, lamb, ml=True)
/home/dani/code/pysal_darribas/pysal/spreg/user_output.py:349: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future. if i == None: /home/dani/anaconda/envs/pydata/lib/python2.7/site-packages/scipy/optimize/_minimize.py:570: RuntimeWarning: Method 'bounded' does not support relative tolerance in x; defaulting to absolute tolerance. "defaulting to absolute tolerance.", RuntimeWarning)
need more than 0 values to unpack Working on Size 100 Working on Size 625 Working on Size 900 Working on Size 3025 Working on Size 4900 CPU times: user 45min 17s, sys: 1min 46s, total: 47min 3s Wall time: 32min 38s
/home/dani/code/pysal_darribas/pysal/spreg/twosls.py:139: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future. if q == None and h == None:
stimings
Err_gm | Err_ml | Lag_gm | Lag_ml | OLS | W | ||
---|---|---|---|---|---|---|---|
Py | n100 | 0.022200 | 0.015826 | 0.003856 | 0.018613 | 0.002736 | 0.000991 |
n625 | 0.025810 | 0.594324 | 0.003643 | 0.635988 | 0.002734 | 0.063488 | |
n900 | 0.027929 | 1.651768 | 0.003526 | 1.440586 | 0.002547 | 0.003898 | |
n3025 | 0.040959 | 54.081272 | 0.004570 | 43.543330 | 0.003509 | 0.014238 | |
n4900 | 0.054737 | 1309.297623 | 0.007065 | 198.539012 | 0.004447 | 0.026224 | |
R | n100 | 0.516554 | 0.312385 | 0.006613 | 0.228411 | 0.008700 | 0.099601 |
n625 | 1.223979 | 1.473421 | 0.008109 | 0.891190 | 0.003769 | 0.755120 | |
n900 | 1.265054 | 1.841072 | 0.008744 | 1.767377 | 0.003879 | 0.812916 | |
n3025 | 3.664639 | 32.921955 | 0.014100 | 32.429866 | 0.006280 | 2.649967 | |
n4900 | 5.406586 | 2.101724 | 0.020745 | 2.145562 | 0.008604 | 4.350219 |
for col in stimings:
f, ax = plt.subplots(1)
stimings[col].groupby(level=0).plot(ax=ax)
plt.xlabel('N. Observations')
plt.ylabel('Seconds')
plt.legend()
plt.title(col)
%%time
timings = sim(sizes, rho, lamb)
need more than 0 values to unpack Working on Size 100 Working on Size 625 Working on Size 900 Working on Size 3025 Working on Size 4900 Working on Size 10000 Working on Size 25600 Error: cannot allocate vector of size 4.9 Gb In addition: There were 50 or more warnings (use warnings() to see the first 50) Error: cannot allocate vector of size 4.9 Gb Working on Size 50625 Error: cannot allocate vector of size 19.1 Gb In addition: Warning messages: 1: In if (!(model %in% c("ols", "ols.end"))) { : the condition has length > 1 and only the first element will be used 2: In if (model %in% c("sarar", "lag", "ivhac")) { : the condition has length > 1 and only the first element will be used 3: In if (model == "error") { : the condition has length > 1 and only the first element will be used 4: In if (model == "ols.end") { : the condition has length > 1 and only the first element will be used 5: In if (model %in% c("sarar", "error")) { : the condition has length > 1 and only the first element will be used Error: cannot allocate vector of size 19.1 Gb Working on Size 99225 Error: cannot allocate vector of size 73.4 Gb In addition: Warning messages: 1: In if (!(model %in% c("ols", "ols.end"))) { : the condition has length > 1 and only the first element will be used 2: In if (model %in% c("sarar", "lag", "ivhac")) { : the condition has length > 1 and only the first element will be used 3: In if (model == "error") { : the condition has length > 1 and only the first element will be used 4: In if (model == "ols.end") { : the condition has length > 1 and only the first element will be used 5: In if (model %in% c("sarar", "error")) { : the condition has length > 1 and only the first element will be used Error: cannot allocate vector of size 73.4 Gb CPU times: user 3min 45s, sys: 30 s, total: 4min 15s Wall time: 22min 13s
timings
Err_gm | Lag_gm | OLS | W | ||
---|---|---|---|---|---|
Py | n100 | 0.020799 | 0.003664 | 0.002098 | 0.000670 |
n625 | 0.026594 | 0.002983 | 0.002443 | 0.005336 | |
n900 | 0.027249 | 0.003719 | 0.002479 | 0.006735 | |
n3025 | 0.041921 | 0.005710 | 0.003510 | 0.029185 | |
n4900 | 0.054417 | 0.006932 | 0.004522 | 0.045991 | |
n10000 | 0.088830 | 0.205651 | 0.006430 | 0.084250 | |
n25600 | 0.194143 | 6.968898 | 0.136538 | 0.785413 | |
n50625 | 0.491060 | 0.061421 | 0.348023 | 49.088749 | |
n99225 | 0.687504 | 0.123573 | 0.056800 | 0.789296 | |
R | n100 | 0.372393 | 0.006577 | 0.002660 | 0.087507 |
n625 | 0.356954 | 0.007866 | 0.003204 | 0.546080 | |
n900 | 0.413155 | 0.008901 | 0.003610 | 0.772779 | |
n3025 | 1.610812 | 0.014108 | 0.006154 | 2.663630 | |
n4900 | 3.627312 | 0.019419 | 0.008348 | 4.076419 | |
n10000 | 20.597504 | 0.076775 | 0.015285 | 8.635177 | |
n25600 | 0.056522 | 2.104784 | 0.056522 | 22.532605 | |
n50625 | 0.161837 | 0.203868 | 0.161837 | 47.339219 | |
n99225 | 0.393457 | 0.421947 | 0.393457 | 1.564577 |
for col in timings:
f, ax = plt.subplots(1)
timings[col].groupby(level=0).plot(ax=ax)
plt.xlabel('N. Observations')
plt.ylabel('Seconds')
plt.legend()
plt.title(col)