%matplotlib inline
import time
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import dask.dataframe as dk
Settings:
def reg(s):
lm = sm.ols('y ~ x1 + x2', s).fit()
return lm.params
n = 1000000
l = 100
Set up a dataset to run a model by groups:
x = pd.DataFrame(np.random.random((n, 2)), columns=['x1', 'x2'])
x['c'] = 1
x['u'] = np.random.random(n)
x['l'] = np.random.random_integers(1, l+1, n)
g = x.groupby('l')
for id, sub in g:
test = sub[['c', 'x1', 'x2']].dot(np.array([sub['l'].iloc[0]]*3)) + sub['u']
x.loc[test.index, 'y'] = test
pandas
implementation:
%time b_pd = x.groupby('l').apply(reg)
CPU times: user 654 ms, sys: 117 ms, total: 770 ms Wall time: 801 ms
dask
implementation:
xd = dk.from_pandas(x, 10)
%time b_dk = xd.groupby('l').apply(reg).compute()
CPU times: user 1.26 s, sys: 286 ms, total: 1.55 s Wall time: 1.45 s
Comparison over dataset sizes:
%%time
ns = [1000, 1000000, 10000000]
t_pd = {}
t_dk = {}
for n in ns:
x = pd.DataFrame(np.random.random((n, 2)), columns=['x1', 'x2'])
x['c'] = 1
x['u'] = np.random.random(n)
x['l'] = np.random.random_integers(1, l+1, n)
g = x.groupby('l')
for id, sub in g:
test = sub[['c', 'x1', 'x2']].dot(np.array([sub['l'].iloc[0]]*3)) + sub['u']
x.loc[test.index, 'y'] = test
t0 = time.time()
b_pd = x.groupby('l').apply(reg)
t1 = time.time()
t_pd[n] = t1-t0
t0 = time.time()
xd = dk.from_pandas(x, 10)
b_dk = xd.groupby('l').apply(reg).compute()
t1 = time.time()
t_dk[n] = t1-t0
CPU times: user 1min 12s, sys: 23.3 s, total: 1min 36s Wall time: 2min 15s
res = pd.DataFrame({'pandas': t_pd, 'dask': t_dk})
res
dask | pandas | |
---|---|---|
1000 | 9.358188 | 1.158390 |
1000000 | 21.577656 | 3.056699 |
10000000 | 34.014051 | 19.967284 |
res.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10c3eead0>