In [10]:
%matplotlib inline

import time
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import dask.dataframe as dk

Settings:

In [2]:
def reg(s):
    lm = sm.ols('y ~ x1 + x2', s).fit()
    return lm.params
In [4]:
n = 1000000
l = 100

Set up a dataset to run a model by groups:

In [7]:
x = pd.DataFrame(np.random.random((n, 2)), columns=['x1', 'x2'])
x['c'] = 1
x['u'] = np.random.random(n)
x['l'] = np.random.random_integers(1, l+1, n)
g = x.groupby('l')
for id, sub in g:
    test = sub[['c', 'x1', 'x2']].dot(np.array([sub['l'].iloc[0]]*3)) + sub['u']
    x.loc[test.index, 'y'] = test

pandas implementation:

In [61]:
%time b_pd = x.groupby('l').apply(reg)
CPU times: user 654 ms, sys: 117 ms, total: 770 ms
Wall time: 801 ms

dask implementation:

In [51]:
xd = dk.from_pandas(x, 10)
In [62]:
%time b_dk = xd.groupby('l').apply(reg).compute()
CPU times: user 1.26 s, sys: 286 ms, total: 1.55 s
Wall time: 1.45 s

Comparison over dataset sizes:

In [7]:
%%time
ns = [1000, 1000000, 10000000]
t_pd = {}
t_dk = {}
for n in ns:
    x = pd.DataFrame(np.random.random((n, 2)), columns=['x1', 'x2'])
    x['c'] = 1
    x['u'] = np.random.random(n)
    x['l'] = np.random.random_integers(1, l+1, n)
    g = x.groupby('l')
    for id, sub in g:
        test = sub[['c', 'x1', 'x2']].dot(np.array([sub['l'].iloc[0]]*3)) + sub['u']
        x.loc[test.index, 'y'] = test
    t0 = time.time()
    b_pd = x.groupby('l').apply(reg)
    t1 = time.time()
    t_pd[n] = t1-t0
    t0 = time.time()
    xd = dk.from_pandas(x, 10)
    b_dk = xd.groupby('l').apply(reg).compute()
    t1 = time.time()
    t_dk[n] = t1-t0
CPU times: user 1min 12s, sys: 23.3 s, total: 1min 36s
Wall time: 2min 15s
In [8]:
res = pd.DataFrame({'pandas': t_pd, 'dask': t_dk})
res
Out[8]:
dask pandas
1000 9.358188 1.158390
1000000 21.577656 3.056699
10000000 34.014051 19.967284
In [11]:
res.plot()
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c3eead0>