In [10]:

%matplotlib inline

import time
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import dask.dataframe as dk

Settings:

In [2]:

def reg(s):
    lm = sm.ols('y ~ x1 + x2', s).fit()
    return lm.params

In [4]:

n = 1000000
l = 100

Set up a dataset to run a model by groups:

In [7]:

x = pd.DataFrame(np.random.random((n, 2)), columns=['x1', 'x2'])
x['c'] = 1
x['u'] = np.random.random(n)
x['l'] = np.random.random_integers(1, l+1, n)
g = x.groupby('l')
for id, sub in g:
    test = sub[['c', 'x1', 'x2']].dot(np.array([sub['l'].iloc[0]]*3)) + sub['u']
    x.loc[test.index, 'y'] = test

pandas implementation:

In [61]:

%time b_pd = x.groupby('l').apply(reg)

CPU times: user 654 ms, sys: 117 ms, total: 770 ms
Wall time: 801 ms

dask implementation:

In [51]:

xd = dk.from_pandas(x, 10)

In [62]:

%time b_dk = xd.groupby('l').apply(reg).compute()

CPU times: user 1.26 s, sys: 286 ms, total: 1.55 s
Wall time: 1.45 s

Comparison over dataset sizes:

In [7]:

%%time
ns = [1000, 1000000, 10000000]
t_pd = {}
t_dk = {}
for n in ns:
    x = pd.DataFrame(np.random.random((n, 2)), columns=['x1', 'x2'])
    x['c'] = 1
    x['u'] = np.random.random(n)
    x['l'] = np.random.random_integers(1, l+1, n)
    g = x.groupby('l')
    for id, sub in g:
        test = sub[['c', 'x1', 'x2']].dot(np.array([sub['l'].iloc[0]]*3)) + sub['u']
        x.loc[test.index, 'y'] = test
    t0 = time.time()
    b_pd = x.groupby('l').apply(reg)
    t1 = time.time()
    t_pd[n] = t1-t0
    t0 = time.time()
    xd = dk.from_pandas(x, 10)
    b_dk = xd.groupby('l').apply(reg).compute()
    t1 = time.time()
    t_dk[n] = t1-t0

CPU times: user 1min 12s, sys: 23.3 s, total: 1min 36s
Wall time: 2min 15s

In [8]:

res = pd.DataFrame({'pandas': t_pd, 'dask': t_dk})
res

Out[8]:

	dask	pandas
1000	9.358188	1.158390
1000000	21.577656	3.056699
10000000	34.014051	19.967284

In [11]:

res.plot()

Out[11]:

<matplotlib.axes._subplots.AxesSubplot at 0x10c3eead0>