#!/usr/bin/env python # coding: utf-8 # In[10]: get_ipython().run_line_magic('matplotlib', 'inline') import time import pandas as pd import numpy as np import statsmodels.formula.api as sm import dask.dataframe as dk # Settings: # In[2]: def reg(s): lm = sm.ols('y ~ x1 + x2', s).fit() return lm.params # In[4]: n = 1000000 l = 100 # Set up a dataset to run a model by groups: # In[7]: x = pd.DataFrame(np.random.random((n, 2)), columns=['x1', 'x2']) x['c'] = 1 x['u'] = np.random.random(n) x['l'] = np.random.random_integers(1, l+1, n) g = x.groupby('l') for id, sub in g: test = sub[['c', 'x1', 'x2']].dot(np.array([sub['l'].iloc[0]]*3)) + sub['u'] x.loc[test.index, 'y'] = test # `pandas` implementation: # In[61]: get_ipython().run_line_magic('time', "b_pd = x.groupby('l').apply(reg)") # `dask` implementation: # In[51]: xd = dk.from_pandas(x, 10) # In[62]: get_ipython().run_line_magic('time', "b_dk = xd.groupby('l').apply(reg).compute()") # Comparison over dataset sizes: # In[7]: get_ipython().run_cell_magic('time', '', "ns = [1000, 1000000, 10000000]\nt_pd = {}\nt_dk = {}\nfor n in ns:\n x = pd.DataFrame(np.random.random((n, 2)), columns=['x1', 'x2'])\n x['c'] = 1\n x['u'] = np.random.random(n)\n x['l'] = np.random.random_integers(1, l+1, n)\n g = x.groupby('l')\n for id, sub in g:\n test = sub[['c', 'x1', 'x2']].dot(np.array([sub['l'].iloc[0]]*3)) + sub['u']\n x.loc[test.index, 'y'] = test\n t0 = time.time()\n b_pd = x.groupby('l').apply(reg)\n t1 = time.time()\n t_pd[n] = t1-t0\n t0 = time.time()\n xd = dk.from_pandas(x, 10)\n b_dk = xd.groupby('l').apply(reg).compute()\n t1 = time.time()\n t_dk[n] = t1-t0\n") # In[8]: res = pd.DataFrame({'pandas': t_pd, 'dask': t_dk}) res # In[11]: res.plot()