#!/usr/bin/env python
# coding: utf-8

# In[10]:


get_ipython().run_line_magic('matplotlib', 'inline')

import time
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import dask.dataframe as dk


# Settings:

# In[2]:


def reg(s):
    lm = sm.ols('y ~ x1 + x2', s).fit()
    return lm.params


# In[4]:


n = 1000000
l = 100


# Set up a dataset to run a model by groups:

# In[7]:


x = pd.DataFrame(np.random.random((n, 2)), columns=['x1', 'x2'])
x['c'] = 1
x['u'] = np.random.random(n)
x['l'] = np.random.random_integers(1, l+1, n)
g = x.groupby('l')
for id, sub in g:
    test = sub[['c', 'x1', 'x2']].dot(np.array([sub['l'].iloc[0]]*3)) + sub['u']
    x.loc[test.index, 'y'] = test


# `pandas` implementation:

# In[61]:


get_ipython().run_line_magic('time', "b_pd = x.groupby('l').apply(reg)")


# `dask` implementation:

# In[51]:


xd = dk.from_pandas(x, 10)


# In[62]:


get_ipython().run_line_magic('time', "b_dk = xd.groupby('l').apply(reg).compute()")


# Comparison over dataset sizes:

# In[7]:


get_ipython().run_cell_magic('time', '', "ns = [1000, 1000000, 10000000]\nt_pd = {}\nt_dk = {}\nfor n in ns:\n    x = pd.DataFrame(np.random.random((n, 2)), columns=['x1', 'x2'])\n    x['c'] = 1\n    x['u'] = np.random.random(n)\n    x['l'] = np.random.random_integers(1, l+1, n)\n    g = x.groupby('l')\n    for id, sub in g:\n        test = sub[['c', 'x1', 'x2']].dot(np.array([sub['l'].iloc[0]]*3)) + sub['u']\n        x.loc[test.index, 'y'] = test\n    t0 = time.time()\n    b_pd = x.groupby('l').apply(reg)\n    t1 = time.time()\n    t_pd[n] = t1-t0\n    t0 = time.time()\n    xd = dk.from_pandas(x, 10)\n    b_dk = xd.groupby('l').apply(reg).compute()\n    t1 = time.time()\n    t_dk[n] = t1-t0\n")


# In[8]:


res = pd.DataFrame({'pandas': t_pd, 'dask': t_dk})
res


# In[11]:


res.plot()