#!/usr/bin/env python
# coding: utf-8

# Comparision of dask_glm and scikit-learn on the [SUSY dataset](https://archive.ics.uci.edu/ml/datasets/SUSY).

# In[3]:


import numpy as np
import pandas as pd

import dask
from distributed import Client
import dask.array as da
from sklearn import linear_model
from dask_glm.estimators import LogisticRegression


# In[4]:


df = pd.read_csv("SUSY.csv.gz", header=None)
df.head()


# In[5]:


len(df)


# We have 5,000,000 rows of all-numeric data. We'll skip any feature engineering and preprocessing.

# In[6]:


y = df[0].values
X = df.drop(0, axis=1).values


# In[29]:


C = 10     # for scikit-learn
λ = 1 / C  # for dask_glm


# ## Scikit-learn
# 
# First, we run scikit-learn's `LogisticRegression` on the full dataset.

# In[27]:


get_ipython().run_cell_magic('time', '', "lm = linear_model.LogisticRegression(penalty='l1', C=C)\nlm.fit(X, y)\n")


# In[28]:


get_ipython().run_cell_magic('time', '', 'lm.score(X, y)\n')


# ## Dask GLM
# 
# Now for the dask-glm version.

# In[9]:


client = Client()

# dask
K = 100000
dX = da.from_array(X, chunks=(K, X.shape[-1]))
dy = da.from_array(y, chunks=(K,))

dX, dy = dask.persist(X, y)
client.rebalance([X, y])


# In[25]:


get_ipython().run_cell_magic('time', '', 'dk = LogisticRegression()\ndk.fit(dX, dy)\n')


# In[26]:


get_ipython().run_cell_magic('time', '', 'dk.score(dX, dy)\n')


# | Library      | Training time | Score |
# | -------------| ------------- | ----- |
# | dask-glm     | 9:25          | .788  |
# | scikit-learn | 25:01         | .788  |