#!/usr/bin/env python # coding: utf-8 # Comparision of dask_glm and scikit-learn on the [SUSY dataset](https://archive.ics.uci.edu/ml/datasets/SUSY). # In[3]: import numpy as np import pandas as pd import dask from distributed import Client import dask.array as da from sklearn import linear_model from dask_glm.estimators import LogisticRegression # In[4]: df = pd.read_csv("SUSY.csv.gz", header=None) df.head() # In[5]: len(df) # We have 5,000,000 rows of all-numeric data. We'll skip any feature engineering and preprocessing. # In[6]: y = df[0].values X = df.drop(0, axis=1).values # In[29]: C = 10 # for scikit-learn λ = 1 / C # for dask_glm # ## Scikit-learn # # First, we run scikit-learn's `LogisticRegression` on the full dataset. # In[27]: get_ipython().run_cell_magic('time', '', "lm = linear_model.LogisticRegression(penalty='l1', C=C)\nlm.fit(X, y)\n") # In[28]: get_ipython().run_cell_magic('time', '', 'lm.score(X, y)\n') # ## Dask GLM # # Now for the dask-glm version. # In[9]: client = Client() # dask K = 100000 dX = da.from_array(X, chunks=(K, X.shape[-1])) dy = da.from_array(y, chunks=(K,)) dX, dy = dask.persist(X, y) client.rebalance([X, y]) # In[25]: get_ipython().run_cell_magic('time', '', 'dk = LogisticRegression()\ndk.fit(dX, dy)\n') # In[26]: get_ipython().run_cell_magic('time', '', 'dk.score(dX, dy)\n') # | Library | Training time | Score | # | -------------| ------------- | ----- | # | dask-glm | 9:25 | .788 | # | scikit-learn | 25:01 | .788 |