#!/usr/bin/env python
# coding: utf-8

# # H2O.ai XGBoost GPU Benchmarks

# In this notebook, we benchmark the latest version of [XGBoost](https://github.com/h2oai/xgboost), the well-known Kaggle-winning gradient boosting algorithm, and in particular, the [XGBoost GPU plugin](https://github.com/h2oai/xgboost/blob/master/plugin/updater_gpu/README.md). We also showcase the integration of XGBoost (incl. the GPU version) into H2O.

# In[1]:


## For comparison between 1 GPU and 1 CPU, we use only 1 CPU:
#numactl -C 0 -N 0 -m 0 jupyter notebook

## This will ensure that we only use the first CPU on multi-CPU systems


# ![1CPU](./1CPUonNUMA.png)

# In[2]:


## First time only: install xgboost and H2O, and restart the kernel afterwards
if False:
    ## Build XGBoost from source and install its Python module
    import os
    os.system("mkdir -p tmp && cd tmp && git clone https://github.com/h2oai/xgboost --recursive && cd xgboost && mkdir build && cd build && cmake .. -DPLUGIN_UPDATER_GPU=ON -DCUB_DIRECTORY=../cub -DCUDA_NVCC_FLAGS=\"--expt-extended-lambda -arch=sm_30\" && make -j; make; cd ../python-package && python3.6 setup.py install")

    ## Download and install H2O and its Python module
    os.system("cd tmp && wget http://h2o-release.s3.amazonaws.com/h2o/rel-vajda/1/h2o-3.10.5.1.zip && unzip h2o-3.10.5.1.zip")
    os.system("python3.6 -m pip install h2o-3.10.5.1/python/h2o-3.10.5.1-py2.py3-none-any.whl --upgrade")
    
    ## restart the kernel!


# In[3]:


get_ipython().run_line_magic('matplotlib', 'inline')
import xgboost as xgb
import pandas as pd
import numpy as np
import scipy as sp
import os
import time
from sklearn import metrics


# In[4]:


path = "/opt/higgs_head_2M.csv"
if not os.path.exists(path):
    os.system("cd /opt/ && wget https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/higgs_head_2M.csv")
num_class = 2
num_round = 100
learn_rate = 0.02
max_depth = 10

## Parse data into a Pandas Frame
df = pd.read_csv(path, header=None)


# In[5]:


df_target = df.iloc[:,0]
df.drop(df.iloc[:,0], axis=1, inplace=True)
cols = df.columns.values
df.shape


# In[6]:


train = df


# In[7]:


train_target = df_target


# In[8]:


print(train.shape)


# In[9]:


get_ipython().system('lscpu')


# In[10]:


get_ipython().system('cat /proc/meminfo | grep MemTotal')


# In[11]:


get_ipython().system('nvidia-smi -L')


# In[12]:


def runXGBoost(param):
    have_updater = "updater" in param.keys()
    label = "XGBoost " \
        + ("GPU hist" if have_updater and param["updater"]=="grow_gpu_hist" else "GPU exact" if have_updater and param["updater"]=="grow_gpu" else "CPU") \
        + " " + (param["tree_method"] if "updater" not in param.keys() else "")
    print(label)
    print("=====================")
    for k, v in param.items():
        print(k, v)
    print("=====================")
    
    t_start = time.time()
    dtrain = xgb.DMatrix(train.values, label = train_target.values, feature_names=[str(c) for c in cols])
    tt = time.time() - t_start
    print("Time to create DMatrix (sec): ", tt)
    dmatrix_times.append(tt)
    
    t_start = time.time()
    bst = xgb.train(param, dtrain, num_round)
    tt = time.time() - t_start
    print("Time to train (sec): ", tt)
    train_times.append(tt)

    t_start = time.time()
    preds = bst.predict(dtrain)
    tt = time.time() - t_start
    print("Time to predict (sec): ", tt)
    score_times.append(tt)

    labels = dtrain.get_label()
    auc = metrics.roc_auc_score(labels, preds)
    print("Training AUC:", auc)
    valid_aucs.append(auc)
    plot_labels.append(label)
    
    fs = bst.get_fscore()
    
    # Optional: Uncomment to show variable importance
    #varimp = pd.DataFrame({'Importance': list(fs.values()), 'Feature': list(fs.keys())})
    #varimp.sort_values(by = 'Importance', inplace = True, ascending = False)
    #varimp.head(10).plot(label='importance',kind="barh",x="Feature",y="Importance").invert_yaxis()


# In[13]:


valid_aucs = []
dmatrix_times = []
train_times = []
score_times = []
plot_labels = []


# In[14]:


param = {
    "objective":('reg:logistic' if num_class>1 else 'reg:linear')
    , "max_depth":max_depth
    , "eta":learn_rate
    , "tree_method":"exact"
    , "subsample":0.7
    , "colsample_bytree":0.9
    , "min_child_weight":5
    , "seed":12345
}
runXGBoost(param)


# In[15]:


param = {
    "objective":('reg:logistic' if num_class>1 else 'reg:linear')
    , "max_depth":max_depth
    , "eta":learn_rate
    , "tree_method":"approx"
    , "subsample":0.7
    , "colsample_bytree":0.9
    , "min_child_weight":5
    , "seed":12345
}
runXGBoost(param)


# In[16]:


param = {
    "objective":('reg:logistic' if num_class>1 else 'reg:linear')
    , "max_depth":max_depth
    , "eta":learn_rate
    , "tree_method":"hist"
    , "subsample":0.7
    , "colsample_bytree":0.9
    , "min_child_weight":5
    , "seed":12345
}
runXGBoost(param)


# In[17]:


param = {
    "objective":('reg:logistic' if num_class>1 else 'reg:linear')
    , "max_depth":max_depth
    , "eta":learn_rate
    , "tree_method":"exact"
    , "updater":"grow_gpu"
    , "subsample":0.7
    , "colsample_bytree":0.9
    , "min_child_weight":5
    , "seed":12345
}
runXGBoost(param)


# In[18]:


param = {
    "objective":('reg:logistic' if num_class>1 else 'reg:linear')
    , "max_depth":max_depth
    , "eta":learn_rate
    , "tree_method":"exact"
    , "updater":"grow_gpu_hist"
    , "n_gpus":1
    , "subsample":0.7
    , "colsample_bytree":0.9
    , "min_child_weight":5
    , "seed":12345
}
runXGBoost(param)


# In[19]:


data = pd.DataFrame({'algorithm'  :plot_labels,
                     'dmatrix time':dmatrix_times,
                     'training time':train_times,
                     'scoring time':score_times,
                     'training AUC' :valid_aucs}).sort_values(by="training time")
data


# In[20]:


data.plot(label="training time",kind='barh',x='algorithm',y='training time')
data.plot(title="training AUC",kind='barh',x='algorithm',y='training AUC',legend=False)


# ## Now call XGBoost from H2O

# In[35]:


import h2o
from h2o.estimators import H2OXGBoostEstimator
h2o.init()

t_start = time.time()
df_hex = h2o.import_file(path)
print("Time to parse by H2O (sec): ", time.time() - t_start)

trainhex = df_hex
trainhex[0] = (trainhex[0]).asfactor()


# In[23]:


def runH2OXGBoost(param):
    label = "H2O XGBoost " \
    + ("GPU" if "backend" in param.keys() and "gpu"==param["backend"] else "CPU") \
    + (" " + param["tree_method"] if "tree_method" in param.keys() else "")
    print(label)
    print("=====================")
    for k, v in param.items():
        print(k, v)
    print("=====================")
        
    t_start = time.time()
    model = H2OXGBoostEstimator(**param)
    model.train(x = list(range(1,trainhex.shape[1])), y = 0, training_frame = trainhex)
    tt = time.time() - t_start
    print("Time to train (sec): ", tt)
    h2o_train_times.append(tt)

    t_start = time.time()
    preds = model.predict(trainhex)[:,2]
    tt = time.time() - t_start
    print("Time to predict (sec): ", tt)
    h2o_score_times.append(tt)

    preds = h2o.as_list(preds)
    labels = train_target.values
    auc = metrics.roc_auc_score(labels, preds)
    print("Training AUC:", auc)

    h2o_valid_aucs.append(auc)
    h2o_plot_labels.append(label)
    
    #pd.DataFrame(model.varimp(),columns=["Feature","","Importance",""]).head(10).plot(label='importance',kind="barh",x="Feature",y="Importance").invert_yaxis()


# In[24]:


h2o_valid_aucs = []
h2o_train_times = []
h2o_score_times = []
h2o_plot_labels = []


# In[25]:


param = {
    "ntrees":num_round
    , "max_depth":max_depth
    , "eta":learn_rate
    , "subsample":0.7
    , "colsample_bytree":0.9
    , "min_child_weight":5
    , "seed":12345
    , "score_tree_interval":num_round
    , "backend":"cpu"
    , "tree_method":"exact"
}
runH2OXGBoost(param)


# In[26]:


param = {
    "ntrees":num_round
    , "max_depth":max_depth
    , "eta":learn_rate
    , "subsample":0.7
    , "colsample_bytree":0.9
    , "min_child_weight":5
    , "seed":12345
    , "score_tree_interval":num_round
    , "backend":"cpu"
    , "tree_method":"approx"
}
runH2OXGBoost(param)


# In[27]:


param = {
    "ntrees":num_round
    , "max_depth":max_depth
    , "eta":learn_rate
    , "subsample":0.7
    , "colsample_bytree":0.9
    , "min_child_weight":5
    , "seed":12345
    , "score_tree_interval":num_round
    , "backend":"cpu"
    , "tree_method":"hist"
}
runH2OXGBoost(param)


# In[28]:


param = {
    "ntrees":num_round
    , "max_depth":max_depth
    , "learn_rate":learn_rate
    , "sample_rate":0.7
    , "col_sample_rate_per_tree":0.9
    , "min_rows":5
    , "seed":12345
    , "score_tree_interval":num_round
    , "backend":"gpu"
    , "tree_method":"exact"
}
runH2OXGBoost(param)


# In[29]:


param = {
    "ntrees":num_round
    , "max_depth":max_depth
    , "learn_rate":learn_rate
    , "sample_rate":0.7
    , "col_sample_rate_per_tree":0.9
    , "min_rows":5
    , "seed":12345
    , "score_tree_interval":num_round
    , "backend":"gpu"
    , "tree_method":"hist"
}
runH2OXGBoost(param)


# ## H2O GBM (CPU)

# In[30]:


from h2o.estimators.gbm import H2OGradientBoostingEstimator
param = {
      "ntrees":num_round
    , "max_depth":max_depth
    , "learn_rate":learn_rate
    , "sample_rate":0.7
    , "col_sample_rate_per_tree":0.9
    , "min_rows":5
    , "seed":12345
    , "score_tree_interval":num_round
}

t_start = time.time()
model = H2OGradientBoostingEstimator(**param)
model.train(x = list(range(1,trainhex.shape[1])), y = 0, training_frame = trainhex)
tt = time.time() - t_start
print("Time to train (sec): ", tt)
h2o_train_times.append(tt)

t_start = time.time()
preds = model.predict(trainhex)[:,2]
tt = time.time() - t_start
print("Time to predict (sec): ", tt)
h2o_score_times.append(tt)

preds = h2o.as_list(preds)
labels = train_target.values
auc = metrics.roc_auc_score(labels, preds)
print("AUC:", auc)

h2o_valid_aucs.append(auc)
h2o_plot_labels.append("H2O GBM CPU")


# In[31]:


data = pd.DataFrame({'algorithm'  :h2o_plot_labels,
                     'training time':h2o_train_times,
                     'scoring time':h2o_score_times,
                     'training AUC' :h2o_valid_aucs}).sort_values(by="training time")
data


# In[32]:


data.plot(label="DMatrix + training time",kind='barh',x='algorithm',y='training time')
data.plot(title="training AUC",kind='barh',x='algorithm',y='training AUC',legend=False)


# ### Summary: Fastest GPU algorithm (XGBoost histogram) takes 5s, fastest CPU algorithm (H2O) takes 50s
# 
# ##### Note: H2O's XGBoost integration has some internal overhead still (DMatrix creation is single-threaded, and some parameters have different default values, hence the slightly slower training speed and slightly higher training accuracy) - this doesn't affect the summary conclusion