#!/usr/bin/env python # coding: utf-8 # # H2O.ai XGBoost GPU Benchmarks # In this notebook, we benchmark the latest version of [XGBoost](https://github.com/h2oai/xgboost), the well-known Kaggle-winning gradient boosting algorithm, and in particular, the [XGBoost GPU plugin](https://github.com/h2oai/xgboost/blob/master/plugin/updater_gpu/README.md). We also showcase the integration of XGBoost (incl. the GPU version) into H2O. # In[1]: ## For comparison between 1 GPU and 1 CPU, we use only 1 CPU: #numactl -C 0 -N 0 -m 0 jupyter notebook ## This will ensure that we only use the first CPU on multi-CPU systems # ![1CPU](./1CPUonNUMA.png) # In[2]: ## First time only: install xgboost and H2O, and restart the kernel afterwards if False: ## Build XGBoost from source and install its Python module import os os.system("mkdir -p tmp && cd tmp && git clone https://github.com/h2oai/xgboost --recursive && cd xgboost && mkdir build && cd build && cmake .. -DPLUGIN_UPDATER_GPU=ON -DCUB_DIRECTORY=../cub -DCUDA_NVCC_FLAGS=\"--expt-extended-lambda -arch=sm_30\" && make -j; make; cd ../python-package && python3.6 setup.py install") ## Download and install H2O and its Python module os.system("cd tmp && wget http://h2o-release.s3.amazonaws.com/h2o/rel-vajda/1/h2o-3.10.5.1.zip && unzip h2o-3.10.5.1.zip") os.system("python3.6 -m pip install h2o-3.10.5.1/python/h2o-3.10.5.1-py2.py3-none-any.whl --upgrade") ## restart the kernel! # In[3]: get_ipython().run_line_magic('matplotlib', 'inline') import xgboost as xgb import pandas as pd import numpy as np import scipy as sp import os import time from sklearn import metrics # In[4]: path = "/opt/higgs_head_2M.csv" if not os.path.exists(path): os.system("cd /opt/ && wget https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/higgs_head_2M.csv") num_class = 2 num_round = 100 learn_rate = 0.02 max_depth = 10 ## Parse data into a Pandas Frame df = pd.read_csv(path, header=None) # In[5]: df_target = df.iloc[:,0] df.drop(df.iloc[:,0], axis=1, inplace=True) cols = df.columns.values df.shape # In[6]: train = df # In[7]: train_target = df_target # In[8]: print(train.shape) # In[9]: get_ipython().system('lscpu') # In[10]: get_ipython().system('cat /proc/meminfo | grep MemTotal') # In[11]: get_ipython().system('nvidia-smi -L') # In[12]: def runXGBoost(param): have_updater = "updater" in param.keys() label = "XGBoost " \ + ("GPU hist" if have_updater and param["updater"]=="grow_gpu_hist" else "GPU exact" if have_updater and param["updater"]=="grow_gpu" else "CPU") \ + " " + (param["tree_method"] if "updater" not in param.keys() else "") print(label) print("=====================") for k, v in param.items(): print(k, v) print("=====================") t_start = time.time() dtrain = xgb.DMatrix(train.values, label = train_target.values, feature_names=[str(c) for c in cols]) tt = time.time() - t_start print("Time to create DMatrix (sec): ", tt) dmatrix_times.append(tt) t_start = time.time() bst = xgb.train(param, dtrain, num_round) tt = time.time() - t_start print("Time to train (sec): ", tt) train_times.append(tt) t_start = time.time() preds = bst.predict(dtrain) tt = time.time() - t_start print("Time to predict (sec): ", tt) score_times.append(tt) labels = dtrain.get_label() auc = metrics.roc_auc_score(labels, preds) print("Training AUC:", auc) valid_aucs.append(auc) plot_labels.append(label) fs = bst.get_fscore() # Optional: Uncomment to show variable importance #varimp = pd.DataFrame({'Importance': list(fs.values()), 'Feature': list(fs.keys())}) #varimp.sort_values(by = 'Importance', inplace = True, ascending = False) #varimp.head(10).plot(label='importance',kind="barh",x="Feature",y="Importance").invert_yaxis() # In[13]: valid_aucs = [] dmatrix_times = [] train_times = [] score_times = [] plot_labels = [] # In[14]: param = { "objective":('reg:logistic' if num_class>1 else 'reg:linear') , "max_depth":max_depth , "eta":learn_rate , "tree_method":"exact" , "subsample":0.7 , "colsample_bytree":0.9 , "min_child_weight":5 , "seed":12345 } runXGBoost(param) # In[15]: param = { "objective":('reg:logistic' if num_class>1 else 'reg:linear') , "max_depth":max_depth , "eta":learn_rate , "tree_method":"approx" , "subsample":0.7 , "colsample_bytree":0.9 , "min_child_weight":5 , "seed":12345 } runXGBoost(param) # In[16]: param = { "objective":('reg:logistic' if num_class>1 else 'reg:linear') , "max_depth":max_depth , "eta":learn_rate , "tree_method":"hist" , "subsample":0.7 , "colsample_bytree":0.9 , "min_child_weight":5 , "seed":12345 } runXGBoost(param) # In[17]: param = { "objective":('reg:logistic' if num_class>1 else 'reg:linear') , "max_depth":max_depth , "eta":learn_rate , "tree_method":"exact" , "updater":"grow_gpu" , "subsample":0.7 , "colsample_bytree":0.9 , "min_child_weight":5 , "seed":12345 } runXGBoost(param) # In[18]: param = { "objective":('reg:logistic' if num_class>1 else 'reg:linear') , "max_depth":max_depth , "eta":learn_rate , "tree_method":"exact" , "updater":"grow_gpu_hist" , "n_gpus":1 , "subsample":0.7 , "colsample_bytree":0.9 , "min_child_weight":5 , "seed":12345 } runXGBoost(param) # In[19]: data = pd.DataFrame({'algorithm' :plot_labels, 'dmatrix time':dmatrix_times, 'training time':train_times, 'scoring time':score_times, 'training AUC' :valid_aucs}).sort_values(by="training time") data # In[20]: data.plot(label="training time",kind='barh',x='algorithm',y='training time') data.plot(title="training AUC",kind='barh',x='algorithm',y='training AUC',legend=False) # ## Now call XGBoost from H2O # In[35]: import h2o from h2o.estimators import H2OXGBoostEstimator h2o.init() t_start = time.time() df_hex = h2o.import_file(path) print("Time to parse by H2O (sec): ", time.time() - t_start) trainhex = df_hex trainhex[0] = (trainhex[0]).asfactor() # In[23]: def runH2OXGBoost(param): label = "H2O XGBoost " \ + ("GPU" if "backend" in param.keys() and "gpu"==param["backend"] else "CPU") \ + (" " + param["tree_method"] if "tree_method" in param.keys() else "") print(label) print("=====================") for k, v in param.items(): print(k, v) print("=====================") t_start = time.time() model = H2OXGBoostEstimator(**param) model.train(x = list(range(1,trainhex.shape[1])), y = 0, training_frame = trainhex) tt = time.time() - t_start print("Time to train (sec): ", tt) h2o_train_times.append(tt) t_start = time.time() preds = model.predict(trainhex)[:,2] tt = time.time() - t_start print("Time to predict (sec): ", tt) h2o_score_times.append(tt) preds = h2o.as_list(preds) labels = train_target.values auc = metrics.roc_auc_score(labels, preds) print("Training AUC:", auc) h2o_valid_aucs.append(auc) h2o_plot_labels.append(label) #pd.DataFrame(model.varimp(),columns=["Feature","","Importance",""]).head(10).plot(label='importance',kind="barh",x="Feature",y="Importance").invert_yaxis() # In[24]: h2o_valid_aucs = [] h2o_train_times = [] h2o_score_times = [] h2o_plot_labels = [] # In[25]: param = { "ntrees":num_round , "max_depth":max_depth , "eta":learn_rate , "subsample":0.7 , "colsample_bytree":0.9 , "min_child_weight":5 , "seed":12345 , "score_tree_interval":num_round , "backend":"cpu" , "tree_method":"exact" } runH2OXGBoost(param) # In[26]: param = { "ntrees":num_round , "max_depth":max_depth , "eta":learn_rate , "subsample":0.7 , "colsample_bytree":0.9 , "min_child_weight":5 , "seed":12345 , "score_tree_interval":num_round , "backend":"cpu" , "tree_method":"approx" } runH2OXGBoost(param) # In[27]: param = { "ntrees":num_round , "max_depth":max_depth , "eta":learn_rate , "subsample":0.7 , "colsample_bytree":0.9 , "min_child_weight":5 , "seed":12345 , "score_tree_interval":num_round , "backend":"cpu" , "tree_method":"hist" } runH2OXGBoost(param) # In[28]: param = { "ntrees":num_round , "max_depth":max_depth , "learn_rate":learn_rate , "sample_rate":0.7 , "col_sample_rate_per_tree":0.9 , "min_rows":5 , "seed":12345 , "score_tree_interval":num_round , "backend":"gpu" , "tree_method":"exact" } runH2OXGBoost(param) # In[29]: param = { "ntrees":num_round , "max_depth":max_depth , "learn_rate":learn_rate , "sample_rate":0.7 , "col_sample_rate_per_tree":0.9 , "min_rows":5 , "seed":12345 , "score_tree_interval":num_round , "backend":"gpu" , "tree_method":"hist" } runH2OXGBoost(param) # ## H2O GBM (CPU) # In[30]: from h2o.estimators.gbm import H2OGradientBoostingEstimator param = { "ntrees":num_round , "max_depth":max_depth , "learn_rate":learn_rate , "sample_rate":0.7 , "col_sample_rate_per_tree":0.9 , "min_rows":5 , "seed":12345 , "score_tree_interval":num_round } t_start = time.time() model = H2OGradientBoostingEstimator(**param) model.train(x = list(range(1,trainhex.shape[1])), y = 0, training_frame = trainhex) tt = time.time() - t_start print("Time to train (sec): ", tt) h2o_train_times.append(tt) t_start = time.time() preds = model.predict(trainhex)[:,2] tt = time.time() - t_start print("Time to predict (sec): ", tt) h2o_score_times.append(tt) preds = h2o.as_list(preds) labels = train_target.values auc = metrics.roc_auc_score(labels, preds) print("AUC:", auc) h2o_valid_aucs.append(auc) h2o_plot_labels.append("H2O GBM CPU") # In[31]: data = pd.DataFrame({'algorithm' :h2o_plot_labels, 'training time':h2o_train_times, 'scoring time':h2o_score_times, 'training AUC' :h2o_valid_aucs}).sort_values(by="training time") data # In[32]: data.plot(label="DMatrix + training time",kind='barh',x='algorithm',y='training time') data.plot(title="training AUC",kind='barh',x='algorithm',y='training AUC',legend=False) # ### Summary: Fastest GPU algorithm (XGBoost histogram) takes 5s, fastest CPU algorithm (H2O) takes 50s # # ##### Note: H2O's XGBoost integration has some internal overhead still (DMatrix creation is single-threaded, and some parameters have different default values, hence the slightly slower training speed and slightly higher training accuracy) - this doesn't affect the summary conclusion