#!/usr/bin/env python # coding: utf-8 # # Predicting Forest Cover Type using Extreme Gradient Boosting (xgboost) # # ## 1. Introduction # The data is gotten from the UCI Machine Learning repository submitted by Jock A. Blackard. The dataset contains 581,012 records with 54 attributes. The dataset characteristics can be viewed from the [UCI Forest CoverType description page](https://archive.ics.uci.edu/ml/datasets/Covertype). But for convenience sake, the summary statistics is replicated below. # # **Summary Statistics** # * *Number of instances (observations):* 581012 # * *Number of Attributes:* 54 # * *Attribute breakdown:* 12 measures, but 54 columns of data (10 quantitative variables, 4 binary wilderness areas and 40 binary soil type variables) # * *Missing Attribute Values:* None # # ## 2. Import relevant classes and functions # In[1]: # import classes and functions import numpy from pandas import read_csv from xgboost import XGBClassifier # xgboost from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score from sklearn.preprocessing import LabelEncoder from keras.utils import np_utils # In[2]: # fix random seed for reproducibility seed = 7 numpy.random.seed(seed) # ## 3. Load the Dataset # In[3]: # load dataset dataframe = read_csv("covtype.data", header=None) dataset = dataframe.values # ## 4. Data Preparation # * To improve the performance of our model we shuffle the dataset. Some learning algorithms are known to have a better chance at an improved learning accuracy when the input data is randomized. # * Due to computational constraints we use 50,000 records to train and evaluate our model out of available 581012 entries. Even with that, XGBoost took over 18 hours to process the data. # In[4]: # reshuffle dataset dataset = numpy.random.permutation(dataset) # use reduced dataset dataset = dataset[0:50000,:] # split into input (X) and output (Y) variables X = dataset[:,0:54].astype(float) Y = dataset[:,54] # In[5]: # encode class values as integers encoder = LabelEncoder() encoder = encoder.fit(Y) encoded_Y = encoder.transform(Y) # ## 5. XGboost Grid Search # The following hyper-parameters are tuned to improve our model. # * The number of decision trees (n_estimators), # * The size of decision tree (max_depth), # * The learning rate (learning_rate) # In[6]: # XGBOOST # grid search model = XGBClassifier(nthread=-1) n_estimators = range(50, 400, 50) # tune number of trees max_depth = range(1, 11, 2) # tune size of decision trees learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3] # tune the learning rate param_grid = dict(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate) kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7) grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold) results = grid_search.fit(X, encoded_Y) # In[7]: print("Best: %f%% using %s" % (results.best_score_ * 100, results.best_params_))