Homepage: https://spkit.github.io
Nikesh Bajaj : http://nikeshbajaj.in
Note: This notebook covers the use of (1) Classification and (2) Regression Tree from spkit library with different verbosity mode while training and plotting resulting decision tree after training. We use two different datasets Iris and Breast Cancer for classification and Boston Housing price for Regression.
import numpy as np
import matplotlib.pyplot as plt
import spkit
#version
# it is 0.0.9.1 version
spkit.__version__
np.random.seed(11) # just to makesure same results
# import Classification and Regression Tree from spkit
from spkit.ml import ClassificationTree, RegressionTree
# import dataset and train-test split from sklearn or use your own dataset
from sklearn import datasets
from sklearn.model_selection import train_test_split
Loading and spliting for training and testing
data = datasets.load_iris()
X = data.data
y = data.target
feature_names = data.feature_names #Optional
Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3)
print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape)
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=0,feature_names=feature_names)
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=1,feature_names=feature_names)
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=2,feature_names=feature_names)
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=3,feature_names=feature_names)
%matplotlib notebook
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=4,feature_names=feature_names)
%matplotlib inline
plt.figure(figsize=(10,6))
clf.plotTree(show=True,scale=False)
plt.figure(figsize=(8,6))
clf.plotTree(DiffBranchColor=False)
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
ytpr = clf.predict_proba(Xt)[:,1]
yspr = clf.predict_proba(Xs)[:,1]
print('Depth of trained Tree ', clf.getTreeDepth())
print('Accuracy')
print('- Training : ',np.mean(ytp==yt))
print('- Testing : ',np.mean(ysp==ys))
print('Logloss')
Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10))
Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10))
print('- Training : ',Trloss)
print('- Testing : ',Tsloss)
clf = ClassificationTree(max_depth=3)
clf.fit(Xt,yt,verbose=1,feature_names=feature_names)
plt.figure(figsize=(5,5))
clf.plotTree(show=True,DiffBranchColor=True)
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
ytpr = clf.predict_proba(Xt)[:,1]
yspr = clf.predict_proba(Xs)[:,1]
print('Depth of trained Tree ', clf.getTreeDepth())
print('Accuracy')
print('- Training : ',np.mean(ytp==yt))
print('- Testing : ',np.mean(ysp==ys))
print('Logloss')
Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10))
Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10))
print('- Training : ',Trloss)
print('- Testing : ',Tsloss)
data = datasets.load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names #Optional
Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3)
print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape)
While building tree, To first choose True branch and then False set randomBranch=False
%matplotlib notebook
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=4,feature_names=feature_names,randomBranch=False)
plt.close(clf.fig)
To randomly selevting True or False branch set randomBranch=True
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=4,feature_names=feature_names,randomBranch=True)
plt.close(clf.fig)
%matplotlib inline
plt.figure(figsize=(10,6))
clf.plotTree(show=True,DiffBranchColor=True,scale=False)
plt.close(clf.fig)
#%matplotlib inline
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=1,feature_names=feature_names)
plt.figure(figsize=(6,6))
clf.plotTree()
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
ytpr = clf.predict_proba(Xt)[:,1]
yspr = clf.predict_proba(Xs)[:,1]
print('Depth of trained Tree ', clf.getTreeDepth())
print('Accuracy')
print('- Training : ',np.mean(ytp==yt))
print('- Testing : ',np.mean(ysp==ys))
print('Logloss')
Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10))
Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10))
print('- Training : ',Trloss)
print('- Testing : ',Tsloss)
It's overfitting, try with smaller trees by decresing the max_depth of classifier
data = datasets.load_boston()
X = data.data
y = data.target
feature_names = data.feature_names #Optional
Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3)
print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape)
rgr = RegressionTree()
rgr.fit(Xt,yt,verbose=1,feature_names = feature_names)
%matplotlib inline
plt.style.use('default')
plt.figure(figsize=(15,15))
rgr.plotTree(show=True,scale=True, showtitle =False, showDirection=False)
ytp = rgr.predict(Xt)
ysp = rgr.predict(Xs)
print('Training MSE: ',np.mean((ytp-yt)**2))
print('Testing MSE: ',np.mean((ysp-ys)**2))
rgr = RegressionTree(max_depth=4)
rgr.fit(Xt,yt,verbose=1,feature_names = feature_names)
%matplotlib inline
plt.style.use('default')
plt.figure(figsize=(6,5))
rgr.plotTree(show=True,scale=True, showtitle =True, showDirection=False,DiffBranchColor=True)
ytp = rgr.predict(Xt)
ysp = rgr.predict(Xs)
print('Training MSE: ',np.mean((ytp-yt)**2))
print('Testing MSE: ',np.mean((ysp-ys)**2))