Class is implemented only using numpy and visualization are shown using matplotlib. Code for class of decision tree is given in trees.py file
import numpy as np
import matplotlib.pyplot as plt
from trees import ClassificationTree, RegressionTree
from sklearn import datasets
from sklearn.model_selection import train_test_split
Loading and spliting for training and testing
data = datasets.load_iris()
X = data.data
y = data.target
feature_names = data.feature_names #Optional
Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3)
print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape)
(150, 4) (150,) (105, 4) (105,) (45, 4) (45,)
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=0,feature_names=feature_names)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... | |.........................tree is buit! ---------------------------------------
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=1,feature_names=feature_names)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|| |.........................tree is buit! ---------------------------------------
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=2,feature_names=feature_names)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... |-Feature::3_petal length (cm) Gain::0.96 thr::3.3_Depth = 0 |->True branch (>>>).. |--Feature::3_petal length (cm) Gain::0.83 thr::4.8_Depth = 1 |-->True branch (>>>).. |---Feature::3_petal length (cm) Gain::0.18 thr::5.0_Depth = 2 |--->True branch (>>>).. |---->{Leaf Node:: value: 2 }_Depth =3 | |--->False branch (<<<).. |---Feature::4_petal width (cm) Gain::0.97 thr::1.8_Depth = 3 |--->False branch (<<<).. |--->{Leaf Node:: value: 1 }_Depth =4 | |--->True branch (>>>).. |---->{Leaf Node:: value: 2 }_Depth =4 | |-->False branch (<<<).. |-->{Leaf Node:: value: 1 }_Depth =2 | |->False branch (<<<).. |->{Leaf Node:: value: 0 }_Depth =1 | |.........................tree is buit! ---------------------------------------
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=3,feature_names=feature_names)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... None 0 | True 1 | T True 2 | TT True 3 | TTT False 3 | TTF True 4 | TTFT False 4 | TTFF False 2 | TF False 1 | F | |.........................tree is buit! ---------------------------------------
%matplotlib notebook
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=4,feature_names=feature_names, randomBranch=True)
plt.close(clf.fig)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... | |.........................tree is buit! ---------------------------------------
plt.figure(figsize=(8,6))
clf.plotTree(show=True,DiffBranchColor=True,scale=False)
plt.figure(figsize=(8,6))
clf.plotTree(show=True)
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
print('Training Accuracy: ',np.mean(ytp==yt))
print('Testing Accuracy: ',np.mean(ysp==ys))
Training Accuracy: 1.0 Testing Accuracy: 0.8888888888888888
clf = ClassificationTree(max_depth=2)
clf.fit(Xt,yt,verbose=1,feature_names=feature_names)
#plt.figure(figsize=(15,8))
plt.figure(figsize=(5,5))
clf.plotTree(show=True,DiffBranchColor=True)
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
print('Training Accuracy: ',np.mean(ytp==yt))
print('Testing Accuracy: ',np.mean(ysp==ys))
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|| |.........................tree is buit! ---------------------------------------
Training Accuracy: 0.9809523809523809 Testing Accuracy: 0.8666666666666667
data = datasets.load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names #Optional
Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3)
print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape)
(569, 30) (569,) (398, 30) (398,) (171, 30) (171,)
While building tree, To first choose True branch and then False set randomBranch=False
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=4,feature_names=feature_names,randomBranch=False)
#plt.close(clf.fig)
Number of features:: 30 Number of samples :: 398 --------------------------------------- |Building the tree..................... | |.........................tree is buit! ---------------------------------------
To randomly selevting True or False branch set randomBranch=True
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=4,feature_names=feature_names,randomBranch=True)
plt.close(clf.fig)
Number of features:: 30 Number of samples :: 398 --------------------------------------- |Building the tree..................... | |.........................tree is buit! ---------------------------------------
plt.figure(figsize=(6,6))
clf.plotTree(show=True,DiffBranchColor=True,scale=False)
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=1,feature_names=feature_names)
plt.figure(figsize=(6,6))
clf.plotTree(show=True)
Number of features:: 30 Number of samples :: 398 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|- |.........................tree is buit! ---------------------------------------
plt.figure(figsize=(6,6))
clf.plotTree(show=True)
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
print('Training Accuracy: ',np.mean(ytp==yt))
print('Testing Accuracy: ',np.mean(ysp==ys))
Training Accuracy: 1.0 Testing Accuracy: 0.9473684210526315
It's overfitting, try with smaller trees by decresing the max_depth of classifier
data = datasets.load_boston()
X = data.data
y = data.target
feature_names = data.feature_names #Optional
Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3)
print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape)
(506, 13) (506,) (354, 13) (354,) (152, 13) (152,)
rgr = RegressionTree()
rgr.fit(Xt,yt,verbose=1,feature_names = feature_names)
Number of features:: 13 Number of samples :: 354 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|\ |.........................tree is buit! ---------------------------------------
%matplotlib inline
plt.style.use('default')
plt.figure(figsize=(10,10))
rgr.plotTree(show=True,scale=False,DiffBranchColor=True)
plt.figure(figsize=(15,15))
rgr.plotTree(show=True,scale=True, showtitle =False, showDirection=False,DiffBranchColor=True)
%matplotlib notebook
rgr = RegressionTree(max_depth=4)
rgr.fit(Xt,yt,verbose=4,feature_names = feature_names)
Number of features:: 13 Number of samples :: 354 --------------------------------------- |Building the tree..................... | |.........................tree is buit! ---------------------------------------
plt.figure(figsize=(10,6))
rgr.plotTree(show=True,scale=True, showtitle =False, showDirection=False,DiffBranchColor=True)
ytp = rgr.predict(Xt)
ysp = rgr.predict(Xs)
print('Training MSE: ',np.mean((ytp-yt)**2))
print('Testing MSE: ',np.mean((ysp-ys)**2))
Training MSE: 6.178914224439429 Testing MSE: 12.187349420981135
rgr = RegressionTree(max_depth=3)
rgr.fit(Xt,yt,verbose=1,feature_names = feature_names)
Number of features:: 13 Number of samples :: 354 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|\ |.........................tree is buit! ---------------------------------------
plt.figure(figsize=(8,6))
rgr.plotTree(show=True,scale=True, showtitle =True, showDirection=False,DiffBranchColor=True)
ytp = rgr.predict(Xt)
ysp = rgr.predict(Xs)
print('Training MSE: ',np.mean((ytp-yt)**2))
print('Testing MSE: ',np.mean((ysp-ys)**2))
Training MSE: 8.674185349676321 Testing MSE: 12.839154053729468