Class is implemented only using numpy and visualization are shown using matplotlib. Code for class of decision tree is given in trees.py file
import numpy as np
import matplotlib.pyplot as plt
from spkit.ml import ClassificationTree, RegressionTree
from sklearn import datasets
from sklearn.model_selection import train_test_split
data = datasets.load_iris()
X = data.data
y = data.target
feature_names = data.feature_names #Optional
Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3)
print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape)
(150, 4) (150,) (105, 4) (105,) (45, 4) (45,)
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=0,feature_names=feature_names)
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=1,feature_names=feature_names)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|| |.........................tree is buit! ---------------------------------------
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=2,feature_names=feature_names)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... |-Feature::3_petal length (cm) Gain::0.94 thr::3.0_Depth = 0 |->True branch (>>>).. |--Feature::3_petal length (cm) Gain::0.72 thr::4.8_Depth = 1 |-->True branch (>>>).. |---Feature::4_petal width (cm) Gain::0.16 thr::1.9_Depth = 2 |--->False branch (<<<).. |---Feature::3_petal length (cm) Gain::0.19 thr::5.5_Depth = 3 |--->False branch (<<<).. |---Feature::1_sepal length (cm) Gain::0.32 thr::6.7_Depth = 4 |--->True branch (>>>).. |---->{Leaf Node:: value: 1 }_Depth =5 | |--->False branch (<<<).. |---Feature::1_sepal length (cm) Gain::0.31 thr::6.1_Depth = 5 |--->True branch (>>>).. |---->{Leaf Node:: value: 2 }_Depth =6 | |--->False branch (<<<).. |---Feature::2_sepal width (cm) Gain::0.31 thr::3.0_Depth = 6 |--->False branch (<<<).. |--->{Leaf Node:: value: 1 }_Depth =7 | |--->True branch (>>>).. |----Feature::2_sepal width (cm) Gain::0.92 thr::3.2_Depth = 7 |---->True branch (>>>).. |----->{Leaf Node:: value: 1 }_Depth =8 | |---->False branch (<<<).. |---->{Leaf Node:: value: 2 }_Depth =8 | |--->True branch (>>>).. |---->{Leaf Node:: value: 2 }_Depth =4 | |--->True branch (>>>).. |---->{Leaf Node:: value: 2 }_Depth =3 | |-->False branch (<<<).. |-->{Leaf Node:: value: 1 }_Depth =2 | |->False branch (<<<).. |->{Leaf Node:: value: 0 }_Depth =1 | |.........................tree is buit! ---------------------------------------
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=3,feature_names=feature_names)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... None 0 | True 1 | T True 2 | TT True 3 | TTT False 3 | TTF True 4 | TTFT False 4 | TTFF True 5 | TTFFT False 5 | TTFFF True 6 | TTFFFT False 6 | TTFFFF True 7 | TTFFFFT True 8 | TTFFFFTT False 8 | TTFFFFTF False 7 | TTFFFFF False 2 | TF False 1 | F | |.........................tree is buit! ---------------------------------------
%matplotlib notebook
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=4,feature_names=feature_names, randomBranch=True)
plt.close(clf.fig)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... | |.........................tree is buit! ---------------------------------------
plt.figure(figsize=(8,6))
clf.plotTree(show=True,DiffBranchColor=True,scale=False)
plt.figure(figsize=(8,6))
clf.plotTree(show=True)
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
print('Training Accuracy: ',np.mean(ytp==yt))
print('Testing Accuracy: ',np.mean(ysp==ys))
Training Accuracy: 1.0 Testing Accuracy: 0.9333333333333333
clf = ClassificationTree(max_depth=2)
clf.fit(Xt,yt,verbose=1,feature_names=feature_names)
#plt.figure(figsize=(15,8))
plt.figure(figsize=(5,5))
clf.plotTree(show=True,DiffBranchColor=True)
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
print('Training Accuracy: ',np.mean(ytp==yt))
print('Testing Accuracy: ',np.mean(ysp==ys))
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|| |.........................tree is buit! ---------------------------------------
Training Accuracy: 0.9619047619047619 Testing Accuracy: 0.9333333333333333
data = datasets.load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names #Optional
Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3)
print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape)
(569, 30) (569,) (398, 30) (398,) (171, 30) (171,)
While building tree, To first choose True branch and then False set randomBranch=False
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=4,feature_names=feature_names,randomBranch=False)
#plt.close(clf.fig)
Number of features:: 30 Number of samples :: 398 --------------------------------------- |Building the tree..................... | |.........................tree is buit! ---------------------------------------
To randomly selevting True or False branch set randomBranch=True
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=4,feature_names=feature_names,randomBranch=True)
plt.close(clf.fig)
Number of features:: 30 Number of samples :: 398 --------------------------------------- |Building the tree..................... | |.........................tree is buit! ---------------------------------------
plt.figure(figsize=(6,6))
clf.plotTree(show=True,DiffBranchColor=True,scale=False)
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=1,feature_names=feature_names)
plt.figure(figsize=(6,6))
clf.plotTree(show=True)
Number of features:: 30 Number of samples :: 398 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|- |.........................tree is buit! ---------------------------------------
plt.figure(figsize=(6,6))
clf.plotTree(show=True)
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
print('Training Accuracy: ',np.mean(ytp==yt))
print('Testing Accuracy: ',np.mean(ysp==ys))
Training Accuracy: 1.0 Testing Accuracy: 0.9532163742690059
It's overfitting, try with smaller trees by decresing the max_depth of classifier
data = datasets.load_boston()
X = data.data
y = data.target
feature_names = data.feature_names #Optional
Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3)
print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape)
(506, 13) (506,) (354, 13) (354,) (152, 13) (152,)
rgr = RegressionTree()
rgr.fit(Xt,yt,verbose=1,feature_names = feature_names)
Number of features:: 13 Number of samples :: 354 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|\ |.........................tree is buit! ---------------------------------------
%matplotlib inline
plt.style.use('default')
plt.figure(figsize=(10,10))
rgr.plotTree(show=True,scale=False,DiffBranchColor=True)
plt.figure(figsize=(15,15))
rgr.plotTree(show=True,scale=True, showtitle =False, showDirection=False,DiffBranchColor=True)
%matplotlib notebook
rgr = RegressionTree(max_depth=4)
rgr.fit(Xt,yt,verbose=4,feature_names = feature_names)
Number of features:: 13 Number of samples :: 354 --------------------------------------- |Building the tree..................... | |.........................tree is buit! ---------------------------------------
plt.figure(figsize=(10,6))
rgr.plotTree(show=True,scale=True, showtitle =False, showDirection=False,DiffBranchColor=True)
ytp = rgr.predict(Xt)
ysp = rgr.predict(Xs)
print('Training MSE: ',np.mean((ytp-yt)**2))
print('Testing MSE: ',np.mean((ysp-ys)**2))
Training MSE: 6.178914224439429 Testing MSE: 12.187349420981135
rgr = RegressionTree(max_depth=3)
rgr.fit(Xt,yt,verbose=1,feature_names = feature_names)
Number of features:: 13 Number of samples :: 354 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|\ |.........................tree is buit! ---------------------------------------
plt.figure(figsize=(8,6))
rgr.plotTree(show=True,scale=True, showtitle =True, showDirection=False,DiffBranchColor=True)
ytp = rgr.predict(Xt)
ysp = rgr.predict(Xs)
print('Training MSE: ',np.mean((ytp-yt)**2))
print('Testing MSE: ',np.mean((ysp-ys)**2))
Training MSE: 8.674185349676321 Testing MSE: 12.839154053729468