import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from typing import Tuple
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib
from mlxtend.plotting import plot_decision_regions
import graphviz
from hdtree import HDTreeClassifier, EntropyMeasure, SmallerThanSplit, TwentyQuantileSplit, TwentyQuantileRangeSplit, SingleCategorySplit, FixedValueSplit
import pandas as pd
%load_ext autoreload
%autoreload 2
blue = "#2a6bb7"
orange = "#E97817"
# Generate x * log(x) plot
font = {'family' : 'serif',
'weight' : 'normal',
'size' : 13}
matplotlib.rc('font', **font)
standard_cmap = 'pastell'
sns.set_style("whitegrid")
def generate_example_data() -> Tuple[np.ndarray, np.ndarray]:
def shift(x, y):
# shift data a bit away from the street
street_size = .75
if abs(x-y) < street_size: # almost on boundary
if x < y:
x-=street_size
y+=street_size
else:
y-=street_size
x+=street_size
return [x,y]
# generate normal distributed data
np.random.seed(1)
x_data_decision_tree = np.array([[np.random.rand(1)[0]*10-5+np.random.normal(scale=2),
np.random.rand(1)[0]*10-5+np.random.normal(scale=2)] for x in range(3000)])
# free the street in the middle
x_data_decision_tree = np.array([*map(lambda xy: shift(*xy), x_data_decision_tree)])
# label it positive an negative respectively
y_data_decision_tree = np.array([*map(lambda pos: 1 if pos[0] < pos[1] else 0, x_data_decision_tree)])
# change some labels randomly
y_data_decision_tree = np.array(
[cls if np.random.rand() < 0.95 else (1 if cls == 0 else 0) for cls in y_data_decision_tree])
return x_data_decision_tree, y_data_decision_tree
# generate data
X_street, y_street = generate_example_data()
X_street_train, X_street_test, y_street_train, y_street_test = train_test_split(X_street, y_street, test_size=.33)
# train some decision trees
tree_4 = DecisionTreeClassifier(max_depth=4)
tree_8 = DecisionTreeClassifier(max_depth=8)
tree_16 = DecisionTreeClassifier(max_depth=16)
tree_4.fit(X_street_train, y_street_train)
tree_8.fit(X_street_train, y_street_train)
tree_16.fit(X_street_train, y_street_train)
trees = [tree_4, tree_8, tree_16]
# plot the two class problem as is
fig, ax = plt.subplots(1,1, figsize=(8,8))
class_0 = X_street[y_street==0]
class_1 = X_street[y_street==1]
plt.scatter(class_0[:, 0], class_0[:,1], c=orange, marker='+', s=70, alpha=0.8)
plt.scatter(class_1[:, 0], class_1[:,1], c=blue, marker='*', s=50, alpha=0.8)
regressor_line = plt.plot([-10, 10], [-10, 10], linestyle='--', c='black')
_ = plt.legend(['$f(x)=y$', 'Class 1', 'Class 2'])
ax = plt.gca()
plt.gca().set_xlim([-11, 11])
plt.gca().set_ylim([-11, 11])
ax.spines['bottom'].set_color('0.5')
ax.spines['top'].set_color('0.5')
ax.spines['right'].set_color('0.5')
ax.spines['left'].set_color('0.5')
plt.tight_layout()
# plt.savefig("linear.png", bbbox_inches=0)
# plot the Decision bounaries of the decision trees
fig, axs = plt.subplots(1, 3, figsize=(18, 6))
titles = ["Decision Tree Max. Depth 4", "Decision Tree Max. Depth 8", "Decision Tree Max. Depth 16"]
for i in range(0,3):
plt.sca(axs[i])
plot_decision_regions(X_street,
y_street,
clf=trees[i],
markers='+*',
colors=','.join([orange, blue]),
ax=plt.gca(),
legend=0,
scatter_kwargs={'s': 80, 'edgecolor': None},
hide_spines=False)
ax = plt.gca()
plt.setp(ax.get_yticklabels(), visible=False)
plt.setp(ax.get_xticklabels(), visible=False)
plt.title(titles[i])
plt.gca().set_xlim([-11, 11])
plt.gca().set_ylim([-11, 11])
ax.spines['bottom'].set_color('0.5')
ax.spines['top'].set_color('0.5')
ax.spines['right'].set_color('0.5')
ax.spines['left'].set_color('0.5')
plt.tight_layout()
# plt.savefig("tree_decision_region.png", bbbox_inches=0)
graph = graphviz.Source(export_graphviz(tree_4, class_names=["Class 1", "Class 2"],
feature_names=['x', 'y'], filled=True))
graph
# graph.render("dt4.png", format='png', )
for tree in trees:
print(tree.score(X_street_test, y_street_test), tree.score(X_street_train, y_street_train))
from hdtree import HDTreeClassifier, SmallerThanSplit, EntropyMeasure
hdtree_linear = HDTreeClassifier(allowed_splits=[SmallerThanSplit.build()],
information_measure=EntropyMeasure(),
attribute_names=['x', 'y'])
hdtree_linear.fit(X_street_train, y_street_train)
hdtree_linear.score(X_street_test, y_street_test)
# hdtree_linear.generate_dot_graph().render('linear_tree', format='png')
hdtree_linear.generate_dot_graph()
df_titanic = pd.read_csv("data/titanic.csv")
df_titanic.head()
df_titanic['Name Length'] = df_titanic['Name'].map(lambda name: len(name))
y_titanic = df_titanic['Survived'].map(lambda s: 'Survived' if s == 1 else 'Death')
del df_titanic['Name']
del df_titanic['Ticket']
del df_titanic['Survived']
df_titanic['Survived'] = y_titanic
df_titanic.head()
X_titanic_train, X_titanic_test, y_titanic_train, y_titanic_test = train_test_split(df_titanic.iloc[:,:-1],
df_titanic.iloc[:,-1], test_size=0.33,
random_state=42)
col_names = [*df_titanic.columns[:-1]]
hdtree_titanic = HDTreeClassifier(allowed_splits=[FixedValueSplit.build(), SingleCategorySplit.build(), TwentyQuantileRangeSplit.build(), TwentyQuantileSplit.build()], information_measure=EntropyMeasure(), attribute_names=col_names, max_levels=3)
hdtree_titanic.fit(X_titanic_train.values, y_titanic_train.values)
hdtree_titanic.generate_dot_graph()
hdtree_titanic.score(X_titanic_train.values, y_titanic_train), hdtree_titanic.score(X_titanic_test.values, y_titanic_test)
# hdtree_titanic.generate_dot_graph().render('hd_tree_titanic_1', format='png')
hdtree_titanic_2 = HDTreeClassifier(allowed_splits=[FixedValueSplit.build_with_restrictions(min_level=1), SingleCategorySplit.build_with_restrictions(min_level=1), TwentyQuantileRangeSplit.build(), TwentyQuantileSplit.build()], information_measure=EntropyMeasure(), attribute_names=col_names, max_levels=3)
hdtree_titanic_2.fit(X_titanic_train.values, y_titanic_train.values)
hdtree_titanic_2.generate_dot_graph()
hdtree_titanic_2.score(X_titanic_train.values, y_titanic_train), hdtree_titanic_2.score(X_titanic_test.values, y_titanic_test)
hdtree_titanic_2.generate_dot_graph().render('hd_tree_titanic_2', format='png')
hdtree_titanic_3 = HDTreeClassifier(allowed_splits=[FixedValueSplit.build_with_restrictions(blacklist_attribute_indices=['Name Length']), SingleCategorySplit.build(), TwentyQuantileRangeSplit.build_with_restrictions(blacklist_attribute_indices=['PassengerId']), TwentyQuantileSplit.build()], information_measure=EntropyMeasure(), attribute_names=col_names, max_levels=3)
hdtree_titanic_3.fit(X_titanic_train.values, y_titanic_train.values)
hdtree_titanic_3.generate_dot_graph()
hdtree_titanic_3.score(X_titanic_train.values, y_titanic_train), hdtree_titanic_3.score(X_titanic_test.values, y_titanic_test)
print(hdtree_titanic_3.explain_decision(X_titanic_train.values[42]))
passenger_42 = X_titanic_train.values[42].copy()
passenger_42[2] = None
print(hdtree_titanic_3.explain_decision(passenger_42))
print(hdtree_titanic_3)
[str(node) for node in hdtree_titanic_3.get_clean_nodes(min_score=0.5)]