#!/usr/bin/env python # coding: utf-8 # ### From the Titanic Dataset # ![](https://upload.wikimedia.org/wikipedia/commons/f/f3/CART_tree_titanic_survivors.png) #
By Stephen Milborrow (Own work) [CC BY-SA 3.0 via Wikimedia Commons]
# In[1]: import mglearn # credits to Muller and Guido (https://www.amazon.com/dp/1449369413/) import matplotlib.pyplot as plt import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') mglearn.plots.plot_tree_not_monotone() # In[2]: from sklearn.datasets import load_breast_cancer from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split cancer = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42) tree = DecisionTreeClassifier(random_state=0) tree.fit(X_train, y_train) print('Accuracy on the training subset: {:.3f}'.format(tree.score(X_train, y_train))) print('Accuracy on the test subset: {:.3f}'.format(tree.score(X_test, y_test))) # In[3]: tree = DecisionTreeClassifier(max_depth=4, random_state=0) tree.fit(X_train, y_train) print('Accuracy on the training subset: {:.3f}'.format(tree.score(X_train, y_train))) print('Accuracy on the test subset: {:.3f}'.format(tree.score(X_test, y_test))) # In[4]: import graphviz from sklearn.tree import export_graphviz export_graphviz(tree, out_file='cancertree.dot', class_names=['malignant', 'benign'], feature_names=cancer.feature_names, impurity=False, filled=True) # ![](cancertree.png) # In[5]: print('Feature importances: {}'.format(tree.feature_importances_)) type(tree.feature_importances_) # In[6]: print(cancer.feature_names) # In[7]: n_features = cancer.data.shape[1] plt.barh(range(n_features), tree.feature_importances_, align='center') plt.yticks(np.arange(n_features), cancer.feature_names) plt.xlabel('Feature Importance') plt.ylabel('Feature') plt.show() # ### Advantages of Decision Trees # # - easy to view and understand # - no need to pre-process, normalize, scale, and/or standardize features # # ### Paramaters to work with # # - max_depth # - min_samples_leaf, max_samples_leaf # - max_leaf_nodes # - etc. # # ### Main Disadvantages # # - tendency to overfit # - poor generalization # # ####
Possible work-around: ensembles of decision trees
# In[ ]: