In [133]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import graphviz
import IPython
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [134]:

# This is exact code from fastai 0.7 structured.py

def draw_tree(t, df, size=10, ratio=0.6, precision=0):
    """ Draws a representation of a random forest in IPython.
    Parameters:
    -----------
    t: The tree you wish to draw
    df: The data used to train the tree. This is used to get the names of the features.
    """
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True,
                      special_characters=True, rotate=True, precision=precision)
    IPython.display.display(graphviz.Source(re.sub('Tree {',
       f'Tree {{ size={size}; ratio={ratio}', s)))

In [135]:

# From Notebook Cells

class TreeEnsemble():
    def __init__(self, x, y, n_trees, sample_sz, min_leaf=5):
        np.random.seed(42)
        self.x,self.y,self.sample_sz,self.min_leaf = x,y,sample_sz,min_leaf
        self.trees = [self.create_tree() for i in range(n_trees)]

    def create_tree(self):
        rnd_idxs = np.random.permutation(len(self.y))[:self.sample_sz]
        return DecisionTree(self.x.iloc[rnd_idxs], self.y[rnd_idxs], 
                    idxs=np.array(range(self.sample_sz)), min_leaf=self.min_leaf)
        
    def predict(self, x):
        return np.mean([t.predict(x) for t in self.trees], axis=0)

def std_agg(cnt, s1, s2): return math.sqrt((s2/cnt) - (s1/cnt)**2)

class DecisionTree():
    def __init__(self, x, y, idxs=None, min_leaf=5):
        if idxs is None: idxs=np.arange(len(y))
        self.x,self.y,self.idxs,self.min_leaf = x,y,idxs,min_leaf
        self.n,self.c = len(idxs), x.shape[1]
        self.val = np.mean(y[idxs])
        self.score = float('inf')
        self.find_varsplit()
        
    def find_varsplit(self):
        for i in range(self.c): self.find_better_split(i)
        if self.is_leaf: return
        x = self.split_col
        lhs = np.nonzero(x<=self.split)[0]
        rhs = np.nonzero(x>self.split)[0]
        self.lhs = DecisionTree(self.x, self.y, self.idxs[lhs])
        self.rhs = DecisionTree(self.x, self.y, self.idxs[rhs])

    def find_better_split(self, var_idx):
        x,y = self.x.values[self.idxs,var_idx], self.y[self.idxs]
        sort_idx = np.argsort(x)
        sort_y,sort_x = y[sort_idx], x[sort_idx]
        rhs_cnt,rhs_sum,rhs_sum2 = self.n, sort_y.sum(), (sort_y**2).sum()
        lhs_cnt,lhs_sum,lhs_sum2 = 0,0.,0.

        for i in range(0,self.n-self.min_leaf):
            xi,yi = sort_x[i],sort_y[i]
            lhs_cnt += 1; rhs_cnt -= 1
            lhs_sum += yi; rhs_sum -= yi
            lhs_sum2 += yi**2; rhs_sum2 -= yi**2
            if i<self.min_leaf-1 or xi==sort_x[i+1]:
                continue

            lhs_std = std_agg(lhs_cnt, lhs_sum, lhs_sum2)
            rhs_std = std_agg(rhs_cnt, rhs_sum, rhs_sum2)
            curr_score = lhs_std*lhs_cnt + rhs_std*rhs_cnt
            if curr_score<self.score: 
                self.var_idx,self.score,self.split = var_idx,curr_score,xi

    @property
    def split_name(self): return self.x.columns[self.var_idx]
    
    @property
    def split_col(self): return self.x.values[self.idxs,self.var_idx]

    @property
    def is_leaf(self): return self.score == float('inf')
    
    def __repr__(self):
        s = f'n: {self.n}; val:{self.val}'
        if not self.is_leaf:
            s += f'; score:{self.score}; split:{self.split}; var:{self.split_name}'
        return s

    def predict(self, x):
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        if self.is_leaf: return self.val
        t = self.lhs if xi[self.var_idx]<=self.split else self.rhs
        return t.predict_row(xi)

Spoof some data¶

In [155]:

df_trn = pd.DataFrame(np.random.uniform(low=-0.5, high=1, size=(500, 10)))
y_trn = pd.Series(df_trn.loc[:,1] + 0.5*df_trn.loc[:,2] + 5*df_trn.loc[:,3]*df_trn.loc[:,5])

In [156]:

def split_vals(a,n): return a[:n], a[n:]
n_valid = 100
n_trn = len(df_trn)-n_valid
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y_trn, n_trn)

`sklearn` version¶

In [157]:

m = RandomForestRegressor(n_estimators=1, min_samples_leaf=100, max_features=None, bootstrap=False)
m.fit(X_train, y_train)
preds = m.predict(X_train)

In [158]:

draw_tree(m.estimators_[0], X_train, precision=4)

In [159]:

m_rf = TreeEnsemble(X_train, y_train, n_trees=1, sample_sz=400, min_leaf=100)

In [160]:

tree = m_rf.trees[0]

In [161]:

tree

Out[161]:

n: 400; val:0.7367552209079351; score:500.9992481377045; split:0.31205441523590305; var:6

In [162]:

tree.lhs

Out[162]:

n: 223; val:0.6844595432665761

In [163]:

tree.rhs

Out[163]:

n: 177; val:0.8026418656199293

In [164]:

tree.lhs.is_leaf

Out[164]:

True

In [165]:

tree.rhs.is_leaf

Out[165]:

True

Why does this tree stop splitting after just the root?¶

In [ ]:

Spoof some data¶

sklearn version¶

Why does this tree stop splitting after just the root?¶

`sklearn` version¶