#!/usr/bin/env python # coding: utf-8 # In[133]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import math import graphviz import IPython import re from sklearn.ensemble import RandomForestRegressor from sklearn.tree import export_graphviz from sklearn import metrics import warnings warnings.filterwarnings("ignore", category=FutureWarning) # In[134]: # This is exact code from fastai 0.7 structured.py def draw_tree(t, df, size=10, ratio=0.6, precision=0): """ Draws a representation of a random forest in IPython. Parameters: ----------- t: The tree you wish to draw df: The data used to train the tree. This is used to get the names of the features. """ s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, special_characters=True, rotate=True, precision=precision) IPython.display.display(graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))) # In[135]: # From Notebook Cells class TreeEnsemble(): def __init__(self, x, y, n_trees, sample_sz, min_leaf=5): np.random.seed(42) self.x,self.y,self.sample_sz,self.min_leaf = x,y,sample_sz,min_leaf self.trees = [self.create_tree() for i in range(n_trees)] def create_tree(self): rnd_idxs = np.random.permutation(len(self.y))[:self.sample_sz] return DecisionTree(self.x.iloc[rnd_idxs], self.y[rnd_idxs], idxs=np.array(range(self.sample_sz)), min_leaf=self.min_leaf) def predict(self, x): return np.mean([t.predict(x) for t in self.trees], axis=0) def std_agg(cnt, s1, s2): return math.sqrt((s2/cnt) - (s1/cnt)**2) class DecisionTree(): def __init__(self, x, y, idxs=None, min_leaf=5): if idxs is None: idxs=np.arange(len(y)) self.x,self.y,self.idxs,self.min_leaf = x,y,idxs,min_leaf self.n,self.c = len(idxs), x.shape[1] self.val = np.mean(y[idxs]) self.score = float('inf') self.find_varsplit() def find_varsplit(self): for i in range(self.c): self.find_better_split(i) if self.is_leaf: return x = self.split_col lhs = np.nonzero(x<=self.split)[0] rhs = np.nonzero(x>self.split)[0] self.lhs = DecisionTree(self.x, self.y, self.idxs[lhs]) self.rhs = DecisionTree(self.x, self.y, self.idxs[rhs]) def find_better_split(self, var_idx): x,y = self.x.values[self.idxs,var_idx], self.y[self.idxs] sort_idx = np.argsort(x) sort_y,sort_x = y[sort_idx], x[sort_idx] rhs_cnt,rhs_sum,rhs_sum2 = self.n, sort_y.sum(), (sort_y**2).sum() lhs_cnt,lhs_sum,lhs_sum2 = 0,0.,0. for i in range(0,self.n-self.min_leaf): xi,yi = sort_x[i],sort_y[i] lhs_cnt += 1; rhs_cnt -= 1 lhs_sum += yi; rhs_sum -= yi lhs_sum2 += yi**2; rhs_sum2 -= yi**2 if i