# setup our standard computation environment import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns %matplotlib inline sns.set_style('darkgrid') sns.set_context('poster') # set random seed for reproducibility np.random.seed(12345) # some data, based on a real Abie project df = pd.DataFrame(dict( age= array([ 19., 19., 20., 22., 19., 20., 20., 24., 20., 20., 19., 20., 19., 21., 21., 20., 19., 19., 22., 20., 20., 19., 18., 19., 19., 19., 20., 18., 19., 19., 19., 22., 21., 20., 20., 18., 19., 20., 22., 18., 19., 21., 22., 21., 26., 20., 19., 20., 18., 20., 20., 18., 27., 19., 20., 18., 20., 19., 20., 19., 21., 19., 19., 22., 18., 20., 20., 24., 18., 19., 18., 20., 19., 20., 22., 19., 19., 18., 19., 26., 22., 20., 19., 19., 20., 21., 21., 19., 22., 19., 19., 20., 19., 22., 22., 22., 20., 20., 18., 20.]), level= array([u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Undergraduate', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student', u'Medical Student'], dtype=object), meansalt17to23= array([ 0.0555556, 0.3333333, 0.7222222, 0. , 0.0555556, 0.0555556, 0.2777778, 0.2222222, 0.3333333, 0.5555556, 0.2777778, 0.1111111, 0.5555556, 0.5555556, 0.3333333, 0.5 , 0.1666667, 0.4444444, 0.1111111, 0.6666667, 0.6666667, 0.2222222, 0.2222222, 0.0555556, 0.1111111, 0.2777778, 0. , 0.1666667, 0.3333333, 0.4444444, 0.2777778, 0.1666667, 0.2222222, 0.3333333, 0.0555556, 0.4444444, 0.5555556, 0.2777778, 0.3888889, 0.2222222, 0.2777778, 0.3888889, 0.3333333, 0.0555556, 0.4444444, 0.3333333, 0.5555556, 0.2222222, 0.1666667, 0.1666667, 0.2222222, 0.2777778, 0.1666667, 0.0555556, 0.6111111, 0.8333333, 0.2777778, 0.1111111, 0.9444444, 0.3333333, 0.2777778, 0.2222222, 0.0555556, 0.8888889, 0.0555556, 0.5 , 0. , 0.3125 , 0.5625 , 0. , 0.0625 , 0.125 , 0.625 , 0.0625 , 0.125 , 0. , 0.5625 , 0.125 , 0.3125 , 0.8125 , 0.4375 , 0. , 0.1875 , 0.25 , 0.4375 , 0.3125 , 0. , 0.1875 , 0.0625 , 0.25 , 0. , 0.5 , 0.375 , 0.0625 , 0.1875 , 0.1875 , 0.4375 , 0.375 , 0.1875 , 0.3125 ]) )) # use negative MSE, so that higher values are better def neg_mse(example_indices): #print example_indices return -np.var(df.meansalt17to23[example_indices]) # will have a special one for finding splits that have large difference between treatment and control # overall mse impurity neg_mse(df.index) # for a given split, eval split quality # two cases: categorical and numeric split = 'level' df[split].dtype split = 'age' df[split].dtype # TODO: make this a robust method def data_type(s): if type(s) == int: return 'numeric' if type(s) == float: return 'numeric' if type(s) == str or type(s) == unicode: return 'categorical' if np.dtype(s) == np.dtype('O'): return 'categorical' elif np.dtype(s) == np.float64: return 'numeric' assert data_type(20) == 'numeric' assert data_type(20.) == 'numeric' assert data_type('Undergraduate') == 'categorical' assert data_type(df['level']) == 'categorical' assert data_type(df['age']) == 'numeric' df.level.unique() # each node is a dict, with keys for: # split : str, optional; feature column that node splits on (node is leaf if not present) # thresh : float, optional; if split col is numeric, value to split above and below # child_VALS : dicts, with VALS = leq, greater for numeric splits, and VALS = df[split].unique() for categorical # examples : rows of pd.DataFrame which pass through this node # depth 0 tree: dt_ex0 = {'examples': df.index} # depth 1 tree: dt_ex = \ {'examples': df.index, 'split': 'age', 'thresh': 25, 'child_leq': {'examples': df[df.age<=25].index}, 'child_greater': {'examples': df[df.age>25].index}, } # depth 2 example: dt_ex = \ {'examples': df.index, 'split': 'age', 'thresh': 25, 'child_leq': {'examples': df[df.age<=25].index, 'split': 'level', 'child_Undergraduate': {'examples':df[(df.age<=25)&(df.level=='Undergraduate')].index}, 'child_Medical Student': {'examples':df[(df.age<=25)&(df.level=='Medical Student')].index}, 'child_Resident': {'examples':df[(df.age<=25)&(df.level=='Resident')].index}, 'child_Attending': {'examples':df[(df.age<=25)&(df.level=='Attending')].index},}, 'child_greater': {'examples': df[df.age>25].index, 'split': 'level', 'child_Undergraduate': {'examples':df[(df.age>25)&(df.level=='Undergraduate')].index}, 'child_Medical Student': {'examples':df[(df.age>25)&(df.level=='Medical Student')].index}, 'child_Resident': {'examples':df[(df.age>25)&(df.level=='Resident')].index}, 'child_Attending': {'examples':df[(df.age>25)&(df.level=='Attending')].index},}, } def predict(tree, x): split = tree.get('split') if split: # internal node if data_type(x[split]) == 'numeric': if x[split] <= tree['thresh']: return predict(tree['child_leq'], x) else: return predict(tree['child_greater'], x) else: # data_type is 'categorical' subtree = tree['child_'+str(x[split])] return predict(subtree, x) else: # leaf node rows = np.array(tree['examples']) return df.meansalt17to23[rows].mean() predict(dt_ex, {'age':20, 'level':'Undergraduate'}) for level in df.level.unique(): print level, predict(dt_ex, {'age':26, 'level':level}) def mean(rows): return df.meansalt17to23[rows].mean() # to predict for a row of data, recursively descend tree and use mean of leaf node def predict(tree, x, agg_func): """ predict with decision tree `tree` for example x tree : nest dict, as described above x : example dict or pd.Series agg_func : function that takes rows for examples and calculates prediction value """ split = tree.get('split') if split: # internal node if data_type(x[split]) == 'numeric': if x[split] <= tree['thresh']: return predict(tree['child_leq'], x, agg_func) else: return predict(tree['child_greater'], x, agg_func) else: # data_type is 'categorical' subtree = tree['child_'+str(x[split])] return predict(subtree, x, agg_func) else: # leaf node rows = np.array(tree['examples']) return agg_func(rows) predict(dt_ex, {'age':20, 'level':'Undergraduate'}, mean) for level in df.level.unique(): print level, predict(dt_ex, {'age':26, 'level':level}, mean) predict(dt_ex, df.loc[0], mean) def fit(X, y, criteria, depth=0, max_depth=np.inf, min_examples=2): """ recursively build greedy tree (depth-first) X : pd.DataFrame of feature vectors (as rows) y : pd.Series of labels criteria : function that measures quality of a set of labels (e.g. MSE or gini) depth : int, current depth of recursion max_depth : int, maximum depth of tree min_example : int, minimum number of examples for an internal node """ # simple cases: # not enough examples if len(X.index) <= min_examples: return {'examples': X.index} # tree too deep if depth >= max_depth: return {'examples': X.index} # hard case: search for best split n = float(len(X.index)) max_quality = -np.inf best_split = None for split in X.columns: if data_type(X[split]) == 'numeric': for thresh in X[split].unique(): left_vals = y[X[split] <= thresh] right_vals = y[X[split] > thresh] quality = len(left_vals) / n * criteria(left_vals.index) \ + len(right_vals) / n * criteria(right_vals.index) if quality > max_quality: max_quality = quality best_split = split best_thresh = thresh else: # split var is categorical quality = 0 for cat in X[split].unique(): cat_vals = y[X[split] == cat] quality += len(cat_vals) / n * criteria(cat_vals.index) if quality > max_quality: max_quality = quality best_split = split if not best_split: return {'examples': X.index} else: # found a split to use, so partition on it and recurse tree = {'examples': X.index, 'split': best_split} if data_type(X[best_split]) == 'numeric': tree['thresh'] = best_thresh rows = X[best_split] <= best_thresh tree['child_leq'] = fit(X[rows], y[rows], criteria, depth+1, max_depth, min_examples) rows = X[best_split] > best_thresh tree['child_greater'] = fit(X[rows], y[rows], criteria, depth+1, max_depth, min_examples) else: # best split var is categorical for cat in X[best_split].unique(): rows = X[best_split] == cat tree['child_'+cat] = fit(X[rows].drop(best_split, axis=1), y[rows], criteria, depth+1, max_depth, min_examples) return tree tree = fit(df.filter(['age', 'level']), df.meansalt17to23, neg_mse, max_depth=1) predict(tree, df.loc[0], mean) # interactively, with a recursive tree printer def print_tree(tree, depth=0): indent = ' '*depth if 'split' in tree: if 'thresh' in tree: print indent + 'if X[%s] <= %.2f:' % (tree['split'], tree['thresh']) print_tree(tree['child_leq'], depth+1) print indent + 'else:' print_tree(tree['child_greater'], depth+1) else: for k in tree: if k.startswith('child_'): print indent + 'if X[%s] == "%s":' % (tree['split'], k.replace('child_', '')) print_tree(tree[k], depth+1) else: rows = np.array(tree['examples']) print indent + 'return agg_func(%d, %d, ...) [%d examples]' % (rows[0], rows[1], len(rows)) # depth one, see that it works right n=3 tree = fit(df.loc[:n, ['age', ]], df.meansalt17to23[:n], neg_mse, max_depth=1) print_tree(tree) df.filter(['age', 'level', 'meansalt17to23']).loc[:n] np.random.seed(123456) x_true = np.linspace(0,10,1000) y_true = np.cos(x_true) x_train = np.random.choice(x_true, size=100) y_train = np.random.normal(np.cos(x_train), .4) plt.plot(x_true, y_true, label='truth') plt.plot(x_train, y_train, 's', label='train') plt.legend() X = pd.DataFrame({'x':x_train}) y = pd.Series(y_train) def neg_mse(rows): return -y[rows].var() def mean(rows): return y[rows].mean() tree = fit(X, y, neg_mse, 0, 25, 25) print_tree(tree) y_pred = [predict(tree, {'x': x}, mean) for x in x_true] plt.plot(x_true, y_true, label='truth') plt.plot(x_train, y_train, 's', label='train') plt.plot(x_true, y_pred, label='pred') plt.legend()