#!/usr/bin/env python # coding: utf-8 # In[1]: # %load /Users/facai/Study/book_notes/preconfig.py get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) sns.set(font='SimHei') plt.rcParams['axes.grid'] = False from IPython.display import SVG def show_image(filename, figsize=None): if figsize: plt.figure(figsize=figsize) plt.imshow(plt.imread(filename)) # 树构建模块 _tree.* 详解 # ===================== # ### 0. 大纲 # # 此模块包含两种类,一种是二叉树的实现类 `Tree`,另一种是构建出整颗树的方法类 `TreeBuilder`。我们着重介绍构建方法类 `TreeBuilder`,最后会简要提及 `Tree` 的几个函数。 # In[3]: SVG("./res/uml/Model___tree_3.svg") # ### 1. 构建类 # #### 1.0 TreeBuilder # TreeBuilder 提供了接口方法 `build` 和一个具体的参数检查方法 `_check_input`,没太多说的。 # #### 1.1 DepthFirstTreeBuilder # DepthFirstTreeBuilder 是用类似先序遍历的方式生成整颗决策树,借助的数据结构是栈。 # # 主要流程是: # # 1. 从栈中抽出一个节点 # 2. 计算分割点 # + 若达到叶子条件,不再处理; # + 若不是叶子,则先压右叶子入栈,再压左叶子入栈。 # 3. 若栈空,树构建完成。 # 主体代码如下: # # ```Python # 195 with nogil: # 196 # push root node onto stack # 197 rc = stack.push(0, n_node_samples, 0, _TREE_UNDEFINED, 0, INFINITY, 0) # 198 #+-- 4 lines: if rc == -1:----------------------------------------------------------------- # 202 # 203 while not stack.is_empty(): # 204 stack.pop(&stack_record) # 205 #+-- 9 lines: start = stack_record.start--------------------------------------------------- # 214 n_node_samples = end - start # 215 splitter.node_reset(start, end, &weighted_n_node_samples) # 216 # 217 is_leaf = ((depth >= max_depth) or # 218 (n_node_samples < min_samples_split) or # 219 (n_node_samples < 2 * min_samples_leaf) or # 220 (weighted_n_node_samples < min_weight_leaf)) # 221 # 222 if first: # 223 impurity = splitter.node_impurity() # 224 first = 0 # 225 # 226 is_leaf = is_leaf or (impurity <= MIN_IMPURITY_SPLIT) # 227 # 228 if not is_leaf: # 229 splitter.node_split(impurity, &split, &n_constant_features) # 230 is_leaf = is_leaf or (split.pos >= end) # 231 # 232 node_id = tree._add_node(parent, is_left, is_leaf, split.feature, # 233 split.threshold, impurity, n_node_samples, # 234 weighted_n_node_samples) # 235 # 236 #+-- 4 lines: if node_id == (-1):-------------------------------------------------- # 240 # Store value for all nodes, to facilitate tree/model # 241 # inspection and interpretation # 242 splitter.node_value(tree.value + node_id * tree.value_stride) # 243 # 244 if not is_leaf: # 245 # Push right child on stack # 246 rc = stack.push(split.pos, end, depth + 1, node_id, 0, # 247 split.impurity_right, n_constant_features) # 248 #+-- 3 lines: if rc == -1:----------------------------------------------------------------- # 251 # Push left child on stack # 252 rc = stack.push(start, split.pos, depth + 1, node_id, 1, # 253 split.impurity_left, n_constant_features) # 254 #+-- 2 lines: if rc == -1:----------------------------------------------------------------- # 256 # ``` # #### 1.2 BestFirstTreeBuilder # # BestFristTreeBuilder 总是优先分割最混杂(不纯度最大)的节点,借助了最大堆的数据结构。处理流程和 DepthFirstTreeBuilder 大同小异,不赘述。 # ### 2. 实现类 # #### 2.0 Tree # sklearn 用数组实现了二叉树,我比较感兴趣的函数是计算特征重要性的 `compute_feature_importances`。 # # 这个函数的想法其实也很简单,就是遍历决策的中间节点,汇总各个特征对纯净度的贡献量。代码很短,很好理解。 # # ```Python # 1033 cpdef compute_feature_importances(self, normalize=True): # 1034 """Computes the importance of each feature (aka variable).""" # 1035 #+-- 3 lines: cdef Node* left-------------------------------------------------------------- # 1038 cdef Node* node = nodes # 1039 cdef Node* end_node = node + self.node_count # 1040 #+-- 3 lines: cdef double normalizer = 0.-------------------------------------------------- # 1043 cdef np.ndarray[np.float64_t, ndim=1] importances # 1044 #+-- 2 lines: importances = np.zeros((self.n_features,))----------------------------------- # 1046 # 1047 with nogil: # 1048 while node != end_node: # 1049 if node.left_child != _TREE_LEAF: # 1050 # ... and node.right_child != _TREE_LEAF: # 1051 left = &nodes[node.left_child] # 1052 right = &nodes[node.right_child] # 1053 # 1054 importance_data[node.feature] += ( # 1055 node.weighted_n_node_samples * node.impurity - # 1056 left.weighted_n_node_samples * left.impurity - # 1057 right.weighted_n_node_samples * right.impurity) # 1058 node += 1 # 1059 # 1060 importances /= nodes[0].weighted_n_node_samples # 1061 # 1062 if normalize: # 1063 normalizer = np.sum(importances) # 1064 #+-- 3 lines: if normalizer > 0.0:--------------------------------------------------------- # 1067 importances /= normalizer # 1068 # 1069 return importances # ``` # 另外,在 `tree.py` 模块里决策树分类结果也可以计算出概率值,这个概率其实是预测类的样本在此叶子的占比。 # # 整个计算路径是: # # 1. `_tree.py:Tree.predict` 通过 `_tree.py:Tree.apply` 找到叶子节点,结合 `_tree.py:Tree._get_value_ndarray` 得到所在叶子节点的各个类统计数。 # # 2. `tree.py:DecisionTreeClassifier.predict_proba` 计算占比。 # ### 总结 # # 本文介绍了两种决策树的构建方法,和计算特征重要性与结果预测概率的方法。 # In[ ]: