#!/usr/bin/env python # coding: utf-8 # In[1]: # %load /Users/facai/Study/book_notes/preconfig.py get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) sns.set(font='SimHei') plt.rcParams['axes.grid'] = False from IPython.display import SVG def show_image(filename, figsize=None): if figsize: plt.figure(figsize=figsize) plt.imshow(plt.imread(filename)) # 评价函数模块 `_criterion.*` 详解 # ================================= # ### 0. 大纲 # # 本文会按照类的继承关系,由上到下介绍各个类的成员及实现细节,尽量说得详细点。 # In[17]: SVG("./res/uml/Model___criterion_1.svg") # ### 1. Criterion, 接口类 # Criterion 作为接口之用,相当于 Java 中的抽象类。其接口定义在 `_criterion.pxd` 文件,实现在 `_criterion.pyx` 中。 # # 我们对 Criterion 分为两部份来说明:内部成员,和函数方法。 # # ##### 1.0 内部成员 # 为了计算和存储效率,sklearn 做了些针对性的数据设计。当然,其实思路挺直觉的,我们带着具体的成员变量来说明。 # # + 数据分条 (stride) # # 二维数组会用一维数组(其实是个连续的缓存)加上偏移量来实现,我以前在数字图像处理中常看到。 # # ```Python # 26 # Internal structures # 27 cdef DOUBLE_t* y # Values of y # 28 cdef SIZE_t y_stride # Stride in y (since n_outputs >= 1) # 29 cdef DOUBLE_t* sample_weight # Sample weights # ``` # # 分类的标签或者回归的值($y$)可能是个向量,所以需要对 $y$ 值分条,每条的长度自然是 `y_stride`。 # `sample_weight` 是各个样本的权重值,不用多讲。 # + 使用索引 # # 决策树分割节点时会涉及到移动样本,因为 $x$ 通常维数较多,开销较大。对于这类问题,一个常见的做法就是交换索引,而不变动值。具体到数组操组,这个索引就是下标值。 # # ```Python # 31 cdef SIZE_t* samples # Sample indices in X, y # 32 cdef SIZE_t start # samples[start:pos] are the samples in the left node # 33 cdef SIZE_t pos # samples[pos:end] are the samples in the right node # 34 cdef SIZE_t end # ``` # # 这里样本值的索引区间其实是前闭后开,即 samples[start:pos) 和 samples[pos:end)。 # + 加权处理 # # 加权的样本数目统计 # ```Python # 36 cdef SIZE_t n_outputs # Number of outputs # 37 cdef SIZE_t n_node_samples # Number of samples in the node (end-start) # 38 cdef double weighted_n_samples # Weighted number of samples (in total) # 39 cdef double weighted_n_node_samples # Weighted number of samples in the node # 40 cdef double weighted_n_left # Weighted number of samples in the left node # 41 cdef double weighted_n_right # Weighted number of samples in the right node # ``` # # 加权的 $y$ 值累加和(若是分类,则是对各个类别单独做累加) # ```Python # 43 cdef double* sum_total # For classification criteria, the sum of the # 44 # weighted count of each label. For regression, # 45 # the sum of w*y. sum_total[k] is equal to # 46 # sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k], # 47 # where k is output index. # 48 cdef double* sum_left # Same as above, but for the left side of the split # 49 cdef double* sum_right # same as above, but for the right side of the split # ``` # ##### 1.1 函数方法 # + 纯接口 # # ```Python # 54 # Methods # 55 cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, # 56 double weighted_n_samples, SIZE_t* samples, SIZE_t start, # 57 SIZE_t end) nogil # 58 cdef void reset(self) nogil # 59 cdef void reverse_reset(self) nogil # 60 cdef void update(self, SIZE_t new_pos) nogil # 61 cdef double node_impurity(self) nogil # 62 cdef void children_impurity(self, double* impurity_left, # 63 double* impurity_right) nogil # 64 cdef void node_value(self, double* dest) nogil # ``` # + 计算不纯度提升量 # # ```Python # 65 cdef double impurity_improvement(self, double impurity) nogil # 66 cdef double proxy_impurity_improvement(self) nogil # ``` # # `proxy_impurity_improvement`: # # \begin{align} # & \frac{N_t}{N} \left ( \text{ impurity } - \frac{N_{t,R}}{N_t} \text{ right_impurity } - \frac{N_{t,L}}{N_t} \text{ left_impurity } \right ) \\ # &= \frac{N_t}{N} \text{ impurity } - \frac{N_{t,R}}{N} \text{ right_impurity } - \frac{N_{t,L}}{N} \text{ left_impurity } \\ # &= \frac{N_t}{N} \text{ impurity } - \frac{1}{N} \underbrace{ \left ( N_{t,R} \text{ right_impurity } - N_{t,L} \text{ left_impurity } \right ) }_{\text{impurity_improvement}} \\ # \end{align} # # 对一个节点,去除公式中的常量,弄了个 `impurity_improvement` 来减少计算量。 # ### 2. 分类评价函数 # #### 2.0 ClassificationCriterion # ClassificationCriterion 类是分类评估函数的父类,主要𫐄实现初始化和一些预处理函数。我们从上到下依序看代码。 # # ##### 2.0.0 变量 # # ```Python # 207 cdef class ClassificationCriterion(Criterion): # 208 """Abstract criterion for classification.""" # 209 # 210 cdef SIZE_t* n_classes # 211 cdef SIZE_t sum_stride # ``` # # 这里分类为了支持多输出(target)问题,用 `n_classes` 来统计每个输出的类别个数,用 `sum_stride` 来对齐不同输出的结果。用文字表述比较困难,下面的图说明更清晰点。 # In[18]: show_image("./res/ClassificationCriterion.jpg", figsize=(8,15)) # ##### 2.0.1 `__cinit__` # 接下来,我们看构造函数: # # ```Python # 213 def __cinit__(self, SIZE_t n_outputs, # 214 np.ndarray[SIZE_t, ndim=1] n_classes): # 215 """Initialize attributes for this criterion. # 216 +-- 7 lines: Parameters--------------------------------------------------------- # 223 """ # 224 # 225 #+-- 20 lines: self.y = NULL------------------------------------------------------ # 245 # 246 safe_realloc(&self.n_classes, n_outputs) # 247 #+-- 6 lines: cdef SIZE_t k = 0-------------------------------------------------- # 253 for k in range(n_outputs): # 254 self.n_classes[k] = n_classes[k] # 255 # 256 if n_classes[k] > sum_stride: # 257 sum_stride = n_classes[k] # 258 # 259 self.sum_stride = sum_stride # 260 # 261 cdef SIZE_t n_elements = n_outputs * sum_stride # 262 self.sum_total = calloc(n_elements, sizeof(double)) # 263 #+-- 7 lines: self.sum_left = calloc(n_elements, sizeof(double))------- # ``` # # 主要是三个工作: # # + 初始化变量,设置默认值。 # + 复制 `n_classes`,同时记录最大值 `sum_stride` 用于对齐。 # + 为 `sum_*` 统计各类别信息的变量申请空间。 # ##### 2.0.2 `init` # 应该是为了重复使用,这里单独写了初始化函数。 # # ```Python # 282 cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, # 283 DOUBLE_t* sample_weight, double weighted_n_samples, # 284 SIZE_t* samples, SIZE_t start, SIZE_t end) nogil: # 285 """Initialize the criterion at node samples[start:end] and # 286 +-- 19 lines: children samples[start:start] and samples[start:end].-------------- # 305 """ # 306 # 307 #+-- 19 lines: self.y = y--------------------------------------------------------- # 326 # 327 for k in range(self.n_outputs): # 328 memset(sum_total + offset, 0, n_classes[k] * sizeof(double)) # 329 offset += self.sum_stride # 330 # 331 for p in range(start, end): # 332 i = samples[p] # 333 #+-- 4 lines: w is originally set to be 1.0, meaning that if no sample weights--- # 337 w = sample_weight[i] # 338 # 339 # Count weighted class frequency for each target # 340 for k in range(self.n_outputs): # 341 c = y[i * y_stride + k] # 342 sum_total[k * self.sum_stride + c] += w # 343 # 344 self.weighted_n_node_samples += w # 345 # 346 # Reset to pos=start # 347 self.reset() # ``` # # 主要流程是: # # + 307L-326L, 复制参数和设置默认值; # + 327L-330L, 清零 `sum_total`。 # 注意 328L 的优化, `memset` 最后一个参数是各输出的实际类别数,所以只会清实际用到的空间。 # + 331L-344L, 统计各类别的权重值 `sum_total` 和总权重值 `weighted_n_node_samples`。 # 其中,340L 取到输出 $k$, 341L 取到类别 $c$,累加相应的总值 $\text{sum_total}[k, c]$。 # ##### 2.0.3 `reset` # reset 方法正如其名,做了重置工作,将节点位置 `pos` 恢复到最左边 `start` 位置,并更新统计信息。 # # ```Python # 349 cdef void reset(self) nogil: 350 """Reset the criterion at pos=start.""" # 351 # 352 self.pos = self.start # 353 # 354 self.weighted_n_left = 0.0 # 355 self.weighted_n_right = self.weighted_n_node_samples # 356 # 357 #+-- 7 lines: cdef double* sum_total = self.sum_total---------------------------- # 364 for k in range(self.n_outputs): # 365 memset(sum_left, 0, n_classes[k] * sizeof(double)) # 366 memcpy(sum_right, sum_total, n_classes[k] * sizeof(double)) # 367 # 368 sum_total += self.sum_stride # 369 #+--- 2 lines: sum_left += self.sum_stride--------------------------------------- # ``` # # 主要工作,清掉左节点的信息,覆盖右节点。 # # `reverse_reset` 和 `reset` 操作互反,不赘述。 # ##### 2.0.4 `update` # update 函数将节点移到指定位置,并更新统计信息。 # # ```Python # 394 cdef void update(self, SIZE_t new_pos) nogil: # 395 """Updated statistics by moving samples[pos:new_pos] to the left child. # 396 +-- 6 lines: Parameters--------------------------------------------------------- # 402 """ # 403 cdef DOUBLE_t* y = self.y # 404 #+-- 17 lines: cdef SIZE_t pos = self.pos----------------------------------------- # 421 # 422 #+-- 8 lines: Update statistics up to new_pos------------------------------------ # 430 if (new_pos - pos) <= (end - new_pos): # 431 for p in range(pos, new_pos): # 432 #+-- 8 lines: i = samples[p]----------------------------------------------------- # 440 sum_left[label_index] += w # 441 # 442 self.weighted_n_left += w # 443 # 444 else: # 445 self.reverse_reset() # 446 # 447 for p in range(end - 1, new_pos - 1, -1): # 448 #+-- 8 lines: i = samples[p]----------------------------------------------------- # 456 sum_left[label_index] -= w # 457 # 458 self.weighted_n_left -= w # 459 # 460 # Update right part statistics # 461 self.weighted_n_right = self.weighted_n_node_samples - self.weighted_n_le ft # 462 for k in range(self.n_outputs): # 463 for c in range(n_classes[k]): # 464 sum_right[c] = sum_total[c] - sum_left[c] # 465 #+-- 4 lines: sum_right += self.sum_stride--------------------------------------- # 469 # 470 self.pos = new_pos # ``` # # 主要流程如下: # # + 初始化参数; # + 422L-458L, 更新左节点的统计信息。 # 这里的优化,是判断指定位置、当前节点和最右边三者的距离,遍历较小的距离。 # + 460L-465L, 更新右节点的统计信息:$\text{sum_right } = \text{ sum_total } - \text{ sum_left }$。 # ##### 2.0.5 `node_value` # 作用是将 `sum_total` 的数据拷贝出来,不多说。 # # ```Python # 479 cdef void node_value(self, double* dest) nogil: # 480 """Compute the node value of samples[start:end] and save it into dest. # 481 +-- 5 lines: Parameters--------------------------------------------------------- # 486 """ # 487 #+-- 4 lines: cdef double* sum_total = self.sum_total---------------------------- # 491 # 492 for k in range(self.n_outputs): # 493 memcpy(dest, sum_total, n_classes[k] * sizeof(double)) # 494 #+-- 2 lines: dest += self.sum_stride-------------------------------------------- # ``` # ##### 2.0.6 小结 # 我们可以看出,`ClassificationCriterion` 的主要工作是在更新 `sum_*` 变量。这三个变量的数据是相应节点中各个类的权重值,用这三个变量可以算出各类 $k$ 在节点 $i$ 的权重占比 $p_i^k$。而我们知道,常用的 gini、entropy 和 misclassification_rate 都是基于 $p_i^k$ 来表达。所以,将更新权重信息函数汇总,便于复用,正是此父类的存在意义。 # 有了 $p_k^i$ 权重占比,再下面的子类就纯粹是数学计算了,加上这几个式子都简单,代码非常清晰。需要注意,对多输出的问题,评价值是统一加合取平均。 # # #### 2.1 Entropy # # $\text{cross-entropy } = -\sum_{k=0}^{K-1} \text{ count_k } \log( \text{ count_k }) $ # # ```Python # 498 cdef class Entropy(ClassificationCriterion): # 499 """Cross Entropy impurity criterion. # 500 +-- 11 lines: This handles cases where the target is a classification taking valu # 511 cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k) # 512 """ # 513 # 514 cdef double node_impurity(self) nogil: # 515 #+-- 10 lines: """Evaluate the impurity of the current node, i.e. the impurity of- # 525 for k in range(self.n_outputs): # 526 for c in range(n_classes[k]): # 527 count_k = sum_total[c] # 528 if count_k > 0.0: # 529 count_k /= self.weighted_n_node_samples # 530 entropy -= count_k * log(count_k) # 531 #+-- 3 lines: sum_total += self.sum_stride--------------------------------------- # 534 return entropy / self.n_outputs # 535 # 536 cdef void children_impurity(self, double* impurity_left, # 537 double* impurity_right) nogil: # 538 #+-- 22 lines: """Evaluate the impurity in children nodes------------------------- # 560 for k in range(self.n_outputs): # 561 for c in range(n_classes[k]): # 562 count_k = sum_left[c] # 563 if count_k > 0.0: # 564 count_k /= self.weighted_n_left # 565 entropy_left -= count_k * log(count_k) # 566 # 567 count_k = sum_right[c] # 568 if count_k > 0.0: # 569 count_k /= self.weighted_n_right # 570 entropy_right -= count_k * log(count_k) # 571 #+-- 3 lines: sum_left += self.sum_stride---------------------------------------- # 574 # 575 impurity_left[0] = entropy_left / self.n_outputs # 576 impurity_right[0] = entropy_right / self.n_outputs # ``` # #### 2.2 Gini # $ \text{ index } = 1 - \sum_{k=0}^{K-1} \text{ count_k}^2 $ # # ```Python # 579 cdef class Gini(ClassificationCriterion): # 580 """Gini Index impurity criterion. # 581 +-- 11 lines: This handles cases where the target is a classification taking valu # 592 index = \sum_{k=0}^{K-1} count_k (1 - count_k) # 593 = 1 - \sum_{k=0}^{K-1} count_k ** 2 # 594 """ # 595 # 596 cdef double node_impurity(self) nogil: # 597 #+-- 12 lines: """Evaluate the impurity of the current node, i.e. the impurity of- # 609 for k in range(self.n_outputs): # 610 sq_count = 0.0 # 611 # 612 for c in range(n_classes[k]): # 613 count_k = sum_total[c] # 614 sq_count += count_k * count_k # 615 # 616 gini += 1.0 - sq_count / (self.weighted_n_node_samples * # 617 self.weighted_n_node_samples) # 618 # 619 sum_total += self.sum_stride # 620 # 621 return gini / self.n_outputs # 622 # 623 cdef void children_impurity(self, double* impurity_left, # 624 double* impurity_right) nogil: # 625 """Evaluate the impurity in children nodes # 626 #+-- 45 lines: i.e. the impurity of the left child (samples[start:pos]) and the--- # ``` # ### 3. 回归评价函数 # #### 3.0 RegressionCriterion # RegressionCriterion 是回归评价的父类,与分类预计算权重占比 $p_k^i$ 类似,回归会预计算节点的方差值。为了节省计算量,对公式做了变形,降低了计算次数,具体推导如下所示: # # \begin{align} # \text{var } &= \frac{1}{n} \sum_i^n (y_i - \bar{y})^2 \\ # &= E \left [ (y_i - E(y_i))^2 \right ] \\ # &= E[y_i^2] - E^2[y_i] \\ # &= \frac{1}{n} \sum_i^n (y_i^2) - \bar{y}^2 \\ # &= \frac{1}{n} \sum_i^n (y_i^2) - \left ( \frac{1}{n} \sum_i^n y_i \right )^2 \\ # & \text{ 引入权重值 } \\ # &= \sum_i^n ( w_i (y_i^2) ) / \sum_i^n w_i - \left ( \sum_i^n (w_i y_i) / \sum_i^n w_i \right )^2 \\ # &= \frac{\text{sq_sum_total}}{\text{weighted_n_node_samples}} - \left (\frac{\text{sum_total}}{\text{weighted_n_node_samples}} \right )^2 # \end{align} # # 所以,RegressionCriterion 的方法都是在计算和更新上述式子中的三个变量(还有子节点的),因为整个代码和分类相似,不再赘述。 # ##### 3.1 MSE # 其实这里 MSE 用的公式就是上述的方差公式。虽然实际实义中 $\text{MSE } = E [ y_i - \hat{y}_i ]$,但因为决策树预测值用的是均值,所以全部 $\hat{y}_i = \bar{y_i}$,于是退化成了计算方差的公式。 # ##### 3.2 FriedmanMSE # FriedmanMSE 来源于 Friedman 论文 [Greedy Function Approximation: A Gradient Boosting Machine(https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) 中的公式 (35),跟 Gradient Boosting 相关,这方面我还没看,等后继写到 GBDT 时再回来补上。 # ### 总结 # # 本文较为详细地介绍了 sklearn 中评估函数模块的实现细节:一个接口类,两个辅助计算的父类,四个具体计算的子类。