#!/usr/bin/env python # coding: utf-8 # In[1]: # %load /Users/facai/Study/book_notes/preconfig.py get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) sns.set(font='SimHei') plt.rcParams['axes.grid'] = False from IPython.display import SVG def show_image(filename, figsize=None): if figsize: plt.figure(figsize=figsize) plt.imshow(plt.imread(filename)) # 分割函数模块 `_splitter.*` 详解 # ================================= # ### 0. 大纲 # # 分割函数根据数据的稀疏性和阈值遍历方式,细分为四个子类,后面我们一一探究。 # In[2]: SVG("./res/uml/Model___splitter_2.svg") # ### 1. `Splitter` # Splitter 是分割模块的接口类。其定义在 _splitter.pxd 文件,实现在 _splitter.pyx 文件。 # # 为了记录每次分割的情况,首先定义了 SplitRecord 数据结构,含义注释非常清晰,不赘述。 # # ```Python # 23 cdef struct SplitRecord: # 24 # Data to track sample split # 25 SIZE_t feature # Which feature to split on. # 26 SIZE_t pos # Split samples array at the given position, # 27 # i.e. count of samples below threshold for feature. # 28 # pos is >= end if the node is a leaf. # 29 double threshold # Threshold to split at. # 30 double improvement # Impurity improvement given parent node. # 31 double impurity_left # Impurity of the left split. # 32 double impurity_right # Impurity of the right split. # ``` # #### 1.0 内部成员 # 成员变量粗略分为五部份: # # 1. 评价函数和终止条件: # # ```Python # 34 cdef class Splitter: # 35 #+-- 5 lines: The splitter searches in the input space for a feature and a thresho # 40 # Internal structures # 41 cdef public Criterion criterion # Impurity criterion # 42 cdef public SIZE_t max_features # Number of features to test # 43 cdef public SIZE_t min_samples_leaf # Min samples in a leaf # 44 cdef public double min_weight_leaf # Minimum weight in a leaf # ``` # # 2. 随机数的种子,在选特征和阈值时会用到: # # ```Python # 46 cdef object random_state # Random state # 47 cdef UINT32_t rand_r_state # sklearn_rand_r random number state # ``` # # 3. 样本索引和特征索引: # + 为了简化交换数据的计算,大量使用了交换索引的思路; # + 注意 `constant_features`,在有的节点处,某些特征已经是常数(主要是离散特征),不再可分割。为了避免无谓的计算量,sklearn 每次分割会记录下已经是常量的特征,并由父节点到子节点,代代相传。 # # ```Python # 49 cdef SIZE_t* samples # Sample indices in X, y # 50 cdef SIZE_t n_samples # X.shape[0] # 51 cdef double weighted_n_samples # Weighted number of samples # 52 cdef SIZE_t* features # Feature indices in X # 53 cdef SIZE_t* constant_features # Constant features indices # 54 cdef SIZE_t n_features # X.shape[1] # 55 cdef DTYPE_t* feature_values # temp. array holding feature values # ``` # # 4. 节点起止: # sklearn 将 X 和 y 用数组存储,并在每次分割后,将同一子节点的数据索引归结在一起,所以只需指定首末位罝。 # # ```Python # 57 cdef SIZE_t start # Start position for the current node # 58 cdef SIZE_t end # End position for the current node # ``` # # 5. 预排:事先已经有按各特征值排序好的索引矩阵 # # ```Python # 60 cdef bint presort # Whether to use presorting, only # 61 # allowed on dense data # ``` # # 6. 标签和权重 # # ```Python # 63 cdef DOUBLE_t* y # 64 cdef SIZE_t y_stride # 65 cdef DOUBLE_t* sample_weight # ``` # #### 1.1 函数方法 # ##### 1.1.0 `init` # # init 是调用时的初始化函数。 # # ```Python # 119 cdef void init(self, # 120 object X, # 121 np.ndarray[DOUBLE_t, ndim=2, mode="c"] y, # 122 DOUBLE_t* sample_weight, # 123 np.ndarray X_idx_sorted=None) except *: # 124 """Initialize the splitter. # 125 +-- 15 lines: Take in the input data X, the target Y, and optional sample weights # 140 """ # 141 # 142 self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) # 143 cdef SIZE_t n_samples = X.shape[0] # 144 #+-- 3 lines: Create a new array which will be used to store nonzero------------- # 147 cdef SIZE_t* samples = safe_realloc(&self.samples, n_samples) # 148 #+-- 5 lines: cdef SIZE_t i, j--------------------------------------------------- # 153 for i in range(n_samples): # 154 # Only work with positively weighted samples # 155 if sample_weight == NULL or sample_weight[i] != 0.0: # 156 samples[j] = i # 157 j += 1 # 158 # 159 if sample_weight != NULL: # 160 weighted_n_samples += sample_weight[i] # 161 else: # 162 weighted_n_samples += 1.0 # 163 # 164 # Number of samples is number of positively weighted samples # 165 self.n_samples = j # 166 self.weighted_n_samples = weighted_n_samples # 167 # 168 cdef SIZE_t n_features = X.shape[1] # 169 cdef SIZE_t* features = safe_realloc(&self.features, n_features) # 170 # 171 for i in range(n_features): # 172 features[i] = i # 173 # 174 self.n_features = n_features # 175 # 176 #+-- 7 lines: safe_realloc(&self.feature_values, n_samples)---------------------- # ``` # # 主要流程: # # 1. 142L, 生成随机种子; # 2. 143L-166L, 拷贝权重不为零的样本索引,并计算权重总值; # 3. 168L-174L, 特征索引; # 4. 其它复制准备。 # # # 剩下的函数都非常简单,不赘述。 # # ```Python # 88 cdef void node_reset(self, SIZE_t start, SIZE_t end, # 89 double* weighted_n_node_samples) nogil # 90 # 91 cdef void node_split(self, # 92 double impurity, # Impurity of the node # 93 SplitRecord* split, # 94 SIZE_t* n_constant_features) nogil # 95 # 96 cdef void node_value(self, double* dest) nogil # 97 # 98 cdef double node_impurity(self) nogil # ``` # ### 2. 实阵分割 # #### 2.0 `BaseDenseSplitter` # BaseDenseSplitter 增加了关于实阵 X 和相应预排索引的 `X_idx_sorted` 的变量,相应地重载做了初始化工作。 # # ```Python # 232 cdef class BaseDenseSplitter(Splitter): # 233 cdef DTYPE_t* X # 234 cdef SIZE_t X_sample_stride # 235 cdef SIZE_t X_feature_stride # 236 # 237 cdef np.ndarray X_idx_sorted # 238 cdef INT32_t* X_idx_sorted_ptr # 239 cdef SIZE_t X_idx_sorted_stride # 240 cdef SIZE_t n_total_samples # 241 cdef SIZE_t* sample_mask # 242 # 243 #+-- 16 lines: def __cinit__(self, Criterion criterion, SIZE_t max_features,------ # 259 # 260 cdef void init(self, # 261 object X, # 262 np.ndarray[DOUBLE_t, ndim=2, mode="c"] y, # 263 DOUBLE_t* sample_weight, # 264 np.ndarray X_idx_sorted=None) except *: # 265 """Initialize the splitter.""" # 266 # 267 # Call parent init # 268 Splitter.init(self, X, y, sample_weight) # 269 # 270 # Initialize X # 271 cdef np.ndarray X_ndarray = X # 272 # 273 self.X = X_ndarray.data # 274 #+-- 2 lines: self.X_sample_stride = X.strides[0] / X.itemsize- # 276 # 277 if self.presort == 1: # 278 self.X_idx_sorted = X_idx_sorted # 279 #+-- 4 lines: self.X_idx_sorted_ptr = self.X_idx_sorted.data---------- # 283 self.n_total_samples = X.shape[0] # 284 safe_realloc(&self.sample_mask, self.n_total_samples) # 285 memset(self.sample_mask, 0, self.n_total_samples*sizeof(SIZE_t)) # ``` # # 主要流程: # # 1. 复制实阵样本 X; # 2. 复制预排数据 `X_idx_sorted`; # 3. 清零样本掩码阵 `sample_mask`。 # #### 2.1 `BestSplitter` # # BestSplitter 会随机抽取最多 `max_features` 个特征。对每一个抽定特征,会将样本按此特征进入排序,并逐一作为分割阈值点来考查计算其评价值。最后从所有结果中选取出评价最优的分割方法(特征和阈值)。 # # 主要内容集中在 `node_split` 函数,我们边给代码边说明: # # ```Python # 288 cdef class BestSplitter(BaseDenseSplitter): # 289 #+-- 8 lines: """Splitter for finding the best split."""----------------------------------- # 297 # 298 cdef void node_split(self, double impurity, SplitRecord* split, # 299 SIZE_t* n_constant_features) nogil: # 300 """Find the best split on node samples[start:end].""" # 301 #+-- 20 lines: Find the best split---------------------------------------------------------- # 321 # ``` # # 函数开头主要是复制函数和初始化,不多说。 # ```Python # 322 #+--- 4 lines: cdef SplitRecord best, current---------------------------------------------- # 326 cdef SIZE_t f_i = n_features # 327 #+--- 18 lines: cdef SIZE_t f_j------------------------------------------------------------- # 345 # 346 _init_split(&best, end) # 347 # 348 if self.presort == 1: # 349 for p in range(start, end): # 350 sample_mask[samples[p]] = 1 # ``` # # 322L-350L 是分割时主要涉及的参数初始化。其中 346L 初始化最佳分割点,348L-350L 是标记预排样本。 # # 接下来就是寻找分割的主体。在具体说明前,需要简单介绍下 `feature` 特征索引的组成。 # # 在前面已经说过,某些特征在分割后的子集中会是同一数值,此时已经是常数,不再可分割。为了避免计算浪费在常数特征中,sklearn 设计的 `feature` 特征数组会不断标记在分割中形成的常数特征,并将它们放于数组最前面。为了区分 `feature` 中的常数和非常数,已抽取和未抽取等情况,有多个指针用于指示划分边界点。理解这些指针的位置和作用,可以方便我们的理解。 # # 主要的指针在 `feature` 数组中位置如下所示: # # 0 | | n_draw_const | | n_known_const | | n_total_const | | f_i | | n_features # --|-| --------------|-|---------------|-|---------------|-|-----|-|---------- # | 旧常数,已抽取 | | 旧常数,未抽取 | | 新常数(已抽取) | | 未抽取 | | 非常数,已抽取 | # # 旧常数是父节点传承过来的信息,在分割前已知;新常数是在子节点在抽取分割时新发现的。 # # 于是,基本的寻找分割特征的过程是: # # + 在 [`n_draw_const`, `f_i`] 中随机抽取出一个特征 `f_j`。 # + 若 `f_j` 在 [`n_draw_const`, `n_known_const`] 间,是旧常数。 # 移动 `f_j` 到已抽取旧常数 [0, `n_draw_const`] 中。 # + 若 `f_j` 在 [`n_total_const`, `f_i`] 间,是未知情况,进一步判断: # - 常数,移动到新发现常数 [`n_know_const`, `n_total_const`] 中; # - 非常数,计算评估结果,移动到已抽取非常数 [`f_i`, `n_features`] 中。 # # 有两点要注意说明: # # 1. `f_j` 在具体设计中,不会抽到新发现常数 [`n_total_const`, `f_i`] 区间,后面会讲到。 # # 2. 这里的移动,其实是交换指针,再移动边界的方法。说起来抽象,但因为算法中很常见,一般都看过,见到代码就了然了。 # 代码主体如下: # # ```Python # 361 while (f_i > n_total_constants and # Stop early if remaining features # 362 # are constant # 363 (n_visited_features < max_features or # 364 # At least one drawn features must be non constant # 365 n_visited_features <= n_found_constants + n_drawn_constants)): # 366 # 367 n_visited_features += 1 # 368 #+-- 12 lines: Loop invariant: elements of features in-------------------------------------- # 380 # Draw a feature at random # 381 f_j = rand_int(n_drawn_constants, f_i - n_found_constants, # 382 random_state) # 383 # 384 if f_j < n_known_constants: # 385 #+-- 6 lines: f_j in the interval [n_drawn_constants, n_known_constants[------------------- # 391 # 392 else: # 393 # f_j in the interval [n_known_constants, f_i - n_found_constants[ # 394 f_j += n_found_constants # 395 # f_j in the interval [n_total_constants, f_i[ # 396 current.feature = features[f_j] # 397 feature_offset = self.X_feature_stride * current.feature # 398 # 399 #+-- 4 lines: Sort samples along that feature; either by utilizing------------------------- # 403 if self.presort == 1: # 404 #+-- 9 lines: p = start-------------------------------------------------------------------- # 413 else: # 414 #+--- 3 lines: for i in range(start, end):------------------------------------------------- # 417 sort(Xf + start, samples + start, end - start) # 418 # 419 if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD: # 420 #+-- 6 lines: features[f_j] = features[n_total_constants]---------------------------------- # 426 else: # 427 f_i -= 1 # 428 features[f_i], features[f_j] = features[f_j], features[f_i] # 429 # 430 # Evaluate all splits # 431 self.criterion.reset() # 432 p = start # 433 # 434 while p < end: # 435 #+-- 11 lines: while (p + 1 < end and------------------------------------------------------- # 446 current.pos = p # 447 # 448 # Reject if min_samples_leaf is not guaranteed # 449 #+-- 3 lines: if (((current.pos - start) < min_samples_leaf) or---------------------------- # 452 # 453 self.criterion.update(current.pos) # 454 # 455 # Reject if min_weight_leaf is not satisfied # 456 #+-- 3 lines: if ((self.criterion.weighted_n_left < min_weight_leaf) or-------------------- # 459 # 460 current_proxy_improvement = self.criterion.proxy_impurity_impro vement() # 461 # 462 if current_proxy_improvement > best_proxy_improvement: # 463 #+-- 6 lines: best_proxy_improvement = current_proxy_improvement--------------------------- # 469 best = current # copy # 470 # ``` # # 大概流程说明如下: # # 1. 361L-365L,搜索终止条件: # + 未抽取区间还存在 # + 最少已经抽取了一个非常数特征 # # 2. 抽取:381L 的随机抽取,配合 394L 的加法,可以保证 `f_j` 只会落在未抽取常数和未抽取两个区。后面有画了一个图示,注意红圈1和2,一目了然。 # # 3. 384L-391L,`f_j` 是旧常数的处理,上面已经说过。 # # 4. 接下来,是对 `f_j` 未知情况的处理: # + 399L-417L, 按特征值排序; # + 420L-425L,`f_j` 是新发现的常数特征; # + 427L-469L,遍历特征值,记录当前最优分割信息。 # In[6]: show_image("./res/BestSplitter.jpg", figsize=(10,15)) # 在找到最佳分割点后,剩下的就是善后工作。 # # 1. 按分割点进行节点分割。 # # ```Python # 471 # Reorganize into samples[start:best.pos] + samples[best.pos:end] # 472 if best.pos < end: # 473 feature_offset = X_feature_stride * best.feature # 474 partition_end = end # 475 p = start # 476 # 477 while p < partition_end: # 478 if X[X_sample_stride * samples[p] + feature_offset] <= best.threshold: # 479 p += 1 # 480 # 481 else: # 482 partition_end -= 1 # 483 # 484 tmp = samples[partition_end] # 485 samples[partition_end] = samples[p] # 486 samples[p] = tmp # 487 # 488 self.criterion.reset() # 489 self.criterion.update(best.pos) # 490 best.improvement = self.criterion.impurity_improvement(impurity) # 491 self.criterion.children_impurity(&best.impurity_left, # 492 &best.impurity_right) # 493 # ``` # # 2. 复制常数特征信息,确保 `features` 和 `constant_features` 一致。 # ```Python # 494 # Reset sample mask # 495 #+-- 3 lines: if self.presort == 1:-------------------------------------------------------- # 498 # 499 # Respect invariant for constant features: the original order of # 500 # element in features[:n_known_constants] must be preserved for sibling # 501 # and child nodes # 502 memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants) # 503 # 504 # Copy newly found constant features # 505 memcpy(constant_features + n_known_constants, # 506 features + n_known_constants, # 507 sizeof(SIZE_t) * n_found_constants) # 508 # ``` # # 3. 返值。 # # ```Python # 509 # Return values # 510 split[0] = best # 511 n_constant_features[0] = n_total_constants # ``` # # # 附:这里特征值排序的算法,有机会再补充。 # #### 2.2 RandomSplitter # RandomSplitter 和 BestSplitter 大同小异,主要的差异是对分割阈值的寻找。BestSplitter 会遍历所有的特征值来确定最好的阈值点,而 RandomSplitter 则是找出此特征最大徝和最小值,在这个范围内随机抽值作为阈值点,这种分割比较随缘。 # # 差异的代码如下,就不多说了: # # ```Python # 726 # Find min, max # 727 min_feature_value = X[X_sample_stride * samples[start] + feature_stride] # 728 max_feature_value = min_feature_value # 729 Xf[start] = min_feature_value # 730 # 731 for p in range(start + 1, end): # 732 #+-- 7 lines: current_feature_value = X[X_sample_stride * samples[p] + feature_stride]----- 739 # 740 if max_feature_value <= min_feature_value + FEATURE_THRESHOLD: # 741 #+-- 6 lines: features[f_j] = features[n_total_constants]---------------------------------- # 747 else: # 748 f_i -= 1 # 749 features[f_i], features[f_j] = features[f_j], features[f_i] # 750 # 751 # Draw a random threshold # 752 current.threshold = rand_uniform(min_feature_value, # 753 max_feature_value, # 754 random_state) # ``` # ### 3. 稀疏阵分割 # #### 3.0 BaseSparseSplitter # # 稀疏阵与实阵的算法思路和流程是近乎一致的,差别是在对数据查找和分割的操作上。这里不再叙述,以后有机会接触到稀疏阵时,再说明。 # ### 总结 # # 本文介绍了 BestSplit 和 RandomSplit 两种分割方法。 # In[ ]: