#!/usr/bin/env python # coding: utf-8 # ## 内容概要 # - 如何使用K折交叉验证来搜索最优调节参数 # - 如何让搜索参数的流程更加高效 # - 如何一次性的搜索多个调节参数 # - 在进行真正的预测之前,如何对调节参数进行处理 # - 如何削减该过程的计算代价 # ## 1. K折交叉验证回顾 # **交叉验证的过程** # - 选择K的值(一般是10),将数据集分成K等份 # - 使用其中的K-1份数据作为训练数据,另外一份数据作为测试数据,进行模型的训练 # - 使用一种度量测度来衡量模型的预测性能 # # **交叉验证的优点** # - 交叉验证通过降低模型在一次数据分割中性能表现上的方差来保证模型性能的稳定性 # - 交叉验证可以用于选择调节参数、比较模型性能差别、选择特征 # # **交叉验证的缺点** # - 交叉验证带来一定的计算代价,尤其是当数据集很大的时候,导致计算过程会变得很慢 # ## 2. 使用GridSearchCV进行高效调参 # GridSearchCV根据你给定的模型自动进行交叉验证,通过调节每一个参数来跟踪评分结果,实际上,该过程代替了进行参数搜索时的for循环过程。 # In[1]: from sklearn.datasets import load_iris from sklearn.neighbors import KNeighborsClassifier import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') from sklearn.grid_search import GridSearchCV # In[2]: # read in the iris data iris = load_iris() # create X (features) and y (response) X = iris.data y = iris.target # In[3]: # define the parameter values that should be searched k_range = range(1, 31) print k_range # In[4]: # create a parameter grid: map the parameter names to the values that should be searched # 下面是构建parameter grid,其结构是key为参数名称,value是待搜索的数值列表的一个字典结构 param_grid = dict(n_neighbors=k_range) print param_grid # In[5]: knn = KNeighborsClassifier(n_neighbors=5) # instantiate the grid # 这里GridSearchCV的参数形式和cross_val_score的形式差不多,其中param_grid是parameter grid所对应的参数 # GridSearchCV中的n_jobs设置为-1时,可以实现并行计算(如果你的电脑支持的情况下) grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy') # 我们可以知道,这里的grid search针对每个参数进行了10次交叉验证,并且一共对30个参数进行相同过程的交叉验证 # In[6]: grid.fit(X, y) # In[7]: # view the complete results (list of named tuples) grid.grid_scores_ # In[8]: # examine the first tuple print grid.grid_scores_[0].parameters print grid.grid_scores_[0].cv_validation_scores print grid.grid_scores_[0].mean_validation_score # In[9]: # create a list of the mean scores only grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_] print grid_mean_scores # In[10]: # plot the results plt.plot(k_range, grid_mean_scores) plt.xlabel('Value of K for KNN') plt.ylabel('Cross-Validated Accuracy') # In[11]: # examine the best model print grid.best_score_ print grid.best_params_ print grid.best_estimator_ # ## 3. 同时对多个参数进行搜索 # 这里我们使用knn的两个参数,分别是n_neighbors和weights,其中weights参数默认是uniform,该参数将所有数据看成等同的,而另一值是distance,它将近邻的数据赋予更高的权重,而较远的数据赋予较低权重。 # In[12]: # define the parameter values that should be searched k_range = range(1, 31) weight_options = ['uniform', 'distance'] # In[13]: # create a parameter grid: map the parameter names to the values that should be searched param_grid = dict(n_neighbors=k_range, weights=weight_options) print param_grid # In[14]: # instantiate and fit the grid grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy') grid.fit(X, y) # In[15]: # view the complete results grid.grid_scores_ # In[16]: # examine the best model print grid.best_score_ print grid.best_params_ # ## 4. 使用最佳参数做出预测 # In[17]: # train your model using all data and the best known parameters knn = KNeighborsClassifier(n_neighbors=13, weights='uniform') knn.fit(X, y) # make a prediction on out-of-sample data knn.predict([3, 5, 4, 2]) # **这里使用之前得到的最佳参数对模型进行重新训练,在训练时,就可以将所有的数据都作为训练数据全部投入到模型中去,这样就不会浪费个别数据了。** # In[18]: # shortcut: GridSearchCV automatically refits the best model using all of the data grid.predict([3, 5, 4, 2]) # ## 5. 使用RandomizeSearchCV来降低计算代价 # - RandomizeSearchCV用于解决多个参数的搜索过程中计算代价过高的问题 # - RandomizeSearchCV搜索参数中的一个子集,这样你可以控制计算代价 # ![](Image/grid_vs_random.jpeg) # In[19]: from sklearn.grid_search import RandomizedSearchCV # In[20]: # specify "parameter distributions" rather than a "parameter grid" param_dist = dict(n_neighbors=k_range, weights=weight_options) # In[21]: # n_iter controls the number of searches rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5) rand.fit(X, y) rand.grid_scores_ # In[22]: # examine the best model print rand.best_score_ print rand.best_params_ # In[23]: # run RandomizedSearchCV 20 times (with n_iter=10) and record the best score best_scores = [] for _ in range(20): rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10) rand.fit(X, y) best_scores.append(round(rand.best_score_, 3)) print best_scores # **当你的调节参数是连续的,比如回归问题的正则化参数,有必要指定一个连续分布而不是可能值的列表,这样RandomizeSearchCV就可以执行更好的grid search。** # ## 参考资料 # - scikit-learn documentation: [Grid search](http://scikit-learn.org/stable/modules/grid_search.html), [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html), [RandomizedSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.RandomizedSearchCV.html) # - Timed example: [Comparing randomized search and grid search](http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html) # - scikit-learn workshop by Andreas Mueller: [Video segment on randomized search (3 minutes)](https://www.youtube.com/watch?v=0wUF_Ov8b0A&feature=youtu.be&t=17m38s), [related notebook](http://nbviewer.ipython.org/github/amueller/pydata-nyc-advanced-sklearn/blob/master/Chapter%203%20-%20Randomized%20Hyper%20Parameter%20Search.ipynb) # - Paper by Yoshua Bengio: [Random Search for Hyper-Parameter Optimization](http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf)