#!/usr/bin/env python # coding: utf-8 # #### Cross Validation # In[77]: # import from sklearn.datasets import load_iris from sklearn.cross_validation import cross_val_score, KFold, train_test_split, cross_val_predict, LeaveOneOut, LeavePOut from sklearn.cross_validation import ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit from sklearn.metrics import accuracy_score from sklearn.svm import SVC from scipy.stats import sem import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: iris = load_iris() X, y = iris.data, iris.target # In[5]: # splotting the data into train and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27) print(X_train.shape, X_test.shape, X_train.shape[0]) # cross_val_score uses the KFold or StratifiedKFold strategies by default # In[49]: # define cross_val func def xVal_score(clf, X, y, K): # creating K using KFold cv = KFold(n=X.shape[0], n_folds=K, shuffle=True, random_state=True) # Can use suffle as well # cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0) # doing cross validation scores = cross_val_score(clf, X, y, cv=cv) print(scores) print("Accuracy Mean : %0.3f" %np.mean(scores)) print("Std : ", np.std(scores)*2) print("Standard Err : +/- {0:0.3f} ".format(sem(scores))) # In[50]: svc1 = SVC() xVal_score(svc1, X_train, y_train, 10) # In[53]: # define cross_val predict # The function cross_val_predict has a similar interface to cross_val_score, but returns, # for each element in the input, the prediction that was obtained for that element when it # was in the test set. Only cross-validation strategies that assign all elements to a test # set exactly once can be used (otherwise, an exception is raised). def xVal_predict(clf, X, y, K): # creating K using KFold cv = KFold(n=X.shape[0], n_folds=K, shuffle=True, random_state=True) # Can use suffle as well # cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0) # doing cross validation prediction predicted = cross_val_predict(clf, X, y, cv=cv) print(predicted) print("Accuracy Score : %0.3f" % accuracy_score(y, predicted)) # In[54]: xVal_predict(svc1, X_train, y_train, 10) # ** Cross Validation Iterator ** # # ** K-Fold ** - KFold divides all the samples in k groups of samples, called folds (if k = n, this is equivalent to the Leave One Out strategy), of equal sizes (if possible). The prediction function is learned using k - 1 folds, and the fold left out is used for test. # In[68]: X = [1,2,3,4,5] kf = KFold(n=len(X), n_folds=2) print(kf) for i in kf: print(i) # ** Leave One Out (LOO) ** - LeaveOneOut (or LOO) is a simple cross-validation. Each learning set is created by taking all the samples except one, the test set being the sample left out. Thus, for n samples, we have n different training sets and n different tests set. This cross-validation procedure does not waste much data as only one sample is removed from the training set: # In[70]: X = [1,2,3,4,5] loo = LeaveOneOut(len(X)) print(loo) for i in loo: print(i) # ** Leave P Out (LPO) ** - LeavePOut is very similar to LeaveOneOut as it creates all the possible training/test sets by removing p samples from the complete set. For n samples, this produces {n \choose p} train-test pairs. Unlike LeaveOneOut and KFold, the test sets will overlap for p > 1 # In[72]: X = [1,2,3,4,5] loo = LeavePOut(len(X), p=3) print(loo) for i in loo: print(i) # ** Random permutations cross-validation a.k.a. Shuffle & Split ** - The ShuffleSplit iterator will generate a user defined number of independent train / test dataset splits. Samples are first shuffled and then split into a pair of train and test sets. # # It is possible to control the randomness for reproducibility of the results by explicitly seeding the random_state pseudo random number generator. # In[76]: X = [1,2,3,4,5] loo = ShuffleSplit(len(X)) print(loo) for i in loo: print(i) # Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples. In such cases it is recommended to use stratified sampling as implemented in StratifiedKFold and StratifiedShuffleSplit to ensure that relative class frequencies is approximately preserved in each train and validation fold. # # ** Stratified k-fold ** # StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set. # In[91]: X = np.ones(10) y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1] skf = StratifiedKFold(n_folds=4, y=y) for i in skf: print(i) skf. # In[ ]: