#!/usr/bin/env python # coding: utf-8 # In[1]: import random import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn import svm from sklearn.naive_bayes import GaussianNB # In[2]: iris = datasets.load_iris() X, y = iris.data, iris.target print(X.shape, y.shape) # In[3]: def simulate(X, y, n=100): X_new, y_new = [], [] Xmax = np.max(X, axis=0) Xmin = np.min(X, axis=0) for t in range(n): tmp = [] for i in range(X.shape[1]): rg = np.linspace(Xmin[i], Xmax[i], 30) choice = round(np.random.choice(rg), 1) tmp.append(choice) X_new.append(tmp) y_new.append(random.choice(list(set(y)))) return np.array(X_new), np.array(y_new) # In[4]: X_new, y_new = simulate(X, y, 1000) # In[5]: X_all = np.concatenate((X, X_new), axis=0) y_all = np.concatenate((y, y_new), axis=0) print(X_all.shape, y_all.shape) # In[6]: X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42) X_train.shape, y_train.shape, X_test.shape, y_test.shape # In[7]: m1 = svm.SVC(gamma='auto') m2 = GaussianNB() m1.fit(X_train, y_train); m2.fit(X_train, y_train); m1.score(X_test, y_test), m2.score(X_test, y_test) # In[8]: def boostrap(test, m1, m2, b=10, n=10, delta=0.05): ds = [] X_test, y_test = test for i in range(b): choices = random.choices(range(X_test.shape[0]), k=n) _X_test = X_test[choices, :] _y_test = y_test[choices] dm1 = m1.score(_X_test, _y_test) dm2 = m2.score(_X_test, _y_test) d = dm1 - dm2 ds.append(d) p = sum([_ for _ in ds if _ > 2*delta])/b return p # In[9]: boostrap((X_test, y_test), m1, m2, 30, 10, 0.025) # 不能拒绝原假设:m1 在双边置信度 0.05 下不比 m2 好,所以 m1 不比 m2 好 # In[10]: boostrap((X_test, y_test), m1, m2, 30, 10, 0.05) # 拒绝原假设:m1 在双边置信度 0.1 下不比 m2 好,所以 m1 比 m2 好 # In[ ]: