import random
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
iris = datasets.load_iris()
X, y = iris.data, iris.target
print(X.shape, y.shape)
(150, 4) (150,)
def simulate(X, y, n=100):
X_new, y_new = [], []
Xmax = np.max(X, axis=0)
Xmin = np.min(X, axis=0)
for t in range(n):
tmp = []
for i in range(X.shape[1]):
rg = np.linspace(Xmin[i], Xmax[i], 30)
choice = round(np.random.choice(rg), 1)
tmp.append(choice)
X_new.append(tmp)
y_new.append(random.choice(list(set(y))))
return np.array(X_new), np.array(y_new)
X_new, y_new = simulate(X, y, 1000)
X_all = np.concatenate((X, X_new), axis=0)
y_all = np.concatenate((y, y_new), axis=0)
print(X_all.shape, y_all.shape)
(1150, 4) (1150,)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((920, 4), (920,), (230, 4), (230,))
m1 = svm.SVC(gamma='auto')
m2 = GaussianNB()
m1.fit(X_train, y_train);
m2.fit(X_train, y_train);
m1.score(X_test, y_test), m2.score(X_test, y_test)
(0.3869565217391304, 0.30434782608695654)
def boostrap(test, m1, m2, b=10, n=10, delta=0.05):
ds = []
X_test, y_test = test
for i in range(b):
choices = random.choices(range(X_test.shape[0]), k=n)
_X_test = X_test[choices, :]
_y_test = y_test[choices]
dm1 = m1.score(_X_test, _y_test)
dm2 = m2.score(_X_test, _y_test)
d = dm1 - dm2
ds.append(d)
p = sum([_ for _ in ds if _ > 2*delta])/b
return p
boostrap((X_test, y_test), m1, m2, 30, 10, 0.025)
# 不能拒绝原假设:m1 在双边置信度 0.05 下不比 m2 好,所以 m1 不比 m2 好
0.1166666666666667
boostrap((X_test, y_test), m1, m2, 30, 10, 0.05)
# 拒绝原假设:m1 在双边置信度 0.1 下不比 m2 好,所以 m1 比 m2 好
0.08666666666666666