#!/usr/bin/env python
# coding: utf-8

# In[1]:


import random
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.naive_bayes import GaussianNB


# In[2]:


iris = datasets.load_iris()
X, y = iris.data, iris.target
print(X.shape, y.shape)


# In[3]:


def simulate(X, y, n=100):
    X_new, y_new = [], []
    Xmax = np.max(X, axis=0)
    Xmin = np.min(X, axis=0)
    for t in range(n):
        tmp = []
        for i in range(X.shape[1]):
            rg = np.linspace(Xmin[i], Xmax[i], 30)
            choice = round(np.random.choice(rg), 1)
            tmp.append(choice)
        X_new.append(tmp)
        y_new.append(random.choice(list(set(y))))
    return np.array(X_new), np.array(y_new) 


# In[4]:


X_new, y_new = simulate(X, y, 1000)


# In[5]:


X_all = np.concatenate((X, X_new), axis=0)
y_all = np.concatenate((y, y_new), axis=0)
print(X_all.shape, y_all.shape)


# In[6]:


X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape


# In[7]:


m1 = svm.SVC(gamma='auto')
m2 = GaussianNB()
m1.fit(X_train, y_train);
m2.fit(X_train, y_train);
m1.score(X_test, y_test), m2.score(X_test, y_test)


# In[8]:


def boostrap(test, m1, m2, b=10, n=10, delta=0.05):
    ds = []
    X_test, y_test = test
    for i in range(b):
        choices = random.choices(range(X_test.shape[0]), k=n)
        _X_test = X_test[choices, :]
        _y_test = y_test[choices]
        dm1 = m1.score(_X_test, _y_test)
        dm2 = m2.score(_X_test, _y_test)
        d = dm1 - dm2
        ds.append(d)
    p = sum([_ for _ in ds if _ > 2*delta])/b
    return p


# In[9]:


boostrap((X_test, y_test), m1, m2, 30, 10, 0.025)
# 不能拒绝原假设：m1 在双边置信度 0.05 下不比 m2 好，所以 m1 不比 m2 好


# In[10]:


boostrap((X_test, y_test), m1, m2, 30, 10, 0.05)
# 拒绝原假设：m1 在双边置信度 0.1 下不比 m2 好，所以 m1 比 m2 好


# In[ ]: