from IPython.core.display import HTML

print("Setting custom CSS for the IPython Notebook")
styles = open('custom.css', 'r').read()
HTML(styles)

import scipy
import numpy as np
import matplotlib
%matplotlib inline
%pylab inline 

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("talk")
sns.set_style("white")

import pandas as pd

from matplotlib.colors import ListedColormap
import sklearn
import sklearn.decomposition
import sklearn.neighbors
import sklearn.datasets

cmap_light = matplotlib.colors.ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])

# generate plots for later

# ML in 10 minutes plot
#cheat to get the same "random" numbers
np.random.seed(seed=99)

# make some data up
mean1 = [3,3]
mean2 = [8,8]
cov = [[1.0,0.0],[0.0,1.0]] 

#create some points
x1 = np.random.multivariate_normal(mean1,cov,50)
x2 = np.random.multivariate_normal(mean2,cov,50)

plt.scatter(x1[:,0],x1[:,1], c='r', s=100)
plt.scatter(x2[:,0],x2[:,1], c='b', s=100)

plt.plot([2,10],[10,1], c='g', linewidth=5.0)

plt.title("ML in One Picture")
plt.xlabel("feature 1")
plt.ylabel("feature 2")

fig_ml_in_10 = plt.gcf()

# features matter plot
#cheat to get the same "random" numbers
np.random.seed(seed=99)

# make some data up
mean1 = [6,6]
mean2 = [6,6]
cov = [[1.0,0.0],[0.0,1.0]] 

#create some points
x1 = np.random.multivariate_normal(mean1,cov,50)
x2 = np.random.multivariate_normal(mean2,cov,50)

plt.scatter(x1[:,0],x1[:,1], c='r', s=100)
plt.scatter(x2[:,0],x2[:,1], c='b', s=100)

plt.plot([2,10],[10,1], c='g', linewidth=5.0)

plt.title("Apples and Oranges")
plt.xlabel("Roundness")
plt.ylabel("Weight")

fig_features_matter_confused = plt.gcf()


#cheat to get the same "random" numbers
np.random.seed(seed=42)

# make some data up
mean1 = [5,5]
mean2 = [11,5]
cov = [[1.0,0.0],[0.0,1.0]] 

#create some points
x1 = np.random.multivariate_normal(mean1,cov,50)
x2 = np.random.multivariate_normal(mean2,cov,50)

plt.scatter(x1[:,0],x1[:,1], c='r', s=100)
plt.scatter(x2[:,0],x2[:,1], c='b', s=100)

plt.plot([8,8],[2,10], c='g', linewidth=5.0)

plt.title("Apples and Oranges")
plt.xlabel("Color")
plt.ylabel("Shape")

fig_features_matter_separated = plt.gcf()


#cheat to get the same "random" numbers
np.random.seed(seed=99)

# make some data up
mean1 = [3,3]
mean2 = [8,8]
cov = [[1.0,0.0],[0.0,1.0]] 

#create some points
x1 = np.random.multivariate_normal(mean1,cov,50)
x2 = np.random.multivariate_normal(mean2,cov,50)

plt.scatter(x1[:,0],x1[:,1], c='r', s=100)
plt.scatter(x2[:,0],x2[:,1], c='b', s=100)
plt.scatter([6],[9], c='k', s=200)

plt.plot([2,10],[10,1], c='g', linewidth=5.0)

plt.title("ML in One Picture")
plt.xlabel("feature 1")
plt.ylabel("feature 2")

fig_prediction = plt.gcf()

fig_ml_in_10

fig_features_matter_confused

fig_features_matter_separated

fig_prediction

# get the data
x = np.concatenate((x1, x2))

# get the labels
ones = np.ones((50,))
y = np.concatenate((ones*0, ones))

k = 1 # number of neighbors

classifier = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k)
classifier.fit(x,y)

newPoint_1 = np.array([6,9])
newPoint_2 = np.array([2,4])

print classifier.predict([newPoint_1, newPoint_2])

fig_ml_in_10

# code based on sklearn example
def visualize_decision_boundary(classifier, xmin, xmax, ymin, 
                                ymax, step_size=0.02, 
                                cmap=cmap_light):
    
    xx, yy = np.meshgrid(np.arange(xmin, xmax, step_size),
                         np.arange(ymin, ymax, step_size))
    colors = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
    colors = colors.reshape(xx.shape)
    ax = plt.subplot(111)
    ax.pcolormesh(xx, yy, colors, cmap=cmap)
    return ax

xx, yy = np.meshgrid(np.arange(0, 10, 2),
                     np.arange(0, 10, 2))

plt.scatter(xx,yy, c='b', zorder=2)
random_colors = np.random.randint(10, size=25)
random_colors = random_colors.reshape(xx.shape)
plt.pcolormesh(xx,yy,random_colors,cmap=plt.cm.Pastel2, zorder=1)

# now visualize the boundary
boundary_vis = visualize_decision_boundary(classifier=classifier, 
                                           xmin=-2,xmax=12,
                                           ymin=0,ymax=12,
                                           step_size=0.02)

# and plot our data points
boundary_vis.scatter(x1[:,0],x1[:,1], c='r', s=100)
boundary_vis.scatter(x2[:,0],x2[:,1], c='b', s=100)
plt.show()

data = sklearn.datasets.fetch_mldata('MNIST original')
print data.keys()

x = data['data'][::50,:]
y = data['target'][::50]

print x.shape, y.shape

# shuffle the data
randomized_data = np.random.permutation(x)
rows = 5
cols = 10
img = []

for col in xrange(rows, rows*cols, rows):
    examples = randomized_data[col-rows:col,:]
    col_img = np.reshape(examples,(28*rows,28))
    img.append(col_img)

plt.imshow(np.hstack(img))
plt.show()

data = sklearn.datasets.fetch_mldata('MNIST original')
x = data['data'][::50,:]
y = data['target'][::50]

svd = sklearn.decomposition.TruncatedSVD(n_components=2)
x = x - np.mean(x, axis=0)
#x = x / (np.std(x, axis=0) + 0.00001)
foo = svd.fit(x)
x_2d = foo.transform(x)

plt.scatter(x_2d[:,0], x_2d[:,1], c=y, s = 50, cmap=plt.cm.Paired)
plt.colorbar()
plt.show()

print foo_c
print "##########"
print pca_c

x_sub_2d = x_2d[np.logical_or(y==8, y==0),:]
y_sub = y[np.logical_or(y==8, y==0)]

plt.scatter(x_sub_2d[:,0], x_sub_2d[:,1], c=y_sub, 
            s = 50, cmap=plt.cm.Paired)
plt.show()

k = 1
knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k, 
                                             weights = 'uniform')
knn.fit(x_sub_2d,y_sub)

boundary_vis = visualize_decision_boundary(classifier=knn, 
                                xmin=500, xmax=4000,
                                ymin=-2500,ymax=1500, 
                                step_size=20.0, cmap=plt.cm.Paired)

# and plot our data points
boundary_vis.scatter(x_sub_2d[:,0], x_sub_2d[:,1], 
                     c=y_sub, s = 10, linewidth=1, 
                     cmap=plt.cm.Paired)

fig_knn = plt.gcf()

k = 20
knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k, 
                                             weights = 'uniform')
knn.fit(x_sub_2d,y_sub)
y_hat = knn.predict(x_sub_2d)

confusion_matrix = sklearn.metrics.confusion_matrix(y_sub, y_hat)
print confusion_matrix

# Show confusion matrix in a separate window
def show_confusion_matrix(confusion_matrix):
    plt.matshow(confusion_matrix, cmap=plt.cm.Greys_r)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    
show_confusion_matrix(confusion_matrix)


k = 20
knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k, 
                                             weights = 'uniform')
knn.fit(x,y)
y_hat = knn.predict(x)

confusion_matrix = sklearn.metrics.confusion_matrix(y, y_hat)
show_confusion_matrix(confusion_matrix)
print confusion_matrix

sklearn.metrics.accuracy_score(y, y_hat, normalize=True)

def accuracy_for_k(k,x,y):
    knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k, 
                                             weights = 'uniform')
    knn.fit(x,y)
    y_hat = knn.predict(x)
    return sklearn.metrics.accuracy_score(y, y_hat, normalize=True)

## Why is this code inefficient?
acc_values = []
k_values = xrange(1,50,2)
for k in k_values:
    acc_values.append(accuracy_for_k(k=k,x=x_sub_2d,y=y_sub))
    
plt.plot(k_values, acc_values)
plt.xlabel('Value of k')
plt.ylabel('Accuracy')
fig_train_acc = plt.gcf()

fig_train_acc

def accuracy_for_k_val(k,x,y, random_state=99):
    split_data = sklearn.cross_validation.train_test_split(
                        x,y, test_size=0.33, random_state=random_state)
    x_train, x_val, y_train, y_val = split_data
    knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k, 
                                             weights = 'uniform')
    knn.fit(x_train,y_train)
    y_hat = knn.predict(x_val)
    return sklearn.metrics.accuracy_score(y_val, y_hat, 
                                          normalize=True)

acc_values_val = []
acc_values_train = []

k_values = xrange(1,150,2)
for k in k_values:
    acc_values_train.append(accuracy_for_k(k=k,x=x_sub_2d,y=y_sub))
    acc_values_val.append(accuracy_for_k_val(k=k,x=x_sub_2d,y=y_sub))
    
plt.plot(k_values, acc_values_train, c='r', label="training accuracy")
plt.plot(k_values, acc_values_val, c='b', label="validation accuracy")
plt.xlabel('Value of k')
plt.ylabel('Accuracy')
plt.legend()

fig_cross_val = plt.gcf()

fig_knn

fig_cross_val

k_values = range(1,150,10)

np.random.seed(seed=99)
random_seeds = np.random.randint(1000, size=50)

values = np.zeros((len(k_values),random_seeds.shape[0]))
for k, c_k in zip(k_values, range(len(k_values))):
    counter_rs = 0
    for rs,c_rs in zip(random_seeds, range(random_seeds.shape[0])):
        value = accuracy_for_k_val(k=k,x=x_sub_2d,y=y_sub,
                                   random_state=rs)
        values[c_k,c_rs] = value
        
sns.tsplot(values.T)
plt.xlabel('k')
plt.ylabel('accuracy')
plt.show()

k_values = range(1,150,10)

np.random.seed(seed=99)
random_seeds = np.random.randint(1000, size=50)

values = np.zeros((len(k_values),random_seeds.shape[0]))
for k, c_k in zip(k_values, range(len(k_values))):
    counter_rs = 0
    for rs,c_rs in zip(random_seeds, range(random_seeds.shape[0])):
        value = accuracy_for_k_val(k=k,x=x_sub_2d,y=y_sub,
                                   random_state=rs)
        values[c_k,c_rs] = value
        
sns.boxplot(values.T)
plt.xlabel('k')
plt.ylabel('accuracy')
plt.show()