%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import brewer2mpl
from matplotlib import rcParams

#colorbrewer2 Dark2 qualitative color table
dark2_cmap = brewer2mpl.get_map('Dark2', 'Qualitative', 7)
dark2_colors = dark2_cmap.mpl_colors

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings('ignore', message='Polyfit*')

import random
import copy
def scatter_by(df, scatterx, scattery, by=None, figure=None, axes=None, colorscale=dark2_cmap, labeler={}, mfunc=None, setupfunc=None, mms=8):
    cs=copy.deepcopy(colorscale.mpl_colors)
    if not figure:
        figure=plt.figure(figsize=(8,8))
    if not axes:
        axes=figure.gca()
    x=df[scatterx]
    y=df[scattery]
    if not by:
        col=random.choice(cs)
        axes.scatter(x, y, cmap=colorscale, c=col)
        if setupfunc:
            axeslist=setupfunc(axes, figure)
        else:
            axeslist=[axes]
        if mfunc:
            mfunc(axeslist,x,y,color=col, mms=mms)
    else:
        cs=list(np.linspace(0,1,len(df.groupby(by))))
        xlimsd={}
        ylimsd={}
        xs={}
        ys={}
        cold={}
        for k,g in df.groupby(by):
            col=cs.pop()
            x=g[scatterx]
            y=g[scattery]
            xs[k]=x
            ys[k]=y
            c=colorscale.mpl_colormap(col)
            cold[k]=c
            axes.scatter(x, y, c=c, label=labeler.get(k,k), s=40, alpha=0.3);
            xlimsd[k]=axes.get_xlim()
            ylimsd[k]=axes.get_ylim()
        xlims=[min([xlimsd[k][0] for k in xlimsd.keys()]), max([xlimsd[k][1] for k in xlimsd.keys()])]
        ylims=[min([ylimsd[k][0] for k in ylimsd.keys()]), max([ylimsd[k][1] for k in ylimsd.keys()])]
        axes.set_xlim(xlims)
        axes.set_ylim(ylims)
        if setupfunc:
            axeslist=setupfunc(axes, figure)
        else:
            axeslist=[axes]
        if mfunc:
            for k in xs.keys():
                mfunc(axeslist,xs[k],ys[k],color=cold[k], mms=mms);
    axes.set_xlabel(scatterx);
    axes.set_ylabel(scattery);
    
    return axes

def make_rug(axeslist, x, y, color='b', mms=8):
    axes=axeslist[0]
    zerosx1=np.zeros(len(x))
    zerosx2=np.zeros(len(x))
    xlims=axes.get_xlim()
    ylims=axes.get_ylim()
    zerosx1.fill(ylims[1])
    zerosx2.fill(xlims[1])
    axes.plot(x, zerosx1, marker='|', color=color, ms=mms)
    axes.plot(zerosx2, y, marker='_', color=color, ms=mms)
    axes.set_xlim(xlims)
    axes.set_ylim(ylims)
    return axes

np.random.seed(42)

def rmse(p,x,y):
    yfit = np.polyval(p, x)
    return np.sqrt(np.mean((y - yfit) ** 2))

def generate_curve(x, sigma):
    return np.random.normal(10 - 1. / (x + 0.1), sigma)
x = 10 ** np.linspace(-2, 0, 8)
intrinsic_error=1.
y=generate_curve(x, intrinsic_error)
plt.scatter(x,y)

x_new=np.linspace(-0.2, 1.2, 1000)
plt.scatter(x,y, s=50)
f1=np.polyfit(x,y,1)
plt.plot(x_new,np.polyval(f1,x_new))
print "d=1, rmse=",rmse(f1,x,y)
f2=np.polyfit(x,y,2)
plt.plot(x_new,np.polyval(f2,x_new))
print "d=2, rmse=",rmse(f2,x,y)
f4=np.polyfit(x,y,4)
plt.plot(x_new,np.polyval(f4,x_new))
print "d=4, rmse=",rmse(f4,x,y)
f6=np.polyfit(x,y,6)
plt.plot(x_new,np.polyval(f6,x_new))
print "d=6, rmse=",rmse(f6,x,y)
plt.xlim(-0.2, 1.2)
plt.ylim(-1, 12)

N = 200
x = np.random.random(N)
y = generate_curve(x, intrinsic_error)
plt.scatter(x,y)

from sklearn.cross_validation import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, train_size=0.6)
plt.scatter(xtrain, ytrain, color='red')
plt.scatter(xtest, ytest, color='blue')

ds = np.arange(21)
train_err = np.zeros(len(ds))
test_err = np.zeros(len(ds))

for i, d in enumerate(ds):
    p = np.polyfit(xtrain, ytrain, d)

    train_err[i] = rmse(p, xtrain, ytrain)
    test_err[i] = rmse(p, xtest, ytest)

fig, ax = plt.subplots()

ax.plot(ds, test_err, lw=2, label = 'test error')
ax.plot(ds, train_err, lw=2, label = 'training error')
ax.legend(loc=0)
ax.set_xlabel('degree of fit')
ax.set_ylabel('rms error')

#taken lock stock and barrel from Vanderplas.
def plot_learning_curve(d):
    sizes = np.linspace(2, N, 50).astype(int)
    train_err = np.zeros(sizes.shape)
    crossval_err = np.zeros(sizes.shape)

    for i, size in enumerate(sizes):
        # Train on only the first `size` points
        p = np.polyfit(xtrain[:size], ytrain[:size], d)
        
        # Validation error is on the *entire* validation set
        crossval_err[i] = rmse(p, xtest, ytest)
        
        # Training error is on only the points used for training
        train_err[i] = rmse(p, xtrain[:size], ytrain[:size])

    fig, ax = plt.subplots()
    ax.plot(sizes, crossval_err, lw=2, label='validation error')
    ax.plot(sizes, train_err, lw=2, label='training error')
    ax.plot([0, N], [intrinsic_error, intrinsic_error], '--k', label='intrinsic error')

    ax.set_xlabel('training set size')
    ax.set_ylabel('rms error')
    
    ax.legend(loc=0)
    
    ax.set_xlim(0, 99)

    ax.set_title('d = %i' % d)

plot_learning_curve(d=1)
plt.ylim(0, 10)

plot_learning_curve(d=5)
plt.ylim(0, 10)

plot_learning_curve(d=20)
plt.ylim(0, 10)

df=pd.read_csv("data/olive.csv")
df.head(5)

df.rename(columns={df.columns[0]:'areastring'}, inplace=True)
df.areastring=df.areastring.map(lambda x: x.split('.')[-1])
acidlist=['palmitic', 'palmitoleic', 'stearic', 'oleic', 'linoleic', 'linolenic', 'arachidic', 'eicosenoic']
dfsub=df[acidlist].apply(lambda x: x/100.0)
df[acidlist]=dfsub
df.head()

dfsouth = df[df.region==1]
dfsouthns = dfsouth[df.area != 4]
dfsouthns.head()

akeys=[1,2,3]
avals=['North-Apulia','Calabria','South-Apulia']
amap={e[0]:e[1] for e in zip(akeys,avals)}
ax=scatter_by(dfsouthns, 'palmitic', 'palmitoleic', by='area', labeler=amap, mfunc=make_rug, mms=20)
ax.grid(False)
ax.legend(loc='upper right');

from matplotlib.colors import ListedColormap
#cm_bright = ListedColormap(['#FF0000', '#000000','#0000FF'])
#cm = plt.cm.RdBu
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

def points_plot(X, Xtr, Xte, ytr, yte, clf, colorscale=cmap_light, cdiscrete=cmap_bold):
    h = .02
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50),
                         np.linspace(y_min, y_max, 50))

    plt.figure()
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=0.2)
    plt.scatter(Xtr[:, 0], Xtr[:, 1], c=ytr-1, cmap=cdiscrete, s=50, alpha=0.2,edgecolor="k")
    # and testing points
    yact=clf.predict(Xte)
    print "SCORE", clf.score(Xte, yte)
    plt.scatter(Xte[:, 0], Xte[:, 1], c=yte-1, cmap=cdiscrete, alpha=0.5, marker="s", s=35)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    return ax


from sklearn.neighbors import KNeighborsClassifier
subdf=dfsouthns[['palmitic','palmitoleic']]
subdfstd=(subdf - subdf.mean())/subdf.std()
X=subdfstd.values
y=dfsouthns['area'].values
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.6)
Xtr=np.concatenate((Xtrain, Xtest))

clf = KNeighborsClassifier(20, warn_on_equidistant=False).fit(Xtrain, ytrain)
points_plot(Xtr, Xtrain, Xtest, ytrain, ytest, clf)

#your code here
clf = KNeighborsClassifier(1, warn_on_equidistant=False).fit(Xtrain, ytrain)
points_plot(Xtr, Xtrain, Xtest, ytrain, ytest, clf)

#your code here
clf = KNeighborsClassifier(35, warn_on_equidistant=False).fit(Xtrain, ytrain)
points_plot(Xtr, Xtrain, Xtest, ytrain, ytest, clf)

Xcv,Xte,ycv,yte=train_test_split(Xtest, ytest, train_size=0.5)
print ytrain.shape, ycv.shape, yte.shape
trscores=[]
cvscores=[]
ns=np.arange(1,80,1)
ones=np.ones(len(ns))
for n in ns:
    clf = KNeighborsClassifier(n, warn_on_equidistant=False).fit(Xtrain, ytrain)
    trscores.append(clf.score(Xtrain, ytrain))
    cvscores.append(clf.score(Xcv, ycv))
plt.plot(ns, ones-trscores, label="training")
plt.plot(ns, ones-cvscores, label="cv")
plt.legend(loc='upper left');
print clf.score(Xte, yte)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.8)
Xtr=np.concatenate((Xtrain, Xtest))

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
parameters = {"n_neighbors": np.arange(1,80,1)}
clf = KNeighborsClassifier(warn_on_equidistant=False)
gs = GridSearchCV(clf, param_grid=parameters, cv=10)
gs.fit(Xtrain, ytrain)
#print gs.grid_scores_
print gs.best_params_, gs.best_score_
y_true, y_pred = ytest, gs.predict(Xtest)
print(classification_report(y_true, y_pred))

points_plot(Xtr, Xtrain, Xtest, ytrain, ytest, gs)