!wget -nc --directory-prefix data \
    https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data

!head data/transfusion.data

import numpy as np
import pandas as pd

df = pd.read_csv('data/transfusion.data')
df.head()

df.shape

df.dtypes

# save the current names in case we need them later
original_column_names = df.columns

# make the names less ugly
names = ['recency', 'frequency', 'cc', 'time', 'donated']
df.columns = names

df.head()

# import our graphics tools
%matplotlib inline
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns  # nice defaults for matplotlib styles
set2 = sns.color_palette('Set2')

# add on some settings from 'Bayesian Methods for Hackers'
plt.style.use('bmh')

# set larger default fonts for presentation-friendliness
mpl.rc('figure', figsize=(10, 8))
mpl.rc('axes', labelsize=16, titlesize=20)

from pandas.tools.plotting import scatter_matrix

axeslist = scatter_matrix(df, alpha=0.8, figsize=(10, 10))
for ax in axeslist.flatten():
    ax.grid(False)

import numpy as np

# create a figure with 4 subplots
fig, axs = plt.subplots(nrows=2, ncols=2)

feature_column_names = df.columns[:-1]
label_column_name = df.columns[-1]

for i, col in enumerate(feature_column_names):
    
    # get the current subplot to work on
    ax = axs.ravel()[i]
    
    # create some random y jitter to add
    jitter = np.random.uniform(low=-0.05, high=0.05, size=len(df))
    
    # plot the data
    ax.scatter(x=df[col], y=df[label_column_name] + jitter,
               c=df.donated, cmap='coolwarm', alpha=0.5)
    
    # label the axes
    ax.set_xlabel(col)
    ax.set_ylabel(label_column_name)
    
plt.tight_layout()
plt.show()

from sklearn.cross_validation import train_test_split

# using conventional sklearn variable names
X = df[feature_column_names].astype(float)
y = df.donated.ravel()

# break up the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

X_train.shape

y_train.shape

X_test.shape

y_test.shape

from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier(max_depth=3)
clf_tree.fit(X_train, y_train)

print 'Score:', clf_tree.score(X_test, y_test)

# viz adapted from http://scikit-learn.org/stable/modules/tree.html
import pydot
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz

dot_data = StringIO() 
export_graphviz(clf_tree, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_png('output/decision_tree.png')

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty='l2', fit_intercept=True)
clf.fit(X_train, y_train)

print 'Score:', clf.score(X_test, y_test)

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

other_clfs = {
    'Random forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Naive Bayes': MultinomialNB(),
    'Linear SVC': LinearSVC(),
    'KNN': KNeighborsClassifier(5),
}

# iterating through all of these models we want to fit ...
for name, other_clf in other_clfs.iteritems():
    
    # fit the model with the training data
    print other_clf.fit(X_train, y_train)
    
    # cross validation score
    print '---\nScore:', other_clf.score(X_test, y_test)
    print '\n'

clf

clf.coef_

clf.intercept_

clf.get_params()

clf.predict(X_test)

pd.DataFrame(clf.predict_proba(X_test))\
    .head(10)

clf.score(X_test, y_test)

from sklearn import cross_validation

# come up with random folds of the data
kf = cross_validation.KFold(len(X), n_folds=5, shuffle=True)

def plot_scores(scores):
    N = len(scores)
    plt.bar(np.arange(1, N + 1) - 0.4, scores, color=set2[2])
    plt.title('{}-fold cross-validation scores'.format(N), fontsize=18)
    plt.xlabel('fold', fontsize=14)
    plt.ylabel('score', fontsize=14)
    plt.xlim(0.5, N + 0.5)
    plt.ylim(0, 1)
    plt.show()

# evaluate the fitted model on each fold in turn, returns a score for each fold
scores = cross_validation.cross_val_score(clf, X, y, cv=kf, n_jobs=1)

print 'scores:', scores
print 'average score:', np.mean(scores)
plot_scores(scores)

from sklearn.metrics import log_loss

log_loss(y_test, clf.predict_proba(X_test))

from sklearn.metrics import f1_score

f1_score(y_test, clf.predict(X_test))

from itertools import permutations
from sklearn.metrics import confusion_matrix

# get the raw confusion matrix
cm = confusion_matrix(y_test, clf.predict(X_test))

# create a dataframe
cmdf = pd.DataFrame(cm)
cmdf.columns = map(lambda x: 'pred {}'.format(x), cmdf.columns)
cmdf.index = map(lambda x: 'actual {}'.format(x), cmdf.index)
cmdf

from IPython.html.widgets import interact

from sklearn.metrics import roc_curve, auc

def plot_roc_curve(y_test, probas):
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y_test, probas[:, 1])
    roc_auc = auc(fpr, tpr)
    
    # Plot ROC curve
    plt.clf()
    plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate', fontsize=14)
    plt.ylabel('True Positive Rate', fontsize=14)
    plt.legend(loc="lower right", fontsize=20)
    plt.show()

def fit_model(penalty, C):
    clf = LogisticRegression(penalty=penalty, C=C)
    clf.fit(X_train, y_train)
    plot_roc_curve(y_test, clf._predict_proba_lr(X_test))
    
interact(fit_model, penalty=('l2','l1'), C=(0.01, 100, 1))

from sklearn.grid_search import GridSearchCV

params_to_try = {
    'C': [0.01, 0.1, 1, 10, 100, 100],
    'penalty': ['l1', 'l2']
}

gs = GridSearchCV(clf, param_grid=params_to_try, cv=5)
gs.fit(X, y)

print "Best parameters:", gs.best_params_
print "Best score:", gs.best_score_

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_standardized = scaler.fit_transform(X_train.astype(np.float))
X_train_standardized

print 'column means:', np.round(X_train_standardized.mean(axis=0))
print 'column variances:', np.round(X_train_standardized.var(axis=0))

X_new = np.array([25., 35., 9200., 90.])

scaler.transform(X_new)

from sklearn.decomposition import PCA

# instantiate the PCA transformation object
pca = PCA(n_components=2, whiten=True)

# fit the PCA object on and transform the training data
X_train_pca = pca.fit_transform(X_train_standardized)

# create a 3d figure
fig = plt.figure()
ax = fig.add_subplot(111)

# scatterplot the PCA points
ax.scatter(*np.hsplit(X_train_pca, 2), c=y_train, s=40,
           cmap='coolwarm')

# annotate and show the figure
ax.set_xlabel('component 1')
ax.set_ylabel('component 2')

plt.show()

from sklearn.pipeline import Pipeline

# define a pipeline with some transforms and a simple classifier
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('reduce_dim', PCA()),
    ('clf', LogisticRegression()),
])

# enumerate all of the different settings we wish to try out
parameters = {
    'reduce_dim__n_components': (1, 2, 3),
    'reduce_dim__whiten': (True, False),
    'clf__penalty': ('l1', 'l2'),
    'clf__C': (1e-3, 1e-2, 1, 1e1, 1e2, 1e3),
}

# grid search the parameter space
grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1)

print("Performing grid search...\n")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
grid_search.fit(X, y)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

# load the %R cell magic extension
%load_ext rmagic

# send the dataframe over to the R instance
%Rpush df

%%R
library(ggplot2)
qplot(log(time), log(cc), data=df, color=donated)

%%R
blood.glm <- glm(donated ~ log(cc) + log(time), data=df, family="binomial")
print(summary(blood.glm))

par(mfrow=c(2, 2))
plot(blood.glm)

r_coeffs = %R coef(blood.glm)
r_coeffs