%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import interp
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cross_validation import cross_val_score, StratifiedKFold, train_test_split, KFold
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.preprocessing import LabelEncoder, label_binarize

#get the CHIMP training data

X_chimp = pd.read_csv('data/chimp/chimp.Qsorted.rdpout.xtab.norm', delimiter="\t", index_col=0)
y_chimp = pd.read_csv('data/chimp/sampledata.training.chimp.csv', index_col=0)

#just make sure the labels are the same
X_chimp.sort_index(inplace=True)
y_chimp.sort_index(inplace=True)
assert (X_chimp.index == y_chimp.index).all()

## do the same for the blind validation test data
X_blind = pd.read_csv('data/chimp/blind.sorted.rdpout.xtab.norm',
                        delimiter="\t", index_col=0)
y_blind = pd.read_csv('data/chimp/sampledata.validation.blind.csv',
                        index_col=0)

X_blind.sort_index(inplace=True)
y_blind.sort_index(inplace=True)
assert (X_blind.index == y_blind.index).all()

#concatenate using pandas
X = pd.concat([X_chimp, X_blind], keys=['chimp','blind'])
X.head()

X.fillna(value=0,inplace=True) #replace NAs with zeroes

y_dx = pd.concat([y_chimp.dx, y_blind.dx], keys=['chimp','blind'])
y_dx #btw, what joy is to use pandas over R/dplyr for this. so intuitive and fast.

#convert the training and testing labels to numerical values
le = LabelEncoder()
le.fit(y_dx)
y = le.transform(y_dx)

# just for reference, the columns of the binarized label read respectively:
le.inverse_transform([0,1,2])

clf = RandomForestClassifier(n_estimators=50, oob_score=True)
clf.fit(X.values, y)
scores = cross_val_score(clf, X.values, y, cv=10)

print("Cross validation score:")
print(scores.mean())

importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],axis = 0)
indices = np.argsort(importances)[::-1]

print("feature ranking:")

for f in range(20):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

y_bin = label_binarize(y,classes=[0,1,2])
n_classes = y_bin.shape[1]

X_train, X_test, y_train, y_test = train_test_split(X.values, y_bin, test_size=.3)

clf1 = OneVsRestClassifier(RandomForestClassifier(n_estimators=50))
y_score = clf1.fit(X_train, y_train).predict_proba(X_test)

y_score[:10,:]

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = roc_auc_score(y_test[:,i], y_score[:,i],average="micro")

# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr[2], tpr[2], label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC - UC vs all')
plt.legend(loc="lower right")
plt.show()

# Plot ROC curves all together now
plt.figure()

for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='ROC curve {0} (area = {1:0.2f})'
                                   ''.format(le.inverse_transform(i), roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC - one random 1/3 split')
plt.legend(loc="lower right")
plt.show()

# Run classifier with cross-validation and plot ROC curves

for dx in range(n_classes):
    cv = StratifiedKFold(y_bin[:,dx], n_folds=10)
    classifier = RandomForestClassifier()

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    for i, (train, test) in enumerate(cv):
        probas_ = classifier.fit(X.iloc[train], y_bin[train,dx]).predict_proba(X.iloc[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_bin[test,dx], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        #plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))


    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 
             label='Mean ROC %s (area = %0.2f)' % (le.inverse_transform(dx), mean_auc), lw=1)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Figure 6A - ROC - crossvalidated - one vs. all')
plt.legend(loc="lower right")
plt.show()

# Only CD vs UC

X_cduc = X[(y == 0) | (y == 2)]
y_cduc = y[(y == 0) | (y == 2)]
np.place(y_cduc,y_cduc == 2, 1)

cv = StratifiedKFold(y_cduc, n_folds=10)
clf_cduc = RandomForestClassifier()

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for i, (train, test) in enumerate(cv):
    fitted = classifier.fit(X_cduc.iloc[train], y_cduc[train])
    probas_ = fitted.predict_proba(X_cduc.iloc[test])
    scored_ = fitted.predict(X_cduc.iloc[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y_cduc[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    #roc_auc = auc(fpr, tpr)
    roc_auc = roc_auc_score(scored_, y_cduc[test], average="micro")
    #plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))


mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=1)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Figure 6A - ROC - crossvalidated - CD vs UC')
plt.legend(loc="lower right")
plt.show()

frank = pd.read_csv('data/pace/pace.rdpout.xtab.norm.ibd.csv')
frank_label = frank['ibd']
del frank['ibd']
frank_label = (frank_label == 'yes')
frank_label.value_counts()

def crossval_roc(X, y):
    cv = StratifiedKFold(y, n_folds=10)
    clf = RandomForestClassifier()

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    for i, (train, test) in enumerate(cv):
        fitted = clf.fit(X[train], y[train])
        probas_ = fitted.predict_proba(X[test])
        scored_ = fitted.predict(X[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        #roc_auc = auc(fpr, tpr)
        roc_auc = roc_auc_score(scored_, y[test], average="micro")
        #plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))


    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    return plt.plot(mean_fpr, mean_tpr, 
                    label='Mean ROC (area = %0.2f)' % mean_auc, lw=1)

fig = crossval_roc(frank.values,frank_label.values)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Frank et al. cross-validated ROC')
plt.legend(loc="lower right")
plt.show()