%matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt from scipy import interp from sklearn.ensemble import RandomForestClassifier from sklearn.multiclass import OneVsRestClassifier from sklearn.cross_validation import cross_val_score, StratifiedKFold, train_test_split, KFold from sklearn.metrics import roc_curve, roc_auc_score, auc from sklearn.preprocessing import LabelEncoder, label_binarize #get the CHIMP training data X_chimp = pd.read_csv('data/chimp/chimp.Qsorted.rdpout.xtab.norm', delimiter="\t", index_col=0) y_chimp = pd.read_csv('data/chimp/sampledata.training.chimp.csv', index_col=0) #just make sure the labels are the same X_chimp.sort_index(inplace=True) y_chimp.sort_index(inplace=True) assert (X_chimp.index == y_chimp.index).all() ## do the same for the blind validation test data X_blind = pd.read_csv('data/chimp/blind.sorted.rdpout.xtab.norm', delimiter="\t", index_col=0) y_blind = pd.read_csv('data/chimp/sampledata.validation.blind.csv', index_col=0) X_blind.sort_index(inplace=True) y_blind.sort_index(inplace=True) assert (X_blind.index == y_blind.index).all() #concatenate using pandas X = pd.concat([X_chimp, X_blind], keys=['chimp','blind']) X.head() X.fillna(value=0,inplace=True) #replace NAs with zeroes y_dx = pd.concat([y_chimp.dx, y_blind.dx], keys=['chimp','blind']) y_dx #btw, what joy is to use pandas over R/dplyr for this. so intuitive and fast. #convert the training and testing labels to numerical values le = LabelEncoder() le.fit(y_dx) y = le.transform(y_dx) # just for reference, the columns of the binarized label read respectively: le.inverse_transform([0,1,2]) clf = RandomForestClassifier(n_estimators=50, oob_score=True) clf.fit(X.values, y) scores = cross_val_score(clf, X.values, y, cv=10) print("Cross validation score:") print(scores.mean()) importances = clf.feature_importances_ std = np.std([tree.feature_importances_ for tree in clf.estimators_],axis = 0) indices = np.argsort(importances)[::-1] print("feature ranking:") for f in range(20): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) y_bin = label_binarize(y,classes=[0,1,2]) n_classes = y_bin.shape[1] X_train, X_test, y_train, y_test = train_test_split(X.values, y_bin, test_size=.3) clf1 = OneVsRestClassifier(RandomForestClassifier(n_estimators=50)) y_score = clf1.fit(X_train, y_train).predict_proba(X_test) y_score[:10,:] # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = roc_auc_score(y_test[:,i], y_score[:,i],average="micro") # Plot of a ROC curve for a specific class plt.figure() plt.plot(fpr[2], tpr[2], label='ROC curve (area = %0.2f)' % roc_auc[2]) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC - UC vs all') plt.legend(loc="lower right") plt.show() # Plot ROC curves all together now plt.figure() for i in range(n_classes): plt.plot(fpr[i], tpr[i], label='ROC curve {0} (area = {1:0.2f})' ''.format(le.inverse_transform(i), roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC - one random 1/3 split') plt.legend(loc="lower right") plt.show() # Run classifier with cross-validation and plot ROC curves for dx in range(n_classes): cv = StratifiedKFold(y_bin[:,dx], n_folds=10) classifier = RandomForestClassifier() mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X.iloc[train], y_bin[train,dx]).predict_proba(X.iloc[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_bin[test,dx], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) #plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, label='Mean ROC %s (area = %0.2f)' % (le.inverse_transform(dx), mean_auc), lw=1) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Figure 6A - ROC - crossvalidated - one vs. all') plt.legend(loc="lower right") plt.show() # Only CD vs UC X_cduc = X[(y == 0) | (y == 2)] y_cduc = y[(y == 0) | (y == 2)] np.place(y_cduc,y_cduc == 2, 1) cv = StratifiedKFold(y_cduc, n_folds=10) clf_cduc = RandomForestClassifier() mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): fitted = classifier.fit(X_cduc.iloc[train], y_cduc[train]) probas_ = fitted.predict_proba(X_cduc.iloc[test]) scored_ = fitted.predict(X_cduc.iloc[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_cduc[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 #roc_auc = auc(fpr, tpr) roc_auc = roc_auc_score(scored_, y_cduc[test], average="micro") #plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, label='Mean ROC (area = %0.2f)' % mean_auc, lw=1) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Figure 6A - ROC - crossvalidated - CD vs UC') plt.legend(loc="lower right") plt.show() frank = pd.read_csv('data/pace/pace.rdpout.xtab.norm.ibd.csv') frank_label = frank['ibd'] del frank['ibd'] frank_label = (frank_label == 'yes') frank_label.value_counts() def crossval_roc(X, y): cv = StratifiedKFold(y, n_folds=10) clf = RandomForestClassifier() mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): fitted = clf.fit(X[train], y[train]) probas_ = fitted.predict_proba(X[test]) scored_ = fitted.predict(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 #roc_auc = auc(fpr, tpr) roc_auc = roc_auc_score(scored_, y[test], average="micro") #plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) return plt.plot(mean_fpr, mean_tpr, label='Mean ROC (area = %0.2f)' % mean_auc, lw=1) fig = crossval_roc(frank.values,frank_label.values) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Frank et al. cross-validated ROC') plt.legend(loc="lower right") plt.show()