Notebook

In [ ]:

%matplotlib inline
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import os.path as op
import pandas as pd
import numpy as np
import seaborn as sn

sn.set(style="whitegrid")

In [ ]:

from mriqc.classifier import data as mcd
abide, _ = mcd.read_dataset(x_path, y_path, rate_label='rater_1')
sites = list(sorted(set(abide.site.values.ravel())))

fmt = r'{site} & \pixmat{{{size[0]:d}$\pm${sr[0]:d}}}{{{size[1]:d}$\pm${sr[1]:d}}}{{{size[2]:d}$\pm${sr[1]:d}}}'
fmt += r'& \pixmat[mm]{{{sp[0]:.2f}$\pm${spr[0]:.2f}}}{{{sp[1]:.2f}$\pm${spr[1]:.2f}}}{{{sp[2]:.2f}$\pm${spr[1]:.2f}}}'


for site in sites:
    subabide = abide.loc[abide.site.str.contains(site)]
    
    medians = np.median(subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']],
                        axis=0)
    
    mins = np.abs(medians - np.min(
            subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']], axis=0))

    maxs = np.abs(medians - np.max(
            subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']], axis=0))

    ranges = np.max(np.vstack((maxs, mins)), axis=0)
   
    print(
        fmt.format(
            site=site,
            size=tuple(medians[:3].astype(int)),
            sr=tuple(ranges[:3].astype(int)),
            sp=tuple(medians[3:]),
            spr=tuple(ranges[3:]),

))

In [ ]:

#data_path = '/home/oesteban/Google Drive/mriqc'
data_path = '/home/oesteban/tmp/mriqc-ml-tests-2/'
out_path = data_path
loso = pd.read_csv(op.join(data_path, 'cv_loso_inner.csv'), index_col=False)
kfold = pd.read_csv(op.join(data_path, 'cv_kfold_inner.csv'), index_col=False)

kfold_outer = pd.read_csv(op.join(data_path, 'cv_kfold_outer.csv'), index_col=False)
loso_outer = pd.read_csv(op.join(data_path, 'cv_loso_outer.csv'), index_col=False)

def gen_newparams(dataframe):
    thisdf = dataframe.copy()
    thisdf['zscored_str'] = ['nzs'] * len(thisdf['zscored'])
    thisdf.loc[thisdf.zscored == 1, 'zscored_str'] = 'zs'
    thisdf['params'] = thisdf['clf'] + '-' + thisdf['zscored_str'] + ' ' + thisdf['params']
    del thisdf['zscored_str']
    return thisdf

loso = gen_newparams(loso)
kfold = gen_newparams(kfold)

In [ ]:

loso_models_list = list(set(loso.params.values.ravel().tolist()))
kfold_models_list = list(set(kfold.params.values.ravel().tolist()))

best_param = {}

spstr = ['LoSo', '10-fold']
best_models = {}
for i, split_cv in enumerate([loso, kfold]):
    best_models[spstr[i]] = {}
    splitcols = [col for col in split_cv.columns.ravel() if col.startswith('split0')]
    for clf in ['svc_linear-nzs', 'svc_rbf-nzs', 'rfc-nzs', 'svc_linear-zs', 'svc_rbf-zs', 'rfc-zs']:
        thismodeldf = split_cv.loc[split_cv.params.str.contains(clf)]
        max_auc = thismodeldf.mean_auc.max()
        best = thismodeldf.loc[thismodeldf.mean_auc >= max_auc]
        best_list = best.params.values.ravel().tolist()
        
        if len(best_list) == 1:
            best_models[spstr[i]][clf] = best_list[0]
        else:
            overall_means = [thismodeldf.loc[thismodeldf.params.str.contains(pset), 'mean_auc'].mean()
                             for pset in best_list]
            overall_max = np.max(overall_means)
            if sum([val >= overall_max for val in overall_means]) == 1:
                best_models[spstr[i]][clf] = best_list[np.argmax(overall_means)]
            else:
                best_models[spstr[i]][clf] = best_list[0]
                
newdict = {'AUC': [], 'Classifier': [], 'Split scheme': []}

modelnames = {'rfc-nzs': 'RFC-nzs', 'rfc-zs': 'RFC-zs',
              'svc_linear-nzs': 'SVC_lin-nzs', 'svc_linear-zs': 'SVC_lin-zs',
              'svc_rbf-nzs': 'SVC_rbf-nzs', 'svc_rbf-zs': 'SVC_rbf-zs'}

for key, val in list(best_models['LoSo'].items()):
    scores = loso.loc[loso.params.str.contains(val), 'mean_auc'].values.ravel().tolist()
    nscores = len(scores)
   
    newdict['AUC'] += scores
    newdict['Classifier'] += [modelnames[key]] * nscores
    newdict['Split scheme'] += ['LoSo (16 folds)'] * nscores
    
for key, val in list(best_models['10-fold'].items()):
    scores = kfold.loc[kfold.params.str.contains(val), 'mean_auc'].values.ravel().tolist()
    nscores = len(scores)
   
    newdict['AUC'] += scores
    newdict['Classifier'] += [modelnames[key]] * nscores
    newdict['Split scheme'] += ['10-fold'] * nscores

newdf = pd.DataFrame(newdict).sort_values(by=['Split scheme', 'Classifier'])

In [ ]:

def plot_cv_outer(data, score='auc', zscored=0, ax=None, ds030_score=None,
                  split_type='LoSo', color='dodgerblue'):
    
    if ax is None:
        ax = plt.gca()
            
    outer_score = data.loc[data[score].notnull(), [score, 'zscored']]
    sn.distplot(outer_score.loc[outer_score.zscored==zscored, score],
                hist=True, norm_hist=True, ax=ax, color=color, label=split_type)
    ax.set_xlim([0.4, 1.0])
    ax.grid(False)
    ax.set_yticklabels([])
    
    mean = outer_score.loc[outer_score.zscored==zscored, score].mean()
    std = outer_score.loc[outer_score.zscored==zscored, score].std()

    mean_coord = draw_line(mean, ax=ax, color=color, lw=2.0, marker='o', extend=True)
    
    ymax = ax.get_ylim()[1]
    draw_line(mean - std, ax=ax, color=color, extend=True)
    draw_line(mean + std, ax=ax, color=color, extend=True)
    
    
    ax.annotate(
        '$\mu$=%0.3f' % mean, xy=(mean_coord[0], 0.75*ymax), xytext=(-35, 30),
        textcoords='offset points', va='center', color='w', size=14,
        bbox=dict(boxstyle='round', fc=color, ec='none', color='none', lw=0),
        arrowprops=dict(
            arrowstyle='wedge,tail_width=0.8', lw=0, patchA=None, patchB=None,
            fc=color, ec='none', relpos=(0.5, 0.5)))
    sigmay = 0.70*ymax
    ax.annotate(s='', xy=(mean - std, sigmay), xytext=(mean + std, sigmay), arrowprops=dict(arrowstyle='<->'))
    ax.annotate(
        '$2\sigma$=%0.3f' % (2 * std), xy=(mean_coord[0], 0.70*ymax), xytext=(-25, -12),
        textcoords='offset points', va='center', color='k', size=12,
        bbox=dict(boxstyle='round', fc='w', ec='none', color='none', alpha=.7, lw=0))
    
    if ds030_score is not None:
        ds030_coord = draw_line(ds030_score, ax=ax, color='k', marker='o')
        ax.annotate(
            'DS030', xy=ds030_coord, xytext=(-100, 0),
            textcoords='offset points', va='center', color='w', size=16,
            bbox=dict(boxstyle='round', fc=color, ec='none', color='none', lw=0),
            arrowprops=dict(
                arrowstyle='wedge,tail_width=0.8', lw=0, patchA=None, patchB=None,
                fc=color, ec='none', relpos=(0.5, 0.5)))
        
        
def draw_line(score, ax=None, color='k', marker=None, lw=.7, extend=False):
    if ax is None:
        ax = plt.gca()
    
    if score > 1.0:
        score = 1.0
        
    coords = [score, -1]
    pdf_points = ax.lines[0].get_data()
    coords[1] = np.interp([coords[0]], pdf_points[0], pdf_points[1])
    
    if extend:
        ax.axvline(coords[0], ymin=coords[1] / ax.get_ylim()[1], ymax=0.75, color='gray', lw=.7)
    
    ax.axvline(coords[0], ymin=coords[1] / ax.get_ylim()[1], ymax=0, color=color, marker=marker, markevery=2,
                markeredgewidth=1.5, markerfacecolor='w', markeredgecolor=color, lw=lw)

    return coords

In [ ]:

sn.set(style="whitegrid")

fig = plt.figure(figsize=(20, 8)) 
ax1 = plt.subplot2grid((2,4), (0,0), colspan=2, rowspan=2)

sn.violinplot(x='Classifier', y='AUC', hue='Split scheme', data=newdf, split=True,
              palette=['dodgerblue', 'darkorange'], ax=ax1)
ax1.set_ylim([0.70, 1.0])
ax1.set_ylabel('AUC')
ax1.set_xlabel('Model')
ax1.set_title('Model selection - Inner loop of nested cross-validation')

ax2 = plt.subplot2grid((2,4), (0, 2))
plot_cv_outer(kfold_outer, zscored=0, score='auc', ax=ax2, ds030_score=0.695, split_type='10-fold')
ax2.set_xlabel('')
ax2.legend()
ax2.set_title('Evaluation - Outer loop of nested cross-validation')
ax2.title.set_position([1.1, 1.0])

ax3 = plt.subplot2grid((2,4), (1, 2))
plot_cv_outer(loso_outer, zscored=0, score='auc', ax=ax3, ds030_score=0.695, color='darkorange', split_type='LoSo (17 folds)')
ax3.legend()
ax3.set_xlabel('AUC')

ax4 = plt.subplot2grid((2,4), (0, 3))
plot_cv_outer(kfold_outer, zscored=0, score='acc', ax=ax4, ds030_score=0.7283, split_type='10-fold')
ax4.set_xlabel('')
ax4.legend()

ax5 = plt.subplot2grid((2,4), (1, 3))
plot_cv_outer(loso_outer, zscored=0, score='acc', ax=ax5, ds030_score=0.7283, color='darkorange', split_type='LoSo (17 folds)')
ax5.legend()
ax5.set_xlabel('Accuracy')


fig.savefig(op.join(out_path, 'crossvalidation.pdf'),
            bbox_inches='tight', pad_inches=0, dpi=300)

In [ ]:

zscoreddf = loso_outer.loc[loso_outer.zscored == 0, ['auc', 'acc', 'site']]
palette = sn.color_palette("cubehelix", len(set(zscoreddf.site)))
sn.pairplot(zscoreddf.loc[zscoreddf.auc.notnull(), ['auc', 'acc', 'site']], hue='site', palette=palette)

In [ ]:

sites = sorted(list(set(loso_outer.site.ravel().tolist())))
palette = sn.color_palette("husl", len(sites))
fig = plt.figure()
for i, site in enumerate(sites):
    sitedf = loso_outer.loc[loso_outer.site == site]
    accdf = sitedf.loc[sitedf.zscored==0]
    sn.distplot(accdf.acc.values.ravel(), bins=20, kde=0, label=site, color=palette[i])

fig.gca().legend()
fig.gca().set_xlim([0.5, 1.0])