%matplotlib inline
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import os.path as op
import pandas as pd
import numpy as np
import seaborn as sn
sn.set(style="whitegrid")
from mriqc.classifier import data as mcd
abide, _ = mcd.read_dataset(x_path, y_path, rate_label='rater_1')
sites = list(sorted(set(abide.site.values.ravel())))
fmt = r'{site} & \pixmat{{{size[0]:d}$\pm${sr[0]:d}}}{{{size[1]:d}$\pm${sr[1]:d}}}{{{size[2]:d}$\pm${sr[1]:d}}}'
fmt += r'& \pixmat[mm]{{{sp[0]:.2f}$\pm${spr[0]:.2f}}}{{{sp[1]:.2f}$\pm${spr[1]:.2f}}}{{{sp[2]:.2f}$\pm${spr[1]:.2f}}}'
for site in sites:
subabide = abide.loc[abide.site.str.contains(site)]
medians = np.median(subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']],
axis=0)
mins = np.abs(medians - np.min(
subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']], axis=0))
maxs = np.abs(medians - np.max(
subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']], axis=0))
ranges = np.max(np.vstack((maxs, mins)), axis=0)
print(
fmt.format(
site=site,
size=tuple(medians[:3].astype(int)),
sr=tuple(ranges[:3].astype(int)),
sp=tuple(medians[3:]),
spr=tuple(ranges[3:]),
))
#data_path = '/home/oesteban/Google Drive/mriqc'
data_path = '/home/oesteban/tmp/mriqc-ml-tests-2/'
out_path = data_path
loso = pd.read_csv(op.join(data_path, 'cv_loso_inner.csv'), index_col=False)
kfold = pd.read_csv(op.join(data_path, 'cv_kfold_inner.csv'), index_col=False)
kfold_outer = pd.read_csv(op.join(data_path, 'cv_kfold_outer.csv'), index_col=False)
loso_outer = pd.read_csv(op.join(data_path, 'cv_loso_outer.csv'), index_col=False)
def gen_newparams(dataframe):
thisdf = dataframe.copy()
thisdf['zscored_str'] = ['nzs'] * len(thisdf['zscored'])
thisdf.loc[thisdf.zscored == 1, 'zscored_str'] = 'zs'
thisdf['params'] = thisdf['clf'] + '-' + thisdf['zscored_str'] + ' ' + thisdf['params']
del thisdf['zscored_str']
return thisdf
loso = gen_newparams(loso)
kfold = gen_newparams(kfold)
loso_models_list = list(set(loso.params.values.ravel().tolist()))
kfold_models_list = list(set(kfold.params.values.ravel().tolist()))
best_param = {}
spstr = ['LoSo', '10-fold']
best_models = {}
for i, split_cv in enumerate([loso, kfold]):
best_models[spstr[i]] = {}
splitcols = [col for col in split_cv.columns.ravel() if col.startswith('split0')]
for clf in ['svc_linear-nzs', 'svc_rbf-nzs', 'rfc-nzs', 'svc_linear-zs', 'svc_rbf-zs', 'rfc-zs']:
thismodeldf = split_cv.loc[split_cv.params.str.contains(clf)]
max_auc = thismodeldf.mean_auc.max()
best = thismodeldf.loc[thismodeldf.mean_auc >= max_auc]
best_list = best.params.values.ravel().tolist()
if len(best_list) == 1:
best_models[spstr[i]][clf] = best_list[0]
else:
overall_means = [thismodeldf.loc[thismodeldf.params.str.contains(pset), 'mean_auc'].mean()
for pset in best_list]
overall_max = np.max(overall_means)
if sum([val >= overall_max for val in overall_means]) == 1:
best_models[spstr[i]][clf] = best_list[np.argmax(overall_means)]
else:
best_models[spstr[i]][clf] = best_list[0]
newdict = {'AUC': [], 'Classifier': [], 'Split scheme': []}
modelnames = {'rfc-nzs': 'RFC-nzs', 'rfc-zs': 'RFC-zs',
'svc_linear-nzs': 'SVC_lin-nzs', 'svc_linear-zs': 'SVC_lin-zs',
'svc_rbf-nzs': 'SVC_rbf-nzs', 'svc_rbf-zs': 'SVC_rbf-zs'}
for key, val in list(best_models['LoSo'].items()):
scores = loso.loc[loso.params.str.contains(val), 'mean_auc'].values.ravel().tolist()
nscores = len(scores)
newdict['AUC'] += scores
newdict['Classifier'] += [modelnames[key]] * nscores
newdict['Split scheme'] += ['LoSo (16 folds)'] * nscores
for key, val in list(best_models['10-fold'].items()):
scores = kfold.loc[kfold.params.str.contains(val), 'mean_auc'].values.ravel().tolist()
nscores = len(scores)
newdict['AUC'] += scores
newdict['Classifier'] += [modelnames[key]] * nscores
newdict['Split scheme'] += ['10-fold'] * nscores
newdf = pd.DataFrame(newdict).sort_values(by=['Split scheme', 'Classifier'])
def plot_cv_outer(data, score='auc', zscored=0, ax=None, ds030_score=None,
split_type='LoSo', color='dodgerblue'):
if ax is None:
ax = plt.gca()
outer_score = data.loc[data[score].notnull(), [score, 'zscored']]
sn.distplot(outer_score.loc[outer_score.zscored==zscored, score],
hist=True, norm_hist=True, ax=ax, color=color, label=split_type)
ax.set_xlim([0.4, 1.0])
ax.grid(False)
ax.set_yticklabels([])
mean = outer_score.loc[outer_score.zscored==zscored, score].mean()
std = outer_score.loc[outer_score.zscored==zscored, score].std()
mean_coord = draw_line(mean, ax=ax, color=color, lw=2.0, marker='o', extend=True)
ymax = ax.get_ylim()[1]
draw_line(mean - std, ax=ax, color=color, extend=True)
draw_line(mean + std, ax=ax, color=color, extend=True)
ax.annotate(
'$\mu$=%0.3f' % mean, xy=(mean_coord[0], 0.75*ymax), xytext=(-35, 30),
textcoords='offset points', va='center', color='w', size=14,
bbox=dict(boxstyle='round', fc=color, ec='none', color='none', lw=0),
arrowprops=dict(
arrowstyle='wedge,tail_width=0.8', lw=0, patchA=None, patchB=None,
fc=color, ec='none', relpos=(0.5, 0.5)))
sigmay = 0.70*ymax
ax.annotate(s='', xy=(mean - std, sigmay), xytext=(mean + std, sigmay), arrowprops=dict(arrowstyle='<->'))
ax.annotate(
'$2\sigma$=%0.3f' % (2 * std), xy=(mean_coord[0], 0.70*ymax), xytext=(-25, -12),
textcoords='offset points', va='center', color='k', size=12,
bbox=dict(boxstyle='round', fc='w', ec='none', color='none', alpha=.7, lw=0))
if ds030_score is not None:
ds030_coord = draw_line(ds030_score, ax=ax, color='k', marker='o')
ax.annotate(
'DS030', xy=ds030_coord, xytext=(-100, 0),
textcoords='offset points', va='center', color='w', size=16,
bbox=dict(boxstyle='round', fc=color, ec='none', color='none', lw=0),
arrowprops=dict(
arrowstyle='wedge,tail_width=0.8', lw=0, patchA=None, patchB=None,
fc=color, ec='none', relpos=(0.5, 0.5)))
def draw_line(score, ax=None, color='k', marker=None, lw=.7, extend=False):
if ax is None:
ax = plt.gca()
if score > 1.0:
score = 1.0
coords = [score, -1]
pdf_points = ax.lines[0].get_data()
coords[1] = np.interp([coords[0]], pdf_points[0], pdf_points[1])
if extend:
ax.axvline(coords[0], ymin=coords[1] / ax.get_ylim()[1], ymax=0.75, color='gray', lw=.7)
ax.axvline(coords[0], ymin=coords[1] / ax.get_ylim()[1], ymax=0, color=color, marker=marker, markevery=2,
markeredgewidth=1.5, markerfacecolor='w', markeredgecolor=color, lw=lw)
return coords
sn.set(style="whitegrid")
fig = plt.figure(figsize=(20, 8))
ax1 = plt.subplot2grid((2,4), (0,0), colspan=2, rowspan=2)
sn.violinplot(x='Classifier', y='AUC', hue='Split scheme', data=newdf, split=True,
palette=['dodgerblue', 'darkorange'], ax=ax1)
ax1.set_ylim([0.70, 1.0])
ax1.set_ylabel('AUC')
ax1.set_xlabel('Model')
ax1.set_title('Model selection - Inner loop of nested cross-validation')
ax2 = plt.subplot2grid((2,4), (0, 2))
plot_cv_outer(kfold_outer, zscored=0, score='auc', ax=ax2, ds030_score=0.695, split_type='10-fold')
ax2.set_xlabel('')
ax2.legend()
ax2.set_title('Evaluation - Outer loop of nested cross-validation')
ax2.title.set_position([1.1, 1.0])
ax3 = plt.subplot2grid((2,4), (1, 2))
plot_cv_outer(loso_outer, zscored=0, score='auc', ax=ax3, ds030_score=0.695, color='darkorange', split_type='LoSo (17 folds)')
ax3.legend()
ax3.set_xlabel('AUC')
ax4 = plt.subplot2grid((2,4), (0, 3))
plot_cv_outer(kfold_outer, zscored=0, score='acc', ax=ax4, ds030_score=0.7283, split_type='10-fold')
ax4.set_xlabel('')
ax4.legend()
ax5 = plt.subplot2grid((2,4), (1, 3))
plot_cv_outer(loso_outer, zscored=0, score='acc', ax=ax5, ds030_score=0.7283, color='darkorange', split_type='LoSo (17 folds)')
ax5.legend()
ax5.set_xlabel('Accuracy')
fig.savefig(op.join(out_path, 'crossvalidation.pdf'),
bbox_inches='tight', pad_inches=0, dpi=300)
zscoreddf = loso_outer.loc[loso_outer.zscored == 0, ['auc', 'acc', 'site']]
palette = sn.color_palette("cubehelix", len(set(zscoreddf.site)))
sn.pairplot(zscoreddf.loc[zscoreddf.auc.notnull(), ['auc', 'acc', 'site']], hue='site', palette=palette)
sites = sorted(list(set(loso_outer.site.ravel().tolist())))
palette = sn.color_palette("husl", len(sites))
fig = plt.figure()
for i, site in enumerate(sites):
sitedf = loso_outer.loc[loso_outer.site == site]
accdf = sitedf.loc[sitedf.zscored==0]
sn.distplot(accdf.acc.values.ravel(), bins=20, kde=0, label=site, color=palette[i])
fig.gca().legend()
fig.gca().set_xlim([0.5, 1.0])