#!/usr/bin/env python # coding: utf-8 # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') import matplotlib.pyplot as plt import os.path as op import pandas as pd import numpy as np import seaborn as sn sn.set(style="whitegrid") # In[ ]: from mriqc.classifier import data as mcd abide, _ = mcd.read_dataset(x_path, y_path, rate_label='rater_1') sites = list(sorted(set(abide.site.values.ravel()))) fmt = r'{site} & \pixmat{{{size[0]:d}$\pm${sr[0]:d}}}{{{size[1]:d}$\pm${sr[1]:d}}}{{{size[2]:d}$\pm${sr[1]:d}}}' fmt += r'& \pixmat[mm]{{{sp[0]:.2f}$\pm${spr[0]:.2f}}}{{{sp[1]:.2f}$\pm${spr[1]:.2f}}}{{{sp[2]:.2f}$\pm${spr[1]:.2f}}}' for site in sites: subabide = abide.loc[abide.site.str.contains(site)] medians = np.median(subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']], axis=0) mins = np.abs(medians - np.min( subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']], axis=0)) maxs = np.abs(medians - np.max( subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']], axis=0)) ranges = np.max(np.vstack((maxs, mins)), axis=0) print( fmt.format( site=site, size=tuple(medians[:3].astype(int)), sr=tuple(ranges[:3].astype(int)), sp=tuple(medians[3:]), spr=tuple(ranges[3:]), )) # In[ ]: #data_path = '/home/oesteban/Google Drive/mriqc' data_path = '/home/oesteban/tmp/mriqc-ml-tests-2/' out_path = data_path loso = pd.read_csv(op.join(data_path, 'cv_loso_inner.csv'), index_col=False) kfold = pd.read_csv(op.join(data_path, 'cv_kfold_inner.csv'), index_col=False) kfold_outer = pd.read_csv(op.join(data_path, 'cv_kfold_outer.csv'), index_col=False) loso_outer = pd.read_csv(op.join(data_path, 'cv_loso_outer.csv'), index_col=False) def gen_newparams(dataframe): thisdf = dataframe.copy() thisdf['zscored_str'] = ['nzs'] * len(thisdf['zscored']) thisdf.loc[thisdf.zscored == 1, 'zscored_str'] = 'zs' thisdf['params'] = thisdf['clf'] + '-' + thisdf['zscored_str'] + ' ' + thisdf['params'] del thisdf['zscored_str'] return thisdf loso = gen_newparams(loso) kfold = gen_newparams(kfold) # In[ ]: loso_models_list = list(set(loso.params.values.ravel().tolist())) kfold_models_list = list(set(kfold.params.values.ravel().tolist())) best_param = {} spstr = ['LoSo', '10-fold'] best_models = {} for i, split_cv in enumerate([loso, kfold]): best_models[spstr[i]] = {} splitcols = [col for col in split_cv.columns.ravel() if col.startswith('split0')] for clf in ['svc_linear-nzs', 'svc_rbf-nzs', 'rfc-nzs', 'svc_linear-zs', 'svc_rbf-zs', 'rfc-zs']: thismodeldf = split_cv.loc[split_cv.params.str.contains(clf)] max_auc = thismodeldf.mean_auc.max() best = thismodeldf.loc[thismodeldf.mean_auc >= max_auc] best_list = best.params.values.ravel().tolist() if len(best_list) == 1: best_models[spstr[i]][clf] = best_list[0] else: overall_means = [thismodeldf.loc[thismodeldf.params.str.contains(pset), 'mean_auc'].mean() for pset in best_list] overall_max = np.max(overall_means) if sum([val >= overall_max for val in overall_means]) == 1: best_models[spstr[i]][clf] = best_list[np.argmax(overall_means)] else: best_models[spstr[i]][clf] = best_list[0] newdict = {'AUC': [], 'Classifier': [], 'Split scheme': []} modelnames = {'rfc-nzs': 'RFC-nzs', 'rfc-zs': 'RFC-zs', 'svc_linear-nzs': 'SVC_lin-nzs', 'svc_linear-zs': 'SVC_lin-zs', 'svc_rbf-nzs': 'SVC_rbf-nzs', 'svc_rbf-zs': 'SVC_rbf-zs'} for key, val in list(best_models['LoSo'].items()): scores = loso.loc[loso.params.str.contains(val), 'mean_auc'].values.ravel().tolist() nscores = len(scores) newdict['AUC'] += scores newdict['Classifier'] += [modelnames[key]] * nscores newdict['Split scheme'] += ['LoSo (16 folds)'] * nscores for key, val in list(best_models['10-fold'].items()): scores = kfold.loc[kfold.params.str.contains(val), 'mean_auc'].values.ravel().tolist() nscores = len(scores) newdict['AUC'] += scores newdict['Classifier'] += [modelnames[key]] * nscores newdict['Split scheme'] += ['10-fold'] * nscores newdf = pd.DataFrame(newdict).sort_values(by=['Split scheme', 'Classifier']) # In[ ]: def plot_cv_outer(data, score='auc', zscored=0, ax=None, ds030_score=None, split_type='LoSo', color='dodgerblue'): if ax is None: ax = plt.gca() outer_score = data.loc[data[score].notnull(), [score, 'zscored']] sn.distplot(outer_score.loc[outer_score.zscored==zscored, score], hist=True, norm_hist=True, ax=ax, color=color, label=split_type) ax.set_xlim([0.4, 1.0]) ax.grid(False) ax.set_yticklabels([]) mean = outer_score.loc[outer_score.zscored==zscored, score].mean() std = outer_score.loc[outer_score.zscored==zscored, score].std() mean_coord = draw_line(mean, ax=ax, color=color, lw=2.0, marker='o', extend=True) ymax = ax.get_ylim()[1] draw_line(mean - std, ax=ax, color=color, extend=True) draw_line(mean + std, ax=ax, color=color, extend=True) ax.annotate( '$\mu$=%0.3f' % mean, xy=(mean_coord[0], 0.75*ymax), xytext=(-35, 30), textcoords='offset points', va='center', color='w', size=14, bbox=dict(boxstyle='round', fc=color, ec='none', color='none', lw=0), arrowprops=dict( arrowstyle='wedge,tail_width=0.8', lw=0, patchA=None, patchB=None, fc=color, ec='none', relpos=(0.5, 0.5))) sigmay = 0.70*ymax ax.annotate(s='', xy=(mean - std, sigmay), xytext=(mean + std, sigmay), arrowprops=dict(arrowstyle='<->')) ax.annotate( '$2\sigma$=%0.3f' % (2 * std), xy=(mean_coord[0], 0.70*ymax), xytext=(-25, -12), textcoords='offset points', va='center', color='k', size=12, bbox=dict(boxstyle='round', fc='w', ec='none', color='none', alpha=.7, lw=0)) if ds030_score is not None: ds030_coord = draw_line(ds030_score, ax=ax, color='k', marker='o') ax.annotate( 'DS030', xy=ds030_coord, xytext=(-100, 0), textcoords='offset points', va='center', color='w', size=16, bbox=dict(boxstyle='round', fc=color, ec='none', color='none', lw=0), arrowprops=dict( arrowstyle='wedge,tail_width=0.8', lw=0, patchA=None, patchB=None, fc=color, ec='none', relpos=(0.5, 0.5))) def draw_line(score, ax=None, color='k', marker=None, lw=.7, extend=False): if ax is None: ax = plt.gca() if score > 1.0: score = 1.0 coords = [score, -1] pdf_points = ax.lines[0].get_data() coords[1] = np.interp([coords[0]], pdf_points[0], pdf_points[1]) if extend: ax.axvline(coords[0], ymin=coords[1] / ax.get_ylim()[1], ymax=0.75, color='gray', lw=.7) ax.axvline(coords[0], ymin=coords[1] / ax.get_ylim()[1], ymax=0, color=color, marker=marker, markevery=2, markeredgewidth=1.5, markerfacecolor='w', markeredgecolor=color, lw=lw) return coords # In[ ]: sn.set(style="whitegrid") fig = plt.figure(figsize=(20, 8)) ax1 = plt.subplot2grid((2,4), (0,0), colspan=2, rowspan=2) sn.violinplot(x='Classifier', y='AUC', hue='Split scheme', data=newdf, split=True, palette=['dodgerblue', 'darkorange'], ax=ax1) ax1.set_ylim([0.70, 1.0]) ax1.set_ylabel('AUC') ax1.set_xlabel('Model') ax1.set_title('Model selection - Inner loop of nested cross-validation') ax2 = plt.subplot2grid((2,4), (0, 2)) plot_cv_outer(kfold_outer, zscored=0, score='auc', ax=ax2, ds030_score=0.695, split_type='10-fold') ax2.set_xlabel('') ax2.legend() ax2.set_title('Evaluation - Outer loop of nested cross-validation') ax2.title.set_position([1.1, 1.0]) ax3 = plt.subplot2grid((2,4), (1, 2)) plot_cv_outer(loso_outer, zscored=0, score='auc', ax=ax3, ds030_score=0.695, color='darkorange', split_type='LoSo (17 folds)') ax3.legend() ax3.set_xlabel('AUC') ax4 = plt.subplot2grid((2,4), (0, 3)) plot_cv_outer(kfold_outer, zscored=0, score='acc', ax=ax4, ds030_score=0.7283, split_type='10-fold') ax4.set_xlabel('') ax4.legend() ax5 = plt.subplot2grid((2,4), (1, 3)) plot_cv_outer(loso_outer, zscored=0, score='acc', ax=ax5, ds030_score=0.7283, color='darkorange', split_type='LoSo (17 folds)') ax5.legend() ax5.set_xlabel('Accuracy') fig.savefig(op.join(out_path, 'crossvalidation.pdf'), bbox_inches='tight', pad_inches=0, dpi=300) # In[ ]: zscoreddf = loso_outer.loc[loso_outer.zscored == 0, ['auc', 'acc', 'site']] palette = sn.color_palette("cubehelix", len(set(zscoreddf.site))) sn.pairplot(zscoreddf.loc[zscoreddf.auc.notnull(), ['auc', 'acc', 'site']], hue='site', palette=palette) # In[ ]: sites = sorted(list(set(loso_outer.site.ravel().tolist()))) palette = sn.color_palette("husl", len(sites)) fig = plt.figure() for i, site in enumerate(sites): sitedf = loso_outer.loc[loso_outer.site == site] accdf = sitedf.loc[sitedf.zscored==0] sn.distplot(accdf.acc.values.ravel(), bins=20, kde=0, label=site, color=palette[i]) fig.gca().legend() fig.gca().set_xlim([0.5, 1.0])