Notebook

Generation of tables and figures of MRIQC paper¶

This notebook is associated to the paper:

Esteban O, Birman D, Schaer M, Koyejo OO, Poldrack RA, Gorgolewski KJ; MRIQC: Predicting Quality in Manual MRI Assessment Protocols Using No-Reference Image Quality Measures; PLoS ONE 12(9): e0184661; doi:10.1371/journal.pone.0184661.

In [ ]:

%matplotlib inline
%load_ext autoreload
%autoreload 2
import os.path as op
import numpy as np
import pandas as pd
from pkg_resources import resource_filename as pkgrf

from mriqc.viz import misc as mviz
from mriqc.classifier.data import read_dataset, combine_datasets

# Where the outputs should be saved
outputs_path = '../../mriqc-data/'

# Path to ABIDE's BIDS structure
abide_path = '/home/oesteban/Data/ABIDE/'
# Path to DS030's BIDS structure
ds030_path = '/home/oesteban/Data/ds030/'

Read some data (from mriqc package)

In [ ]:

x_path = pkgrf('mriqc', 'data/csv/x_abide.csv')
y_path = pkgrf('mriqc', 'data/csv/y_abide.csv')
ds030_x_path = pkgrf('mriqc', 'data/csv/x_ds030.csv')
ds030_y_path = pkgrf('mriqc', 'data/csv/y_ds030.csv')

rater_types = {'rater_1': float, 'rater_2': float, 'rater_3': float}
mdata = pd.read_csv(y_path, index_col=False, dtype=rater_types)

sites = list(sorted(list(set(mdata.site.values.ravel().tolist()))))

Figure 1: artifacts in MRI¶

Shows a couple of subpar datasets from the ABIDE dataset

In [ ]:

out_file = op.join(outputs_path, 'figures', 'fig01-artifacts.svg')
mviz.figure1(
    op.join(abide_path, 'sub-50137', 'anat', 'sub-50137_T1w.nii.gz'),
    op.join(abide_path, 'sub-50110', 'anat', 'sub-50110_T1w.nii.gz'),
    out_file)

In [ ]:

out_file_pdf = out_file[:4] + '.pdf'
!rsvg-convert -f pdf -o $out_file_pdf $out_file

Figure 2: batch effects¶

This code was use to generate the second figure

In [ ]:

from mriqc.classifier.sklearn import preprocessing as mcsp

# Concatenate ABIDE & DS030
fulldata = combine_datasets([
    (x_path, y_path, 'ABIDE'),
    (ds030_x_path, ds030_y_path, 'DS030'),
])

# Names of all features
features =[
    'cjv', 'cnr', 'efc', 'fber',
    'fwhm_avg', 'fwhm_x', 'fwhm_y', 'fwhm_z',
    'icvs_csf', 'icvs_gm', 'icvs_wm',
    'inu_med', 'inu_range', 
    'qi_1', 'qi_2',
    'rpve_csf', 'rpve_gm', 'rpve_wm',
    'size_x', 'size_y', 'size_z',
    'snr_csf', 'snr_gm', 'snr_total', 'snr_wm',
    'snrd_csf', 'snrd_gm', 'snrd_total', 'snrd_wm',
    'spacing_x', 'spacing_y', 'spacing_z',
    'summary_bg_k', 'summary_bg_mad', 'summary_bg_mean', 'summary_bg_median', 'summary_bg_n', 'summary_bg_p05', 'summary_bg_p95', 'summary_bg_stdv',
    'summary_csf_k', 'summary_csf_mad', 'summary_csf_mean', 'summary_csf_median', 'summary_csf_n', 'summary_csf_p05', 'summary_csf_p95', 'summary_csf_stdv',
    'summary_gm_k', 'summary_gm_mad', 'summary_gm_mean', 'summary_gm_median', 'summary_gm_n', 'summary_gm_p05', 'summary_gm_p95', 'summary_gm_stdv',
    'summary_wm_k', 'summary_wm_mad', 'summary_wm_mean', 'summary_wm_median', 'summary_wm_n', 'summary_wm_p05', 'summary_wm_p95', 'summary_wm_stdv',
    'tpm_overlap_csf', 'tpm_overlap_gm', 'tpm_overlap_wm',
    'wm2max'
]

# Names of features that can be normalized
coi = [
    'cjv', 'cnr', 'efc', 'fber', 'fwhm_avg', 'fwhm_x', 'fwhm_y', 'fwhm_z',
    'snr_csf', 'snr_gm', 'snr_total', 'snr_wm', 'snrd_csf', 'snrd_gm', 'snrd_total', 'snrd_wm',
    'summary_csf_mad', 'summary_csf_mean', 'summary_csf_median', 'summary_csf_p05', 'summary_csf_p95', 'summary_csf_stdv', 'summary_gm_k', 'summary_gm_mad', 'summary_gm_mean', 'summary_gm_median', 'summary_gm_p05', 'summary_gm_p95', 'summary_gm_stdv', 'summary_wm_k', 'summary_wm_mad', 'summary_wm_mean', 'summary_wm_median', 'summary_wm_p05', 'summary_wm_p95', 'summary_wm_stdv'
]

# Plot batches
fig = mviz.plot_batches(fulldata, cols=list(reversed(coi)),
                        out_file=op.join(outputs_path, 'figures/fig02-batches-a.pdf'))

# Apply new site-wise scaler
scaler = mcsp.BatchRobustScaler(by='site', columns=coi)
scaled = scaler.fit_transform(fulldata)
fig = mviz.plot_batches(scaled, cols=coi, site_labels='right',
                        out_file=op.join(outputs_path, 'figures/fig02-batches-b.pdf'))

Figure 3: Inter-rater variability¶

In this figure we evaluate the inter-observer agreement between both raters on the 100 data points overlapping of ABIDE. Also the Cohen's Kappa is computed.

In [ ]:

from sklearn.metrics import cohen_kappa_score
overlap = mdata[np.all(~np.isnan(mdata[['rater_1', 'rater_2']]), axis=1)]
y1 = overlap.rater_1.values.ravel().tolist()
y2 = overlap.rater_2.values.ravel().tolist()
fig = mviz.inter_rater_variability(y1, y2, out_file=op.join(outputs_path, 'figures', 'fig02-irv.pdf'))

print("Cohen's Kappa %f" % cohen_kappa_score(y1, y2))

y1 = overlap.rater_1.values.ravel()
y1[y1 == 0] = 1

y2 = overlap.rater_2.values.ravel()
y2[y2 == 0] = 1
print("Cohen's Kappa (binarized): %f" % cohen_kappa_score(y1, y2))

Figure 5: Model selection¶

In [ ]:

import matplotlib.pyplot as plt
import seaborn as sn
rfc_acc=[0.842, 0.815, 0.648, 0.609, 0.789, 0.761, 0.893, 0.833, 0.842, 0.767, 0.806, 0.850, 0.878, 0.798, 0.559, 0.881, 0.375]
svc_lin_acc=[0.947, 0.667, 0.870, 0.734, 0.754, 0.701, 0.750, 0.639, 0.877, 0.767, 0.500, 0.475, 0.837, 0.768, 0.717, 0.050, 0.429]
svc_rbf_acc=[0.947, 0.852, 0.500, 0.578, 0.772, 0.712, 0.821, 0.583, 0.912, 0.767, 0.500, 0.450, 0.837, 0.778, 0.441, 0.950, 0.339]

df = pd.DataFrame({
    'site': list(range(len(sites))) * 3,
    'accuracy': rfc_acc + svc_lin_acc + svc_rbf_acc,
    'Model': ['RFC'] * len(sites) + ['SVC_lin'] * len(sites) + ['SVC_rbf'] * len(sites)
    
})


x = np.arange(len(sites))
data = list(zip(rfc_acc, svc_lin_acc, svc_rbf_acc))

dim = len(data[0])
w = 0.81
dimw = w / dim

colors = ['dodgerblue', 'orange', 'darkorange']

allvals = [rfc_acc, svc_lin_acc, svc_rbf_acc]

fig = plt.figure(figsize=(10, 3))
ax2 = plt.subplot2grid((1, 4), (0, 3))
plot = sn.violinplot(data=df, x='Model', y="accuracy", ax=ax2, palette=colors, bw=.1, linewidth=.7)
for i in range(dim):
    ax2.axhline(np.average(allvals[i]), ls='--', color=colors[i], lw=.8)
#     ax2.axhline(np.percentile(allvals[i], 50), ls='--', color=colors[i], lw=.8)
# sn.swarmplot(x="model", y="accuracy", data=df, color="w", alpha=.5, ax=ax2);
ax2.yaxis.tick_right()
ax2.set_ylabel('')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=40)
ax2.set_ylim([0.0, 1.0])

ax1 = plt.subplot2grid((1, 4), (0, 0), colspan=3)    
for i in range(dim):
    y = [d[i] for d in data]
    b = ax1.bar(x + i * dimw, y, dimw, bottom=0.001, color=colors[i], alpha=.6)
    print(np.average(allvals[i]), np.std(allvals[i]))
    ax1.axhline(np.average(allvals[i]), ls='--', color=colors[i], lw=.8)
    
    

plt.xlim([-0.2, 16.75])
plt.grid(False)
_ = plt.xticks(np.arange(0, 17) + 0.33, sites, rotation='vertical')
ax1.set_ylim([0.0, 1.0])
ax1.set_ylabel('Accuracy (ACC)')
fig.savefig(op.join(outputs_path, 'figures/fig05-acc.pdf'), bbox_inches='tight', dpi=300)

In [ ]:

rfc_roc_auc=[0.597, 0.380, 0.857, 0.610, 0.698, 0.692, 0.963, 0.898, 0.772, 0.596, 0.873, 0.729, 0.784, 0.860, 0.751, 0.900, 0.489]
svc_lin_roc_auc=[0.583, 0.304, 0.943, 0.668, 0.691, 0.754, 1.000, 0.778, 0.847, 0.590, 0.857, 0.604, 0.604, 0.838, 0.447, 0.650, 0.501]
svc_rbf_roc_auc=[0.681, 0.217, 0.827, 0.553, 0.738, 0.616, 0.889, 0.813, 0.845, 0.658, 0.779, 0.493, 0.726, 0.510, 0.544, 0.500, 0.447]

df = pd.DataFrame({
    'site': list(range(len(sites))) * 3,
    'auc': rfc_roc_auc + svc_lin_roc_auc + svc_rbf_roc_auc,
    'Model': ['RFC'] * len(sites) + ['SVC_lin'] * len(sites) + ['SVC_rbf'] * len(sites)
    
})

x = np.arange(len(sites))
data = list(zip(rfc_roc_auc, svc_lin_roc_auc, svc_rbf_roc_auc))

dim = len(data[0])
w = 0.81
dimw = w / dim

colors = ['dodgerblue', 'orange', 'darkorange']

allvals = [rfc_roc_auc, svc_lin_roc_auc, svc_rbf_roc_auc]

fig = plt.figure(figsize=(10, 3))
ax2 = plt.subplot2grid((1, 4), (0, 3))
plot = sn.violinplot(data=df, x='Model', y="auc", ax=ax2, palette=colors, bw=.1, linewidth=.7)
for i in range(dim):
    ax2.axhline(np.average(allvals[i]), ls='--', color=colors[i], lw=.8)

ax2.yaxis.tick_right()
ax2.set_ylabel('')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=40)
ax2.set_ylim([0.0, 1.0])

ax1 = plt.subplot2grid((1, 4), (0, 0), colspan=3)    
for i in range(dim):
    y = [d[i] for d in data]
    b = ax1.bar(x + i * dimw, y, dimw, bottom=0.001, color=colors[i], alpha=.6)
    print(np.average(allvals[i]), np.std(allvals[i]))
    ax1.axhline(np.average(allvals[i]), ls='--', color=colors[i], lw=.8)
    
    

plt.xlim([-0.2, 16.75])
plt.grid(False)
_ = plt.xticks(np.arange(0, 17) + 0.33, sites, rotation='vertical')
ax1.set_ylim([0.0, 1.0])
ax1.set_ylabel('Area under the curve (AUC)')
fig.savefig(op.join(outputs_path, 'figures/fig05-auc.pdf'), bbox_inches='tight', dpi=300)

Evaluation on DS030¶

This section deals with the results obtained on DS030.

Table 4: Confusion matrix¶

In [ ]:

from sklearn.metrics import confusion_matrix

pred_file = op.abspath(op.join(
    '..', 'mriqc/data/csv',
    'mclf_run-20170724-191452_mod-rfc_ver-0.9.7-rc8_class-2_cv-loso_data-test_pred.csv'))

pred_y = pd.read_csv(pred_file)
true_y = pd.read_csv(ds030_y_path)
true_y.rater_1 *= -1
true_y.rater_1[true_y.rater_1 < 0] = 0
print(confusion_matrix(true_y.rater_1.tolist(), pred_y.pred_y.values.ravel().tolist(), labels=[0, 1]))

Figure 6A: Feature importances¶

In [ ]:

import seaborn as sn
from sklearn.externals.joblib import load as loadpkl
sn.set_style("white")

# Get the RFC
estimator = loadpkl(pkgrf('mriqc', 'data/mclf_run-20170724-191452_mod-rfc_ver-0.9.7-rc8_class-2_cv-loso_data-train_estimator.pklz'))
forest = estimator.named_steps['rfc']

# Features selected in cross-validation
features = [
    "cjv", "cnr", "efc", "fber", "fwhm_avg", "fwhm_x", "fwhm_y", "fwhm_z", "icvs_csf", "icvs_gm", "icvs_wm",
    "qi_1", "qi_2", "rpve_csf", "rpve_gm", "rpve_wm", "snr_csf", "snr_gm", "snr_total", "snr_wm", "snrd_csf",
    "snrd_gm", "snrd_total", "snrd_wm", "summary_bg_k", "summary_bg_stdv", "summary_csf_k", "summary_csf_mad",
    "summary_csf_mean", "summary_csf_median", "summary_csf_p05", "summary_csf_p95", "summary_csf_stdv",
    "summary_gm_k", "summary_gm_mad", "summary_gm_mean", "summary_gm_median", "summary_gm_p05", "summary_gm_p95",
    "summary_gm_stdv", "summary_wm_k", "summary_wm_mad", "summary_wm_mean", "summary_wm_median", "summary_wm_p05",
    "summary_wm_p95", "summary_wm_stdv", "tpm_overlap_csf", "tpm_overlap_gm", "tpm_overlap_wm"]

nft = len(features)

forest = estimator.named_steps['rfc']
importances = np.median([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
# importances = np.median(, axis=0)
indices = np.argsort(importances)[::-1]

df = {'Feature': [], 'Importance': []}
for tree in forest.estimators_:
    for i in indices:
        df['Feature'] += [features[i]]
        df['Importance'] += [tree.feature_importances_[i]]
fig = plt.figure(figsize=(20, 6))
# plt.title("Feature importance plot")
sn.boxplot(x='Feature', y='Importance', data=pd.DataFrame(df), linewidth=1, notch=True)
plt.xlabel('Features selected (%d)' % len(features))
# plt.bar(range(nft), importances[indices],
#        color="r", yerr=std[indices], align="center")
plt.xticks(range(nft))
plt.gca().set_xticklabels([features[i] for i in indices], rotation=90)
plt.xlim([-1, nft])
plt.show()
fig.savefig(op.join(outputs_path, 'figures', 'fig06-exp2-fi.pdf'),
            bbox_inches='tight', pad_inches=0, dpi=300)

Figure 6B: Misclassified images of DS030¶

In [ ]:

fn = ['10225', '10235', '10316', '10339', '10365', '10376',
      '10429', '10460', '10506', '10527', '10530', '10624',
      '10696', '10891', '10948', '10968', '10977', '11050',
      '11052', '11142', '11143', '11149', '50004', '50005',
      '50008', '50010', '50016', '50027', '50029', '50033',
      '50034', '50036', '50043', '50047', '50049', '50053',
      '50054', '50055', '50085', '60006', '60010', '60012',
      '60014', '60016', '60021', '60046', '60052', '60072',
      '60073', '60084', '60087', '70051', '70060', '70072']
fp = ['10280', '10455', '10523', '11112', '50020', '50048',
      '50052', '50061', '50073', '60077']

In [ ]:

fn_clear = [
    ('10316', 98),
    ('10968', 122),
    ('11050', 110),
    ('11149', 111)
]

In [ ]:

import matplotlib.pyplot as plt
from mriqc.viz.utils import plot_slice
import nibabel as nb
for im, z in fn_clear:
    image_path = op.join(ds030_path, 'sub-%s' % im, 'anat', 'sub-%s_T1w.nii.gz' % im)
    imdata = nb.load(image_path).get_data()
    
    fig, ax = plt.subplots()
    plot_slice(imdata[..., z], annotate=True)
    fig.savefig(op.join(outputs_path, 'figures', 'fig-06_sub-%s_slice-%03d.svg' % (im, z)),
                dpi=300, bbox_inches='tight')
    plt.clf()
    plt.close()

In [ ]:

fp_clear = [
    ('10455', 140),
    ('50073', 162),
]
for im, z in fp_clear:
    image_path = op.join(ds030_path, 'sub-%s' % im, 'anat', 'sub-%s_T1w.nii.gz' % im)
    imdata = nb.load(image_path).get_data()
    
    fig, ax = plt.subplots()
    plot_slice(imdata[..., z], annotate=True)
    fig.savefig(op.join(outputs_path, 'figures', 'fig-06_sub-%s_slice-%03d.svg' % (im, z)),
                dpi=300, bbox_inches='tight')
    plt.clf()
    plt.close()

In [ ]: