This notebook is associated to the paper:
Esteban O, Birman D, Schaer M, Koyejo OO, Poldrack RA, Gorgolewski KJ; MRIQC: Predicting Quality in Manual MRI Assessment Protocols Using No-Reference Image Quality Measures; PLoS ONE 12(9): e0184661; doi:10.1371/journal.pone.0184661.
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os.path as op
import numpy as np
import pandas as pd
from pkg_resources import resource_filename as pkgrf
from mriqc.viz import misc as mviz
from mriqc.classifier.data import read_dataset, combine_datasets
# Where the outputs should be saved
outputs_path = '../../mriqc-data/'
# Path to ABIDE's BIDS structure
abide_path = '/home/oesteban/Data/ABIDE/'
# Path to DS030's BIDS structure
ds030_path = '/home/oesteban/Data/ds030/'
Read some data (from mriqc package)
x_path = pkgrf('mriqc', 'data/csv/x_abide.csv')
y_path = pkgrf('mriqc', 'data/csv/y_abide.csv')
ds030_x_path = pkgrf('mriqc', 'data/csv/x_ds030.csv')
ds030_y_path = pkgrf('mriqc', 'data/csv/y_ds030.csv')
rater_types = {'rater_1': float, 'rater_2': float, 'rater_3': float}
mdata = pd.read_csv(y_path, index_col=False, dtype=rater_types)
sites = list(sorted(list(set(mdata.site.values.ravel().tolist()))))
Shows a couple of subpar datasets from the ABIDE dataset
out_file = op.join(outputs_path, 'figures', 'fig01-artifacts.svg')
mviz.figure1(
op.join(abide_path, 'sub-50137', 'anat', 'sub-50137_T1w.nii.gz'),
op.join(abide_path, 'sub-50110', 'anat', 'sub-50110_T1w.nii.gz'),
out_file)
out_file_pdf = out_file[:4] + '.pdf'
!rsvg-convert -f pdf -o $out_file_pdf $out_file
This code was use to generate the second figure
from mriqc.classifier.sklearn import preprocessing as mcsp
# Concatenate ABIDE & DS030
fulldata = combine_datasets([
(x_path, y_path, 'ABIDE'),
(ds030_x_path, ds030_y_path, 'DS030'),
])
# Names of all features
features =[
'cjv', 'cnr', 'efc', 'fber',
'fwhm_avg', 'fwhm_x', 'fwhm_y', 'fwhm_z',
'icvs_csf', 'icvs_gm', 'icvs_wm',
'inu_med', 'inu_range',
'qi_1', 'qi_2',
'rpve_csf', 'rpve_gm', 'rpve_wm',
'size_x', 'size_y', 'size_z',
'snr_csf', 'snr_gm', 'snr_total', 'snr_wm',
'snrd_csf', 'snrd_gm', 'snrd_total', 'snrd_wm',
'spacing_x', 'spacing_y', 'spacing_z',
'summary_bg_k', 'summary_bg_mad', 'summary_bg_mean', 'summary_bg_median', 'summary_bg_n', 'summary_bg_p05', 'summary_bg_p95', 'summary_bg_stdv',
'summary_csf_k', 'summary_csf_mad', 'summary_csf_mean', 'summary_csf_median', 'summary_csf_n', 'summary_csf_p05', 'summary_csf_p95', 'summary_csf_stdv',
'summary_gm_k', 'summary_gm_mad', 'summary_gm_mean', 'summary_gm_median', 'summary_gm_n', 'summary_gm_p05', 'summary_gm_p95', 'summary_gm_stdv',
'summary_wm_k', 'summary_wm_mad', 'summary_wm_mean', 'summary_wm_median', 'summary_wm_n', 'summary_wm_p05', 'summary_wm_p95', 'summary_wm_stdv',
'tpm_overlap_csf', 'tpm_overlap_gm', 'tpm_overlap_wm',
'wm2max'
]
# Names of features that can be normalized
coi = [
'cjv', 'cnr', 'efc', 'fber', 'fwhm_avg', 'fwhm_x', 'fwhm_y', 'fwhm_z',
'snr_csf', 'snr_gm', 'snr_total', 'snr_wm', 'snrd_csf', 'snrd_gm', 'snrd_total', 'snrd_wm',
'summary_csf_mad', 'summary_csf_mean', 'summary_csf_median', 'summary_csf_p05', 'summary_csf_p95', 'summary_csf_stdv', 'summary_gm_k', 'summary_gm_mad', 'summary_gm_mean', 'summary_gm_median', 'summary_gm_p05', 'summary_gm_p95', 'summary_gm_stdv', 'summary_wm_k', 'summary_wm_mad', 'summary_wm_mean', 'summary_wm_median', 'summary_wm_p05', 'summary_wm_p95', 'summary_wm_stdv'
]
# Plot batches
fig = mviz.plot_batches(fulldata, cols=list(reversed(coi)),
out_file=op.join(outputs_path, 'figures/fig02-batches-a.pdf'))
# Apply new site-wise scaler
scaler = mcsp.BatchRobustScaler(by='site', columns=coi)
scaled = scaler.fit_transform(fulldata)
fig = mviz.plot_batches(scaled, cols=coi, site_labels='right',
out_file=op.join(outputs_path, 'figures/fig02-batches-b.pdf'))
In this figure we evaluate the inter-observer agreement between both raters on the 100 data points overlapping of ABIDE. Also the Cohen's Kappa is computed.
from sklearn.metrics import cohen_kappa_score
overlap = mdata[np.all(~np.isnan(mdata[['rater_1', 'rater_2']]), axis=1)]
y1 = overlap.rater_1.values.ravel().tolist()
y2 = overlap.rater_2.values.ravel().tolist()
fig = mviz.inter_rater_variability(y1, y2, out_file=op.join(outputs_path, 'figures', 'fig02-irv.pdf'))
print("Cohen's Kappa %f" % cohen_kappa_score(y1, y2))
y1 = overlap.rater_1.values.ravel()
y1[y1 == 0] = 1
y2 = overlap.rater_2.values.ravel()
y2[y2 == 0] = 1
print("Cohen's Kappa (binarized): %f" % cohen_kappa_score(y1, y2))
import matplotlib.pyplot as plt
import seaborn as sn
rfc_acc=[0.842, 0.815, 0.648, 0.609, 0.789, 0.761, 0.893, 0.833, 0.842, 0.767, 0.806, 0.850, 0.878, 0.798, 0.559, 0.881, 0.375]
svc_lin_acc=[0.947, 0.667, 0.870, 0.734, 0.754, 0.701, 0.750, 0.639, 0.877, 0.767, 0.500, 0.475, 0.837, 0.768, 0.717, 0.050, 0.429]
svc_rbf_acc=[0.947, 0.852, 0.500, 0.578, 0.772, 0.712, 0.821, 0.583, 0.912, 0.767, 0.500, 0.450, 0.837, 0.778, 0.441, 0.950, 0.339]
df = pd.DataFrame({
'site': list(range(len(sites))) * 3,
'accuracy': rfc_acc + svc_lin_acc + svc_rbf_acc,
'Model': ['RFC'] * len(sites) + ['SVC_lin'] * len(sites) + ['SVC_rbf'] * len(sites)
})
x = np.arange(len(sites))
data = list(zip(rfc_acc, svc_lin_acc, svc_rbf_acc))
dim = len(data[0])
w = 0.81
dimw = w / dim
colors = ['dodgerblue', 'orange', 'darkorange']
allvals = [rfc_acc, svc_lin_acc, svc_rbf_acc]
fig = plt.figure(figsize=(10, 3))
ax2 = plt.subplot2grid((1, 4), (0, 3))
plot = sn.violinplot(data=df, x='Model', y="accuracy", ax=ax2, palette=colors, bw=.1, linewidth=.7)
for i in range(dim):
ax2.axhline(np.average(allvals[i]), ls='--', color=colors[i], lw=.8)
# ax2.axhline(np.percentile(allvals[i], 50), ls='--', color=colors[i], lw=.8)
# sn.swarmplot(x="model", y="accuracy", data=df, color="w", alpha=.5, ax=ax2);
ax2.yaxis.tick_right()
ax2.set_ylabel('')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=40)
ax2.set_ylim([0.0, 1.0])
ax1 = plt.subplot2grid((1, 4), (0, 0), colspan=3)
for i in range(dim):
y = [d[i] for d in data]
b = ax1.bar(x + i * dimw, y, dimw, bottom=0.001, color=colors[i], alpha=.6)
print(np.average(allvals[i]), np.std(allvals[i]))
ax1.axhline(np.average(allvals[i]), ls='--', color=colors[i], lw=.8)
plt.xlim([-0.2, 16.75])
plt.grid(False)
_ = plt.xticks(np.arange(0, 17) + 0.33, sites, rotation='vertical')
ax1.set_ylim([0.0, 1.0])
ax1.set_ylabel('Accuracy (ACC)')
fig.savefig(op.join(outputs_path, 'figures/fig05-acc.pdf'), bbox_inches='tight', dpi=300)
rfc_roc_auc=[0.597, 0.380, 0.857, 0.610, 0.698, 0.692, 0.963, 0.898, 0.772, 0.596, 0.873, 0.729, 0.784, 0.860, 0.751, 0.900, 0.489]
svc_lin_roc_auc=[0.583, 0.304, 0.943, 0.668, 0.691, 0.754, 1.000, 0.778, 0.847, 0.590, 0.857, 0.604, 0.604, 0.838, 0.447, 0.650, 0.501]
svc_rbf_roc_auc=[0.681, 0.217, 0.827, 0.553, 0.738, 0.616, 0.889, 0.813, 0.845, 0.658, 0.779, 0.493, 0.726, 0.510, 0.544, 0.500, 0.447]
df = pd.DataFrame({
'site': list(range(len(sites))) * 3,
'auc': rfc_roc_auc + svc_lin_roc_auc + svc_rbf_roc_auc,
'Model': ['RFC'] * len(sites) + ['SVC_lin'] * len(sites) + ['SVC_rbf'] * len(sites)
})
x = np.arange(len(sites))
data = list(zip(rfc_roc_auc, svc_lin_roc_auc, svc_rbf_roc_auc))
dim = len(data[0])
w = 0.81
dimw = w / dim
colors = ['dodgerblue', 'orange', 'darkorange']
allvals = [rfc_roc_auc, svc_lin_roc_auc, svc_rbf_roc_auc]
fig = plt.figure(figsize=(10, 3))
ax2 = plt.subplot2grid((1, 4), (0, 3))
plot = sn.violinplot(data=df, x='Model', y="auc", ax=ax2, palette=colors, bw=.1, linewidth=.7)
for i in range(dim):
ax2.axhline(np.average(allvals[i]), ls='--', color=colors[i], lw=.8)
ax2.yaxis.tick_right()
ax2.set_ylabel('')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=40)
ax2.set_ylim([0.0, 1.0])
ax1 = plt.subplot2grid((1, 4), (0, 0), colspan=3)
for i in range(dim):
y = [d[i] for d in data]
b = ax1.bar(x + i * dimw, y, dimw, bottom=0.001, color=colors[i], alpha=.6)
print(np.average(allvals[i]), np.std(allvals[i]))
ax1.axhline(np.average(allvals[i]), ls='--', color=colors[i], lw=.8)
plt.xlim([-0.2, 16.75])
plt.grid(False)
_ = plt.xticks(np.arange(0, 17) + 0.33, sites, rotation='vertical')
ax1.set_ylim([0.0, 1.0])
ax1.set_ylabel('Area under the curve (AUC)')
fig.savefig(op.join(outputs_path, 'figures/fig05-auc.pdf'), bbox_inches='tight', dpi=300)
from sklearn.metrics import confusion_matrix
pred_file = op.abspath(op.join(
'..', 'mriqc/data/csv',
'mclf_run-20170724-191452_mod-rfc_ver-0.9.7-rc8_class-2_cv-loso_data-test_pred.csv'))
pred_y = pd.read_csv(pred_file)
true_y = pd.read_csv(ds030_y_path)
true_y.rater_1 *= -1
true_y.rater_1[true_y.rater_1 < 0] = 0
print(confusion_matrix(true_y.rater_1.tolist(), pred_y.pred_y.values.ravel().tolist(), labels=[0, 1]))
import seaborn as sn
from sklearn.externals.joblib import load as loadpkl
sn.set_style("white")
# Get the RFC
estimator = loadpkl(pkgrf('mriqc', 'data/mclf_run-20170724-191452_mod-rfc_ver-0.9.7-rc8_class-2_cv-loso_data-train_estimator.pklz'))
forest = estimator.named_steps['rfc']
# Features selected in cross-validation
features = [
"cjv", "cnr", "efc", "fber", "fwhm_avg", "fwhm_x", "fwhm_y", "fwhm_z", "icvs_csf", "icvs_gm", "icvs_wm",
"qi_1", "qi_2", "rpve_csf", "rpve_gm", "rpve_wm", "snr_csf", "snr_gm", "snr_total", "snr_wm", "snrd_csf",
"snrd_gm", "snrd_total", "snrd_wm", "summary_bg_k", "summary_bg_stdv", "summary_csf_k", "summary_csf_mad",
"summary_csf_mean", "summary_csf_median", "summary_csf_p05", "summary_csf_p95", "summary_csf_stdv",
"summary_gm_k", "summary_gm_mad", "summary_gm_mean", "summary_gm_median", "summary_gm_p05", "summary_gm_p95",
"summary_gm_stdv", "summary_wm_k", "summary_wm_mad", "summary_wm_mean", "summary_wm_median", "summary_wm_p05",
"summary_wm_p95", "summary_wm_stdv", "tpm_overlap_csf", "tpm_overlap_gm", "tpm_overlap_wm"]
nft = len(features)
forest = estimator.named_steps['rfc']
importances = np.median([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
# importances = np.median(, axis=0)
indices = np.argsort(importances)[::-1]
df = {'Feature': [], 'Importance': []}
for tree in forest.estimators_:
for i in indices:
df['Feature'] += [features[i]]
df['Importance'] += [tree.feature_importances_[i]]
fig = plt.figure(figsize=(20, 6))
# plt.title("Feature importance plot")
sn.boxplot(x='Feature', y='Importance', data=pd.DataFrame(df), linewidth=1, notch=True)
plt.xlabel('Features selected (%d)' % len(features))
# plt.bar(range(nft), importances[indices],
# color="r", yerr=std[indices], align="center")
plt.xticks(range(nft))
plt.gca().set_xticklabels([features[i] for i in indices], rotation=90)
plt.xlim([-1, nft])
plt.show()
fig.savefig(op.join(outputs_path, 'figures', 'fig06-exp2-fi.pdf'),
bbox_inches='tight', pad_inches=0, dpi=300)
fn = ['10225', '10235', '10316', '10339', '10365', '10376',
'10429', '10460', '10506', '10527', '10530', '10624',
'10696', '10891', '10948', '10968', '10977', '11050',
'11052', '11142', '11143', '11149', '50004', '50005',
'50008', '50010', '50016', '50027', '50029', '50033',
'50034', '50036', '50043', '50047', '50049', '50053',
'50054', '50055', '50085', '60006', '60010', '60012',
'60014', '60016', '60021', '60046', '60052', '60072',
'60073', '60084', '60087', '70051', '70060', '70072']
fp = ['10280', '10455', '10523', '11112', '50020', '50048',
'50052', '50061', '50073', '60077']
fn_clear = [
('10316', 98),
('10968', 122),
('11050', 110),
('11149', 111)
]
import matplotlib.pyplot as plt
from mriqc.viz.utils import plot_slice
import nibabel as nb
for im, z in fn_clear:
image_path = op.join(ds030_path, 'sub-%s' % im, 'anat', 'sub-%s_T1w.nii.gz' % im)
imdata = nb.load(image_path).get_data()
fig, ax = plt.subplots()
plot_slice(imdata[..., z], annotate=True)
fig.savefig(op.join(outputs_path, 'figures', 'fig-06_sub-%s_slice-%03d.svg' % (im, z)),
dpi=300, bbox_inches='tight')
plt.clf()
plt.close()
fp_clear = [
('10455', 140),
('50073', 162),
]
for im, z in fp_clear:
image_path = op.join(ds030_path, 'sub-%s' % im, 'anat', 'sub-%s_T1w.nii.gz' % im)
imdata = nb.load(image_path).get_data()
fig, ax = plt.subplots()
plot_slice(imdata[..., z], annotate=True)
fig.savefig(op.join(outputs_path, 'figures', 'fig-06_sub-%s_slice-%03d.svg' % (im, z)),
dpi=300, bbox_inches='tight')
plt.clf()
plt.close()