In [2]:

# tools for handling files
import sys
import os

# pandas/numpy for handling data
import pandas as pd
import numpy as np
# seaborn/matplotlib for graphing
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
# statistics
from statistics import mean 
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats

# for reading individual telomere length data from files
from ast import literal_eval
# for grabbing individual cells
import more_itertools
# my module containing functions for handling/visualizing/analyzing telomere length/chr rearrangement data
import telomere_methods_rad_patient as trp

# machine learning 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, auc, accuracy_score, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from bayes_opt import BayesianOptimization
import xgboost
import shap

import re
from ast import literal_eval
from matplotlib import lines
from matplotlib.offsetbox import AnchoredText
from sklearn.metrics import r2_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# incase reloading modules is required
import importlib
%load_ext autoreload
%autoreload 

# setting darkgrid style for seaborn figures
# sns.set(font='helvetica')
sns.set_style(style="darkgrid",rc= {'patch.edgecolor': 'black'})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Machine Learning (ML) - predicting mean telomere length post-therapy (4 C) using pre-therapy individual telos¶

Loading/merging data for ML¶

In [75]:

exploded_telos_all_patients_df = pd.read_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv')
all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')

In [76]:

# cleaning & combing data; retaining features of interest
telo_data = trp.combine_data(exploded_telos=exploded_telos_all_patients_df, all_patients_df=all_patients_df)

In [77]:

# saving data to stylized table for manuscript

print(telo_data.shape)

example = telo_data.copy()
example.rename({'timepoint':'pre-therapy sample origin', 
                'individual telomeres':'individual telomeres (RFI)'}, axis=1, inplace=True)
example_8 = example[10:16].reset_index(drop=True)

# path=f'../graphs/paper figures/supp figs/view of precleaned individual telomere length dataframe.png'
# trp.render_mpl_table(example_8, col_width=4, path=path)

(128800, 4)

Train/test split¶

In [78]:

telo_test = telo_data.copy()

train_set, test_set = train_test_split(telo_test, test_size=0.2, shuffle=True, stratify=telo_test[['patient id', 
                                                                                                   'timepoint']])

Initializing cleaning/model pipelines¶

In [79]:

clean_process_pipe = Pipeline([('features', trp.make_features(make_log_target=False)), 
                               ('dummies', trp.make_dummies(drop_first=True)),
                               ('cleaner', trp.clean_data(drop_patient_id=True))
                              ])

In [80]:

model = XGBRegressor(n_estimators=200, max_depth=7, learning_rate=0.2, 
                     objective ='reg:squarederror', random_state=1)

xgb_pipe = Pipeline([('XGB', model)
                    ])

full_pipe = Pipeline([('clean_process', clean_process_pipe),
                      ('model', model)
                     ])

Cleaning data with pipeline¶

In [81]:

train_clean = train_set.copy()
test_clean = test_set.copy()

train_clean = clean_process_pipe.fit_transform(train_clean)
test_clean = clean_process_pipe.fit_transform(test_clean)

In [82]:

example8 = train_clean[['timepoint_2 irrad @ 4 Gy', 'individual telomeres', '4 C telo means']].copy()
example = example8.copy()
example.rename({'timepoint_2 irrad @ 4 Gy':'encoded sample origin', 
                'individual telomeres':'individual telomeres (RFI)'}, axis=1, inplace=True)

example_8 = example[10:16].reset_index(drop=True)
path=f'../graphs/paper figures/supp figs/view of cleaned individual telomere length dataframe.png'
trp.render_mpl_table(example_8, col_width=4, path=path)

Random GridSearch for pipeline/model params¶

In [10]:

# #XGBoost params for random grid search
# param_grid = {'XGB__max_depth': [3, 7, 9],
#               'XGB__learning_rate': [0.05, 0.1]}

# X_train = train_clean[['individual telomeres', 'timepoint_1']].copy()
# y_train = train_clean['4 C telo means'].copy()

# folds = KFold(5, shuffle=True, random_state=0)
# df_results, best_estimator = trp.grid_search(X_train, y_train, xgb_pipe, param_grid, 
#                                           scoring='neg_mean_absolute_error', cv=folds, n_iter=2)
# print(best_estimator)

Bayesian Optimization¶

In [11]:

# target = '4 C telo means'
# features = [col for col in train_clean if col != target and col != 'patient id']

# X_train = train_clean[features].copy()
# y_train = train_clean[target].copy()

In [12]:

# pbounds = {
#     'learning_rate': (0.1, 0.2),
#     'n_estimators': (100, 200),
#     'max_depth': (3, 30),
#     'subsample': (.80, 1.0),  
#     'colsample': (1.0, 1.0),  # Change for datasets with lots of features
#     'gamma': (0, 1)}

# def xgboost_hyper_param(learning_rate, n_estimators, max_depth,
#                         subsample, colsample, gamma):
 
#     max_depth = int(max_depth)
#     n_estimators = int(n_estimators)
 
#     clf = XGBRegressor(max_depth=max_depth,
#                        learning_rate=learning_rate,
#                        n_estimators=n_estimators,
#                        gamma=gamma, objective='reg:squarederror')
    
#     return np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error'))
 
# optimizer = BayesianOptimization(
#     f=xgboost_hyper_param,
#     pbounds=pbounds,
#     random_state=1,)

# optimizer.maximize(init_points=10, n_iter=5)

In [13]:

# bayesian optimized model
# bo_model = XGBRegressor(n_estimators=260, max_depth=17, learning_rate=0.25,
#                        objective ='reg:squarederror', random_state=0, subsample=.9,
#                        gamma=1.7)

Visualizing model objective¶

In [14]:

# train_viz = train_set.copy()
# train_viz.sort_values(by=['patient id'], axis=0, ascending=True, inplace=True)
# trp.plot_individ_telos_ML_objective(df=train_viz, target='4 C telo means')

Cross validation; MAE & R2 score¶

In [83]:

# predicting mean telo telo post-therapy (4 C) using individual telos
fit_xgb_model, telo_row = trp.cv_score_fit_mae_test(train_set=train_clean, test_set=test_clean,
                                                    model=model, cv=5)

MAE per CV fold: 
[3.14995355 3.13661234 3.23190584 3.27706171 3.15953341] 

MEAN of MAE all folds: 3.1910133720439524
STD of MAE all folds: 0.0542455147393282

MAE of predict_y_test & y_test: 3.193159481699425
R2 between predict_y_test & y_test: 0.8854687344373416

In [84]:

# evaluating model based on N samples in training data
telo_model_df = []

for n in [100, 500, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 103040]:
    fit_xgb_model, telo_row = trp.cv_score_fit_mae_test(train_set=train_clean.sample(n), test_set=test_clean,
                                                        target='4 C telo means',
                                                        model=model, cv=5, verbose=False)
    telo_row[0].append(n)
    telo_model_df += (telo_row)

In [85]:

model_metrics_n_train = pd.DataFrame(telo_model_df, 
                                     columns=['Model', 'Features', 'Target', 'Average MAE of CV folds', 
                                              'Std dev of MAE of CV folds', 'MAE predicted vs. test values', 
                                              'R2 predicted vs. test values', 'N samples training data'])
model_metrics_n_train['Features'] = model_metrics_n_train['Features'].apply(lambda row: ', '.join(row))
model_metrics_n_train['Features'] = 'individual telos, encoded samples'
model_metrics_n_train.drop(['Model', 'Features', 'Target'], axis=1, inplace=True)

# saving table for manuscript
# path=f'../graphs/paper figures/supp figs/ML model metrics target_4C mean telos | per N training samples 100 to 100000.png'
# trp.render_mpl_table(model_metrics_n_train, col_width=4, path=path)

In [86]:

y_predict_list, y_true_list, ext = trp.predict_target_4C_compare_actual(telo_data=telo_test, test_set=test_set, 
                                                                        model=fit_xgb_model, target='4 C telo means',
                                                                        clean_process_pipe=clean_process_pipe, verbose=True)

patient 1: ACTUAL 4 C telo means: 99.35 --- PREDICTED 4 C telo means: 99.32
patient 2: ACTUAL 4 C telo means: 108.92 --- PREDICTED 4 C telo means: 104.56
patient 3: ACTUAL 4 C telo means: 95.67 --- PREDICTED 4 C telo means: 96.65
patient 5: ACTUAL 4 C telo means: 97.83 --- PREDICTED 4 C telo means: 99.33
patient 6: ACTUAL 4 C telo means: 130.12 --- PREDICTED 4 C telo means: 123.33
patient 7: ACTUAL 4 C telo means: 101.40 --- PREDICTED 4 C telo means: 100.78
patient 8: ACTUAL 4 C telo means: 106.65 --- PREDICTED 4 C telo means: 104.11
patient 9: ACTUAL 4 C telo means: 107.67 --- PREDICTED 4 C telo means: 105.05
patient 10: ACTUAL 4 C telo means: 93.35 --- PREDICTED 4 C telo means: 97.47
patient 11: ACTUAL 4 C telo means: 108.57 --- PREDICTED 4 C telo means: 104.66
patient 12: ACTUAL 4 C telo means: 73.99 --- PREDICTED 4 C telo means: 80.68
patient 14: ACTUAL 4 C telo means: 93.28 --- PREDICTED 4 C telo means: 94.61
patient 15: ACTUAL 4 C telo means: 90.68 --- PREDICTED 4 C telo means: 93.33
patient 16: ACTUAL 4 C telo means: 77.96 --- PREDICTED 4 C telo means: 81.25
MAE predicted vs. actual 4 C telo means: 2.9594924684998127
R2 predicted vs. actual 4 C telo means: 0.9288917111959061

Closer look @ model performance¶

In [29]:

xgboost.plot_importance(fit_xgb_model)

Out[29]:

<matplotlib.axes._subplots.AxesSubplot at 0x1381768d0>

In [56]:

train_clean123 = clean_process_pipe.set_params(cleaner__drop_patient_id=True).fit_transform(train_set)
test_clean123 = clean_process_pipe.set_params(cleaner__drop_patient_id=False).fit_transform(test_set)

patient_info = test_clean123[['patient id', '4 C telo means']]
test_clean123.drop(['patient id', '4 C telo means'], axis=1, inplace=True)
preds = fit_xgb_model.predict(test_clean123)

In [57]:

predictions_df = pd.concat([test_clean123, patient_info, pd.DataFrame(preds)], axis=1)
predictions_df.rename({0:'predicted 4 C telo means'}, axis=1, inplace=True)

In [13]:

sns.set_color_codes()

plt.figure(figsize=(10,6))

ax = sns.regplot(x='individual telomeres', y='predicted 4 C telo means', 
                 scatter_kws={'s':200, 'alpha':.05, 'edgecolor':'k', 'linewidth':0.5}, color='grey', data=predictions_df)


ax.set_xlabel("Actual", fontsize=14)
ax.set_ylabel("Predicted", fontsize=14)
ax.tick_params(labelsize=14)

In [88]:

blank = Rectangle((0,0), 1, 1, fill=False, edgecolor='none', visible=False)

In [59]:

ax = sns.lmplot(x='4 C telo means', y='predicted 4 C telo means', data=predictions_df, 
                scatter_kws={'s':250, 'alpha':None, 'edgecolor':'k', 'linewidth':0},
                height=6, aspect=1, hue='patient id', fit_reg=False, palette=trp.patients_palette(), legend=False)

# sns.regplot(x='True', y='Predicted', data=df, color='white', scatter=False, 
#             line_kws={'alpha':0.5}, ax=ax.axes[0, 0])

ax = plt.gca()
    
# colors data on axes
for patch in range(len(ax.patches)):
    color = ax.patches[patch].get_facecolor()
    color = list(color)
    color[3] = 0.01
    color = tuple(color)
    ax.patches[patch].set_facecolor(color)
    
    
# colors legend handle markers
handles, labels = ax.get_legend_handles_labels()
    
for handle, color in zip(handles, trp.patients_palette()):
    copy = color[0:3] + (.1,)
    handle.set_edgecolor('black')
    handle.set_linewidth(.05)
    handle.set_facecolor(copy)

plt.legend([blank] + handles, ['patient id'] + labels, loc='upper left', bbox_to_anchor=(.99, 1.02),
           fancybox=True, markerscale=1, fontsize=13)


plt.xlabel("Actual post-therapy mean telomere lengths", fontsize=14)
plt.ylabel("Predicted post-therapy mean telomere lengths", fontsize=14)
plt.tick_params(labelsize=14)
# plt.title('mean telomere length', fontsize=14, )
plt.savefig(f'../graphs/paper figures/main figs/ML model actual 4 C mean telos vs ALL predicted.png', 
            dpi=600, bbox_inches = "tight")

In [139]:

df.head(3)

Out[139]:

	True	Predicted	patient id
0	99.346299	99.279747	1
1	108.915327	104.641365	2
2	95.669501	96.624336	3

In [89]:

import scipy

x = df['True']
y = df['Predicted']

scipy.stats.linregress(x, y)

Out[89]:

LinregressResult(slope=0.7491764622791022, intercept=24.801334982266937, rvalue=0.9927797608067482, pvalue=2.014060834569976e-12, stderr=0.026130398367336685)

In [90]:

# 0.99**2

In [99]:

df = pd.DataFrame({'True':y_true_list, 'Predicted':y_predict_list, 'patient id':ext})

ax = sns.lmplot(x='True', y='Predicted', data=df, scatter_kws={'s':250, 'alpha':None, 'edgecolor':'k'},
                height=6, aspect=1, hue='patient id', fit_reg=False, palette=trp.patients_palette(), legend=False, 
                truncate=False)

sns.regplot(x='True', y='Predicted', data=df, color='blue', scatter=False, truncate=False,
            line_kws={'alpha':0.3}, ax=ax.axes[0, 0])

ax = plt.gca()

handles, labels = ax.get_legend_handles_labels()

for handle, color in zip(handles, trp.patients_palette()):
    handle.set_edgecolor('black')
    handle.set_linewidth(.75)
    handle.set_facecolor(color)
    
for patch in range(len(ax.patches)):
    color = ax.patches[patch].get_facecolor()
    color = list(color)
    color[3] = 0.4
    color = tuple(color)
    ax.patches[patch].set_facecolor(color)


plt.legend([blank] + handles, ['patient id'] + labels, loc='upper left', bbox_to_anchor=(.99, 1.02),
           fancybox=True, markerscale=1, fontsize=13)

text = AnchoredText(f'R2= 0.931', loc='upper left', frameon=False, 
                    prop={'fontsize':14,'fontweight':'bold'})
ax.add_artist(text)

plt.xlabel("Actual post-therapy mean telomere lengths", fontsize=14)
plt.ylabel("Average predicted post-therapy mean telomere lengths", fontsize=14)
plt.tick_params(labelsize=14)
# plt.title('mean telomere length', fontsize=14, )
plt.savefig(f'../graphs/paper figures/main figs/ML model actual 4 C mean telos vs AVERAGE predicted.png', 
            dpi=600, bbox_inches = "tight")

Analysis of XGBoost performance when 1/2/3 of each patients is omitted from training data¶

In [ ]:

# loading data
exploded_telos_all_patients_df = pd.read_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv')
all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')

# cleaning & combing data; retaining features of interest
telo_data = trp.combine_data(exploded_telos=exploded_telos_all_patients_df, all_patients_df=all_patients_df)

telo_data_full = telo_data.copy()
train_full, test_full = train_test_split(telo_data_full, test_size=0.2, shuffle=True, 
                                         stratify=telo_data_full[['patient id', 'timepoint']])

both_dfs = []
for less_patient in [1, 2, 3]:
    all_data = pd.DataFrame()
    patient_ids = telo_data['patient id'].unique()
    for ind, val in enumerate(patient_ids):
        if less_patient == 2 and val == 16:
            break
        elif less_patient == 3 and val == 15:
            break
        else:
            patients = patient_ids[ind:ind+less_patient]
            telo_data_reduced = telo_data[~telo_data['patient id'].isin(patients)].copy()    
            train_set, test_set = train_test_split(telo_data_reduced, test_size=0.2, shuffle=True, 
                                                   stratify=telo_data_reduced[['patient id', 'timepoint']])

            clean_process_pipe = Pipeline([('features', trp.make_features(make_log_target=False)), 
                                           ('dummies', trp.make_dummies(drop_first=True)),
                                           ('cleaner', trp.clean_data(drop_patient_id=True))])

            # XGBoost model
            model = XGBRegressor(n_estimators=200, max_depth=7, learning_rate=0.2, 
                                 objective ='reg:squarederror', random_state=1)

            train_clean = train_set.copy()
            test_clean = test_set.copy()
            train_clean = clean_process_pipe.fit_transform(train_clean)
            test_clean = clean_process_pipe.fit_transform(test_clean)

            # predicting mean telo telo post-therapy (4 C) using individual telos
            fit_xgb_model, telo_row = trp.cv_score_fit_mae_test(train_set=train_clean, test_set=test_clean, 
                                                                model=model, cv=5, verbose=False)
        #     print('\n')
            y_predict_list, y_true_list, patient_ids = trp.predict_target_4C_compare_actual(telo_data=telo_data_full, 
                                                                                            test_set=test_full, 
                                                                                            model=fit_xgb_model, 
                                                                                            target='4 C telo means',
                                                                                            clean_process_pipe=clean_process_pipe, 
                                                                                            verbose=False)

            temp_df = pd.DataFrame({'predicted vals':y_predict_list,
                                    'true vals':y_true_list,
                                    'patient ids':patient_ids})
            temp_df['less patient #'] = f'less patient #{patients}'
            all_data = pd.concat([all_data, temp_df], axis=0)

            print(f'passed PATIENT# {patients}')
    both_dfs.append(all_data)

Graphing of XGBoost performance when 1/2/3 of each patients is omitted from training data¶

In [ ]:

leave_one_org, leave_two_org, leave_thr_org = both_dfs[0], both_dfs[1], both_dfs[2]

leave_one_org.to_csv('../data/compiled patient data csv files/leave_one_out_ML_df.csv', index=False)
leave_two_org.to_csv('../data/compiled patient data csv files/leave_two_out_ML_df.csv', index=False)
leave_thr_org.to_csv('../data/compiled patient data csv files/leave_thr_out_ML_df.csv', index=False)

leave_one = leave_one_org.copy()
leave_two = leave_two_org.copy()
leave_thr = leave_thr_org.copy()

In [157]:

leave_one = pd.read_csv('../data/compiled patient data csv files/leave_one_out_ML_df.csv')
leave_two = pd.read_csv('../data/compiled patient data csv files/leave_two_out_ML_df.csv')
leave_thr = pd.read_csv('../data/compiled patient data csv files/leave_thr_out_ML_df.csv')

In [158]:

leave_two.head(3)

Out[158]:

	predicted vals	true vals	patient ids	less patient #
0	98.744576	99.346299	1	less patient #[1 2]
1	101.707565	108.915327	2	less patient #[1 2]
2	96.010017	95.669501	3	less patient #[1 2]

Leave out (one of each patient)¶

In [159]:

def return_real_ids(row):
    found = re.findall('(?<=#).*$', row)
    return literal_eval(found[0])[0]

leave_one['patient ids'] = leave_one['patient ids'].astype('int64')
leave_one['less patient #'] = leave_one['less patient #'].apply(lambda row: return_real_ids(row))
leave_one['less patient #'] = leave_one['less patient #'].astype('int64')

In [171]:

sns.set(font_scale=1.1)
plt.figure(figsize=(7,7))

leave_one.rename({'true vals':'Actual values', 'predicted vals':'Predicted values'}, axis=1, inplace=True)

ax = sns.lmplot(x='Actual values', y='Predicted values', data=leave_one, 
                hue='less patient #', col='less patient #', 
                scatter_kws={'s':200, 'edgecolor':'k', 'facecolor':None, 'alpha':0.7}, height=3,
                sharey=False, sharex=False, col_wrap=5)

ax_list = ax.axes

for index, val in enumerate(list(leave_one['less patient #'].unique())):
    
    # pull out data for patient not present in training data
    parse_df = leave_one[leave_one['less patient #'] == val].copy()
    missing_patient = parse_df[parse_df['patient ids'] == val].copy()
    
    # color patient marker black
    ax_list[index].plot(missing_patient['Actual values'], missing_patient['Predicted values'],
                        markersize=12, marker='o', mec='black', color='black', alpha=.7)
    
    # annotate coefficient of determination between predicted vs true vals
    r2_value = r2_score(parse_df['Actual values'], parse_df['Predicted values'])
    text = AnchoredText(f'R2= {r2_value.round(3)}', loc='upper left', frameon=False, 
                        prop={'fontsize':13,'fontweight':'bold'})
    ax_list[index].add_artist(text)
    
plt.tight_layout(pad=.7)
plt.savefig(f'../graphs/paper figures/main figs/XGBoost leave one patient out performance telo means.png', 
            dpi=600, bbox_inches = "tight")

<Figure size 504x504 with 0 Axes>

Leave out two¶

In [161]:

def correct_data(row):
    if row == 'less patient #[1 2]':
        return 'less patient #[1, 2]'
    else:
        return row
    
def return_real_ids_multiple(row):
    found = re.findall('(?<=#).*$', row)
    return literal_eval(found[0])

leave_two['less patient #'] = leave_two['less patient #'].apply(lambda row: correct_data(row))
leave_two['less patient #'] = leave_two['less patient #'].apply(lambda row: return_real_ids_multiple(row))
leave_two['hue col'] = leave_two['less patient #'].astype('str')
leave_two['hue col'] = leave_two['hue col'].astype('category')

In [170]:

sns.set(font_scale=1.1)
plt.figure(figsize=(7,7))

leave_two.rename({'true vals':'Actual values', 'predicted vals':'Predicted values'}, axis=1, inplace=True)


ax = sns.lmplot(x='Actual values', y='Predicted values', data=leave_two, 
                hue='hue col', col='hue col',
                scatter_kws={'s':200, 'edgecolor':'k', 'alpha':0.7}, height=3,
                sharey=False, sharex=False, col_wrap=5)

ax_list = ax.axes

for index, val in enumerate(list(leave_two['hue col'].unique())):

    # pull out data for patient(s) not present in training data
    parse_df = leave_two[leave_two['hue col'] == val].copy()
    val = literal_eval(val)
    missing_p1 = parse_df[parse_df['patient ids'] == val[0]].copy()
    
    # color patients missing from training data
    ax_list[index].plot(missing_p1['Actual values'], missing_p1['Predicted values'],
                        markersize=12, marker='o', mec='black', color='black', alpha=.7)
    
    missing_p2 = parse_df[parse_df['patient ids'] == val[1]].copy()
    ax_list[index].plot(missing_p2['Actual values'], missing_p2['Predicted values'],
                        markersize=12, marker='o', mec='black', color='white', alpha=.8)
    
    # annotate coefficient of determination for predicted vs true
    r2_value = r2_score(parse_df['Actual values'], parse_df['Predicted values'])
    text = AnchoredText(f'R2= {r2_value.round(3)}', loc='upper left', frameon=False, 
                        prop={'fontsize':13,'fontweight':'bold'})
    ax_list[index].add_artist(text)
    
    ax_list[index].set_title(f'less patient #s = {val}')

plt.tight_layout(pad=.7)
plt.savefig(f'../graphs/paper figures/main figs/XGBoost leave two patient out performance telo means.png', 
            dpi=600, bbox_inches = "tight")

<Figure size 504x504 with 0 Axes>

Leave out three¶

In [166]:

def correct_data_thr(row):
    if row == 'less patient #[1 2 3]':
        return 'less patient #[1, 2, 3]'
    else:
        return row
    
def return_real_ids_multiple(row):
    found = re.findall('(?<=#).*$', row)
    return literal_eval(found[0])

leave_thr['less patient #'] = leave_thr['less patient #'].apply(lambda row: correct_data_thr(row))
leave_thr['less patient #'] = leave_thr['less patient #'].apply(lambda row: return_real_ids_multiple(row))
leave_thr['hue col'] = leave_thr['less patient #'].astype('str')
leave_thr['hue col'] = leave_thr['hue col'].astype('category')

In [169]:

sns.set(font_scale=1.1)
plt.figure(figsize=(7,7))

leave_thr.rename({'true vals':'Actual values', 'predicted vals':'Predicted values'}, axis=1, inplace=True)

ax = sns.lmplot(x='Actual values', y='Predicted values', data=leave_thr, 
                hue='hue col', col='hue col',
                scatter_kws={'s':200, 'edgecolor':'k', 'alpha':0.7}, height=3,
                sharey=False, sharex=False, col_wrap=5)

ax_list = ax.axes

for index, val in enumerate(list(leave_thr['hue col'].unique())):

    # pull out data for patient(s) not present in training data
    parse_df = leave_thr[leave_thr['hue col'] == val].copy()
    
    # read patients IDs as list
    val = literal_eval(val)
    
    # identify patients that were missing from training data and color those patients 
    # on graph to emphasize XGBoost performance on patients not trained on
    missing_p1 = parse_df[parse_df['patient ids'] == val[0]].copy()
    ax_list[index].plot(missing_p1['Actual values'], missing_p1['Predicted values'],
                        markersize=12, marker='o', mec='black', color='black', alpha=.7)
    
    missing_p2 = parse_df[parse_df['patient ids'] == val[1]].copy()
    ax_list[index].plot(missing_p2['Actual values'], missing_p2['Predicted values'],
                        markersize=12, marker='o', mec='black', color='white', alpha=.8)
    
    missing_p3 = parse_df[parse_df['patient ids'] == val[2]].copy()
    ax_list[index].plot(missing_p3['Actual values'], missing_p3['Predicted values'],
                        markersize=12, marker='o', mec='black', color='yellow', alpha=.7)
    
    # annotate coefficient of determination for predicted vs true
    r2_value = r2_score(parse_df['Actual values'], parse_df['Predicted values'])
    text = AnchoredText(f'R2= {r2_value.round(3)}', loc='upper left', frameon=False, 
                        prop={'fontsize':13,'fontweight':'bold'})
    ax_list[index].add_artist(text)
    ax_list[index].set_title(f'less patient #s = {val}')

plt.tight_layout(pad=.7)
plt.savefig(f'../graphs/paper figures/main figs/XGBoost leave three patient out performance telo means.png', 
            dpi=600, bbox_inches = "tight")

<Figure size 504x504 with 0 Axes>

Dataframe for model metrics¶

In [ ]:

stats_df = trp.make_stats_df(stats_list=telo_row)

path=f'../graphs/paper figures/supp figs/ML model metrics target_4C mean telos | features_individ telos.png'
trp.render_mpl_table(stats_df, col_width=6, path=path)

Testing if linear regression can predict 4C mean telos using individual telos¶

In [ ]:

lr_model = LinearRegression(normalize=True)
lr_pipe = Pipeline([('lr_model', lr_model)
                   ])

In [ ]:

# predicting mean telo telo post-therapy (4 C) using individual telos
fit_lr_model, row = trp.cv_score_fit_mae_test(train_set=train_clean, test_set=test_clean,
                                              model=lr_model, cv=5)

In [ ]:

lr_y_predict_list, y_true_list = trp.predict_target_4C_compare_actual(telo_data=telo_test, test_set=test_set, 
                                     model=fit_lr_model, target='4 C telo means',
                                     clean_process_pipe=clean_process_pipe, verbose=False)

In [ ]:

plt.figure(figsize=(6,6))
ax = sns.regplot(x=y_true_list, y=lr_y_predict_list, scatter_kws={'s':200, 'edgecolor':'k'},)
ax.set_xlabel('Actual mean telomere lengths post-therapy', fontsize=18)
ax.set_ylabel('Predicted mean telomere lengths post-therapy', fontsize=18)
ax.tick_params(labelsize=14)

Machine Learning - exploring predictions of #s of short telomeres post-therapy¶

Loading/merging data for ML¶

In [100]:

exploded_telos_all_patients_df = pd.read_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv')
all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')

In [101]:

# cleaning & combing data; retaining features of interest
quartile_telo_data = trp.combine_data(exploded_telos=exploded_telos_all_patients_df, 
                                      all_patients_df=all_patients_df,
                                      pred_obj='4 C # short telos from individual telos')

In [102]:

# # saving view of preprocessed data
# example = quartile_telo_data.copy()
# example.rename({'timepoint':'pre-therapy sample origin', 
#                 'individual telomeres':'individual telomeres (RFI)'}, axis=1, inplace=True)
# example_8 = example[10:16].reset_index(drop=True)
# path=f'../graphs/paper figures/supp figs/view of precleaned individual telomere length SHORT TELOS dataframe.png'
# trp.render_mpl_table(example_8, col_width=4, path=path)

Train/test split¶

In [103]:

quartile_telo_test = quartile_telo_data.copy()

y = quartile_telo_test[['4 C # short telos']]
X = quartile_telo_test.drop(['4 C # short telos'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

q_train_set = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
q_test_set = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

Initializing cleaning/model pipelines¶

In [104]:

q_clean_process_pipe = Pipeline([('features', trp.make_features(make_log_target=False)), 
                               ('dummies', trp.make_dummies(drop_first=True)),
                               ('cleaner', trp.clean_data(drop_patient_id=True))
                              ])

In [105]:

# initialize XGBoost model & pipeline for hyperparams gridsearch
q_model = XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.2,
                       objective='reg:squarederror', random_state=1,)
q_xgb_pipe = Pipeline([('XGB', q_model)
                      ])

q_full_pipe = Pipeline([('clean_process', q_clean_process_pipe),
                        ('model', q_model)
                       ])

Cleaning/modeling data with pipeline¶

In [106]:

q_train_clean = q_train_set.copy()
q_test_clean = q_test_set.copy()

q_train_clean = q_clean_process_pipe.fit_transform(q_train_clean)
q_test_clean = q_clean_process_pipe.fit_transform(q_test_clean)

In [107]:

# saving view of post-processed data
# example8 = q_train_clean[['timepoint_2 irrad @ 4 Gy', 
#                                        'individual telomeres', 
#                                        '4 C # short telos']].copy()
# example = example8.copy()
# example.rename({'timepoint_2 irrad @ 4 Gy':'encoded sample origin', 
#                 'individual telomeres':'individual telomeres (RFI)'}, axis=1, inplace=True)

# example_8 = example[10:16].reset_index(drop=True)

# path=f'../graphs/paper figures/supp figs/view of cleaned individual telomere length SHORT TELOS dataframe.png'
# trp.render_mpl_table(example_8, col_width=4, path=path)

Visualizing model objective¶

In [108]:

# q_train_set.sort_values(by='patient id', ascending=True, inplace=True)
# q_train_set.reset_index(drop=True, inplace=True)
# temp = q_train_set.copy()
# temp['4 C # short telos'] = temp['4 C # short telos'] / 10

# trp.plot_individ_telos_ML_objective(df=temp, target='4 C # short telos')

Cross validation; MAE & R2 score when predicting # of short telomeres post-therapy (4 C)¶

In [109]:

q_fit_xgb_model, short_telos_row  = trp.cv_score_fit_mae_test(train_set=q_train_clean, 
                                                              test_set=q_test_clean, model=q_model, 
                                                              cv=5, target='4 C # short telos',
                                                              verbose=True)

MAE per CV fold: 
[233.13383983 235.34554623 232.85092682 239.2461323  234.41554351] 

MEAN of MAE all folds: 234.99839773740828
STD of MAE all folds: 2.3061521031767955

MAE of predict_y_test & y_test: 241.95471416615553
R2 between predict_y_test & y_test: 0.8021182712876767

In [ ]:

# evaluating model based on N samples in training data
short_telos_model_df = []

for n in [100, 500, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 103040]:
    q_fit_xgb_model, short_telos_row = trp.cv_score_fit_mae_test(train_set=q_train_clean.sample(n), 
                                                                 test_set=q_test_clean, model=q_model, 
                                                                 cv=5, target='4 C # short telos',
                                                                 verbose=False)
    short_telos_row[0].append(n)
    short_telos_model_df += (short_telos_row)

In [ ]:

q_model_metrics_n_train = pd.DataFrame(short_telos_model_df, 
                                     columns=['Model', 'Features', 'Target', 'Average MAE of CV folds', 
                                              'Std dev of MAE of CV folds', 'MAE predicted vs. test values', 
                                              'R2 predicted vs. test values', 'N samples training data'])

q_model_metrics_n_train['Features'] = q_model_metrics_n_train['Features'].apply(lambda row: ', '.join(row))
q_model_metrics_n_train['Features'] = 'individual telos, encoded samples'
q_model_metrics_n_train.drop(['Model', 'Features', 'Target'], axis=1, inplace=True)

In [ ]:

# display(q_model_metrics_n_train)
path=f'../graphs/paper figures/supp figs/ML model metrics target_4C number short telos | per N training samples 100 to 100000.png'
trp.render_mpl_table(q_model_metrics_n_train, col_width=4, path=path)

In [110]:

q_y_xgb_predict, q_y_true, q_ext = trp.predict_target_4C_compare_actual(telo_data=quartile_telo_data, test_set=q_test_set, 
                                                                      model=q_fit_xgb_model, target='4 C # short telos',
                                                                      clean_process_pipe=q_clean_process_pipe, verbose=False)

MAE predicted vs. actual 4 C # short telos: 219.9083251953125
R2 predicted vs. actual 4 C # short telos: 0.8615439801059009

In [111]:

sns.set_color_codes()

plt.figure(figsize=(6,6))
ax = sns.regplot(x=q_y_true, y=q_y_xgb_predict, scatter_kws={'s':200, 'edgecolor':'k'}, color='#fdff38')
ax.set_xlabel("Actual post-therapy numbers of short telomeres", fontsize=14)
ax.set_ylabel("Predicted post-therapy numbers of short telomeres", fontsize=14)
ax.tick_params(labelsize=14)
ax.set_title('XGBoost predictions of post-therapy numbers of short telomeres', fontsize=14, )
plt.savefig(f'../graphs/paper figures/main figs/ML model actual 4 C # short telomeres vs predicted.png', 
            dpi=400, bbox_inches = "tight")

In [101]:

train_clean123 = clean_process_pipe.set_params(cleaner__drop_patient_id=True).fit_transform(q_train_set)
test_clean123 = clean_process_pipe.set_params(cleaner__drop_patient_id=False).fit_transform(q_test_set)

patient_info = test_clean123[['patient id', '4 C # short telos']]
test_clean123.drop(['patient id', '4 C # short telos'], axis=1, inplace=True)
preds = q_fit_xgb_model.predict(test_clean123)

In [102]:

predictions_df = pd.concat([test_clean123, patient_info, pd.DataFrame(preds)], axis=1)
predictions_df.rename({0:'predicted 4 C # short telos'}, axis=1, inplace=True)

In [ ]:

predictions_df.to_csv('imrt xgboost predict 4C telo means from individual telos.csv', index=False)

In [103]:

ax = sns.lmplot(x='4 C # short telos', y='predicted 4 C # short telos', data=predictions_df, 
                scatter_kws={'s':250, 'alpha':None, 'edgecolor':'k', 'linewidth':0},
                height=6, aspect=1, hue='patient id', fit_reg=False, palette=trp.patients_palette(), legend=False)

# sns.regplot(x='True', y='Predicted', data=df, color='white', scatter=False, 
#             line_kws={'alpha':0.5}, ax=ax.axes[0, 0])

ax = plt.gca()
    
# colors data on axes
for patch in range(len(ax.patches)):
    color = ax.patches[patch].get_facecolor()
    color = list(color)
    color[3] = 0.01
    color = tuple(color)
    ax.patches[patch].set_facecolor(color)
    
    
# colors legend handle markers
handles, labels = ax.get_legend_handles_labels()
    
for handle, color in zip(handles, trp.patients_palette()):
    copy = color[0:3] + (.1,)
    handle.set_edgecolor('black')
    handle.set_linewidth(.05)
    handle.set_facecolor(copy)

plt.legend([blank] + handles, ['patient id'] + labels, loc='upper left', bbox_to_anchor=(.99, 1.02),
           fancybox=True, markerscale=1, fontsize=13)


plt.xlabel("Actual post-therapy numbers of short telomeres", fontsize=14)
plt.ylabel("Predicted post-therapy numbers of short telomeres", fontsize=14)
plt.tick_params(labelsize=14)
# plt.title('numbers of short telomeres', fontsize=14, )
plt.savefig(f'../graphs/paper figures/main figs/ML model actual 4 C short telos vs ALL predicted.png', 
            dpi=600, bbox_inches = "tight")

In [113]:

df = pd.DataFrame({'True':q_y_true, 'Predicted':q_y_xgb_predict, 'patient id':q_ext})

ax = sns.lmplot(x='True', y='Predicted', data=df, scatter_kws={'s':250, 'alpha':None, 'edgecolor':'k'},
                height=6, aspect=1, hue='patient id', fit_reg=False, palette=trp.patients_palette(), legend=False,
               truncate=False)

sns.regplot(x='True', y='Predicted', data=df, color='blue', scatter=False, truncate=False,
            line_kws={'alpha':0.3}, ax=ax.axes[0, 0])

ax = plt.gca()

handles, labels = ax.get_legend_handles_labels()

for handle, color in zip(handles, trp.patients_palette()):
    handle.set_edgecolor('black')
    handle.set_linewidth(.75)
    handle.set_facecolor(color)
    
for patch in range(len(ax.patches)):
    color = ax.patches[patch].get_facecolor()
    color = list(color)
    color[3] = 0.4
    color = tuple(color)
    ax.patches[patch].set_facecolor(color)

plt.legend([blank] + handles, ['patient id'] + labels, loc='upper left', bbox_to_anchor=(.99, 1.02),
           fancybox=True, markerscale=1, fontsize=13)

text = AnchoredText(f'R2= 0.877', loc='upper left', frameon=False, 
                    prop={'fontsize':14,'fontweight':'bold'})
ax.add_artist(text)

plt.xlabel("Actual post-therapy numbers of short telomeres", fontsize=14)
plt.ylabel("Average of predicted post-therapy numbers of short telomeres", fontsize=14)
plt.tick_params(labelsize=14)
# plt.title('numbers of short telomeres', fontsize=14, )
plt.savefig(f'../graphs/paper figures/main figs/ML model actual 4 C short telos vs AVERAGE predicted.png', 
            dpi=600, bbox_inches = "tight")

Dataframe for model metrics¶

In [ ]:

telo_row[0].append(103040)
short_telos_row[0].append(103040)

telos_short_telos_rows = telo_row + short_telos_row
stats_df = trp.make_stats_df(stats_list=telos_short_telos_rows)
stats_df

In [ ]:

path=f'../graphs/paper figures/supp figs/ML models predicting #1 4 C telo means and #2 # short telos.png'
trp.render_mpl_table(stats_df, col_width=6.5, path=path)

Machine Learning - exploring predictions of #s of long telomeres post-therapy¶

Loading/merging data for ML¶

In [114]:

exploded_telos_all_patients_df = pd.read_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv')
all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')

In [115]:

long_telo_data = trp.combine_data(exploded_telos=exploded_telos_all_patients_df, 
                                      all_patients_df=all_patients_df,
                                      pred_obj='4 C # long telos from individual telos')

In [116]:

# saving view of preprocessed data
example = long_telo_data.copy()
example.rename({'timepoint':'pre-therapy sample origin', 
                'individual telomeres':'individual telomeres (RFI)'}, axis=1, inplace=True)
example_8 = example[10:16].reset_index(drop=True)
path=f'../graphs/paper figures/supp figs/view of precleaned individual telomere length LONG TELOS dataframe.png'
trp.render_mpl_table(example_8, col_width=4, path=path)

Train/test split¶

In [117]:

long_telo_test = long_telo_data.copy()

y = long_telo_test[['4 C # long telos']]
X = long_telo_test.drop(['4 C # long telos'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

long_train_set = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
long_test_set = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

Initializing cleaning/model pipelines¶

In [118]:

long_clean_process_pipe = Pipeline([('features', trp.make_features(make_log_target=False)), 
                               ('dummies', trp.make_dummies(drop_first=True)),
                               ('cleaner', trp.clean_data(drop_patient_id=True))
                              ])

In [119]:

# initialize XGBoost model & pipeline for hyperparams gridsearch
long_model = XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.2,
                       objective='reg:squarederror', random_state=1, )

long_xgb_pipe = Pipeline([('XGB', long_model)
                      ])

long_full_pipe = Pipeline([('clean_process', long_clean_process_pipe),
                        ('model', long_model)
                       ])

Cleaning/modeling data with pipeline¶

In [120]:

long_train_clean = long_train_set.copy()
long_test_clean = long_test_set.copy()

long_train_clean = long_clean_process_pipe.fit_transform(long_train_clean)
long_test_clean = long_clean_process_pipe.fit_transform(long_test_clean)

In [ ]:

# saving view of post-processed data
example = long_train_clean[['timepoint_2 irrad @ 4 Gy', 
                                       'individual telomeres', 
                                       '4 C # long telos']].copy()

example.rename({'timepoint_2 irrad @ 4 Gy':'encoded sample origin', 
                'individual telomeres':'individual telomeres (RFI)'}, axis=1, inplace=True)

example_8 = example[10:16].reset_index(drop=True)

path=f'../graphs/paper figures/supp figs/view of cleaned individual telomere length LONG TELOS dataframe.png'
trp.render_mpl_table(example_8, col_width=4, path=path)

Visualizing model objective¶

In [ ]:

# long_train_set.sort_values(by='patient id', ascending=True, inplace=True)
# long_train_set.reset_index(drop=True, inplace=True)
# temp = long_train_set.copy()
# temp['4 C # long telos'] = temp['4 C # long telos'] / 20

# trp.plot_individ_telos_ML_objective(df=temp, target='4 C # long telos')

Cross validation; MAE & R2 score when predicting # of long telomeres post-therapy (4 C)¶

In [121]:

long_fit_xgb_model, long_telos_row = trp.cv_score_fit_mae_test(train_set=long_train_clean, 
                                                                 test_set=long_test_clean, model=long_model, 
                                                                 cv=5, target='4 C # long telos',
                                                                 verbose=True)

MAE per CV fold: 
[332.84047094 324.62619424 327.02681255 333.22888125 323.96610815] 

MEAN of MAE all folds: 328.3376934270681
STD of MAE all folds: 3.969965148722421

MAE of predict_y_test & y_test: 323.36791061496143
R2 between predict_y_test & y_test: 0.8249590662884294

In [ ]:

# evaluating model based on N samples in training data
long_telos_model_df = []

for n in [100, 500, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 103040]:
    long_fit_xgb_model, long_telos_row = trp.cv_score_fit_mae_test(train_set=long_train_clean.sample(n), 
                                                                 test_set=long_test_clean, model=long_model, 
                                                                 cv=5, target='4 C # long telos',
                                                                 verbose=False)
    long_telos_row[0].append(n)
    long_telos_model_df += (long_telos_row)

In [ ]:

long_model_metrics_n_train = pd.DataFrame(long_telos_model_df, 
                                     columns=['Model', 'Features', 'Target', 'Average MAE of CV folds', 
                                              'Std dev of MAE of CV folds', 'MAE predicted vs. test values', 
                                              'R2 predicted vs. test values', 'N samples training data'])

long_model_metrics_n_train['Features'] = long_model_metrics_n_train['Features'].apply(lambda row: ', '.join(row))
long_model_metrics_n_train['Features'] = 'individual telos, encoded samples'
long_model_metrics_n_train.drop(['Model', 'Features', 'Target'], axis=1, inplace=True)

In [ ]:

# display(long_model_metrics_n_train)
path=f'../graphs/paper figures/supp figs/ML model metrics target_4C number long telos | per N training samples 100 to 100000.png'
trp.render_mpl_table(long_model_metrics_n_train, col_width=4, path=path)

In [122]:

long_y_xgb_predict, long_y_true, ok = trp.predict_target_4C_compare_actual(telo_data=long_telo_data, test_set=long_test_set, 
                                                                 model=long_fit_xgb_model, target='4 C # long telos',
                                                                 clean_process_pipe=long_clean_process_pipe, verbose=False)

MAE predicted vs. actual 4 C # long telos: 282.83311680385043
R2 predicted vs. actual 4 C # long telos: 0.891164080582533

In [123]:

sns.set_color_codes()

plt.figure(figsize=(6,6))
ax = sns.regplot(x=long_y_true, y=long_y_xgb_predict, scatter_kws={'s':200, 'edgecolor':'k'}, color='#ffbacd')
ax.set_xlabel("Actual post-therapy numbers of long telomeres", fontsize=14)
ax.set_ylabel("Predicted post-therapy numbers of long telomeres", fontsize=14)
ax.tick_params(labelsize=14)
ax.set_title('XGBoost predictions of post-therapy numbers of long telomeres', fontsize=14, )
plt.ylim(500, 3700)
plt.savefig(f'../graphs/paper figures/main figs/ML model actual 4 C # long telomeres vs predicted.png', 
            dpi=400, bbox_inches = "tight")

In [124]:

train_clean123 = clean_process_pipe.set_params(cleaner__drop_patient_id=True).fit_transform(long_train_set)
test_clean123 = clean_process_pipe.set_params(cleaner__drop_patient_id=False).fit_transform(long_test_set)

patient_info = test_clean123[['patient id', '4 C # long telos']]
test_clean123.drop(['patient id', '4 C # long telos'], axis=1, inplace=True)
preds = long_fit_xgb_model.predict(test_clean123)

In [124]:

predictions_df = pd.concat([test_clean123, patient_info, pd.DataFrame(preds)], axis=1)
predictions_df.rename({0:'predicted 4 C # long telos'}, axis=1, inplace=True)

In [125]:

ax = sns.lmplot(x='4 C # long telos', y='predicted 4 C # long telos', data=predictions_df, 
                scatter_kws={'s':250, 'alpha':None, 'edgecolor':'k', 'linewidth':0},
                height=6, aspect=1, hue='patient id', fit_reg=False, palette=trp.patients_palette(), legend=False)

# sns.regplot(x='True', y='Predicted', data=df, color='white', scatter=False, 
#             line_kws={'alpha':0.5}, ax=ax.axes[0, 0])

ax = plt.gca()
    
# colors data on axes
for patch in range(len(ax.patches)):
    color = ax.patches[patch].get_facecolor()
    color = list(color)
    color[3] = 0.01
    color = tuple(color)
    ax.patches[patch].set_facecolor(color)
    
    
# colors legend handle markers
handles, labels = ax.get_legend_handles_labels()
    
for handle, color in zip(handles, trp.patients_palette()):
    copy = color[0:3] + (.1,)
    handle.set_edgecolor('black')
    handle.set_linewidth(.05)
    handle.set_facecolor(copy)

plt.legend([blank] + handles, ['patient id'] + labels, loc='upper left', bbox_to_anchor=(.99, 1.02),
           fancybox=True, markerscale=1, fontsize=13)


plt.xlabel("Actual post-therapy numbers of long telomeres", fontsize=14)
plt.ylabel("Predicted post-therapy numbers of long telomeres", fontsize=14)
plt.tick_params(labelsize=14)
# plt.title('numbers of short telomeres', fontsize=14, )
plt.savefig(f'../graphs/paper figures/main figs/ML model actual 4 C long telos vs ALL predicted.png', 
            dpi=600, bbox_inches = "tight")

In [133]:

# long_y_xgb_predict, long_y_true
x = pd.Series(long_y_true).values.reshape(-1, 1)
y = pd.Series(long_y_xgb_predict).values.reshape(-1, 1)

reg = LinearRegression().fit(x, y)
r2_score(x, y)

Out[133]:

0.8941064026554766

In [125]:

df = pd.DataFrame({'True':long_y_xgb_predict, 'Predicted':long_y_true, 'patient id':ok})

ax = sns.lmplot(x='True', y='Predicted', data=df, scatter_kws={'s':250, 'alpha':None, 'edgecolor':'k'},
                height=6, aspect=1, hue='patient id', fit_reg=False, palette=trp.patients_palette(), legend=False,
               truncate=False)

sns.regplot(x='True', y='Predicted', data=df, color='blue', scatter=False, truncate=False,
            line_kws={'alpha':0.3}, ax=ax.axes[0, 0])

ax = plt.gca()

handles, labels = ax.get_legend_handles_labels()

for handle, color in zip(handles, trp.patients_palette()):
    handle.set_edgecolor('black')
    handle.set_linewidth(.75)
    handle.set_facecolor(color)
    
for patch in range(len(ax.patches)):
    color = ax.patches[patch].get_facecolor()
    color = list(color)
    color[3] = 0.4
    color = tuple(color)
    ax.patches[patch].set_facecolor(color)

plt.legend([blank] + handles, ['Patient ID'] + labels, loc='upper left', bbox_to_anchor=(.99, 1.02),
           fancybox=True, markerscale=1, fontsize=13)

# annotate coefficient of determination between predicted vs true vals
text = AnchoredText(f'R2= 0.890', loc='upper left', frameon=False, 
                    prop={'fontsize':14,'fontweight':'bold'})
ax.add_artist(text)

plt.xlabel("Actual post-therapy numbers of long telomeres", fontsize=14)
plt.ylabel("Average of predicted post-therapy numbers of long telomeres", fontsize=14)
plt.tick_params(labelsize=14)
# plt.title('numbers of short telomeres', fontsize=14, )
plt.savefig(f'../graphs/paper figures/main figs/ML model actual 4 C long telos vs AVERAGE predicted.png', 
            dpi=600, bbox_inches = "tight")

Dataframe for model metrics¶

In [ ]:

telos_short_telos_rows = telo_row + short_telos_row + long_telos_row
stats_df = trp.make_stats_df(stats_list=telos_short_telos_rows)
stats_df

In [ ]:

path=f'../graphs/paper figures/supp figs/ML models predicting #1 4 C telo means and #2 short telos and #3 long telos.png'
trp.df_to_png(df=stats_df, path=path)

Machine Learning - exploring predictions of chromosome rearrangements¶

Loading data & general cleaning¶

In [49]:

all_chr_aberr_df = pd.read_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv')

general_cleaner = Pipeline([('cleaner', trp.general_chr_aberr_cleaner())])
cleaned_chr_df = general_cleaner.fit_transform(all_chr_aberr_df)

Train/test split¶

In [50]:

chr_train, chr_test = train_test_split(cleaned_chr_df, test_size=0.2, shuffle=True, 
                                       stratify=cleaned_chr_df[['patient id', 'timepoint']])

Initializing cleaning/model pipelines¶

In [51]:

features = ['# inversions']
target = '# inversions'

make_new_features_target = Pipeline([('make features', trp.make_chr_features(combine_inversions=True, 
                                                                             bool_features=False,
                                                                             features=features)),
                                     ('make target merge', trp.make_target_merge(target=target, features=features))])

In [62]:

# initialize XGBoost model & pipeline for hyperparams gridsearch
chr_model = XGBRegressor(n_estimators=100, max_depth=10, learning_rate=0.3,
                         objective='reg:squarederror',
                         random_state=0,)

chr_xgb_pipe = Pipeline([('XGB', chr_model)
                        ])

chr_full_pipe = Pipeline([('make ftr target', make_new_features_target),
                          ('model', chr_model)
                         ])

In [63]:

cleaned_chr_train = chr_train.copy()
cleaned_chr_test = chr_test.copy()

cleaned_chr_train = make_new_features_target.fit_transform(cleaned_chr_train)
cleaned_chr_test = make_new_features_target.fit_transform(cleaned_chr_test)

Cross validation; MAE & R2 score when predicting # of chr aberrations post-therapy (4 C)¶

In [64]:

# score model by cross validation, 5 folds, on X/y_train data 
# fit model on train data; w/ model, predict y_test from X_test; score model by MAE/R2 - return model
chr_fit_xgb_model, row = trp.cv_score_fit_mae_test(train_set=cleaned_chr_train, test_set=cleaned_chr_test,
                                              model=chr_model, cv=5, target='4 C # inversions')

MAE per CV fold: 
[0.24528411 0.21034984 0.14726143 0.07887109 0.16814507] 

MEAN of MAE all folds: 0.16998230615197266
STD of MAE all folds: 0.056781627487390664

MAE of predict_y_test & y_test: 0.3064803949424199
R2 between predict_y_test & y_test: -1.8529178772564503

In [65]:

chr_y_predict, y_true, = trp.chr_aberr_predict_target_4C_compare_actual(cleaned_unsplit_chr_data=cleaned_chr_df, 
                                                                        cleaned_test_set=cleaned_chr_test, 
                                                                        model=chr_fit_xgb_model, target='4 C # inversions',
                                                                        clean_process_pipe=make_new_features_target, verbose=False)

In [66]:

plt.figure(figsize=(6,6))
ax = sns.regplot(x=y_true, y=chr_y_predict, scatter_kws={'s':200, 'edgecolor':'k'}, )
ax.set_xlabel('Actual # inversions post-therapy', fontsize=18)
ax.set_ylabel('Predicted # inversions post-therapy', fontsize=18)
ax.tick_params(labelsize=14)
# ax.set_title('Fig. 2', weight='bold', fontsize=20)

Looping through all chr aberration types for XGBoost model fitting, creating dict for graphing & list for displaying model metrics¶

In [67]:

features_list = [['# inversions'], ['# translocations'], 
                 ['# dicentrics'], ['# excess chr fragments'],
                 ['# inversions', '# translocations', '# dicentrics', '# excess chr fragments']]

           
target1_list = ['# inversions', '# translocations', 
                '# dicentrics', '# excess chr fragments',
                'aberration index']

target2_list = ['4 C # inversions', '4 C # translocations', 
                '4 C # dicentrics', '4 C # excess chr fragments',
                '4 C aberration index']

stats_list = []
stats_list, graphing_dict = trp.script_load_clean_data_ml_pipeline_loop_aberrations(features_list=features_list,
                                                                                    target1_list=target1_list,
                                                                                    target2_list=target2_list,
                                                                                    stats_list=stats_list,
                                                                                    verbose=False)

Displaying model metrics in dataframe & graphing performance¶

In [68]:

import copy

stats_list_copy = copy.deepcopy(stats_list)
n_rows = cleaned_chr_train.shape[0]

for row in stats_list_copy:
    row.append(n_rows)

In [69]:

# saving df of model metrics for chr aberr models

graphing_df = trp.make_graphing_df(graphing_dict=graphing_dict)
stats_df = trp.make_stats_df(stats_list=stats_list_copy)
stats_df['Features'] = stats_df['Features'].apply(lambda row: ', '.join(row))

stats_df.iloc[0, 1] = '# inversions, encoded samples'
stats_df.iloc[1, 1] = '# translocations, encoded samples'
stats_df.iloc[2, 1] = '# dicentrics, encoded samples'
stats_df.iloc[3, 1] = '# excess chr fragments, encoded samples'
stats_df.iloc[4, 1] = 'all aberrations, encoded samples'
stats_df.drop(['Model', 'N samples training data'], axis=1, inplace=True)

path=f'../graphs/paper figures/supp figs/1 ML models metrics for all chr aberration types.png'
trp.render_mpl_table(stats_df, col_width=5.3, path=path)

In [70]:

# saving df of model metrics for all models
# all_rows = telo_row + short_telos_row + stats_list_copy
# stats_df = trp.make_stats_df(stats_list=all_rows)

# path=f'../graphs/paper figures/supp figs/ML model metrics for all telo and chr aberr endpoints.png'
# trp.df_to_png(df=stats_df, path=path)

In [71]:

graphing_df.rename({'predicted values':'Predicted values',
                    'actual values':'Actual values'}, axis=1, inplace=True)

In [72]:

ax = sns.set(font_scale=1.1)
ax = sns.lmplot(x='Actual values', y='Predicted values', col='aberration type', hue='aberration type',
                col_order=['# inversions', '# translocations', 
                           '# dicentrics', '# excess chr fragments', 'aberration index'],
                sharex=False, sharey=False, col_wrap=3, data=graphing_df, height=3.5)

ax_list = ax.axes
ax_list[0].set_title('inversions')
ax_list[1].set_title('translocations')
ax_list[2].set_title('dicentrics')
ax_list[3].set_title('excess chr fragments')
ax_list[4].set_title('aberration index')

plt.tight_layout(pad=.7)
plt.savefig(f'../graphs/paper figures/supp figs/ML models performance for all chr aberr types.png',
            dpi=600, bbox_iches='tight')

Making dataframes showing chr aberr data setup¶

In [43]:

features = ['# inversions', '# translocations', '# dicentrics', '# excess chr fragments']
target = 'aberration index'

# features = ['# inversions']
# target = '# inversions'

make_new_features_target = Pipeline([('make features', trp.make_chr_features(combine_inversions=True, 
                                                                             bool_features=False,
                                                                             features=features)),
                                     ('make target merge', trp.make_target_merge(target=target, features=features))])

In [44]:

# initialize XGBoost model & pipeline for hyperparams gridsearch
chr_model = XGBRegressor(n_estimators=200, max_depth=15, learning_rate=0.1,
                         objective='reg:squarederror',
                         random_state=0,)

chr_xgb_pipe = Pipeline([('XGB', chr_model)
                        ])

chr_full_pipe = Pipeline([('make ftr target', make_new_features_target),
                          ('model', chr_model)
                         ])

Cleaning/modeling data with pipeline¶

In [45]:

cleaned_chr_train = chr_train.copy()
cleaned_chr_test = chr_test.copy()

cleaned_chr_train = make_new_features_target.fit_transform(cleaned_chr_train)
cleaned_chr_test = make_new_features_target.fit_transform(cleaned_chr_test)

In [23]:

# aberration index

test_viz_post = cleaned_chr_train[['timepoint_2 irrad @ 4 Gy', '# inversions', '# translocations', 
                                   '# dicentrics', '# excess chr fragments', '4 C aberration index']]

test_viz_pre = chr_train[['patient id', 'timepoint', '# inversions', '# translocations', 
                          '# dicentrics', '# excess chr fragments',]]

test_viz_PRE = (test_viz_pre[test_viz_pre['timepoint'].isin(['1 non irrad', '2 irrad @ 4 Gy'])]
                .merge(cleaned_chr_train[['patient id', '4 C aberration index']])
                .sort_values(by=['patient id', 'timepoint'])
                .reset_index(drop=True))

test_viz_PRE.rename({'timepoint':'pre-therapy sample origin'}, axis=1, inplace=True)
test_viz_post.rename({'timepoint_2 irrad @ 4 Gy':'encoded sample origin'}, axis=1, inplace=True)

path=f'../graphs/paper figures/supp figs/example view ALL chr aberr PRE CLEAN dataframe for ML.png'
trp.render_mpl_table(test_viz_PRE.sample(6).reset_index(drop=True), col_width=3.2, path=path)
path=f'../graphs/paper figures/supp figs/example view ALL chr aberr POST CLEAN dataframe for ML.png'
trp.render_mpl_table(test_viz_post.sample(6).reset_index(drop=True), col_width=3.2, path=path)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [27]:

# inversions

test_viz_post = cleaned_chr_train[['timepoint_2 irrad @ 4 Gy', '# inversions', 
                                   '4 C # inversions']]

test_viz_pre = chr_train[['patient id', 'timepoint', '# inversions']]

test_viz_PRE = (test_viz_pre[test_viz_pre['timepoint'].isin(['1 non irrad', '2 irrad @ 4 Gy'])]
                .merge(cleaned_chr_train[['patient id', '4 C # inversions']])
                .sort_values(by=['patient id', 'timepoint'])
                .reset_index(drop=True))

test_viz_PRE.rename({'timepoint':'pre-therapy sample origin'}, axis=1, inplace=True)
test_viz_post.rename({'timepoint_2 irrad @ 4 Gy':'encoded sample origin'}, axis=1, inplace=True)

path=f'../graphs/paper figures/supp figs/example view INVERSIONS PRE CLEAN chr aberr dataframe for ML.png'
trp.render_mpl_table(test_viz_PRE.sample(6).reset_index(drop=True), col_width=3.2, path=path)
path=f'../graphs/paper figures/supp figs/example view INVERSIONS POST CLEAN chr aberr dataframe for ML.png'
trp.render_mpl_table(test_viz_post.sample(6).reset_index(drop=True), col_width=3.2, path=path)

Clustering telomere data¶

Mean telomere length (teloFISH)¶

In [11]:

all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')
means = all_patients_df.drop(['telo data', 'Q1', 'Q2-3', 'Q4'], axis=1)

In [12]:

means.head(4)

Out[12]:

	patient id	timepoint	telo means
0	1	1 non irrad	84.796483
1	1	2 irrad @ 4 Gy	90.975826
2	1	3 B	116.779989
3	1	4 C	99.346299

In [7]:

pivot = means.pivot_table(index=['patient id'], columns='timepoint', values='telo means').reset_index()
pivot.columns.name = ''

pivot = pivot[pivot['patient id'] != 13].copy()
pivot.set_index(pivot['patient id'], inplace=True)
pivot.drop(['patient id'], axis=1, inplace=True)

In [463]:

g = sns.clustermap(pivot, method='single', metric='correlation',
                   z_score=0, figsize=(7,7), cmap='PRGn',
#                    standard_scale=0, 
                   col_cluster=False,
                   cbar_kws={},) 
font_size=14

# colorbar 
g.cax.set_position([-0.05, .2, .03, .45])
g.cax.set_ylabel('Mean Telomere Length', rotation=90, fontsize=font_size)
g.cax.tick_params(labelsize=12)

# modifying y axis
g.ax_heatmap.set_ylabel('Patient ID', fontsize=font_size)
labels = g.ax_heatmap.yaxis.get_majorticklabels()
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), fontsize=font_size)
plt.setp(g.ax_heatmap.yaxis.get_minorticklabels(), fontsize=font_size)
g.ax_heatmap.set_yticklabels(labels, rotation=0, fontsize=font_size, va="center")

# modifying x axis
plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, fontsize=font_size)

for a in g.ax_row_dendrogram.collections:
    a.set_linewidth(1)
for a in g.ax_col_dendrogram.collections:
    a.set_linewidth(1)

plt.savefig('../graphs/paper figures/main figs/CLUSTERING heatmap all patient by mean telomere length means teloFISH.png', 
            dpi=400, bbox_inches = "tight")

In [14]:

means.head(4)

Out[14]:

	patient id	timepoint	telo means
0	1	1 non irrad	84.796483
1	1	2 irrad @ 4 Gy	90.975826
2	1	3 B	116.779989
3	1	4 C	99.346299

In [18]:

clustered_telos = trp.cluster_data_return_df(means, target='telo means', cut_off_n=2,
                                             metric='correlation', method='single',
                                             x_size=8, y_size=4)

	clusters	patient id	1	2	3	4
0	2	1	84.796483	90.975826	116.779989	99.346299
1	2	2	119.773675	133.199443	159.827558	108.915327
2	2	3	83.350928	87.295453	101.432564	95.669501
3	2	5	85.506106	113.095980	118.340459	97.832190
4	2	6	81.577970	86.403786	96.898929	130.118940
5	1	7	106.123851	110.051355	85.958697	101.402469
6	2	8	110.560114	113.514599	127.614062	106.652869
7	2	9	94.452253	99.629321	128.528917	107.674947
8	1	10	120.920889	124.741261	81.237654	93.352253
9	2	11	109.637490	122.192509	137.957127	108.571714
10	1	12	84.042609	88.342149	79.343869	73.988380
11	2	14	67.928442	78.336315	100.696436	93.279099
12	2	15	72.733567	77.396112	69.570508	90.680352
13	2	16	57.133220	64.074691	81.232799	77.956683

0 Cluster number 2 has 11 elements
1 Cluster number 1 has 3 elements

In [340]:

clustered_graphing = clustered_telos.rename({'telo means cluster groups': 'Clustered groups'}, axis=1).copy()

trp.graph_cluster_groups(clustered_graphing, target='telo means', hue='Clustered groups')

In [235]:

df = clustered_graphing[clustered_graphing['Clustered groups'] == 1] 

trp.telos_scipy_anova_post_hoc_tests(df0=df, time_col='timepoint', target='telo means',
                                     sig_test=stats.f_oneway, post_hoc='tukeyHSD', repeated_measures=True)

REPEATED MEASURES ANOVA for telomere length: 0.04395417105675324
      Multiple Comparison of Means - Tukey HSD,FWER=0.05      
==============================================================
    group1         group2     meandiff  lower    upper  reject
--------------------------------------------------------------
 1 non irrad   2 irrad @ 4 Gy  4.0158  -34.9883 43.0199 False 
 1 non irrad        3 B       -21.5157 -60.5198 17.4884 False 
 1 non irrad        4 C       -14.1147 -53.1189 24.8894 False 
2 irrad @ 4 Gy      3 B       -25.5315 -64.5356 13.4726 False 
2 irrad @ 4 Gy      4 C       -18.1306 -57.1347 20.8736 False 
     3 B            4 C        7.401   -31.6032 46.4051 False 
--------------------------------------------------------------

In [25]:

trp.graph_clusters_per_patient(clustered_telos, target='telo means',
                               y_dimen=1, x_dimen=2,
                               fsize=(12,3.5))

telo means CLUSTER 1 | patient IDs: [7, 10, 12]
telo means CLUSTER 2 | patient IDs: [1, 2, 3, 5, 6, 8, 9, 11, 14, 15, 16]

Short telomeres¶

In [325]:

all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')
short_telos = all_patients_df.drop(['telo data', 'Q2-3', 'Q4', 'telo means'], axis=1)
short_telos.rename(columns={'Q1': 'Number of short telomeres'}, inplace=True)

In [326]:

pivot1 = short_telos.pivot_table(index=['patient id'], columns='timepoint', values='Number of short telomeres').reset_index()
pivot1.columns.name = ''

pivot1 = pivot1[pivot1['patient id'] != 13].copy()
pivot1.set_index(pivot1['patient id'], inplace=True)
pivot1.drop(['patient id'], axis=1, inplace=True)

In [327]:

g = sns.clustermap(pivot1, method='single', metric='correlation',
                   z_score=0, figsize=(7,7), cmap='PRGn',
#                    standard_scale=0, 
                   col_cluster=False,
                   cbar_kws={},) 
font_size=14

# colorbar 
g.cax.set_position([-0.05, .2, .03, .45])
g.cax.set_ylabel('Number of short telomeres', rotation=90, fontsize=font_size)
g.cax.tick_params(labelsize=12)

# modifying y axis
g.ax_heatmap.set_ylabel('Patient ID', fontsize=font_size)
labels = g.ax_heatmap.yaxis.get_majorticklabels()
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), fontsize=font_size)
plt.setp(g.ax_heatmap.yaxis.get_minorticklabels(), fontsize=font_size)
g.ax_heatmap.set_yticklabels(labels, rotation=0, fontsize=font_size, va="center")

# modifying x axis
plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, fontsize=font_size)

for a in g.ax_row_dendrogram.collections:
    a.set_linewidth(1)
for a in g.ax_col_dendrogram.collections:
    a.set_linewidth(1)

plt.savefig('../graphs/paper figures/main figs/CLUSTERING heatmap all patient by short telomeres teloFISH.png', 
            dpi=400, bbox_inches = "tight")

In [328]:

clustered_short_telos = trp.cluster_data_return_df(short_telos, target='Number of short telomeres', cut_off_n=2)

0 Cluster number 1 has 11 elements
1 Cluster number 2 has 3 elements

In [329]:

clustered_short_telos['Clustered groups'] = (clustered_short_telos['Number of short telomeres cluster groups']
                                                             .apply(lambda row: trp.swap_short_telos_group_number(row)))

In [332]:

trp.graph_cluster_groups(clustered_short_telos, target='Number of short telomeres', hue='Clustered groups')

In [31]:

trp.graph_clusters_per_patient(clustered_short_telos, target='# short telomeres',
                               y_dimen=1, x_dimen=2, fsize=(12, 3.5))

# short telomeres CLUSTER 1 | patient IDs: [7, 10, 12]
# short telomeres CLUSTER 2 | patient IDs: [1, 2, 3, 5, 6, 8, 9, 11, 14, 15, 16]

Clustering analysis by hierarchical clustering of longitudinal changes in mean telomere length and number of short telomeres in the patients yielded 4 groups in each case, distinguishable by differential responses to radiation therapy. When patients were clustered by mean telomere length or number of short telomeres, the clustered groups differed by only one patient, suggesting strong agreement between mean telomere length and number of short telomeres for data clustering.

Combining cluster groups of means + short telos¶

In [33]:

y_list = ['telo means', '# short telomeres']
hue_list = ['telo means cluster groups', '# short telomeres cluster groups']
df_list = [clustered_telos, clustered_short_telos]
ylim_dict = {'telo means': (75, 150),
             '# short telomeres': (200, 4500)}

trp.plot_multiple_types_clusters(y_list=y_list, hue_list=hue_list,
                                 df_list=df_list, ylim_dict=ylim_dict)

Long telomeres¶

In [310]:

all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')
long_telos = all_patients_df.drop(['telo data', 'Q1', 'Q2-3', 'telo means'], axis=1)
long_telos.rename(columns={'Q4': 'Number of long telomeres'}, inplace=True)

In [312]:

pivot2 = long_telos.pivot_table(index=['patient id'], columns='timepoint', values='Number of long telomeres').reset_index()
pivot2.columns.name = ''

pivot2 = pivot2[pivot2['patient id'] != 13].copy()
pivot2.set_index(pivot2['patient id'], inplace=True)
pivot2.drop(['patient id'], axis=1, inplace=True)

In [313]:

g = sns.clustermap(pivot2, method='single', metric='correlation',
                   z_score=0, figsize=(7,7), cmap='PRGn',
#                    standard_scale=0, 
                   col_cluster=False,
                   cbar_kws={},) 
font_size=14

# colorbar 
g.cax.set_position([-0.05, .2, .03, .45])
g.cax.set_ylabel('Number of long telomeres', rotation=90, fontsize=font_size)
g.cax.tick_params(labelsize=12)

# modifying y axis
g.ax_heatmap.set_ylabel('Patient ID', fontsize=font_size)
labels = g.ax_heatmap.yaxis.get_majorticklabels()
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), fontsize=font_size)
plt.setp(g.ax_heatmap.yaxis.get_minorticklabels(), fontsize=font_size)
g.ax_heatmap.set_yticklabels(labels, rotation=0, fontsize=font_size, va="center")

# modifying x axis
plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, fontsize=font_size)

for a in g.ax_row_dendrogram.collections:
    a.set_linewidth(1)
for a in g.ax_col_dendrogram.collections:
    a.set_linewidth(1)

plt.savefig('../graphs/paper figures/main figs/CLUSTERING heatmap all patient by long telomeres teloFISH.png', 
            dpi=400, bbox_inches = "tight")

In [314]:

clustered_long_telos = trp.cluster_data_return_df(long_telos, target='Number of long telomeres', cut_off_n=2)

0 Cluster number 1 has 11 elements
1 Cluster number 2 has 3 elements

In [317]:

clustered_long_telos['Clustered groups'] = (clustered_long_telos['Number of long telomeres cluster groups']
                                                             .apply(lambda row: trp.swap_short_telos_group_number(row)))

In [342]:

trp.graph_cluster_groups(clustered_long_telos, target='Number of long telomeres', hue='Clustered groups')

In [322]:

trp.graph_clusters_per_patient(clustered_long_telos, target='Number of long telomeres',
                               y_dimen=1, x_dimen=2, fsize=(12, 3.5))

Number of long telomeres CLUSTER 1 | patient IDs: [1, 2, 3, 5, 6, 8, 9, 11, 14, 15, 16]
Number of long telomeres CLUSTER 2 | patient IDs: [7, 10, 12]

Clustering chromosome rearrangement data¶

In [6]:

all_chr_aberr_df = pd.read_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv')

general_cleaner = Pipeline([('cleaner', trp.general_chr_aberr_cleaner(drop_what_timepoint=False)),
                            ('features', trp.make_chr_features(combine_inversions=True))
                           ])
cleaned_chr_df = general_cleaner.fit_transform(all_chr_aberr_df)
mean_chr = cleaned_chr_df.groupby(['patient id', 'timepoint']).agg('mean').reset_index()

In [7]:

mean_chr.head()

Out[7]:

	patient id	timepoint	# inversions	# sister chromatid exchanges	# dicentrics	# excess chr fragments	# sat associations	# terminal SCEs	# translocations
0	1	1 non irrad	0.233333	0.633333	0.000000	0.000000	0.133333	0.566667	0.033333
1	1	2 irrad @ 4 Gy	0.766667	0.800000	0.400000	0.366667	0.033333	0.666667	0.133333
2	1	3 B	1.266667	0.700000	0.366667	0.433333	0.766667	0.800000	0.100000
3	1	4 C	0.566667	0.933333	0.066667	0.266667	0.366667	0.766667	0.133333
4	2	1 non irrad	0.300000	0.533333	0.000000	0.033333	0.333333	0.766667	0.033333

Testing cluster functions on inversions / excess chr fragments¶

In [8]:

clustered_invs = trp.cluster_data_return_df(mean_chr, target='# inversions', cut_off_n=2,
                                            y_size=5, x_size=10)

	clusters	patient id	1	2	3	4
0	1	1	0.233333	0.766667	1.266667	0.566667
1	2	2	0.300000	0.966667	1.066667	0.866667
2	1	3	0.366667	0.600000	1.000000	0.666667
3	2	5	0.300000	0.766667	0.866667	0.800000
4	1	6	0.333333	0.633333	0.933333	0.533333
5	1	7	0.333333	0.766667	0.900000	0.566667
6	2	8	0.333333	0.633333	0.633333	0.533333
7	2	9	0.266667	0.600000	0.766667	0.566667
8	2	10	0.200000	0.566667	0.800000	0.633333
9	1	11	0.366667	0.600000	0.833333	0.566667
10	1	12	0.400000	0.633333	1.333333	0.900000
11	1	14	0.366667	0.633333	1.133333	0.833333
12	1	15	0.300000	0.733333	1.033333	0.566667
13	2	16	0.566667	0.800000	0.866667	0.833333

0 Cluster number 1 has 8 elements
1 Cluster number 2 has 6 elements

In [9]:

trp.graph_cluster_groups(clustered_invs, target='# inversions', hue='# inversions cluster groups')

In [10]:

trp.graph_clusters_per_patient(clustered_invs, target='# inversions', y_dimen=1, x_dimen=2, fsize=(12,3.5))

# inversions CLUSTER 1 | patient IDs: [1, 3, 6, 7, 11, 12, 14, 15]
# inversions CLUSTER 2 | patient IDs: [2, 5, 8, 9, 10, 16]

In [11]:

trp.make_clustered_heatmap(df=mean_chr, target='# inversions', cb_target_label='# inversions')

In [609]:

trp.make_clustered_heatmap(df=mean_chr, target='# excess chr fragments', cb_target_label='# excess chr fragments')

In [12]:

target = '# excess chr fragments'

clustered_invs = trp.cluster_data_return_df(mean_chr, target=target, cut_off_n=2, y_size=5, x_size=10)
trp.graph_cluster_groups(clustered_invs, target=target, hue=f'{target} cluster groups')
trp.graph_clusters_per_patient(clustered_invs, target=target, y_dimen=1, x_dimen=2, fsize=(12,3.5))

	clusters	patient id	1	2	3	4
0	2	1	0.000000	0.366667	0.433333	0.266667
1	1	2	0.033333	0.666667	0.166667	0.300000
2	1	3	0.000000	0.733333	0.266667	0.366667
3	1	5	0.000000	0.533333	0.200000	0.200000
4	2	6	0.000000	0.400000	0.333333	0.300000
5	1	7	0.000000	0.233333	0.066667	0.133333
6	2	8	0.000000	0.333333	0.233333	0.166667
7	1	9	0.000000	0.466667	0.166667	0.200000
8	1	10	0.000000	0.766667	0.233333	0.133333
9	1	11	0.066667	0.566667	0.266667	0.133333
10	2	12	0.066667	0.400000	0.333333	0.300000
11	1	14	0.000000	0.400000	0.066667	0.200000
12	1	15	0.100000	0.300000	0.100000	0.066667
13	2	16	0.066667	0.466667	0.400000	0.200000

0 Cluster number 2 has 5 elements
1 Cluster number 1 has 9 elements
# excess chr fragments CLUSTER 1 | patient IDs: [2, 3, 5, 7, 9, 10, 11, 14, 15]
# excess chr fragments CLUSTER 2 | patient IDs: [1, 6, 8, 12, 16]

Looping through all clustered groups graphs¶

In [71]:

# chr_aberr = ['# inversions', '# terminal inversions', '# translocations', '# dicentrics']

# for target in chr_aberr:
#     if target == '# inversions':
#         n = 2
#     else:
#         n = 1
#     clustered = trp.cluster_data_return_df(mean_chr, target=target, cut_off_n=n)
#     trp.graph_cluster_groups(clustered, target=target, hue=f'{target} cluster groups')
#     trp.graph_clusters_per_patient(clustered, target=target, y_dimen=1, x_dimen=2, fsize=(12,3.5))

Graphing all aberration cluster groups at once¶

In [42]:

mean_chr['aberration index'] = (mean_chr['# inversions'] + mean_chr['# translocations'] + 
                                mean_chr['# dicentrics'] + mean_chr['# excess chr fragments'])

In [43]:

mean_chr.head()

Out[43]:

	patient id	timepoint	# inversions	# sister chromatid exchanges	# dicentrics	# excess chr fragments	# sat associations	# terminal SCEs	# translocations	aberration index
0	1	1 non irrad	0.233333	0.633333	0.000000	0.000000	0.133333	0.566667	0.033333	0.266667
1	1	2 irrad @ 4 Gy	0.766667	0.800000	0.400000	0.366667	0.033333	0.666667	0.133333	1.666667
2	1	3 B	1.266667	0.700000	0.366667	0.433333	0.766667	0.800000	0.100000	2.166667
3	1	4 C	0.566667	0.933333	0.066667	0.266667	0.366667	0.766667	0.133333	1.033333
4	2	1 non irrad	0.300000	0.533333	0.000000	0.033333	0.333333	0.766667	0.033333	0.366667

In [54]:

# initializing list of aberrations to cluster by
y_list = ['# inversions', 
          '# translocations', 
          '# dicentrics',
          '# excess chr fragments',
          'aberration index']

# initializing list of dfs bearing clustered groups
inv_df = trp.cluster_data_return_df(mean_chr, target='# inversions', cut_off_n=2, verbose=False)
trans_df = trp.cluster_data_return_df(mean_chr, target='# translocations', cut_off_n=1, verbose=False)
dicent_df = trp.cluster_data_return_df(mean_chr, target='# dicentrics', cut_off_n=1, verbose=False)
chr_frag_df = trp.cluster_data_return_df(mean_chr, target='# excess chr fragments', cut_off_n=2, verbose=False)
aberr_index_df = trp.cluster_data_return_df(mean_chr, target='aberration index', cut_off_n=1, verbose=False)

df_list = [inv_df,
           trans_df, 
           dicent_df,
           chr_frag_df,
           aberr_index_df]

# hues to graph by
hue_list = ['# inversions cluster groups',
            '# translocations cluster groups', 
            '# dicentrics cluster groups',
            '# excess chr fragments cluster groups',
            'aberration index cluster groups']

# dimensions for each aberration
ylim_dict = {'# inversions': (.1, 1.7),
             '# translocations': (0, .4),
             '# dicentrics': (0, 1),
             '# excess chr fragments': (0, .95),
             'aberration index': (.2, 3)}

In [58]:

trp.plot_multiple_types_clusters(y_list=y_list, hue_list=hue_list,
                                 df_list=df_list, ylim_dict=ylim_dict)

In [59]:

aberr_list = ['# inversions', '# translocations', 
              '# dicentrics', '# excess chr fragments', 'aberration index']

for aberr in aberr_list:
    trp.make_clustered_heatmap(df=mean_chr, target=aberr, cb_target_label=aberr)

Exploratory analysis (UMAP) of telomere data and chromosome aberrations¶

In [222]:

all_chr_aberr_df = pd.read_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv')
general_cleaner = Pipeline([('cleaner', trp.general_chr_aberr_cleaner(drop_what_timepoint=False))])
cleaned_chr_df = general_cleaner.fit_transform(all_chr_aberr_df)
all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')

In [223]:

combined = all_patients_df.merge(cleaned_chr_df.groupby(['patient id', 'timepoint']).agg('mean'), on=['patient id', 'timepoint'])

In [227]:

combined['total inversions'] = combined['# inversions'] + combined['# terminal inversions']
combined.to_csv('all imrt data.csv', index=False)

In [229]:

combined.head(3)

Out[229]:

	patient id	timepoint	telo data	telo means	Q1	Q2-3	Q4	# inversions	# terminal inversions	# sister chromatid exchanges	# dicentrics	# excess chr fragments	# sat associations	# terminal SCEs	# translocations	total inversions
0	1	1 non irrad	[79.18994405924711, 58.07204491719145, 95.0279...	84.796483	1195	2225	1180	0.066667	0.166667	0.633333	0.000000	0.000000	0.133333	0.566667	0.033333	0.233333
1	1	2 irrad @ 4 Gy	[149.93296075217452, 138.31843562348496, 106.6...	90.975826	724	2350	1526	0.466667	0.300000	0.800000	0.400000	0.366667	0.033333	0.666667	0.133333	0.766667
2	1	3 B	[176.32960877192357, 111.92066838585988, 123.5...	116.779989	231	1457	2912	0.800000	0.466667	0.700000	0.366667	0.433333	0.766667	0.800000	0.100000	1.266667

In [17]:

import umap
import numba

In [18]:

combined['total inversions'] = combined['# inversions'] + combined['# terminal inversions']

parsed = combined[['patient id', 'timepoint', 'telo means', 
                   'total inversions', '# dicentrics', 
                   '# excess chr fragments', '# translocations']].copy()

parsed['telo means'] = parsed['telo means'] / parsed['telo means'].mean()

def encode_row(row):
    time_dict = {'1 non irrad':1,
                 '2 irrad @ 4 Gy':2,
                 '3 B':3,
                 '4 C':4}
    return time_dict[row]

target = parsed[['patient id', 'timepoint']].copy()
parsed['timepoint'] = parsed['timepoint'].apply(lambda row: encode_row(row))
features = parsed.drop(['patient id', 'timepoint'], axis=1)

In [27]:

#UMAP DIMENSIONALITY REDUCTION
reducer = umap.UMAP(random_state=36, metric='manhattan', n_neighbors=15,
                    min_dist=.5, spread=2,)
embeddings = reducer.fit_transform(features)

# RECOMBINING ASTRO ID/FLIGHT STATUS W/ UMAP VECTORS
embed = pd.DataFrame(embeddings, columns=['dim1', 'dim2'])
embed = pd.concat([embed, target], axis=1)

In [38]:

fontsize=16

plt.figure(figsize=(7,7))
# embed.rename({'astro id':'Astro ID', 'flight status':'Flight status'}, axis=1, inplace=True)

colors = sns.color_palette('Paired', len(embed['patient id'].unique())), 
t = (0.7,)
test = [x + t for x in colors[0]]

ax = sns.scatterplot(x='dim1', y='dim2', data=embed, 
                     hue='patient id', 
                     style_order=['1 non irrad', '2 irrad @ 4 Gy', '3 B', '4 C'],
                     alpha=None, style='timepoint', legend='full',
#                      palette=sns.color_palette('Paired', len(embed['patient id'].unique())), 
                     palette=test,  

                    **{'s':300, 'linewidths':1, 'edgecolor':'black'})

plt.tick_params(labelsize=fontsize)
plt.xlabel('', fontsize=fontsize)
plt.ylabel('', fontsize=fontsize)

handles, labels = ax.get_legend_handles_labels()

for handle, color in zip(handles, test):
    handle.set_edgecolor('black')
    handle.set_linewidth(.75)
#     handle.set_facecolor(color)

num=14
handles[num].set_edgecolor('black')
handles[num].set_linewidth(.75)
# handles[num].set_facecolor(test[0])

plt.legend(handles[15:]+handles[0:15], labels[15:]+labels[0:15],
           loc='upper left', bbox_to_anchor=(.99, 1.02),
           fancybox=True, markerscale=2, fontsize=13)

plt.rc_context({"lines.markeredgewidth":1,
                "lines.markeredgecolor":'black'})


# plt.savefig(f'../graphs/paper figures/main figs/UMAP unlabled dimensionality reduction teloFISH rad induced aberr.png', 
#             dpi=600, bbox_inches = "tight")

Out[38]:

<matplotlib.rc_context at 0x131f384d0>

In [ ]: