In [4]:

# tools for handling files
import sys
import os

# pandas/numpy for handling data
import pandas as pd
import numpy as np

# seaborn/matplotlib for graphing
import matplotlib.pyplot as plt
import seaborn as sns

# statistics
from statistics import mean 
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison
import statsmodels.formula.api as smf

# regex
import re

# for reading individual telomere length data from files
from ast import literal_eval

# for grabbing individual cells
import more_itertools

# my module containing functions for handling/visualizing/analyzing telomere length/chr rearrangement data
import telomere_methods_rad_patient as trp

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.stats import zscore
from scipy.stats import ks_2samp
from scipy.stats import mannwhitneyu

# incase reloading modules is required
import importlib
%load_ext autoreload
%autoreload 

# setting darkgrid style for seaborn figures
sns.set_style(style="darkgrid",rc={'patch.edgecolor': 'black'})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

...

Analyzing Telomere Length Data from TeloFISH¶

Mean Telomere Length analyses¶

Visualizations¶

In [5]:

all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')
all_patients_df['telo data'] = all_patients_df['telo data'].map(literal_eval)

In [81]:

sns.set_style(rc= {'patch.edgecolor': 'black'})

fig = plt.figure(figsize=(7,3.2))
ax = sns.boxenplot(x='timepoint',y='telo means', data=all_patients_df,
                  linewidth=1,)
ax = sns.swarmplot(x='timepoint',y='telo means', data=all_patients_df, size=8,
                  linewidth=.5, edgecolor='black')

# ax.set_title("Mean Telomere Length (TeloFISH) per timepoint") 
ax.set_ylabel('Mean Telomere Length', fontsize=14)
ax.set_xlabel('', fontsize=14)
ax.tick_params(labelsize=14)
plt.savefig('../graphs/paper figures/main figs/all patient Mean telomere length means teloFISH.png', 
            dpi=400, bbox_inches = "tight")

ANOVA, correlations, linear regressions¶

In [20]:

df = all_patients_df[all_patients_df['patient id'] != 13].copy()

trp.telos_scipy_anova_post_hoc_tests(df0=df, time_col='timepoint', target='telo means',
                                     sig_test=stats.f_oneway, post_hoc='tukeyHSD', repeated_measures=True)

REPEATED MEASURES ANOVA for telomere length: 0.05898000440859586

In [21]:

lin_reg_df = all_patients_df.pivot(index='patient id', columns='timepoint', values='telo means')
lin_reg_df = lin_reg_df.drop(13)
lin_reg_df['constant'] = 1

In [22]:

lin_reg_df.corr()

Out[22]:

timepoint	1 non irrad	2 irrad @ 4 Gy	3 B	4 C	constant
timepoint
1 non irrad	1.000000	0.947341	0.509322	0.401291	NaN
2 irrad @ 4 Gy	0.947341	1.000000	0.620616	0.400194	NaN
3 B	0.509322	0.620616	1.000000	0.534244	NaN
4 C	0.401291	0.400194	0.534244	1.000000	NaN
constant	NaN	NaN	NaN	NaN	NaN

In [23]:

x_names = [['1 non irrad'], ['1 non irrad', '2 irrad @ 4 Gy'],]
#            ['1 non irrad', '2 irrad @ 4 Gy', '3 B']]   
y_name = '4 C'

mtl_df_list = []

for x_name in x_names:
    x = lin_reg_df[x_name].values.reshape(-1, len(x_name))
    y = lin_reg_df['4 C'].values.reshape(-1, 1)
    regression = LinearRegression().fit(x, y)
    print(f"Linear regression for {x_name} vs. {y_name}:\nR2 is {regression.score(x, y):.4f}")
    
    mtl_df_list.append([', '.join(x_name), '4 C', round(regression.score(x, y), 4)])

Linear regression for ['1 non irrad'] vs. 4 C:
R2 is 0.1610
Linear regression for ['1 non irrad', '2 irrad @ 4 Gy'] vs. 4 C:
R2 is 0.1649

In [24]:

mtl_LM_metrics = pd.DataFrame(mtl_df_list, columns=['Model features', 'Target', 'Linear regression R2 score'])
mtl_LM_metrics['Model features'] = mtl_LM_metrics['Model features'].astype('str')

In [25]:

mtl_LM_metrics

Out[25]:

	Model features	Target	Linear regression R2 score
0	1 non irrad	4 C	0.1610
1	1 non irrad, 2 irrad @ 4 Gy	4 C	0.1649

In [26]:

sns.set_style(rc= {'patch.edgecolor': 'black'})
sns.set_color_codes()

fig = plt.figure(figsize=(6,4))
ax = sns.lineplot(x='Model features', y='Linear regression R2 score', data=mtl_LM_metrics, 
                  linewidth=6, sort=False, hue='Target', palette={'4 C':'grey'}, legend=False, marker='o', 
                  **{'markersize':14, 
                     'mec':'black', 
                     'mew':1})
                       
ax.set_ylabel('Linear regression R2 score', fontsize=14)
ax.set_xlabel('Model features', fontsize=14,)

plt.xticks(fontsize=14,)
plt.yticks(fontsize=14)
plt.ylim(0,1)
plt.title('Predicting post-therapy mean telomere length', fontsize=14)

plt.savefig('../graphs/paper figures/main figs/linear regression metrics Mean telomere length means teloFISH.png', 
            dpi=400, bbox_inches = "tight")

In [7]:

# more indepth stats

# target = lin_reg_df['4 C']
# linear_m = sm.OLS(endog=target, exog=lin_reg_df[['1 non irrad', '2 irrad @ 4 Gy', 'constant']], missing='drop')
# results = linear_m.fit()
# results.summary()

Individual Telomere Length analyses¶

In [27]:

exploded_telos_all_patients_df = pd.read_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv')

Visualizations¶

In [707]:

# graphing individual telomeres per individual per timepoint, personal telomere length dynamics as fxn of radiotherapy
patient_ids = list(exploded_telos_all_patients_df['patient id'].unique())
trp.histogram_plot_groups(x='individual telomeres', 
                          data=exploded_telos_all_patients_df.copy(), 
                          groupby='patient id', 
#                           iterable=[patient_ids[1]],
                          iterable=patient_ids,
                          n_bins=50,
                          znorm=False)

Statistics¶

Linear Mixed Effect Modeling of telomere means¶

In [75]:

# removing spaces from column names for linear model (doesn't work w/ spaces)
df = exploded_telos_all_patients_df.copy()
df.rename({'individual telomeres': 'individual_telomeres',
           'patient id': 'patient_id'}, axis=1, inplace=True)

# encoding timepoint as numerical, log normalizing individual telos
label_encoder = preprocessing.LabelEncoder()
df['timepoint_encoded'] = label_encoder.fit_transform(df['timepoint'])
df['log_individ_telos'] = np.log(df['individual_telomeres'])

# df = df.groupby(by=['patient_id', 'timepoint']).agg('mean').reset_index()

In [76]:

df = all_patients_df[all_patients_df['patient id'] != 13].copy()

trp.telos_scipy_anova_post_hoc_tests(df0=df, time_col='timepoint', target='telo means',
                                     sig_test=stats.f_oneway, post_hoc='tukeyHSD', repeated_measures=True)

REPEATED MEASURES ANOVA for telomere length: 0.05898000440859586

In [413]:

# regressing time onto individual telos w/ random intercepts for patients

model = smf.mixedlm("individual_telomeres ~ timepoint_encoded", 
                    df, 
#                     re_formula='timepoint_encoded',
                    groups=df['patient_id']) 

results = model.fit(method='powell')
results.summary()
results.pvalues[0:4]

Out[413]:

Intercept            4.887136e-108
timepoint_encoded     0.000000e+00
Group Var             6.910804e-03
dtype: float64

Post-hoc Tukey HSD¶

In [414]:

obj = pairwise_tukeyhsd(df['individual_telomeres'], df['timepoint'])
obj.summary()

Out[414]:

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1	group2	meandiff	lower	upper	reject
1 non irrad	2 irrad @ 4 Gy	8.297	7.7796	8.8144	True
1 non irrad	3 B	14.4682	13.9509	14.9856	True
1 non irrad	4 C	8.7314	8.2049	9.258	True
2 irrad @ 4 Gy	3 B	6.1713	5.6539	6.6886	True
2 irrad @ 4 Gy	4 C	0.4344	-0.0921	0.961	False
3 B	4 C	-5.7368	-6.2633	-5.2103	True

In [28]:

# test_df = exploded_telos_all_patients_df.copy()
# for col in test_df.columns:
#     test_df.rename({col: col.replace(' ', '_')}, axis=1, inplace=True)
# test_df = test_df[test_df['patient_id'] != 13].copy()

Graphing all patients overall individual telomere length distributions¶

In [113]:

# graphing individual telomeres per timepoint, overall cohort telomere length dynamics as fxn of radiotherapy
df = exploded_telos_all_patients_df.copy()

patient_ids = list(df['patient id'].unique())
trp.histogram_plot_groups(x='individual telomeres', 
                          data=df, 
                          groupby='timepoint', 
                          n_bins=50,
                          znorm=False)

Graphing cluster 1 patients' individual telomere length distributions¶

In [108]:

# parsing overall individual telomere dist for cluster 1 
df = exploded_telos_all_patients_df[exploded_telos_all_patients_df['patient id'].isin([7,10,12])].copy()

patient_ids = list(df['patient id'].unique())
trp.histogram_plot_groups(x='individual telomeres', 
                          data=df, 
                          groupby='timepoint', 
                          n_bins=50,
                          znorm=False)

Graphing cluster 2 patients' individual telomere length distributions¶

In [109]:

# parsing overall individual telomere dist for cluster 2
df = exploded_telos_all_patients_df[~exploded_telos_all_patients_df['patient id'].isin([7,10,12])].copy()

patient_ids = list(df['patient id'].unique())
trp.histogram_plot_groups(x='individual telomeres', 
                          data=df, 
                          groupby='timepoint', 
                          n_bins=50,
                          znorm=False)

Kruskal wallis + Z-norm + KS-test w/ bonferroni correction for shape differences between timepoints¶

In [478]:

df = exploded_telos_all_patients_df[exploded_telos_all_patients_df['patient id'] != 13].copy()
z_norm = trp.z_norm_individual_telos(exploded_telos_df=df)

# z-norming distributions of individual telomeres per timepoint to enable statistical analysis between
# shapes of overall cohort telomere length dynamics
time_points = list(z_norm['timepoint'].unique())
trp.histogram_plot_groups(x='z-norm_individual_telos', 
                          data=z_norm, 
                          groupby='timepoint', 
                          n_bins=50, 
                          znorm=True)

In [486]:

df = exploded_telos_all_patients_df[exploded_telos_all_patients_df['patient id'] != 13].copy()
target='individual telomeres'

g_1 = df[df['timepoint'] == '1 non irrad'][target]
g_2 = df[df['timepoint'] == '2 irrad @ 4 Gy'][target]
g_3 = df[df['timepoint'] == '3 B'][target]
g_4 = df[df['timepoint'] == '4 C'][target]
statistic, p_value = stats.kruskal(g_1, g_2, g_3, g_4)

print(f'KRUSKAL-WALLIS TEST FOR SIG BETWEEN DISTRIBUTIONS: H STAT: {statistic} P VALUE: {p_value}')

KRUSKAL-WALLIS TEST FOR SIG BETWEEN DISTRIBUTIONS: H STAT: 4417.2500567902225 P VALUE: 0.0

In [479]:

# we see a diff. in shape between all timepoints, as well we see a sig diff between irrad @ 4 Gy & 4 C shapes,
# though mean was the same (ANOVA)

test = ks_2samp
test_name = 'Kolmogorov-Smirnov'
timept_pairs, row = trp.eval_make_test_comparisons(df=z_norm, target='z-norm_individual_telos',
                                                   test=test, test_name=test_name,)

KS_stats_df = pd.DataFrame(row, columns=[test_name, 'timepoint 1', 'timepoint 2', 
                                         'p value', 'KS statistic'])

# iterate for between patients as well? would just be loop using above fxn, passing df per patient

Kolmogorov-Smirnov | 1 non irrad vs 2 irrad @ 4 Gy P-VALUE: 0.07238870179647408 KS-STAT 0.007173913043478253
Kolmogorov-Smirnov | 1 non irrad vs 3 B P-VALUE: 2.3151814439595848e-05 KS-STAT 0.013276397515527949
Kolmogorov-Smirnov | 1 non irrad vs 4 C P-VALUE: 8.402756230273283e-16 KS-STAT 0.023431677018633537
Kolmogorov-Smirnov | 2 irrad @ 4 Gy vs 3 B P-VALUE: 6.233890108982835e-06 KS-STAT 0.014021739130434793
Kolmogorov-Smirnov | 2 irrad @ 4 Gy vs 4 C P-VALUE: 5.758230093941806e-19 KS-STAT 0.025729813664596257
Kolmogorov-Smirnov | 3 B vs 4 C P-VALUE: 1.341631992664129e-27 KS-STAT 0.031149068322981366

In [480]:

# bonferroni pvalue correction to KS-tests
bonferroni_corr = multipletests(KS_stats_df['p value'], method='bonferroni')

# assigning elements of correction to objects
true_false_reject_hyp, pval_corr = bonferroni_corr[0], bonferroni_corr[1]
alpha_c_sidak, alpha_c_bonf = bonferroni_corr[2], bonferroni_corr[3]

# df from vals
bonf_corr_df = pd.DataFrame({'Reject null?': true_false_reject_hyp, 'Bonferroni corrected p values': pval_corr,
                             'Sidak corrected alpha': alpha_c_sidak, 'Bonferroni corrected alpha': alpha_c_bonf})

In [481]:

# merging bonferroni pvals w/ original KS stats & saving to file
KS_stats_df[['Bonferroni corrected p values', 'Reject null?']] = bonf_corr_df[['Bonferroni corrected p values', 
                                                                               'Reject null?']]
display(KS_stats_df)

trp.df_to_png(df=KS_stats_df, 
              path='../graphs/paper figures/supp figs/KS test between overall shapes of individ telo dist.png')

	Kolmogorov-Smirnov	timepoint 1	timepoint 2	p value	KS statistic	Bonferroni corrected p values	Reject null?
0	Kolmogorov-Smirnov	1 non irrad	2 irrad @ 4 Gy	7.238870e-02	0.007174	4.343322e-01	False
1	Kolmogorov-Smirnov	1 non irrad	3 B	2.315181e-05	0.013276	1.389109e-04	True
2	Kolmogorov-Smirnov	1 non irrad	4 C	8.402756e-16	0.023432	5.041654e-15	True
3	Kolmogorov-Smirnov	2 irrad @ 4 Gy	3 B	6.233890e-06	0.014022	3.740334e-05	True
4	Kolmogorov-Smirnov	2 irrad @ 4 Gy	4 C	5.758230e-19	0.025730	3.454938e-18	True
5	Kolmogorov-Smirnov	3 B	4 C	1.341632e-27	0.031149	8.049792e-27	True

Loading page (1/2)
Rendering (2/2)                                                    
Done

Feature Engineering Short/Long Individual Telomeres¶

In [28]:

melted_all_patients_df = pd.melt(
    all_patients_df,
    id_vars = [col for col in all_patients_df.columns if col != 'Q1' and col != 'Q2-3' and col != 'Q4'],
    var_name='relative Q',
    value_name='Q freq counts')

melted_all_patients_df['Q freq counts'] = melted_all_patients_df['Q freq counts'].astype('float64')

In [29]:

def change_qcol_descrip(row):
    if row == 'Q1':
        return 'short'
    elif row == 'Q2-3':
        return 'medium'
    elif row == 'Q4':
        return 'long'

melted_all_patients_df['relative Q'] = melted_all_patients_df['relative Q'].apply(lambda row: change_qcol_descrip(row))

In [221]:

sns.set_style(rc= {'patch.edgecolor': 'black'})
palette ={"Q1":"#fdff38","medium":"#d0fefe","long":"#ffbacd"}

fig = plt.figure(figsize=(7,3.2))
ax = sns.boxenplot(x='timepoint',y='Q1', data=all_patients_df,
                  linewidth=1, color="#fdff38", saturation=.95)
ax = sns.swarmplot(x='timepoint',y='Q1', data=all_patients_df, size=8,
                  linewidth=.5, edgecolor='black', color="#fdff38")

# ax.set_title("Mean Telomere Length (TeloFISH) per timepoint") 
ax.set_ylabel('Number of short telomeres', fontsize=14)
ax.set_xlabel('', fontsize=14)
ax.tick_params(labelsize=14)
plt.savefig('../graphs/paper figures/main figs/all patient Number of short telomeres teloFISH.png', 
            dpi=400, bbox_inches = "tight")

In [68]:

stats_df_cts = all_patients_df.copy()
stats_df_cts['Q1_sqrt'] = np.sqrt(stats_df_cts['Q1'])
stats_df_cts['Q4_sqrt'] = np.sqrt(stats_df_cts['Q4'])
stats_df_cts.head()

Out[68]:

	patient id	timepoint	telo data	telo means	Q1	Q2-3	Q4	Q1_sqrt	Q4_sqrt
0	1	1 non irrad	[70.74301669292741, 91.86033510872664, 59.1284...	84.796483	1195	2225	1180	34.568772	34.351128
1	1	2 irrad @ 4 Gy	[91.86033510872664, 119.31165592077517, 99.251...	90.975826	724	2350	1526	26.907248	39.064050
2	1	3 B	[191.10982054566642, 141.48603338585482, 114.0...	116.779989	231	1457	2912	15.198684	53.962950
3	1	4 C	[86.58100550477684, 139.3729078012595, 99.2504...	99.346299	372	2241	1987	19.287302	44.575778
4	2	1 non irrad	[149.47242207268806, 104.23734697174298, 171.1...	119.773675	1166	2270	1164	34.146742	34.117444

In [69]:

df = stats_df_cts[stats_df_cts['patient id'] != 13].copy()
# df = all_patients_df[all_patients_df['patient id'] != 13].copy()

trp.telos_scipy_anova_post_hoc_tests(df0=df, time_col='timepoint', target='Q1_sqrt',
                                     sig_test=stats.f_oneway, post_hoc='tukeyHSD', repeated_measures=True)

REPEATED MEASURES ANOVA for telomere length: 0.041484514376121384
      Multiple Comparison of Means - Tukey HSD,FWER=0.05      
==============================================================
    group1         group2     meandiff  lower    upper  reject
--------------------------------------------------------------
 1 non irrad   2 irrad @ 4 Gy -6.4897  -16.0887  3.1094 False 
 1 non irrad        3 B        -8.561   -18.16   1.038  False 
 1 non irrad        4 C       -6.0259  -15.6249  3.5731 False 
2 irrad @ 4 Gy      3 B       -2.0713  -11.6704  7.5277 False 
2 irrad @ 4 Gy      4 C        0.4638  -9.1353  10.0628 False 
     3 B            4 C        2.5351  -7.0639  12.1341 False 
--------------------------------------------------------------
pvalues: [0.2877665730794384, 0.0963089267871261, 0.3521526580484169, 0.9, 0.9, 0.8917208985830509]

In [70]:

df = stats_df_cts[stats_df_cts['patient id'] != 13].copy()
# df = all_patients_df[all_patients_df['patient id'] != 13].copy()

trp.telos_scipy_anova_post_hoc_tests(df0=df, time_col='timepoint', target='Q4_sqrt',
                                     sig_test=stats.f_oneway, post_hoc='tukeyHSD', repeated_measures=True)

REPEATED MEASURES ANOVA for telomere length: 0.054153500056322156

In [142]:

# ax = sns.set(font_scale=1)
fig = plt.figure(figsize=(7,3.2))
sns.set_style(style="darkgrid",rc= {'patch.edgecolor': 'black'})
# palette ={"Q1":"#fdff38","Q2-3":"#d0fefe","Q4":"#ffbacd"}
palette ={"short":"#fdff38","medium":"#d0fefe","long":"#ffbacd"}


ax = sns.boxenplot(x='timepoint', y='Q freq counts', hue='relative Q', data=melted_all_patients_df, palette=palette,
             linewidth=2, saturation=5, color="black", )
ax = sns.stripplot(x='timepoint', y='Q freq counts', hue='relative Q', data=melted_all_patients_df, palette=palette,
             linewidth=1, color="black", dodge=True, )

ax=fig.gca()
# ax.set_title('Changes in Distribution Individual Telos Relative to Pre-Rad Therapy Time point', fontsize=14)
ax.set_xlabel('', fontsize=14)
ax.set_ylabel('Individual Telomere Counts', fontsize=14)
ax.tick_params(labelsize=14)

ax.get_legend().remove()
plt.ylim(0,4300)


# handles, labels = ax.get_legend_handles_labels()
plt.legend(handles[0:3], labels[0:3], bbox_to_anchor=(.99,0.75), loc="upper left", 
#            borderaxespad=0)

plt.savefig('../graphs/paper figures/supp figs/counting # short long telos.png', 
            dpi=400,
            bbox_inches='tight')

In [150]:

# nb_regression = all_patients_df.copy()
# nb_regression['constant'] = 1
# label_encoder = preprocessing.LabelEncoder()
# nb_regression['encoded_timepoint'] = label_encoder.fit_transform(nb_regression['timepoint'])

# results = sm.GLM(nb_regression['Q4'], nb_regression[['encoded_timepoint', 'constant']], 
#           family=sm.families.NegativeBinomial()).fit()
# results.summary()

Linear regression short telo counts¶

In [71]:

q_lr_df = all_patients_df.pivot(index='patient id', columns='timepoint', values='Q1')
q_lr_df.drop(13, inplace=True)
q_lr_df['constant'] = 1

q_lr_df.head()

Out[71]:

timepoint	1 non irrad	2 irrad @ 4 Gy	3 B	4 C	constant
patient id
1	1195.0	724.0	231.0	372.0	1
2	1166.0	481.0	292.0	1182.0	1
3	1190.0	1006.0	407.0	503.0	1
5	1159.0	319.0	234.0	521.0	1
6	1188.0	929.0	587.0	124.0	1

In [119]:

# from statsmodels.genmod.generalized_linear_model import GLM

In [134]:

results = sm.GLM(q_lr_df['4 C'], q_lr_df[['1 non irrad', '2 irrad @ 4 Gy', 'constant']], 
          family=sm.families.NegativeBinomial()).fit()

In [135]:

results.summary()

Out[135]:

Generalized Linear Model Regression Results
Dep. Variable:	4 C	No. Observations:	14
Model:	GLM	Df Residuals:	11
Model Family:	NegativeBinomial	Df Model:	2
Link Function:	log	Scale:	1.0000
Method:	IRLS	Log-Likelihood:	-106.83
Date:	Thu, 20 Feb 2020	Deviance:	3.8896
Time:	15:40:07	Pearson chi2:	3.13
No. Iterations:	6	Covariance Type:	nonrobust

	coef	std err	z	P>\|z\|	[0.025	0.975]
1 non irrad	-0.0359	0.016	-2.296	0.022	-0.067	-0.005
2 irrad @ 4 Gy	0.0011	0.001	0.881	0.378	-0.001	0.003
constant	47.9832	18.544	2.587	0.010	11.637	84.330

In [72]:

x_names = [['1 non irrad'], ['1 non irrad', '2 irrad @ 4 Gy'],] 
y_name = '4 C'

numst_df_list = []

for x_name in x_names:
    name = x_name[0]
    
    x = q_lr_df[x_name].values.reshape(-1, len(x_name))
    y = q_lr_df['4 C'].values.reshape(-1, 1)
    regression = LinearRegression().fit(x, y)
    print(f"Linear regression for {x_name} vs. {y_name}:\nR2 is {regression.score(x, y):.4f}")
    
    numst_df_list.append([', '.join(x_name), '4 C', round(regression.score(x, y), 4)])

Linear regression for ['1 non irrad'] vs. 4 C:
R2 is 0.4328
Linear regression for ['1 non irrad', '2 irrad @ 4 Gy'] vs. 4 C:
R2 is 0.5536

In [73]:

qLM_metrics = pd.DataFrame(numst_df_list, columns=['Model features', 'Target', 'Linear regression R2 score'])
qLM_metrics['Model features'] = qLM_metrics['Model features'].astype('str')
qLM_metrics.head()

Out[73]:

	Model features	Target	Linear regression R2 score
0	1 non irrad	4 C	0.4328
1	1 non irrad, 2 irrad @ 4 Gy	4 C	0.5536

In [74]:

sns.set_style(rc= {'patch.edgecolor': 'black'})
sns.set_color_codes()

fig = plt.figure(figsize=(6,4))
ax = sns.lineplot(x='Model features', y='Linear regression R2 score', data=qLM_metrics, 
                  linewidth=6, sort=False, hue='Target', palette={'4 C':'#fdff38'}, legend=False, marker='o', 
                  **{'markersize':14, 
                     'mec':'black', 
                     'mew':1})

ax.set_ylabel('Linear regression R2 score', fontsize=14)
ax.set_xlabel('Model features', fontsize=14,)
# ax.tick_params(labelsize=14,)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylim(0,1)
plt.title('Predicting post-therapy numbers of short telomeres', fontsize=14)

plt.savefig('../graphs/paper figures/main figs/linear regression metrics numbers short telos teloFISH.png', 
            dpi=400, bbox_inches = "tight")

Linear regression long telo counts¶

In [75]:

long_lr_df = all_patients_df.pivot(index='patient id', columns='timepoint', values='Q4')
long_lr_df.drop(13, inplace=True)
long_lr_df['constant'] = 1

long_lr_df.head()

Out[75]:

timepoint	1 non irrad	2 irrad @ 4 Gy	3 B	4 C	constant
patient id
1	1180.0	1526.0	2912.0	1987.0	1
2	1164.0	1537.0	2527.0	602.0	1
3	1164.0	1350.0	2074.0	1808.0	1
5	1166.0	2831.0	2893.0	2026.0	1
6	1174.0	1515.0	2184.0	3806.0	1

In [76]:

# results = sm.GLM(long_lr_df['4 C'], long_lr_df[['1 non irrad', '2 irrad @ 4 Gy', 'constant']], 
#           family=sm.families.NegativeBinomial()).fit()

# results.summary()

In [77]:

x_names = [['1 non irrad'], ['1 non irrad', '2 irrad @ 4 Gy'],] 
y_name = '4 C'

long_df_list = []

for x_name in x_names:
    name = x_name[0]
    
    x = long_lr_df[x_name].values.reshape(-1, len(x_name))
    y = long_lr_df['4 C'].values.reshape(-1, 1)
    regression = LinearRegression().fit(x, y)
    print(f"Linear regression for {x_name} vs. {y_name}:\nR2 is {regression.score(x, y):.4f}")
    
    long_df_list.append([', '.join(x_name), '4 C', round(regression.score(x, y), 4)])

Linear regression for ['1 non irrad'] vs. 4 C:
R2 is 0.0459
Linear regression for ['1 non irrad', '2 irrad @ 4 Gy'] vs. 4 C:
R2 is 0.2082

In [78]:

long_LM_metrics = pd.DataFrame(long_df_list, columns=['Model features', 'Target', 'Linear regression R2 score'])
long_LM_metrics['Model features'] = long_LM_metrics['Model features'].astype('str')
long_LM_metrics.head()

Out[78]:

	Model features	Target	Linear regression R2 score
0	1 non irrad	4 C	0.0459
1	1 non irrad, 2 irrad @ 4 Gy	4 C	0.2082

In [79]:

sns.set_style(rc= {'patch.edgecolor': 'black'})
sns.set_color_codes()

fig = plt.figure(figsize=(6,4))
ax = sns.lineplot(x='Model features', y='Linear regression R2 score', data=long_LM_metrics, 
                  linewidth=6, sort=False, hue='Target', palette={'4 C':'#ffbacd'}, legend=False, marker='o', 
                  **{'markersize':14, 
                     'mec':'black', 
                     'mew':1})

ax.set_ylabel('Linear regression R2 score', fontsize=14)
ax.set_xlabel('Model features', fontsize=14,)
# ax.tick_params(labelsize=14,)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylim(-.05,1)
plt.title('Predicting post-therapy numbers of long telomeres', fontsize=14)

plt.savefig('../graphs/paper figures/main figs/linear regression metrics numbers long telos teloFISH.png', 
            dpi=400, bbox_inches = "tight")

Analyzing Chromosome Aberration Data from dGH¶

In [6]:

all_patients_df[['patient id', 'timepoint', 'telo means']].shape

Out[6]:

(59, 3)

In [7]:

# alteryx testing

inter = pd.read_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv')
inter = inter[['patient id', 'timepoint', 'cell number', '# inversions', '# terminal inversions']].copy()
inter['total inversions'] = inter['# inversions'] + inter['# terminal inversions']

inter = inter.groupby(['patient id', 'timepoint', 'cell number']).agg('mean').reset_index()
inter.drop(['# inversions', '# terminal inversions'], axis=1, inplace=True)
merge = inter.merge(all_patients_df[['patient id', 'timepoint', 'telo means']], on=['patient id', 'timepoint'], how='inner')

print(merge.shape)
merge.head(4)

(1770, 5)

Out[7]:

	patient id	timepoint	cell number	total inversions	telo means
0	1	1 non irrad	1	0.000000	84.796483
1	1	1 non irrad	2	0.333333	84.796483
2	1	1 non irrad	3	0.000000	84.796483
3	1	1 non irrad	4	0.000000	84.796483

In [8]:

# cleaned_chr_df.head(8)

In [9]:

all_chr_aberr_df = pd.read_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv')
general_cleaner = Pipeline([('cleaner', trp.general_chr_aberr_cleaner(drop_what_timepoint=False, adjust_clonality=True))])
cleaned_chr_df = general_cleaner.fit_transform(all_chr_aberr_df)

In [10]:

cleaned_chr_df['# inversions'] = cleaned_chr_df['# inversions'] + cleaned_chr_df['# terminal inversions']
cleaned_chr_df.drop(['# terminal inversions'], axis=1, inplace=True)

In [11]:

melt_aberrations = pd.melt(cleaned_chr_df, id_vars=['patient id', 'timepoint'],
                           var_name='aberration type', value_name='count per cell')

melt_aberrations['count per cell'] = melt_aberrations['count per cell'].astype('int64')
melt_aberrations['aberration type'] = melt_aberrations['aberration type'].astype('str')

In [12]:

melt_aberrations['aberration type'].unique()

Out[12]:

array(['# inversions', '# sister chromatid exchanges', '# dicentrics',
       '# excess chr fragments', '# sat associations', '# terminal SCEs',
       '# translocations'], dtype=object)

In [13]:

melt_aberrations.head(8)

Out[13]:

	patient id	timepoint	aberration type	count per cell
0	1	1 non irrad	# inversions	0
1	1	1 non irrad	# inversions	1
2	1	1 non irrad	# inversions	0
3	1	1 non irrad	# inversions	0
4	1	1 non irrad	# inversions	0
5	1	1 non irrad	# inversions	0
6	1	1 non irrad	# inversions	0
7	1	1 non irrad	# inversions	0

Visualizing Chromosome Rearrangements¶

In [14]:

# melt_aberrations_chr_only = melt_aberrations[~melt_aberrations['aberration type'].isin(['# sub-telo SCEs', 'tricentrics',
#                                                                                         '# dicentrics', '# translocations',
#                                                                                         '# sat associations', 'cell number'])].copy()

# ax = sns.set(font_scale=2)
# ax = sns.catplot(y='aberration type', x='count per cell', hue='chromosome', 
#                  col='timepoint', col_wrap=2, 
#                  data=melt_aberrations_chr_only, kind='bar', height=7, aspect=1.5, orient="h",)

# ax.set_ylabels('')
# ax.set_xlabels('average count per cell')

In [13]:

df = melt_aberrations[melt_aberrations['aberration type'] != '# terminal SCEs'].copy()
df.rename({'timepoint':'Time point'}, axis=1, inplace=True)

ax = sns.set_style(style="darkgrid",rc= {'patch.edgecolor': 'black'})
# ax = sns.set(font_scale=1.35)
ax = sns.catplot(y='aberration type', x='count per cell',
                 col='Time point', col_wrap=2,  
                 data=df, 
                 kind='bar', height=3., aspect=1.5, orient="h",)

fontsize=14

ax.set_ylabels('', fontsize=fontsize)
ax.set_xlabels('', fontsize=fontsize)
plt.xlabel('Average aberration frequency per cell', horizontalalignment='right', x=0.45,
           fontsize=14)

axes = ax.axes.flatten()
axes[0].set_title("1 non irrad", fontsize=fontsize)
axes[1].set_title("2 irrad @ 4 Gy", fontsize=fontsize)
axes[2].set_title("3 B", fontsize=fontsize)
axes[3].set_title("4 C", fontsize=fontsize)

ax.savefig('../graphs/paper figures/main figs/all patients rearrangements.png', dpi=400,
          bbox_inches = "tight")

In [27]:

aberr = melt_aberrations[melt_aberrations['aberration type'] == '# inversions']
aberr.head()

Out[27]:

	patient id	timepoint	aberration type	count per cell
0	1	1 non irrad	# inversions	0
1	1	1 non irrad	# inversions	1
2	1	1 non irrad	# inversions	0
3	1	1 non irrad	# inversions	0
4	1	1 non irrad	# inversions	0

In [31]:

res = pairwise_tukeyhsd(aberr['count per cell'], aberr['timepoint'])
res.summary()

Out[31]:

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1	group2	meandiff	lower	upper	reject
1 non irrad	2 irrad @ 4 Gy	0.3595	0.2046	0.5144	True
1 non irrad	3 B	0.6262	0.4713	0.7811	True
1 non irrad	4 C	0.3405	0.1856	0.4954	True
2 irrad @ 4 Gy	3 B	0.2667	0.1117	0.4216	True
2 irrad @ 4 Gy	4 C	-0.019	-0.174	0.1359	False
3 B	4 C	-0.2857	-0.4406	-0.1308	True

In [32]:

from statsmodels.stats.libqsturng import psturng

print(list(psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total)))

[0.001, 0.001, 0.001, 0.001, 0.9, 0.001]

In [62]:

trp.graph_all_aberrations_freq(melt_aberrations=melt_aberrations)

In [61]:

import importlib
%load_ext autoreload
%autoreload 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [60]:

trp.graph_all_aberrations_freq(melt_aberrations=melt_aberrations, aberr_list=['# sister chromatid exchanges'])

In [33]:

plt.figure(figsize=(6,6))
ax=sns.set(font_scale=1.5)
ax = sns.lineplot(x='timepoint', y='count per cell', data=melt_aberrations[melt_aberrations['aberration type'] != '# terminal SCEs'],
                  hue='aberration type', 
#                   palette=sns.color_palette("terrain", melt_aberrations['aberration type'].nunique()),
                  )
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

Out[33]:

<matplotlib.legend.Legend at 0x12c47d150>

Statistics Chromosome Rearrangements¶

In [17]:

label_encoder = preprocessing.LabelEncoder()
melt_aberrations['timepoint_encoded'] = label_encoder.fit_transform(melt_aberrations['timepoint'])

In [18]:

trp.chr_scipy_anova_post_hoc_tests(df0=melt_aberrations[melt_aberrations['aberration type'] != '# terminal SCEs'],
                                   post_hoc='tukeyHSD', repeated_measures=True)

# inversions 2.8793128491455715e-15
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
===================================================================
    group1         group2     meandiff p-adj  lower   upper  reject
-------------------------------------------------------------------
   1 non irrad 2 irrad @ 4 Gy   0.3595 0.001  0.2046  0.5144   True
   1 non irrad            3 B   0.6262 0.001  0.4713  0.7811   True
   1 non irrad            4 C   0.3405 0.001  0.1856  0.4954   True
2 irrad @ 4 Gy            3 B   0.2667 0.001  0.1117  0.4216   True
2 irrad @ 4 Gy            4 C   -0.019   0.9  -0.174  0.1359  False
           3 B            4 C  -0.2857 0.001 -0.4406 -0.1308   True
-------------------------------------------------------------------
# inversions pvalues: [0.001, 0.001, 0.001, 0.001, 0.9, 0.001]


# sister chromatid exchanges 0.06700814306980919


# dicentrics 3.774831915280363e-21
        Multiple Comparison of Means - Tukey HSD, FWER=0.05         
====================================================================
    group1         group2     meandiff p-adj   lower   upper  reject
--------------------------------------------------------------------
   1 non irrad 2 irrad @ 4 Gy    0.669  0.001  0.5682  0.7699   True
   1 non irrad            3 B     0.35  0.001  0.2492  0.4508   True
   1 non irrad            4 C   0.2357  0.001  0.1349  0.3365   True
2 irrad @ 4 Gy            3 B   -0.319  0.001 -0.4199 -0.2182   True
2 irrad @ 4 Gy            4 C  -0.4333  0.001 -0.5341 -0.3325   True
           3 B            4 C  -0.1143 0.0189 -0.2151 -0.0135   True
--------------------------------------------------------------------
# dicentrics pvalues: [0.001, 0.001, 0.001, 0.001, 0.001, 0.01885686962035671]


# excess chr fragments 5.420958629361807e-13
        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
===================================================================
    group1         group2     meandiff p-adj  lower   upper  reject
-------------------------------------------------------------------
   1 non irrad 2 irrad @ 4 Gy     0.45 0.001  0.3478  0.5522   True
   1 non irrad            3 B   0.2095 0.001  0.1073  0.3117   True
   1 non irrad            4 C   0.1881 0.001  0.0859  0.2903   True
2 irrad @ 4 Gy            3 B  -0.2405 0.001 -0.3427 -0.1383   True
2 irrad @ 4 Gy            4 C  -0.2619 0.001 -0.3641 -0.1597   True
           3 B            4 C  -0.0214   0.9 -0.1236  0.0808  False
-------------------------------------------------------------------
# excess chr fragments pvalues: [0.001, 0.001, 0.001, 0.001, 0.001, 0.9]


# sat associations 2.5079736386352722e-05
        Multiple Comparison of Means - Tukey HSD, FWER=0.05         
====================================================================
    group1         group2     meandiff p-adj   lower   upper  reject
--------------------------------------------------------------------
   1 non irrad 2 irrad @ 4 Gy   0.0595 0.4338 -0.0419   0.161  False
   1 non irrad            3 B   0.2143  0.001  0.1128  0.3157   True
   1 non irrad            4 C   0.1024 0.0469  0.0009  0.2038   True
2 irrad @ 4 Gy            3 B   0.1548  0.001  0.0533  0.2562   True
2 irrad @ 4 Gy            4 C   0.0429 0.6754 -0.0586  0.1443  False
           3 B            4 C  -0.1119 0.0238 -0.2133 -0.0105   True
--------------------------------------------------------------------
# sat associations pvalues: [0.4337670435960369, 0.001, 0.04694268219187914, 0.001, 0.6754137733207328, 0.02383659311705344]


# translocations 1.9719725719939544e-10
        Multiple Comparison of Means - Tukey HSD, FWER=0.05         
====================================================================
    group1         group2     meandiff p-adj   lower   upper  reject
--------------------------------------------------------------------
   1 non irrad 2 irrad @ 4 Gy   0.2071  0.001   0.148  0.2663   True
   1 non irrad            3 B   0.0738 0.0074  0.0146   0.133   True
   1 non irrad            4 C   0.0738 0.0074  0.0146   0.133   True
2 irrad @ 4 Gy            3 B  -0.1333  0.001 -0.1925 -0.0742   True
2 irrad @ 4 Gy            4 C  -0.1333  0.001 -0.1925 -0.0742   True
           3 B            4 C      0.0    0.9 -0.0592  0.0592  False
--------------------------------------------------------------------
# translocations pvalues: [0.001, 0.007448304853158949, 0.007448304853158949, 0.001, 0.001, 0.9]

In [19]:

cleaned_chr_df['aberration index'] = (cleaned_chr_df['# inversions'] + 
                                      cleaned_chr_df['# translocations'] + cleaned_chr_df['# dicentrics'] + 
                                      cleaned_chr_df['# excess chr fragments'])

In [20]:

# pivoting out aberrations for linear regression

group_chr = cleaned_chr_df.groupby(['patient id', 'timepoint']).agg('mean').reset_index()
# pivot_chr = group_chr.pivot(index='patient id', columns='timepoint', values='# inversions')

In [21]:

row = []
aberr_types = [col for col in group_chr.columns if col != 'patient id' and col != 'timepoint']

for aberr in aberr_types:
    pivot_chr = group_chr.pivot(index='patient id', columns='timepoint', values=aberr)
    x_name2 = ['1 non irrad']
    x_name3 = ['1 non irrad', '2 irrad @ 4 Gy']
    y_name = '4 C'

#     print(f'ABERRATION TYPE | {aberr}')
    for x_name in [x_name2, x_name3]:
        x = pivot_chr[x_name].values.reshape(-1, len(x_name))
        y = pivot_chr['4 C'].values.reshape(-1, 1)

        regression = LinearRegression().fit(x, y)
#         print(f"Linear regression for {x_name} vs. {y_name}:\nR2 is {regression.score(x, y):.4f}")
#         print('\n')
        row.append(['Linear Regression', aberr, x_name, y_name, f'{regression.score(x, y):.4f}'])
    
LM_aberr_r2 = pd.DataFrame(data=row, columns=['Model', 'Aberration type', 'Features', 'Target', 'Linear regression R2 score'])

In [22]:

LM_aberr_r2['Features'] = LM_aberr_r2['Features'].apply(lambda row: ', '.join(row))
LM_aberr_r2['Linear regression R2 score'] = LM_aberr_r2['Linear regression R2 score'].astype('float64')

In [23]:

chr_interest = ['# inversions', 
                '# translocations', '# dicentrics', '# excess chr fragments',
                'aberration index']

chr_df = LM_aberr_r2[LM_aberr_r2['Aberration type'].isin(chr_interest)]

In [54]:

chr_df.reset_index(inplace=True, drop=True)

In [64]:

path=f'../graphs/paper figures/supp figs/linear regression chr aberr models.png'
trp.render_mpl_table(chr_df, col_width=3.5, path=path)

Out[64]:

<matplotlib.axes._subplots.AxesSubplot at 0x1298fdb90>

In [65]:

mtl_LM_metrics = pd.DataFrame(mtl_df_list, columns=['Features', 'Target', 'Linear regression R2 score'])
mtl_LM_metrics['Features'] = mtl_LM_metrics['Features'].astype('str')
mtl_LM_metrics['Model'] = 'Linear Regression'
mtl_LM_metrics['Telomeric outcome'] = 'Mean Telomere Length'
mtl_ordered = mtl_LM_metrics[['Model', 'Telomeric outcome', 'Features', 'Target', 'Linear regression R2 score']]

path=f'../graphs/paper figures/supp figs/linear regression mean telomere length models.png'
trp.render_mpl_table(mtl_ordered, col_width=3.5, path=path)

Out[65]:

<matplotlib.axes._subplots.AxesSubplot at 0x12b2a8750>

In [66]:

qLM_LM_metrics = pd.DataFrame(numst_df_list, columns=['Features', 'Target', 'Linear regression R2 score'])
qLM_LM_metrics['Features'] = qLM_LM_metrics['Features'].astype('str')
qLM_LM_metrics['Model'] = 'Linear Regression'
qLM_LM_metrics['Telomeric outcome'] = 'Number of short telomeres'
qLM_ordered = qLM_LM_metrics[['Model', 'Telomeric outcome', 'Features', 'Target', 'Linear regression R2 score']]

path=f'../graphs/paper figures/supp figs/linear regression number of short telomeres models.png'
trp.render_mpl_table(qLM_ordered, col_width=3.5, path=path)

Out[66]:

<matplotlib.axes._subplots.AxesSubplot at 0x12bb661d0>

In [67]:

long_LM_metrics = pd.DataFrame(long_df_list, columns=['Features', 'Target', 'Linear regression R2 score'])
long_LM_metrics['Features'] = long_LM_metrics['Features'].astype('str')
long_LM_metrics['Model'] = 'Linear Regression'
long_LM_metrics['Telomeric outcome'] = 'Number of long telomeres'
long_ordered = long_LM_metrics[['Model', 'Telomeric outcome', 'Features', 'Target', 'Linear regression R2 score']]

path=f'../graphs/paper figures/supp figs/linear regression number of long telomeres models.png'
trp.render_mpl_table(long_ordered, col_width=3.5, path=path)

Out[67]:

<matplotlib.axes._subplots.AxesSubplot at 0x12a37eed0>

In [41]:

sns.color_palette("tab10")

Out[41]:

[(0.12156862745098039, 0.4666666666666667, 0.7058823529411765),
 (1.0, 0.4980392156862745, 0.054901960784313725),
 (0.17254901960784313, 0.6274509803921569, 0.17254901960784313),
 (0.8392156862745098, 0.15294117647058825, 0.1568627450980392),
 (0.5803921568627451, 0.403921568627451, 0.7411764705882353),
 (0.5490196078431373, 0.33725490196078434, 0.29411764705882354),
 (0.8901960784313725, 0.4666666666666667, 0.7607843137254902),
 (0.4980392156862745, 0.4980392156862745, 0.4980392156862745),
 (0.7372549019607844, 0.7411764705882353, 0.13333333333333333),
 (0.09019607843137255, 0.7450980392156863, 0.8117647058823529)]

In [43]:

blue = sns.color_palette("tab10")[0]
red = sns.color_palette("tab10")[1]
orange = sns.color_palette("tab10")[2]
green = sns.color_palette("tab10")[3]
purple = sns.color_palette("tab10")[4]

# blue orange green red purple

palette_choice = [blue, orange, green, red, purple]
palette_choice

Out[43]:

[(0.12156862745098039, 0.4666666666666667, 0.7058823529411765),
 (0.17254901960784313, 0.6274509803921569, 0.17254901960784313),
 (0.8392156862745098, 0.15294117647058825, 0.1568627450980392),
 (1.0, 0.4980392156862745, 0.054901960784313725),
 (0.5803921568627451, 0.403921568627451, 0.7411764705882353)]

In [45]:

ax = sns.set(font_scale=1.2)
col_order=['# inversions', '# translocations', '# dicentrics', '# excess chr fragments', 'aberration index']
ax = sns.relplot(x='Features', y='Linear regression R2 score', data=chr_df, 
                 col='Aberration type', hue='Aberration type', kind='line', col_order=col_order, 
                 col_wrap=3, legend=False, height=2.9, aspect=1.40, palette=palette_choice,
                 facet_kws={'sharey':True, 'sharex':True},
                 **{'linewidth':3, 'marker':'o', 'markersize':10,
                    'mec':'black', 'mew':0.5, })

ax.set_xlabels('Model features')

# ax.set_xticklabels(ha='center')
# ax[0].set_title('# inversions')
ax_list = ax.axes
ax_list[0].set_title('inversions')
ax_list[1].set_title('translocations')
ax_list[2].set_title('dicentrics')
ax_list[3].set_title('excess chr fragments')
ax_list[4].set_title('aberration index')

plt.ylim(-0.05,0.6)

plt.tight_layout(pad=.7)
plt.savefig(f'../graphs/paper figures/supp figs/linear regression performance for all chr aberr types.png',
           dpi=400, bbox_inches='tight')

In [27]:

ax = sns.set(font_scale=1.2)
col_order=['# inversions', '# translocations', '# dicentrics', '# excess chr fragments', 'aberration index']
ax = sns.lmplot(x='Features', y='Linear regression R2 score', data=chr_df, 
                 col='Aberration type', hue='Aberration type', col_order=col_order, 
                 col_wrap=3, legend=False, height=2.9, aspect=1.40, 
#                  facet_kws={'sharey':True, 'sharex':True},
#                  **{'linewidth':3, 'marker':'o', 'markersize':10,
#                     'mec':'black', 'mew':0.5, }
               )

ax.set_xlabels('Model features')

# ax.set_xticklabels(ha='center')
# ax[0].set_title('# inversions')
ax_list = ax.axes
ax_list[0].set_title('inversions')
ax_list[1].set_title('translocations')
ax_list[2].set_title('dicentrics')
ax_list[3].set_title('excess chr fragments')
ax_list[4].set_title('aberration index')

plt.ylim(-0.05,0.6)

plt.tight_layout(pad=.7)
# plt.savefig(f'../graphs/paper figures/supp figs/linear regression performance for all chr aberr types.png',
#            dpi=400, bbox_inches='tight')

---------------------------------------------------------------------------
UFuncTypeError                            Traceback (most recent call last)
<ipython-input-27-0fd50c5976da> in <module>
      1 ax = sns.set(font_scale=1.2)
      2 col_order=['# inversions', '# translocations', '# dicentrics', '# excess chr fragments', 'aberration index']
----> 3 ax = sns.lmplot(x='Features', y='Linear regression R2 score', data=chr_df, 
      4                  col='Aberration type', hue='Aberration type', col_order=col_order,
      5                  col_wrap=3, legend=False, height=2.9, aspect=1.40,

/usr/local/lib/python3.8/site-packages/seaborn/regression.py in lmplot(x, y, data, hue, col, row, palette, col_wrap, height, aspect, markers, sharex, sharey, hue_order, col_order, row_order, legend, legend_out, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, seed, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, x_jitter, y_jitter, scatter_kws, line_kws, size)
    614         scatter_kws=scatter_kws, line_kws=line_kws,
    615         )
--> 616     facets.map_dataframe(regplot, x, y, **regplot_kws)
    617 
    618     # Add a legend

/usr/local/lib/python3.8/site-packages/seaborn/axisgrid.py in map_dataframe(self, func, *args, **kwargs)
    826 
    827             # Draw the plot
--> 828             self._facet_plot(func, ax, args, kwargs)
    829 
    830         # Finalize the annotations and layout

/usr/local/lib/python3.8/site-packages/seaborn/axisgrid.py in _facet_plot(self, func, ax, plot_args, plot_kwargs)
    844 
    845         # Draw the plot
--> 846         func(*plot_args, **plot_kwargs)
    847 
    848         # Sort out the supporting information

/usr/local/lib/python3.8/site-packages/seaborn/regression.py in regplot(x, y, data, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, seed, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, label, color, marker, scatter_kws, line_kws, ax)
    815     scatter_kws["marker"] = marker
    816     line_kws = {} if line_kws is None else copy.copy(line_kws)
--> 817     plotter.plot(ax, scatter_kws, line_kws)
    818     return ax
    819 

/usr/local/lib/python3.8/site-packages/seaborn/regression.py in plot(self, ax, scatter_kws, line_kws)
    367 
    368         if self.fit_reg:
--> 369             self.lineplot(ax, line_kws)
    370 
    371         # Label the axes

/usr/local/lib/python3.8/site-packages/seaborn/regression.py in lineplot(self, ax, kws)
    410         """Draw the model."""
    411         # Fit the regression model
--> 412         grid, yhat, err_bands = self.fit_regression(ax)
    413         edges = grid[0], grid[-1]
    414 

/usr/local/lib/python3.8/site-packages/seaborn/regression.py in fit_regression(self, ax, x_range, grid)
    198                 else:
    199                     x_min, x_max = ax.get_xlim()
--> 200             grid = np.linspace(x_min, x_max, 100)
    201         ci = self.ci
    202 

<__array_function__ internals> in linspace(*args, **kwargs)

/usr/local/lib/python3.8/site-packages/numpy/core/function_base.py in linspace(start, stop, num, endpoint, retstep, dtype, axis)
    118     # Convert float/complex array scalars to float, gh-3504
    119     # and make sure one can use variables that have an __array_interface__, gh-6634
--> 120     start = asanyarray(start) * 1.0
    121     stop  = asanyarray(stop)  * 1.0
    122 

UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U32'), dtype('<U32')) -> dtype('<U32')

Analyzing Complete Blood Count Data¶

Loading TeloFISH, chromosome aberration, CBC data¶

In [2]:

# loading TeloFISH data

all_patients_df = pd.read_csv('../data/compiled patient data csv files/all_patients_df.csv')
all_patients_df['telo data'] = all_patients_df['telo data'].map(literal_eval)
all_patients_df.drop(['telo data', 'Q1', 'Q2-3', 'Q4'], axis=1, inplace=True)

In [3]:

# loading chromosome aberration data

all_chr_aberr_df = pd.read_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv')

general_cleaner = Pipeline([('cleaner', trp.general_chr_aberr_cleaner(drop_what_timepoint=False, adjust_clonality=True))])
cleaned_chr_df = general_cleaner.fit_transform(all_chr_aberr_df)
cleaned_chr_df['# inversions'] = cleaned_chr_df['# inversions'] + cleaned_chr_df['# terminal inversions']
cleaned_chr_df.drop(['# terminal inversions'], axis=1, inplace=True)

grp_chr_aberr = cleaned_chr_df.groupby(['patient id', 'timepoint']).agg('mean').reset_index()

In [4]:

# loading qPCR telo data

all_qPCR_df = pd.read_csv('../data/qPCR telo data/all_qPCR_df.csv')

In [5]:

# loading cbc counts data

cbc_data = pd.read_csv('../data/compiled patient data csv files/cleaned cbc data.csv')
cbc_data.drop(['date', 'on AA 0=no 1=yes', 'LN 0=n 1=y', '  dose   0=78  1=70   2= implant 3=70/28 4=1800/3 bst'], 
              axis=1, inplace=True)

In [6]:

# merging teloFISH + CBC, qPCR + CBC, chr aberrations + CBC, & merging all

merge_telofish_cbc = all_patients_df.merge(cbc_data, on=['patient id', 'timepoint'])

merge_qpcr = all_qPCR_df.merge(cbc_data, on=['patient id', 'timepoint'])

merge_chr_cbc = grp_chr_aberr.merge(cbc_data, on=['patient id', 'timepoint'])

merge_all = merge_telofish_cbc.merge(grp_chr_aberr, on=['patient id', 'timepoint'])

TeloFISH vs. CBC data¶

In [7]:

merge_telofish_cbc.corr()

Out[7]:

	patient id	telo means	WBC	RBC	Hgb	Hct	mcv	mch	mchc	rdw	...	cd3 abs	cd4%	cd4 abs	cd 8%	cd 8 abs	cd 19%	cd 19 abs	cd4/cd8	NK %	NK abs
patient id	1.000000	-0.482406	0.045076	0.056003	0.216487	0.086520	0.151196	0.270799	0.347655	-0.220234	...	0.047696	-0.345496	-0.060331	0.377962	0.201079	0.113894	0.093738	-0.419719	-0.240242	-0.141554
telo means	-0.482406	1.000000	-0.126405	-0.063405	-0.048631	0.009699	0.117051	0.054275	-0.145268	-0.007756	...	-0.265807	0.281594	-0.190007	-0.250713	-0.331702	-0.288298	-0.304504	0.429559	0.408281	0.118537
WBC	0.045076	-0.126405	1.000000	0.001271	0.220666	0.227359	0.271600	0.260500	0.054146	-0.372158	...	0.539096	0.306203	0.639397	-0.225986	0.340598	0.170815	0.437403	0.267823	-0.185876	0.289421
RBC	0.056003	-0.063405	0.001271	1.000000	0.330533	0.325687	-0.551065	-0.448978	0.063059	0.378307	...	0.407696	-0.037649	0.386152	0.158299	0.333421	-0.075314	0.126309	-0.060543	-0.103375	0.323583
Hgb	0.216487	-0.048631	0.220666	0.330533	1.000000	0.921122	0.374157	0.494990	0.439348	-0.352577	...	0.361162	0.053451	0.324169	0.382794	0.368324	-0.038466	0.089672	-0.257840	-0.265620	0.210809
Hct	0.086520	0.009699	0.227359	0.325687	0.921122	1.000000	0.383547	0.350255	0.056114	-0.230922	...	0.369794	0.032424	0.346737	0.253293	0.343606	0.072362	0.176192	-0.152870	-0.256686	0.213496
mcv	0.151196	0.117051	0.271600	-0.551065	0.374157	0.383547	1.000000	0.918503	0.106445	-0.582201	...	-0.118412	0.056052	-0.117447	0.046161	-0.052865	0.130086	0.044068	-0.050749	-0.095841	-0.169225
mch	0.270799	0.054275	0.260500	-0.448978	0.494990	0.350255	0.918503	1.000000	0.489273	-0.661689	...	-0.078138	0.083939	-0.091850	0.193523	0.008687	0.001873	-0.034821	-0.155774	-0.118079	-0.130829
mchc	0.347655	-0.145268	0.054146	0.063059	0.439348	0.056114	0.106445	0.489273	1.000000	-0.394782	...	0.066935	0.089432	0.028703	0.397382	0.143590	-0.265709	-0.178768	-0.299353	-0.106153	0.023885
rdw	-0.220234	-0.007756	-0.372158	0.378307	-0.352577	-0.230922	-0.582201	-0.661689	-0.394782	1.000000	...	-0.078095	-0.075194	-0.119642	0.022586	-0.052700	-0.114159	-0.089429	-0.001233	-0.051859	-0.198498
plt	-0.412318	0.172354	0.243272	-0.074261	0.049405	0.040060	0.055999	0.066981	0.028868	-0.220423	...	0.408450	0.197672	0.410429	0.046549	0.358229	-0.147660	0.042702	0.075712	0.000616	0.289992
mpv	-0.258088	-0.000113	0.169130	0.225262	0.234935	0.392096	0.139594	0.006954	-0.288044	0.203591	...	0.268907	0.082519	0.290063	-0.089949	0.183797	0.216494	0.284548	0.109532	-0.146806	0.078580
gran%	-0.265054	0.180082	0.009123	-0.559908	-0.411022	-0.424544	0.206388	0.156887	-0.056742	-0.062757	...	-0.714490	0.252633	-0.595861	-0.415881	-0.760667	-0.166623	-0.422975	0.364609	0.293757	-0.377525
lmp%	0.027142	-0.196474	0.088627	0.559527	0.375069	0.414239	-0.273757	-0.244848	-0.013491	0.060101	...	0.801260	-0.197470	0.716819	0.283195	0.786176	0.213352	0.513762	-0.262537	-0.262625	0.501336
mono%	0.547701	0.015066	-0.153847	-0.176707	-0.027397	-0.054069	0.331129	0.321180	0.066422	-0.099505	...	-0.348530	-0.200644	-0.394392	0.152340	-0.217742	0.023082	-0.179080	-0.149404	0.054731	-0.315797
eos%	0.206229	0.119629	-0.121313	0.079285	0.145357	0.107267	0.007345	0.054057	0.125795	-0.066832	...	-0.258075	0.203747	-0.239980	-0.027444	-0.256531	-0.285941	-0.343021	0.210397	0.066135	-0.135670
baso%	0.010755	0.301144	0.013914	-0.005616	-0.001234	0.094450	0.073392	-0.015497	-0.204428	-0.250134	...	-0.041744	0.346744	0.016373	-0.219396	-0.118437	-0.178747	-0.176549	0.465297	0.098092	0.027343
nrbc	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
gran#	-0.118379	0.016935	0.876199	-0.243316	0.031462	0.036594	0.352784	0.316216	0.017757	-0.354281	...	0.119143	0.397890	0.262774	-0.377870	-0.079640	0.065011	0.170038	0.405330	-0.031604	0.064017
lymph#	0.076382	-0.293638	0.586373	0.391279	0.336236	0.357343	-0.104188	-0.080423	0.026185	-0.118091	...	0.948706	-0.033763	0.928520	0.105314	0.847489	0.278416	0.673670	-0.110802	-0.308250	0.546756
mono#	0.501696	-0.094394	0.618297	-0.153716	0.145985	0.101529	0.448138	0.460818	0.158968	-0.360540	...	0.129578	0.072046	0.151939	0.016709	0.102851	0.116811	0.174917	0.020066	-0.128706	-0.043695
eos#	0.214030	0.023208	0.563596	0.073564	0.307509	0.278495	0.200661	0.236600	0.154073	-0.290308	...	0.116345	0.401192	0.206862	-0.177053	-0.028970	-0.157342	-0.065269	0.372025	-0.074399	0.078128
baso#	-0.001150	0.278747	0.409894	0.070176	0.166313	0.276380	0.195955	0.099034	-0.187635	-0.342541	...	0.118416	0.433728	0.231272	-0.331775	-0.063651	-0.088281	-0.019791	0.566157	0.059975	0.150053
wbc	0.032423	-0.115737	0.998091	-0.003767	0.211040	0.221447	0.272310	0.257000	0.043900	-0.371530	...	0.540795	0.309680	0.643888	-0.235111	0.339412	0.172790	0.441893	0.277237	-0.178744	0.295736
Lym%	-0.009158	-0.182016	0.058556	0.547182	0.338682	0.376962	-0.296733	-0.267865	-0.020465	0.091123	...	0.804096	-0.206268	0.717933	0.291912	0.792425	0.210414	0.518444	-0.282559	-0.258935	0.503031
Abs lym	0.045180	-0.265932	0.568844	0.386510	0.313005	0.341351	-0.111267	-0.095086	0.005047	-0.114043	...	0.958922	-0.025345	0.944371	0.087354	0.851973	0.278000	0.683801	-0.088489	-0.290770	0.569064
cd3%	0.020425	-0.001117	0.051438	0.165321	0.336405	0.239413	0.005171	0.130770	0.327250	0.036569	...	0.270924	0.651917	0.217464	0.638208	0.319912	-0.654257	-0.358162	-0.088182	-0.630943	-0.314475
cd3 abs	0.047696	-0.265807	0.539096	0.407696	0.361162	0.369794	-0.118412	-0.078138	0.066935	-0.078095	...	1.000000	0.103674	0.964798	0.239357	0.919864	0.085409	0.503935	-0.116753	-0.396157	0.430248
cd4%	-0.345496	0.281594	0.306203	-0.037649	0.053451	0.032424	0.056052	0.083939	0.089432	-0.075194	...	0.103674	1.000000	0.239781	-0.148355	-0.082118	-0.593569	-0.337081	0.636785	-0.193412	-0.050243
cd4 abs	-0.060331	-0.190007	0.639397	0.386152	0.324169	0.346737	-0.117447	-0.091850	0.028703	-0.119642	...	0.964798	0.239781	1.000000	0.032233	0.789152	0.060347	0.482205	0.095431	-0.284642	0.540355
cd 8%	0.377962	-0.250713	-0.225986	0.158299	0.382794	0.253293	0.046161	0.193523	0.397382	0.022586	...	0.239357	-0.148355	0.032233	1.000000	0.509250	-0.240891	-0.109063	-0.785708	-0.615321	-0.363972
cd 8 abs	0.201079	-0.331702	0.340598	0.333421	0.368324	0.343606	-0.052865	0.008687	0.143590	-0.052700	...	0.919864	-0.082118	0.789152	0.509250	1.000000	0.097894	0.462219	-0.406645	-0.504392	0.201142
cd 19%	0.113894	-0.288298	0.170815	-0.075314	-0.038466	0.072362	0.130086	0.001873	-0.265709	-0.114159	...	0.085409	-0.593569	0.060347	-0.240891	0.097894	1.000000	0.811833	-0.229258	-0.088258	0.074464
cd 19 abs	0.093738	-0.304504	0.437403	0.126309	0.089672	0.176192	0.044068	-0.034821	-0.178768	-0.089429	...	0.503935	-0.337081	0.482205	-0.109063	0.462219	0.811833	1.000000	-0.177355	-0.288432	0.284034
cd4/cd8	-0.419719	0.429559	0.267823	-0.060543	-0.257840	-0.152870	-0.050749	-0.155774	-0.299353	-0.001233	...	-0.116753	0.636785	0.095431	-0.785708	-0.406645	-0.229258	-0.177355	1.000000	0.401723	0.272365
NK %	-0.240242	0.408281	-0.185876	-0.103375	-0.265620	-0.256686	-0.095841	-0.118079	-0.106153	-0.051859	...	-0.396157	-0.193412	-0.284642	-0.615321	-0.504392	-0.088258	-0.288432	0.401723	1.000000	0.458031
NK abs	-0.141554	0.118537	0.289421	0.323583	0.210809	0.213496	-0.169225	-0.130829	0.023885	-0.198498	...	0.430248	-0.050243	0.540355	-0.363972	0.201142	0.074464	0.284034	0.272365	0.458031	1.000000

37 rows × 37 columns

Longitudinal correlations between mean telomere length & CBC data¶

In [11]:

# parsing high correlation data

df = merge_telofish_cbc.copy()

hi_r_variables = []

for col in df.columns:
    if col != 'patient id' and col != 'telo means' and col != 'timepoint':
        r2_value = df[['telo means', col]].corr().iloc[0][1]
        if abs(r2_value) > 0:
            hi_r_variables.append([col, r2_value])
#             print(col, r2_value)

In [12]:

r2_telos_corr = pd.DataFrame(hi_r_variables, columns=['marker', 'R2 correlation'])
r2_telos_celltype = r2_telos_corr[r2_telos_corr['marker'].isin(['lymph#', 'mono#', 'gran#', 'eos#', 'baso#'])].copy()
r2_telos_lymph_percent = r2_telos_corr[r2_telos_corr['marker'].isin(['NK %', 'cd4%', 'cd 8%', 'cd 19%'])].copy()

In [13]:

m_r2_telos_celltype = r2_telos_celltype.copy()

m_r2_telos_celltype['marker'].iloc[0] = 'granulocyes'
m_r2_telos_celltype['marker'].iloc[1] = 'lymphocytes'
m_r2_telos_celltype['marker'].iloc[2] = 'monocytes'
m_r2_telos_celltype['marker'].iloc[3] = 'eosinophils'
m_r2_telos_celltype['marker'].iloc[4] = 'basophils'

In [18]:

df = m_r2_telos_celltype.copy()
x = df['R2 correlation']
df['colors'] = ['black' if x < 0 else 'green' for x in df['R2 correlation']]
df.sort_values('R2 correlation', inplace=True)
df.reset_index(inplace=True, drop=True)

plt.figure(figsize=(7,3.2))
plt.hlines(y=df.index, xmin=0, xmax=df['R2 correlation'], color=df['colors'], alpha=0.6, linewidth=14,)

# Decorations
plt.yticks(df.index, df['marker'], fontsize=14)
plt.xticks(fontsize=14)
plt.xlabel('Correlation (R) values with Mean Telomere Length', fontsize=14)
plt.ylabel('Counts of peripheral WBC types', fontsize=14)

plt.grid(linestyle='-', alpha=.2, color='black')
plt.tight_layout()

plt.savefig('../graphs/paper figures/supp figs/correlations between WBC cell type counts and telomere length.png', dpi=600,
            bbox_inches='tight')

In [19]:

m_r2_telos_lymph_percent = r2_telos_lymph_percent.copy()

m_r2_telos_lymph_percent['marker'].iloc[0] = 'CD4 %'
m_r2_telos_lymph_percent['marker'].iloc[1] = 'CD8 %'
m_r2_telos_lymph_percent['marker'].iloc[2] = 'CD19 %'
m_r2_telos_lymph_percent['marker'].iloc[3] = 'NK %'

In [20]:

df = m_r2_telos_lymph_percent
x = df['R2 correlation']
df['colors'] = ['black' if x < 0 else 'green' for x in df['R2 correlation']]
df.sort_values('R2 correlation', inplace=True)
df.reset_index(inplace=True, drop=True)

plt.figure(figsize=(7,3.2))
plt.hlines(y=df.index, xmin=0, xmax=df['R2 correlation'], color=df['colors'], alpha=0.6, linewidth=14,)

# Decorations
plt.yticks(df.index, df['marker'], fontsize=14)
plt.xticks(fontsize=14)
plt.xlabel('Correlation (R) values with Mean Telomere Length', fontsize=14)
plt.ylabel('Percentage of lymphocytes', fontsize=14)

plt.grid(linestyle='-', alpha=.2, color='black')
plt.tight_layout()

plt.savefig('../graphs/paper figures/supp figs/correlations between proportions of lymphocytes and telomere length.png', 
            dpi=600,
            bbox_inches='tight')

In [21]:

merge_telofish_cbc.head(3)

Out[21]:

	patient id	timepoint	telo means	WBC	RBC	Hgb	Hct	mcv	mch	mchc	...	cd3 abs	cd4%	cd4 abs	cd 8%	cd 8 abs	cd 19%	cd 19 abs	cd4/cd8	NK %	NK abs
0	1	1 non irrad	84.796483	8.7	4.86	14.7	43.6	89.7	30.2	33.7	...	1740	50	1175	23	541	17	400	2.17	7	158
1	1	3 B	116.779989	5.7	4.79	14.7	43.0	89.8	30.7	34.2	...	528	64	404	19	119	4	27	3.40	9	57
2	1	4 C	99.346299	5.9	4.58	14.8	43.9	95.9	32.3	33.7	...	431	50	294	21	124	11	64	2.38	14	83

3 rows × 38 columns

In [28]:

trp.graph_cyto_cbc_data(plot_cyto_y='telo means', plot_cbc_y='WBC', df=merge_telofish_cbc,
                        cyto_name='Mean Telomere Length', cbc_name='Total WBC counts',
                        ylim1=(80, 120),
                        ylim2=(3.25, 8))

Chromosome aberrations vs. CBC data¶

In [704]:

# merge_chr_cbc.corr()

In [705]:

merge_chr_cbc.head()

Out[705]:

	patient id	timepoint	# inversions	# sister chromatid exchanges	# dicentrics	# excess chr fragments	# sat associations	# terminal SCEs	# translocations	WBC	...	cd3 abs	cd4%	cd4 abs	cd 8%	cd 8 abs	cd 19%	cd 19 abs	cd4/cd8	NK %	NK abs
0	1	1 non irrad	0.233333	0.633333	0.000000	0.000000	0.133333	0.566667	0.033333	8.7	...	1740	50	1175	23	541	17	400	2.17	7	158
1	1	3 B	1.266667	0.700000	0.366667	0.433333	0.766667	0.800000	0.100000	5.7	...	528	64	404	19	119	4	27	3.40	9	57
2	1	4 C	0.566667	0.933333	0.066667	0.266667	0.366667	0.766667	0.133333	5.9	...	431	50	294	21	124	11	64	2.38	14	83
3	2	1 non irrad	0.300000	0.533333	0.000000	0.033333	0.333333	0.766667	0.033333	5.3	...	740	42	472	25	275	15	170	1.72	15	171
4	2	3 B	1.066667	0.700000	0.300000	0.166667	0.466667	1.000000	0.033333	5.6	...	203	53	147	22	61	2	6	2.43	24	67

5 rows × 44 columns

In [707]:

sns.lmplot(x='lymph#', y='# inversions', lowess=True, data=merge_chr_cbc)

Out[707]:

<seaborn.axisgrid.FacetGrid at 0x14d47d1d0>

Longitudinal correlations between chromosome aberrations & CBC data¶

In [23]:

chr_corr_df = pd.DataFrame()

for aberr in grp_chr_aberr.columns:
    if aberr != 'patient id' and aberr != 'timepoint':
        test = grp_chr_aberr[['patient id', 'timepoint', aberr]].merge(cbc_data, on=['patient id', 'timepoint']) 

        df = test.copy()

        hi_r_variables = []

        for col in df:
            if col == 'lymph#':
                r2_value = df[[aberr, col]].corr().iloc[0][1]
                if abs(r2_value) > 0.60:
                    hi_r_variables.append([col, r2_value])
                    print(aberr, col, r2_value)
                    
#         single_aberr_df = pd.DataFrame(hi_r_variables, columns=['CBC data', f'R2 correlation {aberr}'])
    

# inversions lymph# -0.7522869776095474
# dicentrics lymph# -0.7509787076100194
# excess chr fragments lymph# -0.6717065000901444
# translocations lymph# -0.6015085894070261

In [24]:

cbc_corr_telo = pd.DataFrame(hi_r_variables, columns=['CBC data', 'R2 correlation'])
cbc_corr_telo.sort_values(by='R2 correlation', ascending=False).reset_index(drop=True)

Out[24]:

	CBC data	R2 correlation
0	lymph#	-0.601509

In [29]:

trp.graph_cyto_cbc_data(plot_cyto_y='# inversions', plot_cbc_y='lymph#', df=merge_chr_cbc,
                    cyto_name='# inversions', cbc_name='Lymphocyte cell counts',
                    ax_color1='teal'
                    )

In [32]:

trp.graph_cyto_cbc_data(plot_cyto_y='# translocations', plot_cbc_y='lymph#', df=merge_chr_cbc,
                    cyto_name='# translocations', cbc_name='Lymphocyte cell counts',
                    ax_color1='red',
                    ylim1=(0, 0.15)
                   )

In [33]:

trp.graph_cyto_cbc_data(plot_cyto_y='# dicentrics', plot_cbc_y='lymph#', df=merge_chr_cbc,
                    cyto_name='# dicentrics', cbc_name='Lymphocyte cell counts',
                    ax_color1='orange'
                    )

In [34]:

trp.graph_cyto_cbc_data(plot_cyto_y='# excess chr fragments', plot_cbc_y='lymph#', df=merge_chr_cbc,
                    cyto_name='# excess chr fragments', cbc_name='Lymphocyte cell counts',
                    ax_color1='purple',
                    ylim1=(0, 0.35)
                    )

In [127]:

cbc_corr_telo = pd.DataFrame(hi_r_variables, columns=['CBC data', 'R2 correlation'])
cbc_corr_telo.sort_values(by='R2 correlation', ascending=False).reset_index(drop=True)

Out[127]:

	CBC data	R2 correlation
0	cd4/cd8	0.429559
1	NK %	0.408281
2	baso%	0.301144
3	cd4%	0.281594
4	baso#	0.278747
5	cd 8%	-0.250713
6	cd3 abs	-0.265807
7	Abs lym	-0.265932
8	cd 19%	-0.288298
9	lymph#	-0.293638
10	cd 19 abs	-0.304504
11	cd 8 abs	-0.331702

In [149]:

chr_corr_df = pd.DataFrame()

for aberr in grp_chr_aberr.columns:
    if aberr != 'patient id' and aberr != 'timepoint':
        test = grp_chr_aberr[['patient id', 'timepoint', aberr]].merge(cbc_data, on=['patient id', 'timepoint']) 

        df = test.copy()

        hi_r_variables = []

        for col in df:
            if col == 'lymph#':
                r2_value = df[[aberr, col]].corr().iloc[0][1]
                if abs(r2_value) > 0.60:
                    hi_r_variables.append([col, r2_value])
                    print(aberr, col, r2_value)
                    
#         single_aberr_df = pd.DataFrame(hi_r_variables, columns=['CBC data', f'R2 correlation {aberr}'])
    

# inversions lymph# -0.7522869776095474
# dicentrics lymph# -0.7509787076100194
# excess chr fragments lymph# -0.6717065000901444
# translocations lymph# -0.6015085894070261

Addressing whether recovery fraction of lymphocytes (pre vs. post) correlates with chromosome aberrations¶

In [361]:

# parsing out time / lymph#
time_cbc = merge_chr_cbc[['patient id', 'timepoint', 'lymph#']]
recovery_percent = time_cbc.pivot_table(index=['patient id'], values='lymph#', columns='timepoint').reset_index()

In [487]:

# feature engineering lymph# percent recovery
recovery_percent['pre vs. post recov %'] = (recovery_percent['4 C'] / recovery_percent['1 non irrad']) * 100
recovery_percent['pre vs. immed post %'] = (recovery_percent['3 B'] / recovery_percent['1 non irrad']) * 100

In [488]:

plt.figure(figsize=(7,3.2))

fontsize=14

ax = sns.barplot(x='patient id', y='pre vs. immed post %', data=recovery_percent)

ax.set_ylabel('Lymphocyte percent recovery', fontsize=fontsize)  
ax.set_xlabel('patient IDs', fontsize=fontsize)

# enforcing tick sizes
ax.tick_params(labelsize=14)

labels = ax.get_yticks().tolist()
labels_percent = []
for l in labels:
    labels_percent.append(str(l) + '%')
ax.set_yticklabels(labels_percent)

# plt.savefig('../graphs/paper figures/supp figs/lymphocyte percent recovery.png', dpi=400,
#            bbox_inches = "tight")

Out[488]:

[Text(0, 0, '0.0%'),
 Text(0, 0, '10.0%'),
 Text(0, 0, '20.0%'),
 Text(0, 0, '30.0%'),
 Text(0, 0, '40.0%'),
 Text(0, 0, '50.0%'),
 Text(0, 0, '60.0%')]

In [489]:

# merging lymphocyte percent recovery w/ chr aberr & cbc dataframe

merge_recov_percent_chr = merge_chr_cbc.merge(recovery_percent[['patient id', 
                                                                'pre vs. post recov %',
                                                                'pre vs. immed post %']],
                                                                on=['patient id'])

merge_recov_percent_chr = merge_recov_percent_chr[merge_recov_percent_chr['timepoint'] == '4 C'].copy()

In [495]:

# calculating R2 correlation values between aberrations of interest & pre/post lymphocyte recovery

metric = 'pre vs. post recov %'

aberrations = ['# inversions', '# dicentrics', '# translocations', '# excess chr fragments']
r2_list = []

for aberr in aberrations:
    r2_value = merge_recov_percent_chr[[aberr, metric]].corr()[metric][0]
    r2_list.append([aberr, round(r2_value, 3)])

r2_df = pd.DataFrame(r2_list, columns=['Aberration', 'R2 value'])

In [496]:

plt.figure(figsize=(7,3.2))

fontsize=14

ax = sns.barplot(y='Aberration', x='R2 value', data=r2_df, orient='h')

ax.set_ylabel('Average frequencies', fontsize=fontsize)  
ax.set_xlabel('R2 Correlation with post-therapy lymphocyte recovery', fontsize=fontsize)
# plt.xticks(rotation=30)

# enforcing tick sizes
ax.tick_params(labelsize=14)

# plt.savefig('../graphs/paper figures/supp figs/correlation aberrations lymphocyte percent recovery.png', dpi=400,
#            bbox_inches='tight')

In [497]:

merge_chr_cbc.head()

Out[497]:

	patient id	timepoint	# inversions	# sister chromatid exchanges	# dicentrics	# excess chr fragments	# sat associations	# terminal SCEs	# translocations	WBC	...	cd3 abs	cd4%	cd4 abs	cd 8%	cd 8 abs	cd 19%	cd 19 abs	cd4/cd8	NK %	NK abs
0	1	1 non irrad	0.233333	0.633333	0.000000	0.000000	0.133333	0.566667	0.033333	8.7	...	1740	50	1175	23	541	17	400	2.17	7	158
1	1	3 B	1.266667	0.700000	0.366667	0.433333	0.766667	0.800000	0.100000	5.7	...	528	64	404	19	119	4	27	3.40	9	57
2	1	4 C	0.566667	0.933333	0.066667	0.266667	0.366667	0.766667	0.133333	5.9	...	431	50	294	21	124	11	64	2.38	14	83
3	2	1 non irrad	0.300000	0.533333	0.000000	0.033333	0.333333	0.766667	0.033333	5.3	...	740	42	472	25	275	15	170	1.72	15	171
4	2	3 B	1.066667	0.700000	0.300000	0.166667	0.466667	1.000000	0.033333	5.6	...	203	53	147	22	61	2	6	2.43	24	67

5 rows × 44 columns