In [11]:

# tools for handling files
import sys
import os

# pandas/numpy for handling data
import pandas as pd
import numpy as np
from pandas import ExcelWriter
from pandas import ExcelFile

# for reading individual telomere length data from files
from ast import literal_eval

# for grabbing individual cells
import more_itertools

# my module containing functions for handling/visualizing/analyzing telomere length/chr rearrangement data
import telomere_methods_rad_patient as trp

# incase reloading modules is required
import importlib
%load_ext autoreload
%autoreload 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

...

Loading Telomere Length Data from TeloFISH¶

Extracting telomere length data output from ImageJ from all radiation therapy patients¶

In [12]:

all_patients_dict = trp.generate_dictionary_from_TeloLength_data('../data/raw patient teloFISH data/')

SW9A non irrad.xlsx data extraction in progress..
BJ1 for SW9_.xlsx data extraction in progress..
SW11A non irrad.xlsx data extraction in progress..
BJ1 for SW15_.xlsx data extraction in progress..
SW6A non irrad.xlsx data extraction in progress..
SW6A irrad @ 4 Gy.xlsx data extraction in progress..
SW8B.xlsx data extraction in progress..
SW14A irrad @ 4 Gy.xlsx data extraction in progress..
SW8A irrad @ 4 Gy.xlsx data extraction in progress..
SW5A irrad @ 4 Gy.xlsx data extraction in progress..
SW8C.xlsx data extraction in progress..
SW1A non irrad.xlsx data extraction in progress..
BJ1 for SW11_.xlsx data extraction in progress..
SW16A non irrad.xlsx data extraction in progress..
BJ1 for SW13_.xlsx data extraction in progress..
BJ-hTERT for SW9_.xlsx data extraction in progress..
BJ1 for SW14_.xlsx data extraction in progress..
SW9B.xlsx data extraction in progress..
BJ1 for SW8_.xlsx data extraction in progress..
SW_1_ok_3_C_.xlsx data extraction in progress..
SW3A irrad @ 4 Gy.xlsx data extraction in progress..
SW11A irrad @ 4 Gy.xlsx data extraction in progress..
BJ1 for SW16_.xlsx data extraction in progress..
BJ1 for SW12_.xlsx data extraction in progress..
SW8A non irrad.xlsx data extraction in progress..
BJ-hTERT for SW8_.xlsx data extraction in progress..
SW10A non irrad.xlsx data extraction in progress..
SW12A irrad @ 4 Gy.xlsx data extraction in progress..
SW9C.xlsx data extraction in progress..
BJ1 for SW10_.xlsx data extraction in progress..
SW7A non irrad.xlsx data extraction in progress..
SW1A irrad @ 4 Gy.xlsx data extraction in progress..
SW13A irrad @ 4 Gy.xlsx data extraction in progress..
SW1B.xlsx data extraction in progress..
BJ-hTERT for SW6_.xlsx data extraction in progress..
SW13B.xlsx data extraction in progress..
BJ1 for SW2_.xlsx data extraction in progress..
SW2A non irrad.xlsx data extraction in progress..
SW5C.xlsx data extraction in progress..
SW15C.xlsx data extraction in progress..
SW7C.xlsx data extraction in progress..
SW11B.xlsx data extraction in progress..
SW3B.xlsx data extraction in progress..
BJ-hTERT for SW15_.xlsx data extraction in progress..
SW15A non irrad.xlsx data extraction in progress..
SW12A non irrad.xlsx data extraction in progress..
BJ-hTERT for SW11_.xlsx data extraction in progress..
SW3C.xlsx data extraction in progress..
SW11C.xlsx data extraction in progress..
SW7B.xlsx data extraction in progress..
SW15B.xlsx data extraction in progress..
BJ1 for SW6_.xlsx data extraction in progress..
BJ-hTERT for SW2_.xlsx data extraction in progress..
SW5B.xlsx data extraction in progress..
SW5A non irrad.xlsx data extraction in progress..
SW1C.xlsx data extraction in progress..
BJ-hTERT for SW13_.xlsx data extraction in progress..
SW10A irrad @ 4 Gy.xlsx data extraction in progress..
SW2A irrad @ 4 Gy.xlsx data extraction in progress..
BJ1 for SW1_.xlsx data extraction in progress..
SW10B.xlsx data extraction in progress..
BJ-hTERT for SW5_.xlsx data extraction in progress..
SW2B.xlsx data extraction in progress..
SW13A non irrad.xlsx data extraction in progress..
SW14C.xlsx data extraction in progress..
SW6C.xlsx data extraction in progress..
SW9A irrad @ 4 Gy.xlsx data extraction in progress..
SW16A irrad @ 4 Gy.xlsx data extraction in progress..
BJ-hTERT for SW14_.xlsx data extraction in progress..
BJ-hTERT for SW16_.xlsx data extraction in progress..
SW16C.xlsx data extraction in progress..
BJ1 for SW3_.xlsx data extraction in progress..
SW12B.xlsx data extraction in progress..
BJ-hTERT for SW7_.xlsx data extraction in progress..
SW12C.xlsx data extraction in progress..
SW16B.xlsx data extraction in progress..
BJ-hTERT for SW3_.xlsx data extraction in progress..
BJ1 for SW7_.xlsx data extraction in progress..
BJ-hTERT for SW12_.xlsx data extraction in progress..
SW3A non irrad.xlsx data extraction in progress..
SW15A irrad @ 4 Gy.xlsx data extraction in progress..
SW7A irrad @ 4 Gy.xlsx data extraction in progress..
BJ-hTERT for SW10_.xlsx data extraction in progress..
SW6B.xlsx data extraction in progress..
SW14B.xlsx data extraction in progress..
BJ-hTERT for SW1_.xlsx data extraction in progress..
SW14A non irrad.xlsx data extraction in progress..
BJ1 for SW5_.xlsx data extraction in progress..
SW2C.xlsx data extraction in progress..
SW10C.xlsx data extraction in progress..
completed file collection

Making dataframe from dict w/ all patients telomere length data, contains telo means & individual telos as list¶

In [13]:

all_patients_df = trp.generate_dataframe_from_dict(all_patients_dict)

# don't need telo means per cell @ this time
all_patients_df = all_patients_df.drop(['cell data'], axis=1)
print(all_patients_df.shape)

(59, 7)

Saving all patients telomere length data for later retrieval¶

In [14]:

# changing telo data to list in prep for saving to csv
all_patients_df['telo data'] = all_patients_df['telo data'].apply(lambda row: row.tolist())
all_patients_df.to_csv('../data/compiled patient data csv files/all_patients_df.csv', index=False)

Generating all patients telo df containing telo counts per quartile melted into tidy data format¶

In [15]:

melted_all_patients_df = pd.melt(
    all_patients_df,
    id_vars = [col for col in all_patients_df.columns if col != 'Q1' and col != 'Q2-3' and col != 'Q4'],
    var_name='relative Q',
    value_name='Q freq counts')

melted_all_patients_df['Q freq counts'] = melted_all_patients_df['Q freq counts'].astype('float64')
melted_all_patients_df.head(4)

Out[15]:

	patient id	timepoint	telo data	telo means	relative Q	Q freq counts
0	1	1 non irrad	[79.18994405924711, 58.07204491719145, 95.0279...	84.796483	Q1	1195.0
1	1	2 irrad @ 4 Gy	[149.93296075217452, 138.31843562348496, 106.6...	90.975826	Q1	724.0
2	1	3 B	[176.32960877192357, 111.92066838585988, 123.5...	116.779989	Q1	231.0
3	1	4 C	[144.65363114822472, 84.46927366319692, 78.133...	99.346299	Q1	372.0

Saving melted all patients df to csv¶

In [16]:

melted_all_patients_df.to_csv('../data/compiled patient data csv files/melted_all_patients_df.csv', index=False)

Pivoted Dataframe w/ timepoints as columns, and telomere length means for each patient timepoint in rows¶

In [17]:

pivot_patients_telo_means_df = all_patients_df.pivot(index='patient id', columns='timepoint', values='telo means')
pivot_patients_telo_means_df = pivot_patients_telo_means_df.drop(13)

Saving pivoted telo means df to file¶

In [18]:

pivot_patients_telo_means_df.to_csv('../data/compiled patient data csv files/pivot_patients_telo_means_df.csv', index=False)

In [19]:

# can imagine the lists containing the individual telos per patient exploding to the right; maintains the index relationship
explode_telos_raw = all_patients_df['telo data'].apply(pd.Series)

print(explode_telos_raw.shape)
explode_telos_raw.head(4)

(59, 4600)

Out[19]:

	0	1	2	3	4	5	6	7	8	9	...	4590	4591	4592	4593	4594	4595	4596	4597	4598	4599
0	79.189944	58.072045	95.027933	106.642458	55.960334	62.296089	81.301676	109.810056	123.536313	181.608938	...	71.798883	122.480447	50.681564	81.301676	40.122905	115.088234	114.032379	59.128492	63.351322	46.457636
1	149.932961	138.318436	106.642458	101.363128	101.363128	80.245810	111.921788	127.758499	102.418994	57.016760	...	47.513966	92.916201	84.469274	89.748603	47.513966	102.418994	117.201117	107.697247	92.916201	71.798883
2	176.329609	111.920668	123.535077	177.385475	117.201117	139.374302	99.251397	51.736913	145.708040	154.156424	...	127.759776	66.519553	92.916201	48.569832	145.709497	114.033519	156.268156	137.262570	141.486033	182.664804
3	144.653631	84.469274	78.133297	139.372908	98.195531	53.849162	68.631285	62.296089	76.022346	125.578131	...	112.977654	109.808958	111.920668	127.759776	105.586592	100.307262	128.814354	143.597765	157.664571	115.089385

4 rows × 4600 columns

In [20]:

exploded_telos_all_patients_df = (explode_telos_raw
                                  
    # we'll merge the exploded telos df w/ our original all patients df on the index!
    .merge(all_patients_df, right_index = True, left_index = True)
    .drop(['telo data', 'Q1', 'Q2-3', 'Q4'], axis = 1)
    .melt(id_vars = ['patient id', 'timepoint', 'telo means'], value_name = "individual telomeres") 
    .drop("variable", axis = 1)
    .dropna())

exploded_telos_all_patients_df.head(4)

Out[20]:

	patient id	timepoint	telo means	individual telomeres
0	1	1 non irrad	84.796483	79.189944
1	1	2 irrad @ 4 Gy	90.975826	149.932961
2	1	3 B	116.779989	176.329609
3	1	4 C	99.346299	144.653631

Saving exploded telomere df for later retrieval¶

In [21]:

exploded_telos_all_patients_df.to_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv', index=False)

Loading Chromosome Aberration Data from Subtelo-dGH¶

In [17]:

all_chr_aberr_df = trp.make_dataframe_chr_aberr_data('../data/dGH scoresheets/')

<DirEntry 'SW14_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW3_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW8_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW13_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW9_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW5_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW12_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW15_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW2_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW10_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW7_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW1_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW16_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW11_timepoints_subtelodGH.xlsx'>
<DirEntry 'SW6_timepoints_subtelodGH.xlsx'>

In [18]:

all_chr_aberr_df.to_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv', index=False)

Loading Complete Blood Count data¶

In [31]:

# loading excel file
cbc_data = pd.read_excel('../data/to colorado.xlsx')

# minor data cleaning
cbc_data.rename({'patient': 'patient id',
                 'mrn': 'timepoint'}, axis=1, inplace=True)

def extract_patient_ID(row):
    if 'SW' in row:
        row = row.replace('SW', '  ').strip()
    return row

cbc_data['patient id'] = cbc_data['patient id'].apply(lambda row: extract_patient_ID(row))
cbc_data['patient id'] = cbc_data['patient id'].astype('int64')

In [32]:

# saving to file
cbc_data.to_csv('../data/compiled patient data csv files/cleaned cbc data.csv', index=False)

In [ ]:

Loading Telomere Length Data from TeloFISH¶

Extracting telomere length data output from ImageJ from all radiation therapy patients¶

Making dataframe from dict w/ all patients telomere length data, contains telo means & individual telos as list¶

Saving all patients telomere length data for later retrieval¶

Generating all patients telo df containing telo counts per quartile melted into tidy data format¶

Saving melted all patients df to csv¶

Pivoted Dataframe w/ timepoints as columns, and telomere length means for each patient timepoint in rows¶

Saving pivoted telo means df to file¶

Exploding individual telomere length measurements from contained list into dataframe (i.e row per individual telomere) while retaining related column info¶

Saving exploded telomere df for later retrieval¶

Loading Chromosome Aberration Data from Subtelo-dGH¶

Loading Complete Blood Count data¶