# tools for handling files
import sys
import os
# pandas/numpy for handling data
import pandas as pd
import numpy as np
from pandas import ExcelWriter
from pandas import ExcelFile
# for reading individual telomere length data from files
from ast import literal_eval
# for grabbing individual cells
import more_itertools
# my module containing functions for handling/visualizing/analyzing telomere length/chr rearrangement data
import telomere_methods_rad_patient as trp
# incase reloading modules is required
import importlib
%load_ext autoreload
%autoreload
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
...
all_patients_dict = trp.generate_dictionary_from_TeloLength_data('../data/raw patient teloFISH data/')
SW9A non irrad.xlsx data extraction in progress.. BJ1 for SW9_.xlsx data extraction in progress.. SW11A non irrad.xlsx data extraction in progress.. BJ1 for SW15_.xlsx data extraction in progress.. SW6A non irrad.xlsx data extraction in progress.. SW6A irrad @ 4 Gy.xlsx data extraction in progress.. SW8B.xlsx data extraction in progress.. SW14A irrad @ 4 Gy.xlsx data extraction in progress.. SW8A irrad @ 4 Gy.xlsx data extraction in progress.. SW5A irrad @ 4 Gy.xlsx data extraction in progress.. SW8C.xlsx data extraction in progress.. SW1A non irrad.xlsx data extraction in progress.. BJ1 for SW11_.xlsx data extraction in progress.. SW16A non irrad.xlsx data extraction in progress.. BJ1 for SW13_.xlsx data extraction in progress.. BJ-hTERT for SW9_.xlsx data extraction in progress.. BJ1 for SW14_.xlsx data extraction in progress.. SW9B.xlsx data extraction in progress.. BJ1 for SW8_.xlsx data extraction in progress.. SW_1_ok_3_C_.xlsx data extraction in progress.. SW3A irrad @ 4 Gy.xlsx data extraction in progress.. SW11A irrad @ 4 Gy.xlsx data extraction in progress.. BJ1 for SW16_.xlsx data extraction in progress.. BJ1 for SW12_.xlsx data extraction in progress.. SW8A non irrad.xlsx data extraction in progress.. BJ-hTERT for SW8_.xlsx data extraction in progress.. SW10A non irrad.xlsx data extraction in progress.. SW12A irrad @ 4 Gy.xlsx data extraction in progress.. SW9C.xlsx data extraction in progress.. BJ1 for SW10_.xlsx data extraction in progress.. SW7A non irrad.xlsx data extraction in progress.. SW1A irrad @ 4 Gy.xlsx data extraction in progress.. SW13A irrad @ 4 Gy.xlsx data extraction in progress.. SW1B.xlsx data extraction in progress.. BJ-hTERT for SW6_.xlsx data extraction in progress.. SW13B.xlsx data extraction in progress.. BJ1 for SW2_.xlsx data extraction in progress.. SW2A non irrad.xlsx data extraction in progress.. SW5C.xlsx data extraction in progress.. SW15C.xlsx data extraction in progress.. SW7C.xlsx data extraction in progress.. SW11B.xlsx data extraction in progress.. SW3B.xlsx data extraction in progress.. BJ-hTERT for SW15_.xlsx data extraction in progress.. SW15A non irrad.xlsx data extraction in progress.. SW12A non irrad.xlsx data extraction in progress.. BJ-hTERT for SW11_.xlsx data extraction in progress.. SW3C.xlsx data extraction in progress.. SW11C.xlsx data extraction in progress.. SW7B.xlsx data extraction in progress.. SW15B.xlsx data extraction in progress.. BJ1 for SW6_.xlsx data extraction in progress.. BJ-hTERT for SW2_.xlsx data extraction in progress.. SW5B.xlsx data extraction in progress.. SW5A non irrad.xlsx data extraction in progress.. SW1C.xlsx data extraction in progress.. BJ-hTERT for SW13_.xlsx data extraction in progress.. SW10A irrad @ 4 Gy.xlsx data extraction in progress.. SW2A irrad @ 4 Gy.xlsx data extraction in progress.. BJ1 for SW1_.xlsx data extraction in progress.. SW10B.xlsx data extraction in progress.. BJ-hTERT for SW5_.xlsx data extraction in progress.. SW2B.xlsx data extraction in progress.. SW13A non irrad.xlsx data extraction in progress.. SW14C.xlsx data extraction in progress.. SW6C.xlsx data extraction in progress.. SW9A irrad @ 4 Gy.xlsx data extraction in progress.. SW16A irrad @ 4 Gy.xlsx data extraction in progress.. BJ-hTERT for SW14_.xlsx data extraction in progress.. BJ-hTERT for SW16_.xlsx data extraction in progress.. SW16C.xlsx data extraction in progress.. BJ1 for SW3_.xlsx data extraction in progress.. SW12B.xlsx data extraction in progress.. BJ-hTERT for SW7_.xlsx data extraction in progress.. SW12C.xlsx data extraction in progress.. SW16B.xlsx data extraction in progress.. BJ-hTERT for SW3_.xlsx data extraction in progress.. BJ1 for SW7_.xlsx data extraction in progress.. BJ-hTERT for SW12_.xlsx data extraction in progress.. SW3A non irrad.xlsx data extraction in progress.. SW15A irrad @ 4 Gy.xlsx data extraction in progress.. SW7A irrad @ 4 Gy.xlsx data extraction in progress.. BJ-hTERT for SW10_.xlsx data extraction in progress.. SW6B.xlsx data extraction in progress.. SW14B.xlsx data extraction in progress.. BJ-hTERT for SW1_.xlsx data extraction in progress.. SW14A non irrad.xlsx data extraction in progress.. BJ1 for SW5_.xlsx data extraction in progress.. SW2C.xlsx data extraction in progress.. SW10C.xlsx data extraction in progress.. completed file collection
all_patients_df = trp.generate_dataframe_from_dict(all_patients_dict)
# don't need telo means per cell @ this time
all_patients_df = all_patients_df.drop(['cell data'], axis=1)
print(all_patients_df.shape)
(59, 7)
# changing telo data to list in prep for saving to csv
all_patients_df['telo data'] = all_patients_df['telo data'].apply(lambda row: row.tolist())
all_patients_df.to_csv('../data/compiled patient data csv files/all_patients_df.csv', index=False)
melted_all_patients_df = pd.melt(
all_patients_df,
id_vars = [col for col in all_patients_df.columns if col != 'Q1' and col != 'Q2-3' and col != 'Q4'],
var_name='relative Q',
value_name='Q freq counts')
melted_all_patients_df['Q freq counts'] = melted_all_patients_df['Q freq counts'].astype('float64')
melted_all_patients_df.head(4)
patient id | timepoint | telo data | telo means | relative Q | Q freq counts | |
---|---|---|---|---|---|---|
0 | 1 | 1 non irrad | [79.18994405924711, 58.07204491719145, 95.0279... | 84.796483 | Q1 | 1195.0 |
1 | 1 | 2 irrad @ 4 Gy | [149.93296075217452, 138.31843562348496, 106.6... | 90.975826 | Q1 | 724.0 |
2 | 1 | 3 B | [176.32960877192357, 111.92066838585988, 123.5... | 116.779989 | Q1 | 231.0 |
3 | 1 | 4 C | [144.65363114822472, 84.46927366319692, 78.133... | 99.346299 | Q1 | 372.0 |
melted_all_patients_df.to_csv('../data/compiled patient data csv files/melted_all_patients_df.csv', index=False)
pivot_patients_telo_means_df = all_patients_df.pivot(index='patient id', columns='timepoint', values='telo means')
pivot_patients_telo_means_df = pivot_patients_telo_means_df.drop(13)
pivot_patients_telo_means_df.to_csv('../data/compiled patient data csv files/pivot_patients_telo_means_df.csv', index=False)
# can imagine the lists containing the individual telos per patient exploding to the right; maintains the index relationship
explode_telos_raw = all_patients_df['telo data'].apply(pd.Series)
print(explode_telos_raw.shape)
explode_telos_raw.head(4)
(59, 4600)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 4590 | 4591 | 4592 | 4593 | 4594 | 4595 | 4596 | 4597 | 4598 | 4599 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 79.189944 | 58.072045 | 95.027933 | 106.642458 | 55.960334 | 62.296089 | 81.301676 | 109.810056 | 123.536313 | 181.608938 | ... | 71.798883 | 122.480447 | 50.681564 | 81.301676 | 40.122905 | 115.088234 | 114.032379 | 59.128492 | 63.351322 | 46.457636 |
1 | 149.932961 | 138.318436 | 106.642458 | 101.363128 | 101.363128 | 80.245810 | 111.921788 | 127.758499 | 102.418994 | 57.016760 | ... | 47.513966 | 92.916201 | 84.469274 | 89.748603 | 47.513966 | 102.418994 | 117.201117 | 107.697247 | 92.916201 | 71.798883 |
2 | 176.329609 | 111.920668 | 123.535077 | 177.385475 | 117.201117 | 139.374302 | 99.251397 | 51.736913 | 145.708040 | 154.156424 | ... | 127.759776 | 66.519553 | 92.916201 | 48.569832 | 145.709497 | 114.033519 | 156.268156 | 137.262570 | 141.486033 | 182.664804 |
3 | 144.653631 | 84.469274 | 78.133297 | 139.372908 | 98.195531 | 53.849162 | 68.631285 | 62.296089 | 76.022346 | 125.578131 | ... | 112.977654 | 109.808958 | 111.920668 | 127.759776 | 105.586592 | 100.307262 | 128.814354 | 143.597765 | 157.664571 | 115.089385 |
4 rows × 4600 columns
exploded_telos_all_patients_df = (explode_telos_raw
# we'll merge the exploded telos df w/ our original all patients df on the index!
.merge(all_patients_df, right_index = True, left_index = True)
.drop(['telo data', 'Q1', 'Q2-3', 'Q4'], axis = 1)
.melt(id_vars = ['patient id', 'timepoint', 'telo means'], value_name = "individual telomeres")
.drop("variable", axis = 1)
.dropna())
exploded_telos_all_patients_df.head(4)
patient id | timepoint | telo means | individual telomeres | |
---|---|---|---|---|
0 | 1 | 1 non irrad | 84.796483 | 79.189944 |
1 | 1 | 2 irrad @ 4 Gy | 90.975826 | 149.932961 |
2 | 1 | 3 B | 116.779989 | 176.329609 |
3 | 1 | 4 C | 99.346299 | 144.653631 |
exploded_telos_all_patients_df.to_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv', index=False)
all_chr_aberr_df = trp.make_dataframe_chr_aberr_data('../data/dGH scoresheets/')
<DirEntry 'SW14_timepoints_subtelodGH.xlsx'> <DirEntry 'SW3_timepoints_subtelodGH.xlsx'> <DirEntry 'SW8_timepoints_subtelodGH.xlsx'> <DirEntry 'SW13_timepoints_subtelodGH.xlsx'> <DirEntry 'SW9_timepoints_subtelodGH.xlsx'> <DirEntry 'SW5_timepoints_subtelodGH.xlsx'> <DirEntry 'SW12_timepoints_subtelodGH.xlsx'> <DirEntry 'SW15_timepoints_subtelodGH.xlsx'> <DirEntry 'SW2_timepoints_subtelodGH.xlsx'> <DirEntry 'SW10_timepoints_subtelodGH.xlsx'> <DirEntry 'SW7_timepoints_subtelodGH.xlsx'> <DirEntry 'SW1_timepoints_subtelodGH.xlsx'> <DirEntry 'SW16_timepoints_subtelodGH.xlsx'> <DirEntry 'SW11_timepoints_subtelodGH.xlsx'> <DirEntry 'SW6_timepoints_subtelodGH.xlsx'>
all_chr_aberr_df.to_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv', index=False)
# loading excel file
cbc_data = pd.read_excel('../data/to colorado.xlsx')
# minor data cleaning
cbc_data.rename({'patient': 'patient id',
'mrn': 'timepoint'}, axis=1, inplace=True)
def extract_patient_ID(row):
if 'SW' in row:
row = row.replace('SW', ' ').strip()
return row
cbc_data['patient id'] = cbc_data['patient id'].apply(lambda row: extract_patient_ID(row))
cbc_data['patient id'] = cbc_data['patient id'].astype('int64')
# saving to file
cbc_data.to_csv('../data/compiled patient data csv files/cleaned cbc data.csv', index=False)