Here we are looking at the TP53-3p subtyping within the context of clinical variables, one at a time. For multivariate analysis see the Multivariate_Modeling notebook.
import NotebookImport
from Imports import *
importing IPython notebook from Imports.ipynb Populating the interactive namespace from numpy and matplotlib changing to source dirctory populating namespace with data
p53_mut = mut.features.ix['TP53'].ix[keepers_o].dropna()
del_3p = cn.features.ix['Deletion'].ix['3p14.2'].ix[0].ix[keepers_o].dropna()
combo = combine(p53_mut==1, del_3p==-1)
combo = combo.map({'Lesion':'b', 'neither':'a', 'TP53':'c', 'both':'d'})
two_hit = combo=='d'
c2 = colors_st
def st_plot(f):
'''
Global funciton for plotting subtypes in the background of a
clinical variable.
'''
n_unique = len(f.unique())
fig, axs = subplots(1,n_unique, figsize=(n_unique*3+1,3))
for i,g in enumerate(sorted(f.unique())):
ax = axs[i]
draw_survival_curve(two_hit.ix[true_index(f==g)], surv, ax=ax,
colors={0:c2[1], 1: c2[0]})
ax.get_legend().set_visible(False)
ax.set_title(g)
prettify_ax(ax)
axs[-1].legend(loc='upper right', frameon=False)
fig.tight_layout()
print 'Counts for plot below:\n'
print pd.crosstab(f, two_hit).T
print '\n'
def surv_models(f):
lr = get_cox_ph_ms(surv, '_' + two_hit.astype(str), [age, old, f], interactions=None)['LR']
print 'Likelihood ratio of full model compared to background: {0:.1e}\n'.format(lr)
print 'Full model with covariate and age:'
get_cox_ph(surv, '_' + two_hit.astype(str), [age, old, f], print_desc=True, interactions=None);
clinical_processed = clinical.processed
st_plot(clinical_processed.old_age.dropna())
plt.gca().legend().set_visible(False)
Counts for plot below: old_age Age < 75 Age > 75 TP53 False 59 12 True 161 18
smoker = clinical_processed.smoker.dropna()
smoker = smoker.replace({'current reformed smoker for < or = 15 years': 'reformed <= 15y',
'current reformed smoker for > 15 years': 'reformed >= 15y'})
st_plot(smoker)
plt.gca().legend().set_visible(False)
Counts for plot below: smoker current smoker lifelong non-smoker reformed <= 15y reformed >= 15y TP53 False 18 18 12 22 True 66 26 60 18
/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas-0.14.0.dev-py2.7-linux-x86_64.egg/pandas/core/common.py:1847: RuntimeWarning: tp_compare didn't return -1 or -2 for exception sample = v[:min(3,len(v))]
st_plot(clinical_processed.smoker_inferred.dropna())
plt.gca().legend().set_visible(False)
Counts for plot below: smoker_inferred 0 1 TP53 False 41 30 True 47 132 [2 rows x 2 columns]
pts = ti(smoker != 'lifelong non-smoker').intersection(ti(clinical_processed.old_age=='Age < 75'))
c = combine(two_hit.ix[pts], mut.features.ix['MUC5B'])
c = c.replace(['neither','MUC5B'], 'TP53-3p neg.')
c = c.replace(['TP53'], 'TP53-3p alt.')
c = c.replace(['both'], 'TP53-3p, MUC5B')
survival_and_stats(c, surv, colors=colors_st, order=[1,0,2])
c = combine(two_hit.ix[pts], mirna.features.ix['binary'].ix['hsa-mir-548k'])
c = c.replace(['neither','hsa-mir-548k'], 'TP53-3p neg.')
c = c.replace(['TP53'], 'TP53-3p alt.')
c = c.replace(['both'], 'TP53-3p, mir-548k')
survival_and_stats(c, surv, colors=colors_st)
survival_and_stats(combo.ix[pts].dropna(), surv, colors=colors_th)
pd.crosstab(clinical.processed.stage, two_hit).T
stage | Stage i | Stage ii | Stage iii | Stage iv | nx |
---|---|---|---|---|---|
TP53 | |||||
False | 7 | 17 | 12 | 29 | 6 |
True | 7 | 20 | 24 | 106 | 22 |
2 rows × 5 columns
clin_stage = clinical.processed.stage.ix[keepers_o].dropna()
clin_stage.name = 'Clinical_Stage'
survival_and_stats(clin_stage, surv)
clin_stage.dropna().value_counts()
Stage iv 135 Stage ii 37 Stage iii 36 nx 28 Stage i 14 dtype: int64
clin_stage = clinical.stage.clinicalstage.ix[keepers_o]
clin_stage = clin_stage.replace({'stage iva': 'Stage IV', 'stage ivb':'Stage IV', 'stage ivc':'Stage IV',
'stage iii': 'Stage I/II/III', 'stage i': 'Stage I/II/III', 'stage ii': 'Stage I/II/III'})
st_plot(clin_stage)
plt.gca().legend().set_visible(False)
Counts for plot below: clinicalstage Stage I/II/III Stage IV TP53 False 42 29 True 81 98 [2 rows x 2 columns]
surv_models(clin_stage)
Likelihood ratio of full model compared to background: 1.2e-06 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.203 3.33 0.2814 4.28 1.9e-05 old 0.262 1.30 0.0873 3.00 2.7e-03 Likelihood ratio test=29.1 on 2 df, p=4.9e-07 n= 250, number of events= 102
/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas-0.13.0_247_g82bcbb8-py2.7-linux-x86_64.egg/pandas/core/indexing.py:344: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_index,col_indexer] = value instead self.obj[item] = s
grade = clinical.clinical.neoplasmhistologicgrade.replace('gx',nan).dropna()
grade = grade.replace(['g3','g4'], 'g3/g4')
grade.name = 'Grade'
st_plot(grade)
plt.gca().legend().set_visible(False)
Counts for plot below: Grade g1 g2 g3/g4 TP53 False 14 37 16 True 9 125 44 [2 rows x 3 columns]
surv_models(grade)
Likelihood ratio of full model compared to background: 2.7e-05 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.125 3.08 0.304 3.71 0.00021 old 0.276 1.32 0.090 3.06 0.00220 Gradeg2 1.066 2.90 0.527 2.02 0.04300 Gradeg3/g4 0.928 2.53 0.551 1.69 0.09200 Likelihood ratio test=34.7 on 4 df, p=5.3e-07 n= 245, number of events= 99 (120 observations deleted due to missingness)
This is only for 232/259 patients
#st=st.astype(int)
f = clinical.stage.pathologicstage.ix[two_hit.index].fillna('Missing')
f = f.replace('stage iva','stage iv')
f = f.replace('stage ivb','stage iv')
f = f.replace('stage ivc','stage iv')
f.name = 'Pathologic_Stage'
path_stage = f
survival_and_stats(path_stage, surv)
st_plot(path_stage)
Counts for plot below: Pathologic_Stage Missing stage i stage ii stage iii stage iv TP53 False 6 7 17 12 29 True 22 7 20 24 106 [2 rows x 5 columns]
surv_models(path_stage)
Likelihood ratio of full model compared to background: 1.4e-05 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.1228 3.073 0.2879 3.900 9.6e-05 old 0.2925 1.340 0.0892 3.277 1.0e-03 Pathologic_Stagestage i -1.2169 0.296 0.7625 -1.596 1.1e-01 Pathologic_Stagestage ii -0.3196 0.726 0.3920 -0.815 4.1e-01 Pathologic_Stagestage iii -0.2139 0.807 0.3831 -0.558 5.8e-01 Pathologic_Stagestage iv -0.0374 0.963 0.3078 -0.122 9.0e-01 Likelihood ratio test=33.6 on 6 df, p=7.87e-06 n= 250, number of events= 102
ecs = clinical.processed.spread
ecs.name = 'ECS'
survival_and_stats(ecs, surv)
st_plot(ecs.dropna())
Counts for plot below: ECS no yes TP53 False 40 5 True 78 46 [2 rows x 2 columns]
surv_models(ecs)
Likelihood ratio of full model compared to background: 3.8e-03 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 0.960 2.61 0.360 2.67 0.00760 old 0.389 1.48 0.117 3.33 0.00087 ECSyes 0.741 2.10 0.260 2.85 0.00430 Likelihood ratio test=27 on 3 df, p=5.92e-06 n= 169, number of events= 69
spread_infered = clinical.processed.spread_inferred.ix[keepers_o].dropna()
spread_infered = spread_infered.map({0:'No ECS', 1:'ECS'})
survival_and_stats(spread_infered, surv)
st_plot(spread_infered)
Counts for plot below: spread_inferred ECS No ECS TP53 False 6 65 True 53 126 [2 rows x 2 columns]
surv_models(spread_infered)
Likelihood ratio of full model compared to background: 5.9e-05 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.053 2.866 0.2886 3.65 0.00026 old 0.311 1.364 0.0897 3.46 0.00054 spread_inferredNo ECS -0.651 0.522 0.2228 -2.92 0.00350 Likelihood ratio test=37.1 on 3 df, p=4.38e-08 n= 250, number of events= 102
lymph_stage = clinical_processed.lymph_stage.replace('n2', 'n2+').replace('n3', 'n2+').dropna()
lymph_stage = lymph_stage.ix[keepers_o].fillna('nx')
st_plot(lymph_stage)
Counts for plot below: lymph_stage n0 n1 n2+ nx TP53 False 31 9 14 17 True 53 20 76 30 [2 rows x 4 columns]
surv_models(lymph_stage)
Likelihood ratio of full model compared to background: 1.3e-05 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.134 3.11 0.2889 3.925 8.7e-05 old 0.273 1.31 0.0881 3.104 1.9e-03 lymph_stagen1 0.125 1.13 0.3935 0.317 7.5e-01 lymph_stagen2+ 0.612 1.84 0.2642 2.315 2.1e-02 lymph_stagenx 0.801 2.23 0.2840 2.819 4.8e-03 Likelihood ratio test=39.4 on 5 df, p=1.94e-07 n= 250, number of events= 102
f = lymph_stage.replace('n0','n0/n1')
f = f.replace('n1','n0/n1')
f = f.replace('Missing', nan).dropna()
f.name = 'N_stage'
lymph_n0n1 = f
st_plot(lymph_n0n1)
Counts for plot below: N_stage n0/n1 n2+ nx TP53 False 40 14 17 True 73 76 30 [2 rows x 3 columns]
surv_models(lymph_n0n1)
Likelihood ratio of full model compared to background: 1.3e-05 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.134 3.11 0.2889 3.93 8.7e-05 old 0.273 1.31 0.0881 3.10 1.9e-03 N_stagen2+ 0.578 1.78 0.2396 2.41 1.6e-02 N_stagenx 0.767 2.15 0.2616 2.93 3.4e-03 Likelihood ratio test=39.3 on 4 df, p=5.94e-08 n= 250, number of events= 102
lymph_n0n1_noECS =lymph_n0n1.ix[true_index(clinical.processed.spread_inferred==0)].dropna()
st_plot(lymph_n0n1_noECS)
Counts for plot below: N_stage n0/n1 n2+ nx TP53 False 38 10 17 True 63 39 24 [2 rows x 3 columns]
surv_models(lymph_n0n1_noECS)
Likelihood ratio of full model compared to background: 3.2e-05 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.203 3.33 0.317 3.788 0.00015 old 0.221 1.25 0.103 2.141 0.03200 N_stagen2+ 0.268 1.31 0.304 0.881 0.38000 N_stagenx 0.808 2.24 0.288 2.802 0.00510 Likelihood ratio test=27.5 on 4 df, p=1.6e-05 n= 191, number of events= 68
f = clinical.stage.pathologicn.ix[two_hit.index].fillna('Missing')
f = f.map(str.upper)
f = f.replace('NX',nan)
f = f.replace('N2','n2+')
f = f.replace('N2A','n2+')
f = f.replace('N2B','n2+')
f = f.replace('N2C','n2+')
f = f.replace('N3','n2+')
f = f.fillna('MISSING')
f = f.map(str.lower)
f = f.ix[keepers_o].fillna('nx')
f.name = 'Lymph_Stage_Pathologic'
pathologic_n = f
survival_and_stats(pathologic_n, surv)
st_plot(pathologic_n)
Counts for plot below: Lymph_Stage_Pathologic missing n0 n1 n2+ TP53 False 17 31 9 14 True 30 53 20 76 [2 rows x 4 columns]
surv_models(pathologic_n)
Likelihood ratio of full model compared to background: 1.3e-05 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.134 3.107 0.2889 3.925 8.7e-05 old 0.273 1.314 0.0881 3.104 1.9e-03 Lymph_Stage_Pathologicn0 -0.801 0.449 0.2840 -2.819 4.8e-03 Lymph_Stage_Pathologicn1 -0.676 0.509 0.3854 -1.754 8.0e-02 Lymph_Stage_Pathologicn2+ -0.189 0.828 0.2529 -0.748 4.5e-01 Likelihood ratio test=39.4 on 5 df, p=1.94e-07 n= 250, number of events= 102
f = pathologic_n.replace('n0','n0/n1')
f = f.replace('n1','n0/n1')
f = f.replace('Missing', nan).dropna()
pathologic_n0n1 = f
pathologic_n0n1.name = 'path_n0n1'
survival_and_stats(pathologic_n0n1.ix[keepers_o].fillna('Missing Pathology'), surv)
st_plot(pathologic_n0n1.ix[keepers_o].fillna('Missing Pathology'))
plt.gca().legend().set_visible(False)
Counts for plot below: path_n0n1 missing n0/n1 n2+ TP53 False 17 40 14 True 30 73 76 [2 rows x 3 columns]
surv_models(pathologic_n0n1)
Likelihood ratio of full model compared to background: 1.3e-05 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.134 3.108 0.2889 3.925 8.7e-05 old 0.273 1.314 0.0881 3.100 1.9e-03 path_n0n1n0/n1 -0.767 0.464 0.2616 -2.932 3.4e-03 path_n0n1n2+ -0.189 0.827 0.2529 -0.749 4.5e-01 Likelihood ratio test=39.3 on 4 df, p=5.94e-08 n= 250, number of events= 102
pathologic_n0n1_noECS = pathologic_n0n1.ix[true_index(clinical.processed.spread_inferred==0)].dropna()
st_plot(pathologic_n0n1_noECS.ix[keepers_o].fillna('ECS / Missing'))
Counts for plot below: path_n0n1 ECS / Missing missing n0/n1 n2+ TP53 False 6 17 38 10 True 53 24 63 39 [2 rows x 4 columns]
surv_models(pathologic_n0n1_noECS)
Likelihood ratio of full model compared to background: 3.2e-05 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.203 3.329 0.317 3.79 0.00015 old 0.221 1.247 0.103 2.14 0.03200 path_n0n1n0/n1 -0.808 0.446 0.288 -2.80 0.00510 path_n0n1n2+ -0.540 0.583 0.322 -1.68 0.09400 Likelihood ratio test=27.5 on 4 df, p=1.6e-05 n= 191, number of events= 68
st_plot(clinical_processed.spread_inferred.dropna().map({1:'ECS',0:'No ECS'}))
plt.gca().legend().set_visible(False)
Counts for plot below: spread_inferred ECS No ECS TP53 False 6 65 True 53 126 [2 rows x 2 columns]
bsc = pd.read_table('../Extra_Data/HNSC_Followup/biospecimen_tumor_sample_hnsc.txt',
index_col=0, na_values=['[Not Available]'])
tw = bsc.tumor_weight.dropna()
tw = tw.groupby(level=0).first()
tn = bsc.tumor_nuclei_percent.dropna()
tn = tn.groupby(level=0).first()
ts = bsc.tumor_necrosis_percent.dropna()
ts = ts.groupby(level=0).first()
tw.clip_upper(500).hist()
<matplotlib.axes.AxesSubplot at 0x9217050>
tumor_weight = (tw >= 200)*1. + (tw > 200)*1
tumor_weight = tumor_weight.map({0:'200-', 1:'200', 2:'200+'})
tumor_weight.name = 'weight'
survival_and_stats(tumor_weight, surv, figsize=(6,4))
st_plot(tumor_weight)
Counts for plot below: weight 200 200+ 200- TP53 False 24 8 37 True 73 29 70 [2 rows x 3 columns]
surv_models(tumor_weight)
Likelihood ratio of full model compared to background: 1.3e-05 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.106 3.022 0.2840 3.89 9.8e-05 old 0.263 1.301 0.0877 3.00 2.7e-03 weight200- -0.587 0.556 0.2479 -2.37 1.8e-02 weight200+ 0.505 1.657 0.2678 1.89 5.9e-02 Likelihood ratio test=43.9 on 4 df, p=6.84e-09 n= 241, number of events= 101 (59 observations deleted due to missingness)
tn.hist()
<matplotlib.axes.AxesSubplot at 0x9052410>
tumor_nec = (tn >= 70)*1. + (tn > 70)*1
tumor_nec = tumor_nec.map({0:'70-', 1:'70', 2:'70+'})
tumor_nec.name = 'tumor_necrosis'
survival_and_stats(tumor_nec, surv, figsize=(6,4))
st_plot(tumor_nec)
Counts for plot below: tumor_necrosis 70 70+ 70- TP53 False 36 32 3 True 120 51 7 [2 rows x 3 columns]
surv_models(tumor_nec)
Likelihood ratio of full model compared to background: 3.5e-06 Full model with covariate and age: coef exp(coef) se(coef) z p feature_True 1.160 3.190 0.2823 4.11 0.00004 old 0.254 1.289 0.0882 2.88 0.00400 tumor_necrosis70- 0.615 1.850 0.6012 1.02 0.31000 tumor_necrosis70+ -1.008 0.365 0.2678 -3.76 0.00017 Likelihood ratio test=50.1 on 4 df, p=3.51e-10 n= 249, number of events= 102 (118 observations deleted due to missingness)