UPMC HNSCC Cohort Validation of Primary Findings in UPMC Cohort

Import Data and Packages¶

For full list of data and packages imported see the Imports notebook.

In [1]:

import NotebookImport
from Imports import *

importing IPython notebook from Imports.ipynb
Populating the interactive namespace from numpy and matplotlib
changing to source dirctory
populating namespace with data

Read in UPMC Data¶

In [2]:

meta = pd.read_csv('../Extra_Data/UPMC_cohort/meta.csv', index_col=0)
meta.index = pd.MultiIndex.from_tuples([('-'.join(i.split('-')[:-1]), i.split('-')[-1]) 
                                        for i in meta.index])

In [3]:

clin = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data.csv', index_col=0)
clin.Tumor_type = clin.Tumor_type.map(str.strip)
surv = pd.concat([clin.os_5yr, clin.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack()
clin2 = pd.read_csv('../Extra_Data/UPMC_cohort/pitt_broad_data_update.csv', index_col=0)
surv2 = pd.concat([clin2.os_5yr, clin2.os_5yr_mons*30.5], keys=['event','days'], axis=1).stack()
surv = surv2.combine_first(surv)

Explore Clinical Features of UPMC Cohort¶

In [4]:

keepers = ti(clin.HPV == 'Negative')

In [5]:

cc = clin.ix[true_index(meta.Final_74_exome_analysis[:,'Tumor']>0)]
cc = cc.ix[:,cc.apply(pd.value_counts).count().isin([2,3,4])]
cc = cc.dropna(1)

In [6]:

cox_screen(cc.T, surv)[7:].head(10)

Out[6]:

	hazard				LR				concordance
	exp(-coef)	exp(coef)	lower .95	upper .95	df	p	stat	q	se	stat
Neoadj_tx	0.01	115.91	10.25	1311.28	1	4.32e-04	12.39	0.00	0.00	0.53
Chemo	0.04	27.59	5.33	142.93	2	1.54e-03	12.95	0.00	0.04	0.62
Adj_chemo	0.04	27.59	5.33	142.93	2	1.54e-03	12.95	0.00	0.04	0.62
Recurrence	0.45	2.21	1.18	4.12	1	1.41e-02	6.02	0.04	0.04	0.57
PNI	0.42	2.36	0.86	6.50	2	1.49e-02	8.41	0.04	0.04	0.62
DistMets	0.35	2.82	1.33	6.01	1	1.44e-02	5.98	0.04	0.03	0.55
EPS	1.45	0.69	0.30	1.58	2	2.14e-02	7.69	0.05	0.04	0.62
Dif	0.90	1.11	0.58	2.13	3	6.69e-02	7.16	0.14	0.04	0.54
Ethnicity	6.71	0.15	0.02	1.16	1	1.54e-01	2.04	0.28	0.01	0.51
Local_recurr	0.55	1.83	0.84	3.98	1	1.53e-01	2.04	0.28	0.03	0.53

In [7]:

f = clin.Primary_site.ix[keepers]
f = f.replace('Hypopharynx','Other').replace('Sinonasal','Other')
get_surv_fit_lr(surv, f)

Out[7]:

	Stats		Median Survival			5y Survival			Log-Rank
	# Patients	# Events	Median	Lower	Upper	Surv	Lower	Upper	chi2	p
									4.02	0.259
Oral cavity	36	22	2.99	1.3	NaN	0.411	0.277	0.61
Larynx	15	6	NaN	1.5	NaN	0.574	0.366	0.901
Other	8	4	3.7	1.8	NaN	0.5	0.25	1
Oropharynx	4	3	0.844	0.535	NaN	NaN	NaN	NaN

In [8]:

f = clin.Nodal_stage.ix[keepers].fillna('.')
f = f.replace('N2A','N2').replace('N2B','N2').replace('N2C','N2')
get_surv_fit_lr(surv, f)

Out[8]:

	Stats		Median Survival			5y Survival			Log-Rank
	# Patients	# Events	Median	Lower	Upper	Surv	Lower	Upper	chi2	p
									22.1	0.000192
N2	28	20	1.15	0.978	NaN	0.298	0.167	0.532
N0	19	6	NaN	3.98	NaN	0.672	0.487	0.928
N1	8	4	3.7	2.11	NaN	0.5	0.25	1
.	5	2	NaN	1.55	NaN	0.6	0.293	1
N3	3	3	0.894	0.535	NaN	NaN	NaN	NaN

In [9]:

survival_and_stats(f.isin(['N2','N3']), surv)

In [10]:

f = clin.clinical_stage.ix[keepers]
f = f.replace('IVa','IV').replace('IVb','IV').replace('.', 'Unstaged').fillna('Unstaged')
get_surv_fit_lr(surv, f)

Out[10]:

	Stats		Median Survival			5y Survival			Log-Rank
	# Patients	# Events	Median	Lower	Upper	Surv	Lower	Upper	chi2	p
									4.3	0.231
IV	43	27	1.6	1.1	NaN	0.38	0.258	0.56
III	12	5	NaN	2.11	NaN	0.583	0.362	0.941
Unstaged	5	2	NaN	1.55	NaN	0.6	0.293	1
II	3	1	NaN	3.98	NaN	0.667	0.3	1

In [11]:

f = clin.Alcohol_amt.fillna(0).ix[keepers].replace('.', 0).astype(float) > 13
f2 = clin.Alcohol_amt.fillna(0).ix[keepers].replace('.', 0).astype(float) < 7
get_surv_fit_lr(surv, 1.*f - 1.*f2)

Out[11]:

	Stats		Median Survival			5y Survival			Log-Rank
	# Patients	# Events	Median	Lower	Upper	Surv	Lower	Upper	chi2	p
									3.25	0.197
-1	36	19	3.98	1.8	NaN	0.497	0.357	0.692
0	18	9	3.65	1.29	NaN	0.463	0.276	0.778
1	9	7	1.1	0.953	NaN	0.222	0.0655	0.754

In [12]:

get_surv_fit_lr(surv, clin.Tob_history.ix[keepers])

Out[12]:

	Stats		Median Survival			5y Survival			Log-Rank
	# Patients	# Events	Median	Lower	Upper	Surv	Lower	Upper	chi2	p
									0.0913	0.763
yes	55	31	3.65	1.55	NaN	0.44	0.324	0.596
no	8	4	1.43	1.3	NaN	0.5	0.25	1

In [13]:

get_surv_fit_lr(surv, clin.PNI.ix[keepers])

Out[13]:

	Stats		Median Survival			5y Survival			Log-Rank
	# Patients	# Events	Median	Lower	Upper	Surv	Lower	Upper	chi2	p
									6.78	0.0338
yes	31	22	1.55	1	NaN	0.323	0.194	0.537
no	26	9	NaN	3.98	NaN	0.636	0.471	0.858
unknown	6	4	3.25	1.1	NaN	NaN	NaN	NaN

In [14]:

survival_and_stats(clin['Chemo '], surv, figsize=(5,4), title='Chemo')

In [15]:

get_surv_fit_lr(surv, clin.Date_Dx.map(lambda s: s[-2:]).ix[keepers].astype(int)>10)

Out[15]:

	Stats		Median Survival			5y Survival			Log-Rank
	# Patients	# Events	Median	Lower	Upper	Surv	Lower	Upper	chi2	p
									0.0558	0.813
0	62	34	3.17	1.5	NaN	0.457	0.347	0.602
1	1	1	3.98	NaN	NaN	NaN	NaN	NaN

In [16]:

survival_and_stats(clin.EPS, surv, figsize=(5,4), title='EPS')

In [17]:

survival_and_stats(clin.HPV, surv, figsize=(5,4), title='HPV')

In [18]:

survival_and_stats(to_quants(clin.Age_Dx, q=.33, labels=True), surv, figsize=(5,4), title='Age')

Process Mutation Data¶

In [19]:

maf = pd.read_csv('../Extra_Data/UPMC_cohort/maf.csv')
maf = maf.dropna(how='all', axis=[0,1])
maf['Tumor_Sample_Barcode'] = maf['Tumor_Sample_Barcode'].str.replace('-Tumor','').copy()
maf = maf.set_index(['Hugo_Symbol','Tumor_Sample_Barcode'])
maf = maf[maf.Validation_Status != 'Wildtype']
non_silent = maf[maf.Variant_Classification != 'Synonymous']
non_silent['counter'] = 1
hit_matrix = non_silent.counter.groupby(level=[0,1]).sum().unstack()
hit_matrix.columns = map(lambda s: s.replace('-Tumor',''), hit_matrix.columns)
hit_matrix = hit_matrix.fillna(0)

The TCGA has a much higher mutation rate as compared to this data. This could be due to differences in sample prep, sequencing depth, or bioinformatic processing. The two cohorts are fairly well matched clinically, so we expect that this difference can be contributed to technical factors as compared to biological ones.

In [20]:

fig, axs = subplots(1,3, figsize=(8,3), sharey=True)
c = pd.concat([(mut.df > 0).sum(), (hit_matrix > 0).sum(0)], keys=['TCGA','UPMC'], axis=1).stack()
violin_plot_series(c.clip_upper(500), ax=axs[0])

hh = hit_matrix.ix[:, true_index(clin.HPV == 'Negative')].sum()
yy = mut.df.ix[:, true_index(clinical.hpv.dropna() == False)].sum()
c = pd.concat([yy,hh], keys=['TCGA','UPMC'], axis=1).stack()
violin_plot_series(c.clip_upper(500), ax=axs[1])

hh = hit_matrix.ix[:, true_index(clin.HPV == 'Positive')].sum()
yy = mut.df.ix[:, true_index(clinical.hpv.dropna())].sum()
c = pd.concat([yy,hh], keys=['TCGA','UPMC'], axis=1).stack()
violin_plot_series(c.clip_upper(500), ax=axs[2])

for ax in axs:
    prettify_ax(ax)
    ax.set_ylabel('')
    ax.set_xlabel('Study')
axs[0].set_ylabel('# of Mutations')
fig.tight_layout()
fig.savefig(FIGDIR +  'mutation_rate_comparison.pdf')

In [21]:

hh = hit_matrix.ix[:, true_index(clin.HPV == 'Negative')].sum()
yy = mut.df.ix[:, true_index(clinical.hpv.dropna() == False)].sum()
c = pd.concat([yy,hh], keys=['TCGA','UPMC'], axis=1).stack()

In [22]:

stats.mannwhitneyu(hh.dropna(), yy.dropna())

Out[22]:

(5909.0, 0.0002041546634473598)

The median mutation rate of TCGA HPV- samples is 104 verses 74 in the UPMC cohort.

In [23]:

yy = mut.df.ix[:, true_index(clinical.hpv.dropna() == False)].sum()
yy.median()

Out[23]:

104.0

In [24]:

hh = hit_matrix.ix[:, true_index(clin.HPV == 'Negative')].sum()
hh.median()

Out[24]:

73.0

TP53 Mutation¶

In [25]:

p53_mut = hit_matrix.ix['TP53'].ix[true_index(clin.HPV=='Negative')] > 0
survival_and_stats(p53_mut, surv)

Broken down by functional type.

In [26]:

import re as re
get_nums = lambda s: re.findall(r'\d+', s)

def is_disruptive(v):
    c = v.Variant_Classification
    if c != 'Missense':
        if 'Ins' in c or 'Del' in c:
            return 'InDel'
        else:
            return v.Variant_Classification.split('_')[0]
    else:
        s = v.Protein_Change
        aa = int(get_nums(s)[0])
        if int(aa) in range(163,196):
            return 'L2'
        if int(aa) in range(236, 252):
            return 'L3'
    return 'other'

In [27]:

p53 = maf.ix['TP53'].reset_index()
dd = p53.apply(is_disruptive, 1)
dd = dd.replace('Synonymous',nan).dropna()
others = hit_matrix.columns.diff(p53.Tumor_Sample_Barcode.ix[dd.index])
dd.index = p53.Tumor_Sample_Barcode.ix[dd.index]
dd = pd.concat([pd.Series('WT', others), dd])
dd = dd.ix[keepers]

In [28]:

s2 = surv.unstack().ix[dd.index]
s2.index = range(len(dd))
s2 = s2.stack()
dd.index = range(len(dd))

In [29]:

survival_and_stats(dd, s2, colors=colors[:6] + ['grey'] + colors[6:], figsize=(7,6))

By Validation Status. Validated mutations do seem to have worse prognosis, leading us to think there could be a number of false positives in the mix.

In [30]:

v = maf.ix['TP53'].Validation_Status == 'Valid'
v = v.groupby(level=0).sum() > 0
v.name = 'valid'
v = v.ix[hit_matrix.columns]
v = v.fillna('WT')

In [31]:

v.value_counts()

Out[31]:

False    37
WT       28
True      9
dtype: int64

In [32]:

draw_survival_curve(v.ix[true_index(clin.HPV=='Negative')], surv)

MUC5B mutation status. Can't really say much here.

In [33]:

survival_and_stats(hit_matrix.ix['MUC5B'].ix[true_index(clin.HPV=='Negative')]>0, surv)

Copy Number Data¶

In [34]:

cn_tumor = meta.xs('Tumor', level=1)['SNP array ID'].dropna()
lu = pd.Series({v:i for i,v in cn_tumor.iteritems()})

I am loading the GISTIC2 processed data for this cohort. The data was processed together with the TCGA samples.

In [35]:

path = '../Extra_Data/UPMC_cohort/'
gistic = pd.read_table(path + 'all_thresholded.by_genes.txt', index_col=[0,1, 2])
gistic.index = gistic.index.droplevel(1)
cn_upmc = pd.DataFrame({i: gistic[t] for i,t in cn_tumor.iteritems() if t in gistic})

In [36]:

pct_del = (cn_upmc < 0).sum() / (1.*len(cn_upmc))
pct_amp = (cn_upmc > 0).sum() / (1.*len(cn_upmc))
pct_altered = pct_amp + pct_del
pct_altered.name = 'CIN_pct_altered'

Chromosomal instability in the UPMC cohort.

In [37]:

survival_and_stats(to_quants(pct_altered, q=.33, labels=True), surv)

Taking a look at 3p deletion.

In [38]:

del_3p = cn_upmc.xs('3p14.2',0,1).median().ix[true_index(clin.HPV=='Negative')].dropna() < 0

In [39]:

b = combine(p53_mut, del_3p).index

In [40]:

keepers = meta['SNP array ID'].notnull() & (meta['Final_74_exome_analysis'] == 1)
keepers = keepers.groupby(level=0).sum() 
keepers = (keepers > 0) & (clin.HPV == 'Negative')
keepers = keepers.ix[surv.index.get_level_values(0)].dropna()
keepers = keepers.groupby(level=0).last()
keepers.value_counts()

Out[40]:

True     49
False    25
dtype: int64

In [41]:

survival_and_stats(del_3p, surv)

Association of TP53 mutation with 3p deletion

len(combine(p53_mut, del_3p))

In [42]:

fisher_exact_test(p53_mut, del_3p, alternative='greater')

Out[42]:

odds_ratio    2.50
p             0.17
dtype: float64

Survival association

In [43]:

combo = combine(p53_mut, del_3p)
survival_and_stats(combo, surv, figsize=(5,4), title='PITT')

In [44]:

pd.crosstab(clin.Alcohol, clin.Alcohol_amt.fillna(0))

Out[44]:

Alcohol_amt	0	.	1	10	12	15	16	2	20	24	27	3	5	6	7	8
Alcohol
.	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
no	15	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
yes	0	1	1	8	4	3	1	3	2	3	1	4	4	11	3	9

In [45]:

alch = clin.Alcohol_amt.fillna(0).replace('.', nan).astype(float)

In [46]:

alch.hist()

Out[46]:

<matplotlib.axes.AxesSubplot at 0x970b550>

In [47]:

get_surv_fit_lr(surv, (alch.ix[combo.index].dropna() > 10).fillna('M'))

Out[47]:

	Stats		Median Survival			5y Survival			Log-Rank
	# Patients	# Events	Median	Lower	Upper	Surv	Lower	Upper	chi2	p
									6.83	0.00898
0	39	19	5.01	1.8	NaN	0.526	0.389	0.712
1	9	8	1.1	0.953	NaN	0.111	0.0175	0.705

In [48]:

get_surv_fit_lr(surv, clin.Smoking_history.ix[combo.index])

Out[48]:

	Stats		Median Survival			5y Survival			Log-Rank
	# Patients	# Events	Median	Lower	Upper	Surv	Lower	Upper	chi2	p
									1.14	0.285
yes	41	24	1.85	1.15	NaN	0.399	0.273	0.585
no	7	3	5.01	1.43	NaN	0.714	0.447	1

In [49]:

p2 = p53_mut
p2.name = 'TP53'
p2 = p2.map({True:'Mut.',False:'WT'})

In [50]:

draw_survival_curves(del_3p.map({0:'3p +/+', 1:'3p +/-'}), surv, p2, legend=True)

This UPMC cohort confirmed that patients with the TP53 / 3p aggregate event have worse prognosis than patients with TP53 mutation alone.

In [51]:

get_cox_ph(surv, del_3p.ix[true_index(p53_mut)], print_desc=True, interactions=False);

        coef exp(coef) se(coef)    z     p
feature 1.92      6.84     1.03 1.87 0.061

Likelihood ratio test=6.37  on 1 df, p=0.0116  n= 35, number of events= 20 
   (11 observations deleted due to missingness)

In [52]:

exp(1.92), exp(1.92) -  exp(1.92 - 1.03)

Out[52]:

(6.8209584692907494, 4.3858288180008751)

In [53]:

pd.crosstab(p53_mut, del_3p.ix[p53_mut.index].fillna('M'))

Out[53]:

d	False	True	M
TP53
False	5	8	4
True	7	28	11

In [54]:

two_hit_both = combine(p53_mut>0, del_3p) == 'both'
no_cna = true_index(del_3p.ix[true_index(p53_mut==False)].isnull())
no_cna = pd.Series(False, index=no_cna)
two_hit = two_hit_both.append(no_cna).ix[keepers].dropna()

In [55]:

survival_and_stats(two_hit.ix[keepers].dropna(), surv)

Coin Tests¶

This was a suggestion from a review. The R coin package allows for exact permutation tests for log-rank statistics.

In [56]:

%load_ext rmagic
coin = robjects.packages.importr('coin')

Test 1: p53/3p Worse Survival than wt¶

In [57]:

th2 = two_hit[true_index(clin.Age_Dx < 75)].dropna()
time = robjects.r.c(*list(surv.unstack().days.ix[th2.index]))
event = robjects.r.c(*list(surv.unstack().event.ix[th2.index]))
g = robjects.r.c(*list(th2*1.))

In [58]:

%%R -i time,event,g

exdata <- data.frame(time = time,
                     event = event,
                     group = factor(g))
surv_test(Surv(time, event) ~ group, data=exdata, 
          distribution=exact(), alternative='less')

	Exact Logrank Test

data:  Surv(time, event) by group (0, 1)
Z = -1.5691, p-value = 0.05894
alternative hypothesis: less

Test 2: p53/3p Worse Survival than p53¶

In [59]:

f = del_3p.ix[true_index(p53_mut)].dropna()*1.
f = f.ix[clin.Age_Dx < 77]
g = robjects.r.c(*list(f))
time = robjects.r.c(*list(surv.unstack().days.ix[f.index]))
event = robjects.r.c(*list(surv.unstack().event.ix[f.index]))

In [60]:

cox(f, surv)

Out[60]:

hazard       exp(coef)      6.84
             exp(-coef)     0.15
             lower .95      0.91
             upper .95     51.20
LR           stat           6.37
             df             1.00
             p              0.01
concordance  stat           0.61
             se             0.05
dtype: float64

In [61]:

%%R -i time,event,g

exdata <- data.frame(time = time,
                     event = event,
                     group = factor(g))
surv_test(Surv(time, event) ~ group, data=exdata, 
          distribution=exact(), alternative='less')

	Exact Logrank Test

data:  Surv(time, event) by group (0, 1)
Z = -2.4025, p-value = 0.009194
alternative hypothesis: less

Test p53/3p Worst Survival than 3p¶

In [62]:

f = p53_mut.ix[true_index(del_3p)].dropna()*1.
g = robjects.r.c(*list(f))
time = robjects.r.c(*list(surv.unstack().days.ix[f.index]))
event = robjects.r.c(*list(surv.unstack().event.ix[f.index]))

In [63]:

%%R -i time,event,g

exdata <- data.frame(time = time,
                     event = event,
                     group = factor(g))
surv_test(Surv(time, event) ~ group, data=exdata, 
          distribution=exact(), alternative='less')

	Exact Logrank Test

data:  Surv(time, event) by group (0, 1)
Z = 0.0197, p-value = 0.5014
alternative hypothesis: less

MIR548K Copy Number¶

We do not have expression data for this cohort so we use copy number. The trend seems to hold here but we do not report this in the manuscript.

In [64]:

cn_upmc = cn_upmc.ix[:, two_hit.index]

In [65]:

survival_and_stats(cn_upmc.ix['MIR548K'].ix[0].dropna()>0, surv)

In [66]:

st = (two_hit + 1).add(2*(cn_upmc.ix['MIR548K'].ix[0] > 0)).replace(3,0).replace(4,3)
st = st.dropna()

In [67]:

mir = cn_upmc.ix['MIR548K'].ix[0].ix[two_hit.index]

In [68]:

eq = cn_upmc.ix['MIR548K'].ix[0].ix[two_hit.index]
#eq = cn_upmc.xs('11q13.3',0,1).median()

In [69]:

st = (two_hit + 1).add(2*(eq > 1)).replace(3,1).replace(4,3)
st = st.dropna()

In [70]:

survival_and_stats(st, surv)

In [71]:

get_cox_ph(surv, st, print_desc=True, interactions=False);

         coef exp(coef) se(coef)    z     p
feature 0.433      1.54    0.243 1.78 0.074

Likelihood ratio test=3.14  on 1 df, p=0.0763  n= 48, number of events= 27

HPV+ Combined Patients¶

In [72]:

cn_upmc = pd.DataFrame({i: gistic[t] for i,t in cn_tumor.iteritems() if t in gistic})

In [73]:

cn_upmc = pd.DataFrame({i: gistic[t] for i,t in cn_tumor.iteritems() if t in gistic})
del_3p_upmc = cn_upmc.xs('3p14.2',0,1).median().ix[true_index(clin.HPV=='Positive')].dropna()

In [74]:

surv_combined = pd.concat([surv, clinical.survival.survival_5y])

In [75]:

cn_2 = cn_upmc.groupby(level=1).median().ix[:, true_index(clin.HPV=='Positive')].dropna(1, how='all')
cn_3 = cn.features.copy().ix[:, ti(clinical.hpv)]
cn_3.index = cn_3.index.get_level_values(1)

In [76]:

cn_4 = cn_2.join(cn_3)
cn_4 = cn_4.dropna(1, how='all').dropna(how='any')
cn_d = (cn_4 < 0)
cn_d = cn_d[cn_d.sum(1) > 0]

cn_a = (cn_4 > 0)
cn_a = cn_a[cn_a.sum(1) > 0]

cn_all = pd.concat([cn_a, cn_d], keys=['Amplification','Deletion'])

In [77]:

cox_screen(cn_all, surv_combined).sort([('LR','p')]).head()

Out[77]:

		hazard				LR				concordance
		exp(-coef)	exp(coef)	lower .95	upper .95	df	p	stat	q	se	stat
Deletion	3p14.2	0.18	5.54	1.51	20.33	1	0.00	8.21	0.23	0.08	0.75
Amplification	8q24.21	6.06	0.17	0.04	0.76	1	0.01	7.44	0.23	0.08	0.68
	13q22.1	0.16	6.25	1.82	21.46	1	0.01	7.38	0.23	0.06	0.69
	20q11.22	7.82	0.13	0.02	0.99	1	0.01	6.69	0.23	0.08	0.64
	8p11.23	7.71	0.13	0.02	1.02	1	0.01	6.44	0.23	0.08	0.60

In [78]:

del_3p_all = cn_all.ix['Deletion'].ix['3p14.2']

In [79]:

p53_upmc = hit_matrix.ix['TP53', true_index(clin.HPV=='Positive')].dropna()
p53_tcga = mut.features.ix['TP53', true_index(clinical.hpv==True)]

We noted that TP53 mutation is very rare in HPV+ patients, consistent with the idea that mutation confers little selective advantage once TP53 is inactivated by HPV.

In [80]:

p53_all = (mut.features.ix['TP53'].append(hit_matrix.ix['TP53']>0)).map({True:'mut',False:'wt'})
hpv_all = clinical.hpv.append(clin.HPV=='Positive')
pd.crosstab(hpv_all, p53_all)

Out[80]:

TP53	mut	wt
HPV
False	257	69
True	2	52

In [81]:

1 / fisher_exact_test(hpv_all, p53_all)['odds_ratio']

Out[81]:

0.010326249625860521

In [82]:

fisher_exact_test(hit_matrix.ix['TP53']>0, clin.HPV=='Positive')

Out[82]:

odds_ratio    0.00e+00
p             5.14e-06
dtype: float64

In [83]:

print len(combine(hit_matrix.ix['TP53']>0, clin.HPV=='Positive'))
pd.crosstab(hit_matrix.ix['TP53']>0, clin.HPV)

Out[83]:

HPV	Negative	Positive
TP53
False	17	11
True	46	0

In [84]:

fisher_exact_test(hpv_all, p53_all)

Out[84]:

odds_ratio    9.68e+01
p             1.17e-27
dtype: float64

Moreover, the 25 HPV+ patients harboring a 3p deletion had significantly worse prognosis than the 34 HPV+ patients without the 3p event.

In [85]:

survival_and_stats(del_3p_all, surv_combined)

In [86]:

log_rank(del_3p_all, surv_combined).ix['p']

Out[86]:

0.0038336554312233214

In [87]:

get_cox_ph(surv_combined, del_3p_all, print_desc=True);

        coef exp(coef) se(coef)    z      p
feature 1.71      5.54    0.663 2.58 0.0098

Likelihood ratio test=8.21  on 1 df, p=0.00417  n= 59, number of events= 13

In [88]:

exp(1.71), exp(1.71) -  exp(1.71 - .663)

Out[88]:

(5.5289614776240041, 2.6798704663347901)

UPMC Validation Cohort Figures¶

Figure 3a¶

In [89]:

p53_mut.name = 'TP53'
del_3p.name = '3p'
combo = combine(p53_mut, del_3p)

In [90]:

fig, ax = subplots(figsize=(3,2.5))
c = {'TP53': colors_th[1], 'both': colors_th[0]}
draw_survival_curve(combo[combo.isin(['both','TP53'])], surv, colors=c, ax=ax)
ax.legend(loc='lower left', frameon=False)
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'fig3a.pdf')

In [91]:

c = combo[combo.isin(['both','TP53'])]
c = c.replace({'both': 'TP53 and 3p'})
df = pd.concat([c, surv[:,'days'], surv[:,'event']],
                keys=['Molecular Status', 'Days to Death/Censoring', 'Death Indicator'],
                axis=1).dropna()
df.to_csv(FIGDIR + 'fig3a.csv')

In [92]:

fig, ax = subplots(figsize=(3,2.5))
surv_fit = get_surv_fit(surv, combo)
tt = surv_fit['5y Survival']
b = (tt['Surv']).plot(kind='bar', ax=ax, color=[colors_th[2], colors_th[0], 
                                                colors_th[3], colors_th[1]],
         yerr=[tt.Surv-tt.Lower, tt.Upper-tt.Surv], ecolor='black')
ax.set_ylabel('5Y Survival')
ax.set_yticks([0, .5, 1.0])
bar_labels = ['{}\n\n{}'.format(j,int(v)) for j,v in 
              surv_fit['Stats']['# Patients'].iteritems()]
ax.set_xticklabels(bar_labels, rotation=0);
prettify_ax(ax)
fig.tight_layout()

Figure 3b¶

In [103]:

fig, ax = subplots(figsize=(3,2.5))
draw_survival_curve(del_3p_all.astype(int), surv_combined, ax=ax, 
                    colors=[colors_th[3], colors_th[2]])
ax.legend().set_visible(False)
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'fig3d.pdf')

In [101]:

df = pd.concat([del_3p_all, surv_combined[:,'days'], surv_combined[:,'event']],
                keys=['3p Deletion', 'Days to Death/Censoring', 'Death Indicator'],
                axis=1).dropna()
df.to_csv(FIGDIR + 'fig3ba.csv')