import NotebookImport
from Imports import *
importing IPython notebook from Imports.ipynb Populating the interactive namespace from numpy and matplotlib changing to source dirctory populating namespace with data
cancers = {c: run.load_cancer(c) for c in run.cancers}
from Data.ProcessClinicalDataPortal import update_clinical_object
clinical = {c: cancer.load_clinical() for c, cancer in cancers.iteritems()}
for cancer,clin in clinical.iteritems():
try:
path = params['OUT_PATH'] + '/Followup/' + cancer + '/'
clinical[cancer] = update_clinical_object(clin, path)
except:
print cancer
do_nothing = True
Data/ProcessClinicalDataPortal.py:37: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_index,col_indexer] = value instead f['vitalstatus'] = (f['daystodeath'].isnull() == True)
surv = pd.DataFrame({c: v.survival.survival for c,v in clinical.iteritems()})
surv = surv.stack()
codes = pd.Series({s[0]: s[1] for s in surv[:,'days'].index})
codes = codes.groupby(level=0).first()
codes.name = 'codes'
surv.index = surv.index.droplevel(2)
surv = surv.groupby(level=[0,1]).first()
surv = surv.ix[ti(surv[:,'days'] >= 7)]
surv_5y = pd.DataFrame({c: v.survival.survival_5y for c,v in clinical.iteritems()})
surv_5y = surv_5y.stack()
surv_5y.index = surv_5y.index.droplevel(2)
surv_5y = surv_5y.groupby(level=[0,1]).first()
#bad = ((surv_5y.unstack()['days'] < 8) * (surv_5y.unstack()['event'] == 1)) == False
#surv_5y = surv_5y.unstack().ix[bad].stack()
surv_5y = surv_5y.ix[ti(surv[:,'days'] >= 7)]
for c in clinical.values():
c = c.artificially_censor(10)
surv_10y = pd.DataFrame({c: v.survival.survival_10y for c,v in clinical.iteritems()})
surv_10y = surv_10y.stack()
surv_10y.index = surv_10y.index.droplevel(2)
surv_10y = surv_10y.groupby(level=[0,1]).first()
surv_10y = surv_10y.ix[ti(surv[:,'days'] >= 7)]
for c in clinical.values():
c = c.artificially_censor(3)
surv_3y = pd.DataFrame({c: v.survival.survival_3y for c,v in clinical.iteritems()})
surv_3y = surv_3y.stack()
surv_3y.index = surv_3y.index.droplevel(2)
surv_3y = surv_3y.groupby(level=[0,1]).first()
surv_3y = surv_3y.ix[ti(surv[:,'days'] >= 7)]
all_mut = pd.read_csv(params['OUT_PATH'] + '/MAFs/mega_maf.csv', index_col=0)
all_mut = all_mut[all_mut.Tumor_Sample_Barcode.apply(lambda s: s[13:16]) == '01A']
all_mut.Variant_Classification.value_counts()
#all_mut = pd.read_csv(params['OUT_PATH'] + '/MAFs/mega_maf.csv', index_col=0)
all_mut = all_mut[all_mut.Tumor_Sample_Barcode.apply(lambda s: s[13:16]) == '01A']
non_coding = ['Silent','RNA','IGR',"5'Flank", "3'UTR",'Intron',"5'UTR"]
all_mut = all_mut[all_mut.Variant_Classification.isin(non_coding)==False]
all_mut.Tumor_Sample_Barcode = all_mut.Tumor_Sample_Barcode.map(lambda s: s[:12])
all_mut = all_mut.groupby(['Tumor_Sample_Barcode', 'Hugo_Symbol']).size()
all_mut = all_mut.unstack().T.fillna(0)
/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas-0.13.0_247_g82bcbb8-py2.7-linux-x86_64.egg/pandas/core/generic.py:1642: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_index,col_indexer] = value instead self[name] = value
mut = {c: cancer.load_data('Mutation') for c, cancer in cancers.iteritems()
if run.sample_matrix.MAF.ix[c] > 0}
hit_mat = pd.concat([m.df.unstack() for c,m in mut.iteritems()], axis=1).fillna(0)
hit_mat = hit_mat.groupby(axis=1, level=0).first()
hit_mat = pd.concat([hit_mat, all_mut], axis=1)
hit_mat = hit_mat.groupby(axis=1, level=0).sum() > 0
cn_all = {}
for c in codes.unique():
try:
cn_all[c] = FH.get_gistic_gene_matrix(run.data_path, c)
except:
print c
cn_all = pd.concat(cn_all.values(), axis=1)
cn_all = cn_all.groupby(axis=1, level=0).first()
non_embargo = ['AML','BRCA','KIRC','COAD','READ','COADREAD','SKCM','GBM',
'HNSC','LUAD','LUSC','OV','STAD','THCA','UCEC']
#codes = codes[codes.isin(non_embargo)]
c2 = codes[codes.isin(ti(codes.value_counts() > 30))]
survival_and_stats(c2, surv_10y, upper_lim=10, figsize=(10,12))
fig, axs = subplots(2,1, figsize=(8,8))
ax = axs[0]
t = get_surv_fit(surv, codes)
t = t.sort([('5y Survival','Surv')])
t['Stats'].plot(kind='bar', ax=ax)
prettify_ax(ax)
ax2 = axs[1]
t = get_surv_fit(surv, codes)
tt = t['5y Survival'].sort('Surv')
b = (tt['Surv']).plot(kind='bar', ax=ax2, color='grey',
yerr=[tt.Surv-tt.Lower, tt.Upper-tt.Surv], ecolor='black')
ax2.set_ylabel('5Y Survival')
ax2.set_yticks([0, .5, 1.])
prettify_ax(ax2)
fig.tight_layout()
fys = get_surv_fit(surv, codes)['5y Survival']['Surv'].order()
nn = codes.isin(true_index((fys > .1) * (fys < .83)))
fys
GBM 0.06 LAML 0.23 OV 0.31 STAD 0.35 LUAD 0.35 LIHC 0.36 BLCA 0.39 HNSC 0.44 LUSC 0.45 READ 0.47 ACC 0.48 SARC 0.58 SKCM 0.61 KIRC 0.62 COAD 0.63 LGG 0.65 CESC 0.65 KIRP 0.68 UCEC 0.79 BRCA 0.81 DLBC 0.84 KICH 0.87 THCA 0.89 PRAD 0.97 ESCA NaN PAAD NaN Name: Surv, dtype: float64
age = pd.concat([c.clinical.daystobirth for c in clinical.values()])
age = -1*np.floor(age / 365.25)
a2 = a2 = pd.concat([c.clinical.age for c in clinical.values()])
age = age.combine_first(a2)
age = age.groupby(level=0).min()
age = age.fillna(age.median())
age.name = 'age'
survival_and_stats((age / 15).round(), surv_5y)
survival_and_stats(age.dropna() >= 85, surv_5y)
get_cox_ph(surv_5y, age >= 85, print_desc=True);
coef exp(coef) se(coef) z p feature 0.723 2.06 0.123 5.89 3.8e-09 Likelihood ratio test=28.1 on 1 df, p=1.18e-07 n= 7294, number of events= 1959
old = age >= 75
old.name = 'age_over_75'
survival_and_stats(old.ix[ti(age < 85)], surv_5y)
exp(.773), exp(.773)-(exp(.773-.139))
(2.1662552812206535, 0.28111921870672885)
year = pd.concat([c.clinical.yearofinitialpathologicdiagnosis for c in clinical.values()])
year = year.groupby(level=0).first().replace('[Discrepancy]', nan).astype(float).dropna()
survival_and_stats(year <= 2000, surv_5y)
get_cox_ph_ms(surv_5y, year < 2000, [codes])
/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas-0.13.0_247_g82bcbb8-py2.7-linux-x86_64.egg/pandas/core/indexing.py:344: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_index,col_indexer] = value instead self.obj[item] = s
LR 5.92e-09 feature_p NaN fmla Surv(days, event) ~ codes + codes:feature\n hazzard NaN dtype: object
r_status = pd.concat([c.clinical.residualtumor for c in clinical.values() if 'residualtumor'
in c.clinical])
r_status = r_status.replace(['[Not Evaluated]', '[Unknown]', 'RX'])
r_status = r_status.str.lower()
r_status = r_status.groupby(level=0).first()
survival_and_stats(r_status.dropna(), surv_5y)
get_cox_ph(surv_5y, r_status=='r2', print_desc=True);
coef exp(coef) se(coef) z p feature 1.05 2.86 0.18 5.82 5.8e-09 Likelihood ratio test=25.1 on 1 df, p=5.57e-07 n= 3555, number of events= 809
exp(1.05), exp(1.05)-(exp(1.05-.18))
(2.8576511180631639, 0.47074026453888695)
margin_status = pd.concat([c.clinical.marginstatus for c in clinical.values() if 'marginstatus' in c.clinical])
margin_status = margin_status.groupby(level=0).first()
survival_and_stats(margin_status.dropna(), surv_5y)
s = {}
for can,c in clinical.iteritems():
if 'pathologicstage' in c.stage:
s[can] = c.stage.pathologicstage
elif 'clinicalstage' in c.stage:
s[can] = c.stage.clinicalstage
stage = pd.concat(s.values())
stage = stage.groupby(level=0).first()
stage = stage.dropna().map(lambda s: s.replace('A','').replace('B','').replace('C',''))
stage = stage.dropna().map(lambda s: s.replace('1','').replace('2',''))
stage = stage.replace('stage iic','stage ii')
stage = stage.ix[age.index].fillna('missing')
stage = stage.str.lower()
stage = stage.replace(['[unknown]','stage x','stage tis','i or ii nos', 'stage 0'], nan).dropna()
stage.name = 'stage'
survival_and_stats(stage.ix[true_index(nn)].dropna(), surv_5y)
get_cox_ph(surv_5y, stage == 'stage iv', print_desc=True);
coef exp(coef) se(coef) z p feature 0.712 2.04 0.0625 11.4 0 Likelihood ratio test=110 on 1 df, p=0 n= 7263, number of events= 1952
exp(.712), exp(.712)-(exp(.712-.0625))
(2.0380633118599061, 0.1234800138578156)
mets = pd.concat([c.stage.pathologicm for can,c in clinical.iteritems()
if 'pathologicm' in c.stage])
mets = mets.groupby(level=0).first()
mets = pd.concat([c.stage.pathologicm for can,c in clinical.iteritems() if 'pathologicm' in c.stage])
mets = mets.groupby(level=0).first()
mets = mets.dropna().map(lambda s: s.replace('a','').replace('b','').replace('c',''))
mets = mets.replace(['[Unknown]','MX'], nan).dropna()
mets = mets=='M1'
mets.name = 'metastasis'
survival_and_stats(mets, surv_5y)
survival_and_stats(combine(mets, stage=='stage iv'), surv_5y)
new_p53_calls = pd.read_csv('../Extra_Data/p53_calls_pancancer.csv', header=None, index_col=0,
squeeze=True)
p53_new = (hit_mat.ix['TP53'].combine_first(new_p53_calls)) > 0
survival_and_stats(combine(new_p53_calls, hit_mat.ix['TP53']), surv_5y)
filters = pd.concat([age > 85, mets, r_status=='r2', stage=='stage iv', nn==False,
codes.isin(['ACC','ESCA'])], 1)
keepers = true_index((filters > 0).sum(1) == 0)
keepers = keepers.intersection(surv_5y.unstack().index)
keepers = keepers.intersection(p53_new.index).intersection(cn_all.columns)
keepers = keepers.intersection(codes.index)
stage = stage.ix[keepers].fillna('missing')
len(keepers)
4583
survival_and_stats(codes.ix[keepers].order(), surv_10y, upper_lim=10, figsize=(10,10))
plt.savefig('/cellar/users/agross/figures/pancan_supp.pdf', transparent=True)
p53_all = p53_new > 0
p53_mut = p53_all.ix[keepers]
p53_all.name = 'TP53'
p53_mut.name = 'TP53'
survival_and_stats(p53_mut, surv_5y)
get_cox_ph_ms(surv_5y, p53_mut, [codes, age, stage], interactions=None)
LR 0.112 feature_p 0.112 fmla Surv(days, event) ~ feature + codes + age + st... hazzard 1.14 dtype: object
cn_all2 = cn_all.ix[[i for i in cn_all.index if i[2] in rna.df.index]]
pct_del = (cn_all2 < 0).sum() / (1.*len(cn_all2))
pct_amp = (cn_all2 > 0).sum() / (1.*len(cn_all2))
pct_altered = pct_amp + pct_del
pct_altered.name = 'CIN_pct_altered'
pct_del.name = 'pct_del'
survival_and_stats(to_quants(pct_altered, q=.33), surv)
del_3p_all = cn_all.ix['3p14.2'].median().round()
#del_3p_all = (cn_all.ix[[i for i in cn_all.index if '3p14.2' in i[0]]] < 0).mean()
del_3p_all = del_3p_all < 0
del_3p_all = del_3p_all.map({True:-1, False:0})
del_3p = del_3p_all.ix[keepers].dropna()
del_3p.name = 'del_3p'
del_3p_all.name = 'del_3p'
survival_and_stats(del_3p, surv_5y)
get_cox_ph_ms(surv_5y, del_3p < 0, [codes, age, stage, pct_altered], interactions=None)
LR 0.0575 feature_p 0.0568 fmla Surv(days, event) ~ feature + codes + age + st... hazzard 1.18 dtype: object
old.name = 'age_over_75'
fig, axs = subplots(3, 1, figsize=(3.5,4.5), sharey=True, sharex=True)
fmla = 'Surv(days, event) ~ del_3p + pct_del + age + codes + age_over_75'
fmla2 = 'Surv(days, event) ~ pct_del + age + codes + age_over_75'
k2 = keepers.diff(codes[codes.isin(['HNSC','LUSC'])].index)
pts = [k2, ti(p53_mut>0).intersection(k2),
ti(p53_mut==0).intersection(k2)]
color_list = ['grey', '#a1d99b','#9ecae1']
for i, pt in enumerate(pts):
ax = axs[i]
m1 = get_cox_ph(surv_5y, covariates=[del_3p.ix[pt].dropna() < 0, pct_del, stage, age, old, codes],
formula=fmla, print_desc=False, interactions=False);
m2 = get_cox_ph(surv_5y, covariates=[del_3p.ix[pt].dropna() < 0, pct_del, stage, age, old, codes],
formula=fmla2, print_desc=False, interactions=False);
print LR_test(m1, m2)
ci = convert_robj(robjects.r.summary(m1)[7])
haz = ci['exp(coef)']
for j,h in enumerate(haz):
ax.scatter(h, j, marker='s', s=100, color=color_list[i],
edgecolors=['black'], zorder=10)
ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))),
lw=3, ls='-', marker='o', dash_joinstyle='bevel', color=color_list[i])
ax.axvline(1, ls='--', color='black')
ax.set_xscale('log')
ax.set_xbound(.8,1.5)
ax.set_ybound(-.5,len(ci.index) - .5)
ax.set_xticks([1])
ax.set_xticklabels([1])
ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels(ci.index)
ax.set_ybound(-.5, 1.5)
prettify_ax(ax)
axs[2].set_xlabel('Hazard Ratio')
fig.tight_layout()
#fig.savefig('/cellar/users/agross/figures/fig2c_alt.pdf', transparent=True)
0.0385826115252 0.00706808331344 0.885903546672
combo = combine(del_3p < 0, p53_mut>0)
combo = combo.ix[keepers].dropna()
combo.name = 'TP53 / 3p14'
len(combo), len(combo.ix[codes[codes != 'HNSC'].index].dropna())
(4583, 4404)
venn_pandas(del_3p_all < 0, p53_all>0)
<matplotlib_venn._venn2.Venn2 instance at 0x76ca2d8>
len(combine(del_3p_all.ix[del_3p_all.index.diff(true_index(codes=='HNSC'))].dropna() < 0,
p53_all>0))
7081
fisher_exact_test(del_3p_all.ix[del_3p_all.index.diff(true_index(codes=='HNSC'))].dropna() < 0,
p53_all>0)
odds_ratio 2.47e+00 p 3.24e-62 dtype: float64
venn_pandas(del_3p.ix[del_3p.index.diff(true_index(codes=='HNSC'))].dropna() < 0,
p53_mut>0)
<matplotlib_venn._venn2.Venn2 instance at 0x6ea85f0>
len(combine(del_3p.ix[combo.index.diff(true_index(codes=='HNSC'))] < 0, p53_mut>0))
4404
fisher_exact_test(del_3p.ix[combo.index.diff(true_index(codes=='HNSC'))] < 0, p53_mut>0)
odds_ratio 2.00e+00 p 4.67e-26 dtype: float64
combo_all = combine(p53_all>0, del_3p_all<0)
combo_all.name = 'TP53 / 3p14'
ct = pd.crosstab(combo_all, stage[stage != 'missing']=='stage iv').T
(ct.ix[True] / (1.*ct.sum())).order().plot(kind='bar');
plt.ylabel('% of Patients in Stage IV')
plt.xlabel('');
c = clinical['LGG'].clinical
combo_all = combine(p53_all>0, del_3p_all<0)
fisher_exact_test(combo_all=='both', stage[stage != 'M']=='stage iv')
odds_ratio 2.45e+00 p 4.16e-20 dtype: float64
combo_all = combine(p53_all>0, del_3p_all<0)
fisher_exact_test(combo_all=='both', stage[stage != 'M']=='stage iv')
odds_ratio 2.45e+00 p 4.16e-20 dtype: float64
Univarite model, all patients
get_cox_ph(surv_5y, combo=='both', print_desc=True);
coef exp(coef) se(coef) z p feature 0.505 1.66 0.0767 6.59 4.3e-11 Likelihood ratio test=39.5 on 1 df, p=3.22e-10 n= 4583, number of events= 955
Multivariate model, all patients
get_cox_ph_ms(surv_5y, combo=='both', [codes, age, stage], interactions=None)
LR 0.000308 feature_p 0.000235 fmla Surv(days, event) ~ feature + codes + age + st... hazzard 1.4 dtype: object
codes.ix[keepers].value_counts().shape
(18,)
Univariate model, HNSCC excluded
get_cox_ph(surv_5y, combo.ix[true_index(codes!='HNSC')].dropna()=='both', print_desc=True);
coef exp(coef) se(coef) z p feature 0.452 1.57 0.082 5.52 3.4e-08 Likelihood ratio test=27.8 on 1 df, p=1.35e-07 n= 4404, number of events= 904
cox((combo[combo.isin(['TP53','both'])].dropna()=='both').ix[true_index(codes!='HNSC')].dropna(),
surv_5y).ix['LR'].ix['p']
0.001689913203002158
pats = true_index(codes!='HNSC').intersection(true_index(age < 75))
log_rank((combo[combo.isin(['TP53','both'])]=='both').ix[pats].dropna(), surv_5y).ix['p']
0.0031807460590553751
cox((combo[combo.isin(['TP53','both'])]=='both').ix[pats].dropna(), surv_5y)
hazard exp(coef) 1.39 exp(-coef) 0.72 lower .95 1.12 upper .95 1.74 LR stat 8.50 df 1.00 p 0.00 concordance stat 0.56 se 0.02 dtype: float64
Multivariate model, HNSCC excluded
get_cox_ph_ms(surv_5y, (combo=='both').ix[true_index(codes!='HNSC')].dropna(),
[codes, stage, age, old], interactions=False)
LR 0.00495 feature_p 0.0042 fmla Surv(days, event) ~ feature + codes + stage + ... hazzard 1.32 dtype: object
f = (combo[combo.isin(['del_3p','both'])]=='both').ix[true_index(codes!='HNSC')].dropna()
pts = f.index
cox(f, surv_3y)
hazard exp(coef) 1.93e+00 exp(-coef) 5.17e-01 lower .95 1.50e+00 upper .95 2.49e+00 LR stat 2.68e+01 df 1.00e+00 p 2.22e-07 concordance stat 5.80e-01 se 1.63e-02 dtype: float64
two_hit = combo == 'both'
two_hit.name = 'two_hit'
p53_mut.name = 'TP53'
pct_del.name = 'pct_del'
old = age >= 75
old.name = 'old'
pts = keepers.diff(ti(codes=='HNSC'))
pts = keepers.diff(ti(codes.isin(['HNSC','LUSC'])))
fmla0 = 'Surv(days, event) ~ TP53 + del_3p'
fmla1 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + TP53:pct_del'
fmla2 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + two_hit + TP53:pct_del'
m0 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, old, age, pct_del], formula=fmla0)
m1 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, old, age, pct_del], formula=fmla1)
m2 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, old, age, pct_del], formula=fmla2)
LR_test(m2,m0), LR_test(m2, m1)
(2.6714203811114096e-05, 1.7904393791766563e-05)
fmla0 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + old + age + codes + stage'
fmla1 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + TP53:pct_del + old + age + codes + stage'
fmla2 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + two_hit + TP53:pct_del + old + age + codes + stage'
m0 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, old, age, pct_del, codes, stage], formula=fmla0)
m1 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, old, age, pct_del, codes, stage], formula=fmla1)
m2 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, old, age, pct_del, codes, stage], formula=fmla2)
LR_test(m2,m0), LR_test(m2, m1)
(0.0011125269418744536, 0.00025387690377041955)
pts = keepers.diff(ti(codes.isin(['HNSC','LUSC'])))
#pts = keepers.diff(ti(codes=='HNSC'))
fmla0 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + codes + stage + age'
fmla1 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + TP53:pct_del + codes + stage + age'
fmla2 = 'Surv(days, event) ~ TP53 + del_3p + pct_del + two_hit + TP53:pct_del + codes + stage + age'
m0 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, age, pct_del, codes, stage], formula=fmla0)
m1 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, age, pct_del, codes, stage], formula=fmla1)
m2 = get_cox_ph(surv_3y, covariates=[p53_mut.ix[pts], del_3p < 0, two_hit, age, pct_del, codes, stage], formula=fmla2)
LR_test(m2,m0), LR_test(m2, m1)
(0.0011057684389119127, 0.00025285253518623135)
cc = array(colors_th)
fig, ax = subplots(figsize=(5,3))
draw_survival_curve(combo, surv_5y, ax=ax,
colors=cc[[2,0,3,1]], ms=30, alpha=.7)
ax.get_legend().set_visible(False)
ax.set_ybound(0,1)
ax.set_xbound(0,5)
prettify_ax(ax)
combo.ix[true_index(codes!='HNSC')].dropna().ix[surv_5y.unstack().index].dropna().value_counts()
neither 1954 TP53 1047 both 726 del_3p 677 dtype: int64
k2 = combo.index.intersection(true_index(codes!='HNSC'))
#k2 = k2.intersection(ti(year <=2009))
f = (combo[combo.isin(['TP53','both'])]=='both').ix[true_index(codes!='HNSC')].dropna()
pts = f.index
fmla = 'Surv(days, event) ~ feature + age + old + pct_del + stage + strata(codes)'
fmla2 = 'Surv(days, event)~ age + stage + old + pct_del + strata(codes)'
m1 = get_cox_ph(surv_3y, f, [codes, age.ix[pts], old, stage, pct_del], formula=fmla, print_desc=True)
m2 = get_cox_ph(surv_3y, f, [codes, age.ix[pts], old, stage, pct_del], formula=fmla2, print_desc=True)
LR_test(m1, m2)
coef exp(coef) se(coef) z p feature 0.3677 1.444 0.1385 2.654 0.00800 age 0.3628 1.437 0.0977 3.712 0.00021 old 0.0342 1.035 0.0679 0.505 0.61000 pct_del 0.0502 1.051 0.0868 0.578 0.56000 stagestage i -1.6426 0.193 0.6038 -2.720 0.00650 stagestage ii -1.3456 0.260 0.6051 -2.224 0.02600 stagestage iii -0.6685 0.512 0.6019 -1.111 0.27000 Likelihood ratio test=71.7 on 7 df, p=6.7e-13 n= 1769, number of events= 309 coef exp(coef) se(coef) z p age 0.3511 1.421 0.0973 3.609 0.00031 stagestage i -1.6499 0.192 0.6036 -2.733 0.00630 stagestage ii -1.3478 0.260 0.6042 -2.231 0.02600 stagestage iii -0.6726 0.510 0.6017 -1.118 0.26000 old 0.0397 1.041 0.0676 0.588 0.56000 pct_del 0.1273 1.136 0.0812 1.568 0.12000 Likelihood ratio test=64.7 on 6 df, p=4.97e-12 n= 1769, number of events= 309
0.0081606251451832607
f = (combo[combo.isin(['del_3p','both'])]=='both').ix[true_index(codes!='HNSC')].dropna()
pts = f.index
fmla = 'Surv(days, event) ~ feature + age + old + pct_del + stage + strata(codes)'
fmla2 = 'Surv(days, event)~ age + stage + old + pct_del + strata(codes)'
m1 = get_cox_ph(surv_3y, f, [codes, age.ix[pts], old, stage, pct_del], formula=fmla, print_desc=True)
m2 = get_cox_ph(surv_3y, f, [codes, age.ix[pts], old, stage, pct_del], formula=fmla2, print_desc=True)
LR_test(m1, m2)
coef exp(coef) se(coef) z p feature 0.3043 1.356 0.1787 1.703 0.0890 age 0.3546 1.426 0.1068 3.320 0.0009 old -0.0536 0.948 0.0774 -0.692 0.4900 pct_del 0.1163 1.123 0.1005 1.157 0.2500 stagestage i -1.1684 0.311 0.7444 -1.570 0.1200 stagestage ii -0.7240 0.485 0.7477 -0.968 0.3300 stagestage iii 0.1114 1.118 0.7382 0.151 0.8800 Likelihood ratio test=76.3 on 7 df, p=7.72e-14 n= 1395, number of events= 252 coef exp(coef) se(coef) z p age 0.3447 1.412 0.1066 3.232 0.0012 stagestage i -1.1732 0.309 0.7469 -1.571 0.1200 stagestage ii -0.7312 0.481 0.7501 -0.975 0.3300 stagestage iii 0.1030 1.108 0.7405 0.139 0.8900 old -0.0556 0.946 0.0773 -0.720 0.4700 pct_del 0.1320 1.141 0.0996 1.325 0.1900 Likelihood ratio test=73.3 on 6 df, p=8.47e-14 n= 1395, number of events= 252
0.083554101863663729
f = (combo[combo.isin(['TP53','both'])]=='both').ix[true_index(codes!='HNSC')].dropna()
pts = f.index
fmla = 'Surv(days, event) ~ feature + age + stage + strata(codes)'
fmla2 = 'Surv(days, event)~ age + stage + strata(codes)'
m1 = get_cox_ph(surv_3y, f, [codes, age.ix[pts]>75, stage], formula=fmla, print_desc=True)
m2 = get_cox_ph(surv_3y, f, [codes, age.ix[pts]>75, stage], formula=fmla2, print_desc=True)
LR_test(m1, m2)
coef exp(coef) se(coef) z p feature 0.367 1.443 0.1298 2.83 4.7e-03 age 0.207 1.230 0.0513 4.04 5.4e-05 stagestage i -1.812 0.163 0.6007 -3.02 2.5e-03 stagestage ii -1.518 0.219 0.5995 -2.53 1.1e-02 stagestage iii -0.824 0.439 0.5940 -1.39 1.7e-01 Likelihood ratio test=58 on 5 df, p=3.08e-11 n= 1769, number of events= 309 coef exp(coef) se(coef) z p age 0.210 1.234 0.0511 4.11 3.9e-05 stagestage i -1.831 0.160 0.6011 -3.05 2.3e-03 stagestage ii -1.524 0.218 0.5992 -2.54 1.1e-02 stagestage iii -0.825 0.438 0.5945 -1.39 1.7e-01 Likelihood ratio test=50.1 on 4 df, p=3.38e-10 n= 1769, number of events= 309
0.004919026262837513
cc = array(colors_th)
fig, ax = subplots(figsize=(4,3))
draw_survival_curve(combo.ix[k2], surv_5y, ax=ax,
colors={'both': cc[0], 'TP53': cc[1], 'del_3p': cc[2], 'neither':cc[3]},
ms=30, alpha=.7)
ax.get_legend().set_visible(False)
ax.set_ybound(0,1)
ax.set_xbound(0,5)
prettify_ax(ax)
fig.tight_layout()
figdir = '/cellar/users/agross/figures/'
fig, ax = subplots(figsize=(4,3))
draw_survival_curve(combo.ix[k2], surv_5y, ax=ax,
colors={'both': cc[0], 'TP53': cc[1], 'del_3p': cc[2], 'neither':cc[3]},
ms=30, alpha=.7)
ax.get_legend().set_visible(False)
ax.set_ybound(0,1)
ax.set_xbound(0,5)
prettify_ax(ax)
fig.tight_layout()
fig.savefig(figdir + 'fig3e.pdf', transparent=True)
len(k2)
4404
df = pd.concat([p53_mut, del_3p, combo.ix[k2], surv[:,'days'], surv[:,'event']],
keys=['TP53 Mutation', '3p Deletion', 'TP53 / 3p', 'Days to Death/Censoring', 'Death Indicator'],
axis=1).dropna()
df.to_csv(figdir + 'fig3c.csv')
get_surv_fit(surv, combo.ix[ti(codes != 'HNSCC')], time_cutoff=3)
Stats | Median Survival | 3y Survival | ||||||
---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | |
both | 823 | 246 | 4.78 | 4.11 | 5.72 | 0.62 | 0.58 | 0.67 |
del_3p | 705 | 182 | 8.12 | 6.42 | 9.48 | 0.80 | 0.76 | 0.83 |
neither | 1993 | 433 | 8.20 | 6.96 | 10.60 | 0.77 | 0.75 | 0.80 |
TP53 | 1062 | 272 | 5.56 | 4.85 | 7.17 | 0.75 | 0.72 | 0.79 |
4 rows × 8 columns
fmla = 'Surv(days, event) ~ feature + age + old + stage + strata(codes)'
m = {'TP53':'c', 'both':'d', 'neither':'a', 'del_3p':'b'}
cm = combo.map(m).ix[true_index(codes != 'HNSC')]
f = get_cox_ph(surv_3y, cm, interactions=False)
ci = convert_robj(robjects.r.summary(f)[7])
ci.index = ['3p','TP53','both']
n = ci.ix[0]*0 +1
n.name = 'neither'
ci = ci.append(n)
ci = ci.ix[['neither','3p', 'TP53','both']]
ci_uni = ci
f = get_cox_ph(surv_3y, cm, [codes, age, old, stage],
formula=fmla, interactions=False)
ci = convert_robj(robjects.r.summary(f)[7])
ci = ci.ix[:3]
ci.index = ['3p','TP53','both']
n = ci.ix[0]*0 +1
n.name = 'neither'
ci = ci.append(n)
ci = ci.ix[['neither','3p', 'TP53','both']]
ci_full = ci
fig, ax = subplots(1,1, figsize=(3,2.5))
ci = ci_full
haz = ci['exp(coef)']
b = haz.plot(kind='bar', ax=ax, color=cc[[3,2,1,0]],
yerr=[haz - ci['lower .95'], ci['upper .95'] - haz], ecolor='black')
prettify_ax(ax)
ax.set_ylabel('hazard')
ax.set_ylim(0)
ax.axhline(y=1, ls='--', lw=3, color='black', alpha=.5)
fig.tight_layout()
cm.value_counts().sum()
4404
fig, ax = subplots(1,1, figsize=(2.3,3))
ci = ci_full.ix[::-1]
haz = ci['exp(coef)']
color_list = colors_th
for j,h in enumerate(haz):
ax.scatter(h, j, marker='s', s=100, color=color_list[j],
edgecolors=['black'], zorder=10, )
ax.plot(*zip(*((ci.iloc[j]['lower .95'],j), (ci.iloc[j]['upper .95'],j))),
lw=3, ls='-', marker='o', dash_joinstyle='bevel', color=color_list[j])
ax.axvline(1, ls='--', color='black')
ax.set_xscale('log')
ax.set_xbound(.7,2.)
ax.set_ybound(-.5,len(ci.index) - .5)
ax.set_xticks([.75, 1, 1.25, 1.5, 2])
ax.set_xticklabels([.75, 1, 1.25, 1.5, 2])
ax.set_yticks(range(len(ci.index)))
ax.set_yticklabels('')
ax.set_ybound(-.5, 3.5)
prettify_ax(ax)
ax.set_xlabel('3 Year Hazard Ratio')
fig.tight_layout()
fig.savefig('/cellar/users/agross/figures/fig3f.pdf', transparent=True)
fig, axs = subplots(1,2, figsize=(5,2.5), sharey=True)
for i,ci in enumerate([ci_uni, ci_full]):
ax = axs[i]
haz = ci['exp(coef)']
b = haz.plot(kind='bar', ax=ax, color=cc[[3,2,1,0]],
yerr=[haz - ci['lower .95'], ci['upper .95'] - haz], ecolor='black')
prettify_ax(ax)
ax.set_ylabel('hazard')
ax.set_ylim(0)
ax.axhline(y=1, ls='--', lw=3, color='black', alpha=.5)
fig.tight_layout()
from itertools import combinations
c = list(combinations(cm.unique(),2))[0]
cm = combo.ix[true_index(codes!='HNSC')].dropna()
sig = pd.Series({c: get_cox_ph_ms(surv_5y, cm[cm.isin(c)], interactions=False)['LR']
for c in combinations(cm.unique(),2)})
sig
(TP53, both) 1.69e-03 (TP53, del_3p) 1.28e-02 (TP53, neither) 3.15e-02 (both, del_3p) 8.11e-07 (both, neither) 1.64e-07 (neither, del_3p) 6.53e-01 dtype: float64
cm = combo.ix[true_index(codes!='HNSC')].dropna()
sig = pd.Series({c: get_cox_ph_ms(surv_3y, cm[cm.isin(c)], interactions=False)['LR']
for c in combinations(cm.unique(),2)})
sig
(TP53, both) 2.23e-05 (TP53, del_3p) 1.49e-01 (TP53, neither) 9.25e-01 (both, del_3p) 2.22e-07 (both, neither) 3.87e-06 (neither, del_3p) 1.19e-01 dtype: float64
cm = combo.ix[true_index(codes!='HNSC')].dropna()
sig = pd.Series({c: get_cox_ph_ms(surv_3y, cm[cm.isin(c)], [codes, age], interactions=False)['LR']
for c in combinations(cm.unique(),2)})
sig
(TP53, both) 1.49e-03 (TP53, del_3p) 8.57e-01 (TP53, neither) 4.18e-01 (both, del_3p) 7.11e-02 (both, neither) 2.78e-04 (neither, del_3p) 8.42e-01 dtype: float64
cm = combo.ix[true_index(codes!='HNSC')].dropna()
sig = pd.Series({c: get_cox_ph_ms(surv_5y, cm[cm.isin(c)], [codes, age, stage], interactions=False)['LR']
for c in combinations(cm.unique(),2)})
sig
(TP53, both) 5.51e-03 (TP53, del_3p) 4.85e-01 (TP53, neither) 7.44e-01 (both, del_3p) 3.07e-01 (both, neither) 2.83e-04 (neither, del_3p) 7.52e-01 dtype: float64
pts = keepers.diff(ti(codes=='HNSC'))
cm = combo.ix[pts].dropna()
sig = pd.Series({c: get_cox_ph_ms(surv_3y, cm[cm.isin(c)], [codes, age, old, stage],
interactions=False)['LR']
for c in combinations(cm.unique(),2)})
sig
(TP53, both) 2.33e-03 (TP53, del_3p) 8.95e-01 (TP53, neither) 3.91e-01 (both, del_3p) 5.88e-02 (both, neither) 9.21e-04 (neither, del_3p) 7.22e-01 dtype: float64
df = pd.concat([p53_mut, del_3p, codes, age, old, stage.ix[keepers].fillna('missing'), combo.ix[k2], surv[:,'days'], surv[:,'event']],
keys=['TP53 Mutation', '3p Deletion',
'Tissue Type', 'Age', 'Age > 75', 'Stage'
'TP53 / 3p', 'Days to Death/Censoring', 'Death Indicator'],
axis=1).ix[keepers]
df.isnull().sum()
TP53 Mutation 0 3p Deletion 0 Tissue Type 0 Age 0 Age > 75 0 StageTP53 / 3p 0 Days to Death/Censoring 179 Death Indicator 0 dtype: int64
df = pd.concat([p53_mut, del_3p, codes, age, old, stage.ix[keepers].fillna('M'), combo.ix[k2], surv[:,'days'], surv[:,'event']],
keys=['TP53 Mutation', '3p Deletion',
'Tissue Type', 'Age', 'Age > 75', 'Stage',
'TP53 / 3p', 'Days to Death/Censoring', 'Death Indicator'],
axis=1).dropna()
df.to_csv(figdir + 'fig3c.csv')
df.shape
(4404, 9)
sig.ix[3]
0.050512825852065762
f = {}
for cancer in codes.ix[combo.index].unique():
try:
c = combo[combo.isin(['both','TP53'])] == 'both'
c = c.ix[ti(codes==cancer)].dropna()
f[cancer] = get_cox_ph_ms(surv_3y, c, [age, old], interactions=False)
except:
print 'fail'
fail
pd.DataFrame(f).T.sort('LR')
LR | feature_p | fmla | hazzard | |
---|---|---|---|---|
OV | 0.0167 | 0.0137 | Surv(days, event) ~ feature + age\n | 1.75 |
LUAD | 0.0579 | 0.063 | Surv(days, event) ~ feature + old\n | 1.79 |
LGG | 0.123 | 0.0339 | Surv(days, event) ~ feature\n | 4.48 |
LIHC | 0.216 | 0.163 | Surv(days, event) ~ feature\n | 5.53 |
BRCA | 0.248 | 0.251 | Surv(days, event) ~ feature + age\n | 1.64 |
KIRC | 0.28 | 0.999 | Surv(days, event) ~ feature\n | 2.64e+08 |
COAD | 0.324 | 0.17 | Surv(days, event) ~ feature\n | 2.56 |
LAML | 0.422 | 0.421 | Surv(days, event) ~ feature + old\n | 1.7 |
STAD | 0.514 | 0.478 | Surv(days, event) ~ feature\n | 1.35 |
HNSC | 0.527 | 0.175 | Surv(days, event) ~ feature\n | 2.06 |
READ | 0.572 | 0.549 | Surv(days, event) ~ feature + age\n | 2.01 |
UCEC | 0.88 | 0.88 | Surv(days, event) ~ feature + age\n | 0.919 |
BLCA | 1 | 0.655 | Surv(days, event) ~ feature\n | 0.745 |
CESC | 1 | 1 | Surv(days, event) ~ feature\n | 1.62e+09 |
LUSC | 1 | 0.317 | Surv(days, event) ~ feature\n | 0.681 |
SARC | 1 | 1 | Surv(days, event) ~ feature\n | 1 |
SKCM | 1 | 0.702 | Surv(days, event) ~ feature\n | 0.665 |
17 rows × 4 columns
seg = cn_all2.groupby(level=0).median()
seg.shape
(799, 8323)
rr = screen_feature(p53_all.ix[ti(codes=='LUAD')], chi2_cont_test, seg < 0)
rr.head(20)
chi2 | p | dof | q | |
---|---|---|---|---|
8p23.3 | 59.48 | 1.23e-14 | 1 | 9.85e-12 |
8p23.2 | 56.86 | 4.68e-14 | 1 | 1.45e-11 |
5q31.3 | 56.56 | 5.45e-14 | 1 | 1.45e-11 |
5q31.2 | 54.40 | 1.64e-13 | 1 | 3.00e-11 |
8p21.2 | 54.13 | 1.88e-13 | 1 | 3.00e-11 |
8p22 | 53.18 | 3.05e-13 | 1 | 4.06e-11 |
8p23.1 | 52.08 | 5.33e-13 | 1 | 5.23e-11 |
5q12.3 | 51.91 | 5.82e-13 | 1 | 5.23e-11 |
8p12 | 51.88 | 5.89e-13 | 1 | 5.23e-11 |
5q23.1 | 51.58 | 6.87e-13 | 1 | 5.49e-11 |
8p21.3 | 51.03 | 9.09e-13 | 1 | 6.34e-11 |
5q22.3 | 50.74 | 1.05e-12 | 1 | 6.34e-11 |
5q23.3 | 50.74 | 1.05e-12 | 1 | 6.34e-11 |
5q32 | 50.52 | 1.18e-12 | 1 | 6.34e-11 |
5q31.1 | 50.38 | 1.27e-12 | 1 | 6.34e-11 |
5q22.1 | 50.38 | 1.27e-12 | 1 | 6.34e-11 |
8p21.1 | 50.24 | 1.36e-12 | 1 | 6.40e-11 |
5q15 | 49.55 | 1.93e-12 | 1 | 8.59e-11 |
5q21.2 | 48.83 | 2.79e-12 | 1 | 1.11e-10 |
5q14.3 | 48.83 | 2.79e-12 | 1 | 1.11e-10 |
20 rows × 4 columns
cc = ti(codes.ix[combo_all.index].value_counts() > 50)
ct = pd.DataFrame({c: fisher_exact_test(p53_all.ix[ti(codes==c)].dropna()>0, del_3p_all<0)
for c in cc}).T
ct.dropna().sort('odds_ratio', ascending=False)
odds_ratio | p | |
---|---|---|
LAML | inf | 7.40e-08 |
UCEC | 28.35 | 3.46e-21 |
HNSC | 9.38 | 9.74e-18 |
LGG | 5.67 | 2.29e-03 |
KIRP | 4.75 | 2.73e-01 |
COAD | 3.94 | 4.21e-05 |
BRCA | 3.47 | 7.22e-17 |
STAD | 3.46 | 9.20e-06 |
LIHC | 2.60 | 1.54e-01 |
LUSC | 2.06 | 4.92e-02 |
LUAD | 1.96 | 5.50e-04 |
BLCA | 1.51 | 2.49e-01 |
GBM | 1.45 | 3.98e-01 |
OV | 1.42 | 3.36e-01 |
PAAD | 1.39 | 7.64e-01 |
SKCM | 1.35 | 4.89e-01 |
KICH | 1.14 | 1.00e+00 |
READ | 0.93 | 1.00e+00 |
KIRC | 0.82 | 7.81e-01 |
CESC | 0.66 | 1.00e+00 |
SARC | 0.28 | 4.35e-01 |
THCA | 0.00 | 1.00e+00 |
PRAD | 0.00 | 1.00e+00 |
23 rows × 2 columns
survival_and_stats(combo_all.ix[true_index(codes=='LUAD')].dropna(), surv_5y)
fig, axs = subplots(1, 7, figsize=(10,3), sharey=True)
cc = array(colors_th)
for i,c in enumerate(['HNSC','BRCA','LUAD', 'OV','KIRC','LGG','LUSC']):
draw_survival_curve(combo.ix[true_index(codes==c)].dropna(),
surv_3y, ax=axs[i],
colors={'both': cc[0], 'TP53': cc[1], 'del_3p': cc[2], 'neither':cc[3]})
axs[i].get_legend().set_visible(False)
axs[i].set_ylim(.4,1.05)
axs[i].set_xticks([0,1,2,3])
prettify_ax(axs[i])
axs[i].annotate(c, (2, .42))
fig.tight_layout()
fig, axs = subplots(1, 7, figsize=(10,3), sharey=True)
cc = array(colors_th)
for i,c in enumerate(['HNSC','BRCA','LUAD', 'OV','KIRC','LGG','LUSC']):
f = combo[combo.isin(['both','TP53'])].ix[true_index(codes==c)].dropna()
#f = f.ix[ti(age < 75)].dropna()
draw_survival_curve(f, surv_5y, ax=axs[i],
colors={'both': cc[0], 'TP53': cc[1], 'del_3p': cc[2], 'neither':cc[3]})
axs[i].get_legend().set_visible(False)
axs[i].set_ylim(.4,1.05)
axs[i].set_xticks([0,1,2,3,4,5])
prettify_ax(axs[i])
axs[i].annotate(c, (2, .42))
fig.tight_layout()