%pylab inline
Populating the interactive namespace from numpy and matplotlib
cd ../src
/cellar/users/agross/TCGA_Code/TCGA/src
import pandas as pd
import pickle as pickle
Here we read in the pre-processed data that we downloaded and initialized in the download_data notebook.
def get_run(firehose_dir, version='Latest'):
'''
Helper to get a run from the file-system.
'''
path = '{}/ucsd_analyses'.format(firehose_dir)
if version is 'Latest':
version = sorted(os.listdir(path))[-1]
run = pickle.load(open('{}/{}/RunObject.p'.format(path, version), 'rb'))
return run
params = pd.read_table('../global_params.txt', header=None, squeeze=True,
index_col=0)
run_path = '{}/Firehose__{}/'.format(params.ix['OUT_PATH'], params.ix['RUN_DATE'])
run = get_run(run_path, 'Run_' + params.ix['VERSION'])
cancer = run.load_cancer(params.ix['CANCER'])
clinical = cancer.load_clinical()
mut = cancer.load_data('Mutation')
mut.uncompress()
cn = cancer.load_data('CN_broad')
cn.uncompress()
rna = cancer.load_data('mRNASeq')
mirna = cancer.load_data('miRNASeq')
surv = clinical.survival.survival_5y
hpv_all = pd.read_csv('../Extra_Data/hpv_summary_3_20_13_distribute.csv', index_col=0)
hpv = hpv_all.Molecular_HPV.map({0:'HPV-', 1:'HPV+'})
hpv.name = 'HPV'
hpv_seq = hpv
hpv_seq.value_counts()
HPV- 244 HPV+ 35 dtype: int64
status = clinical.clinical[['hpvstatusbyishtesting','hpvstatusbyp16testing']]
hpv_clin = (status.dropna() == 'positive').sum(1)
hpv_clin = hpv_clin.map({2: 'HPV+', 0:'HPV-', 1:nan}).dropna()
hpv_clin.value_counts()
HPV- 50 HPV+ 8 dtype: int64
hpv_clin.ix[hpv_clin.index.diff(hpv_seq.index)].value_counts()
HPV- 9 HPV+ 4 dtype: int64
hpv_new = pd.read_table('../Extra_Data/nationwidechildrens.org_auxiliary_hnsc.txt',
skiprows=[1], index_col=0, na_values=['[Not Available]'])
hpv_new = hpv_new['hpv_status']
hpv_combo = (hpv_seq.dropna() == 'HPV+').combine_first(hpv_new == 'Positive')
hpv_combo.to_clipboard()
clinical.hpv = hpv_combo
clinical.save() #I keep the same object as there are no side effects
keepers_o = hpv_combo[hpv_combo==0].index
keepers_o = keepers_o.intersection(mut.features.columns)
keepers_o = keepers_o.intersection(cn.features.columns)
keepers_o = keepers_o.intersection(surv.unstack().index)
keepers_o = keepers_o.intersection(rna.features.columns)
keepers_o = keepers_o.intersection(mirna.features.columns)
len(keepers_o)
258
from Initialization.InitializeReal import RealDataset
from Processing.Helpers import make_path_dump
rna = RealDataset(run, cancer, 'mRNASeq', keepers_o)
mirna = RealDataset(run, cancer, 'miRNASeq', keepers_o, create_meta_features=False)
make_path_dump(rna, rna.path + '/store/no_hpv2.p')
make_path_dump(mirna, mirna.path + '/store/no_hpv2.p')