RNAseq, mutation, and copy number data were accessed from the UCSC Xena Data Browser. Clinical data was downloaded from the TCGA Snaptron curation effort. See data_download.sh
for more details.
import os
import requests
import numpy as np
import pandas as pd
from sklearn import preprocessing
# Input Files
rna_file = os.path.join('data', 'raw', 'HiSeqV2')
mut_file = os.path.join('data', 'raw', 'PANCAN_mutation')
copy_file = os.path.join('data', 'raw', 'Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes')
clinical_file = os.path.join('data', 'raw', 'samples.tsv')
# Output Files
# Processing RNAseq data by z-score and zeroone norm
rna_out_file = os.path.join('data', 'pancan_scaled_rnaseq.tsv.gz')
rna_out_zeroone_file = os.path.join('data', 'pancan_scaled_zeroone_rnaseq.tsv.gz')
# Mutation Data
mut_out_file = os.path.join('data', 'pancan_mutation.tsv.gz')
mut_burden_file = os.path.join('data', 'pancan_mutation_burden.tsv')
# Two copy number matrices, for thresholded (2) gains and losses
copy_gain_out_file = os.path.join('data', 'copy_number_gain.tsv.gz')
copy_loss_out_file = os.path.join('data', 'copy_number_loss.tsv.gz')
# Clinical data
clinical_processed_out_file = os.path.join('data', 'clinical_data.tsv')
# OncoKB output file
oncokb_out_file = os.path.join('data', 'oncokb_genetypes.tsv')
# Status matirix integrating mutation and copy number events
known_status_file = os.path.join('data', 'status_matrix.tsv.gz')
rnaseq_df = pd.read_table(rna_file, index_col=0)
mutation_df = pd.read_table(mut_file)
copy_df = pd.read_table(copy_file, index_col=0)
clinical_df = pd.read_table(clinical_file, index_col=0)
/home/gway/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (32,129,130,131,132,133,134,137,138,139,140,141,142,144,147,217,256,283,284,285,286,287,288,289,290,294,295,296,299,301,304,305,306,307,308,309,310,311,315,317,320,321,322,323,325,326,328,329,332,335,336,341,344,346,349,351,357,359,361,362,365,367,368,369,370,374,376,377,378,379,380,381,382,384,385,386,387,388,392,393,395,396,397,408,409,418,419,420,421,422,423,424,425,426,427,428,431,433,434,435,436,444,447,448,449,451,453,454,455,456,457,458,459,460,461,464,466,468,469,470,471,472,473,481,483,484,485,488,491,493,496,497,499,502,503,504,505,509,510,511,512,513,514,515,516,517,518,520,521,523,524,525,526,527,528,529,530,531,532,533,535,536,537,538,539,540,542,543,545,546,549,550,551,552,553,554,555,558,559,560,561,562,564,565,566,567,568,569,570,571,572,573,574,575,577,579,580,581,582,583,584,585,586,588,589,592,593,594,595,596,598,599,601,602,603,604,606,607,626,628,629,630,631,632,633,634,635,636,637,638,639,640,641,642,643,644,645,646,658,660,662,669,670,671,672,673,675,676,678,679,687,688,689,690,692,693,694,695,696,698,699,700,701,702,703,706,709,710,711,726,727,728,729,730,731,732,733,734,735,736,744,746,747,748,749,750,753,754,755,756,757,759,761,762,766,767,769,770,771,772,809,810,811,812,813,815,816,817,818,819,820,822,824,825,826,827,828,850,851,852,853,855,856,857) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
# Process RNAseq file
rnaseq_df.index = rnaseq_df.index.map(lambda x: x.split('|')[0])
rnaseq_df.columns = rnaseq_df.columns.str.slice(start=0, stop=15)
rnaseq_df = rnaseq_df.drop('?').fillna(0).sort_index(axis=1)
# Gene is listed twice in RNAseq data, drop both occurrences
rnaseq_df.drop('SLC35E2', axis=0, inplace=True)
rnaseq_df = rnaseq_df.T
# Determine most variably expressed genes and subset
num_mad_genes = 5000
mad_genes = rnaseq_df.mad(axis=0).sort_values(ascending=False)
top_mad_genes = mad_genes.iloc[0:num_mad_genes, ].index
rnaseq_subset_df = rnaseq_df.loc[:, top_mad_genes]
# Scale RNAseq data using z-scores
rnaseq_scaled_df = preprocessing.StandardScaler().fit_transform(rnaseq_subset_df)
rnaseq_scaled_df = pd.DataFrame(rnaseq_scaled_df,
columns=rnaseq_subset_df.columns,
index=rnaseq_subset_df.index)
rnaseq_scaled_df.to_csv(rna_out_file, sep='\t', compression='gzip')
# Scale RNAseq data using zero-one normalization
rnaseq_scaled_zeroone_df = preprocessing.MinMaxScaler().fit_transform(rnaseq_subset_df)
rnaseq_scaled_zeroone_df = pd.DataFrame(rnaseq_scaled_zeroone_df,
columns=rnaseq_subset_df.columns,
index=rnaseq_subset_df.index)
rnaseq_scaled_zeroone_df.to_csv(rna_out_zeroone_file, sep='\t', compression='gzip')
Mutation data are stored in a long format. First, subset the data to only deleterious mutations listed below. Next, pivot the dataframe to have samples as the index, genes as the columns, and either a 1 or 0 to indicate a deleterious mutation or wild-type sample by gene.
# Filter mutation types and generate binary matrix
mutations = {
'Frame_Shift_Del',
'Frame_Shift_Ins',
'In_Frame_Del',
'In_Frame_Ins',
'Missense_Mutation',
'Nonsense_Mutation',
'Nonstop_Mutation',
'RNA',
'Splice_Site',
'Translation_Start_Site',
}
# Process mutation in long format to dataframe format
mut_pivot = (mutation_df.query("effect in @mutations")
.groupby(['#sample', 'chr',
'gene'])
.apply(len).reset_index()
.rename(columns={0: 'mutation'}))
mut_pivot = (mut_pivot.pivot_table(index='#sample',
columns='gene', values='mutation',
fill_value=0)
.astype(bool).astype(int))
mut_pivot.index.name = 'SAMPLE_BARCODE'
mut_pivot.to_csv(mut_out_file, sep='\t', compression='gzip')
# Process per sample mutation burden
# Mutation burden quantifies how many mutations are observed in a single tumor.
# It is a measure of how unstable a tumors' genome is, potentially reflecting
# deficiences in DNA repair and/or specific etiology. It is a major source of
# variation in expression data and is used by alternative downstream models or
# to explore relationships in tybalt features.
burden_df = mutation_df[mutation_df['effect'].isin(mutations)]
burden_df = burden_df.groupby('#sample').apply(len)
burden_df = np.log10(burden_df).reset_index()
burden_df.columns = ['SAMPLE_BARCODE', 'log10_mut']
burden_df.to_csv(mut_burden_file, index=False, sep='\t')
Copy number data contains thresholded GISTIC2.0 calls where 0 equals wild-type copy number, 1 and -1 mean slight gain and slight loss, respectively, and 2 and -2 mean high gain and high loss, respectively.
copy_df = copy_df.astype(int)
copy_df = copy_df.T
copy_df.columns.name = 'gene'
copy_df.index.name = 'Sample'
# For our purposes, a copy loss status event (1 vs. 0) is conservatively defined only as a deep loss.
copy_loss_df = copy_df.replace(to_replace=[1, 2, -1], value=0)
copy_loss_df.replace(to_replace=-2, value=1, inplace=True)
copy_loss_df.to_csv(copy_loss_out_file, sep='\t', compression='gzip')
# A copy gain status event (1 vs. 0) is defined only as a high gain.
copy_gain_df = copy_df.replace(to_replace=[-1, -2, 1], value=0)
copy_gain_df.replace(to_replace=2, value=1, inplace=True)
copy_gain_df.to_csv(copy_gain_out_file, sep='\t', compression='gzip')
The clinical data consists of hundreds of parameters collected for the TCGA samples. Some columns are redundant, while others contain high amounts of missing data. Select and rename only a couple columns of interest.
clinical_columns_dict = {
'gdc_platform': 'platform',
'gdc_center.short_name': 'analysis_center',
'gdc_cases.submitter_id': 'sample_id',
'gdc_cases.demographic.gender': 'gender',
'gdc_cases.demographic.race': 'race',
'gdc_cases.demographic.ethnicity': 'ethnicity',
'gdc_cases.project.primary_site': 'organ',
'gdc_cases.project.project_id': 'acronym',
'gdc_cases.tissue_source_site.project': 'disease',
'gdc_cases.diagnoses.vital_status': 'vital_status',
'gdc_cases.samples.sample_type': 'sample_type',
'cgc_case_age_at_diagnosis': 'age_at_diagnosis',
'cgc_portion_id': 'portion_id',
'cgc_slide_percent_tumor_nuclei': 'percent_tumor_nuclei',
'cgc_drug_therapy_drug_name': 'drug',
'xml_year_of_initial_pathologic_diagnosis': 'year_of_diagnosis',
'xml_stage_event_pathologic_stage': 'stage'
}
clinical_sub_df = clinical_df.filter(items=clinical_columns_dict.keys())
clinical_sub_df = clinical_sub_df.rename(columns=clinical_columns_dict)
clinical_sub_df.index = clinical_sub_df['sample_id']
clinical_sub_df.drop('sample_id', axis=1, inplace=True)
clinical_sub_df['acronym'] = clinical_sub_df['acronym'].str[5:]
clinical_sub_df.to_csv(clinical_processed_out_file, sep='\t')
Here, we use the OncoKB API of Chakravarty et al. 2017 to download all oncogenes and tumor suppressor genes. These help to subset the copy number gain and copy loss data frames to identify a full status matrix.
response = requests.get('http://oncokb.org/api/v1/genes')
oncokb_df = pd.read_json(response.content)
oncokb_df.to_csv(oncokb_out_file, sep='\t')
We create the full status matrix with:
# Integrate copy number, oncokb gene-type, and mutation status to define status matrix
oncogenes_df = oncokb_df[oncokb_df['oncogene']]
tsg_df = oncokb_df[oncokb_df['tsg']]
# Subset copy gains by oncogenes and copy losses by tumor suppressors (tsg)
status_gain = copy_gain_df.loc[:, oncogenes_df['hugoSymbol']]
status_loss = copy_loss_df.loc[:, tsg_df['hugoSymbol']]
copy_status = pd.concat([status_gain, status_loss], axis=1)
# NOTCH1 and NOTCH2 are both tumor suppressors and oncogenes depending on tissue context
# drop them from copy number consideration for now
copy_status = copy_status.drop(['NOTCH1', 'NOTCH2'], axis=1)
# Subset each dataframe
status_samples = set(mut_pivot.index) & set(copy_status.index)
mutation_status = mut_pivot.loc[status_samples, :]
copy_status = copy_status.loc[status_samples, :]
copy_status = copy_status.loc[:, mutation_status.columns].fillna(0).astype(int)
# Combine and write to file
full_status = mutation_status.add(copy_status, fill_value=0)
full_status.replace(to_replace=2, value=1, inplace=True)
full_status.to_csv(known_status_file, sep='\t', compression='gzip')