%matplotlib inline
import pandas as pd
import numpy as np
import re
import os
import math
from multiprocessing import Pool
from tqdm import tqdm
from scipy import stats
## init
mySpecie='Homo_sapiens'
#prealigned_dir='/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.prealigned.pickle'
targetted_align_dir='/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.pickle'
manifest_dir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/./tcga_lgg_wgs_bams.df.wxs_rnaseq.pickle'
targetted_df=pd.read_pickle(targetted_align_dir).loc["TCGA"]
#targetted_df
##bam-readcount sometime have duplicate rows, need deduplication
#%time targetted_df=targetted_df.groupby(targetted_df.index.names).first()
all_UUIDs=targetted_df.index.get_level_values('Run_digits').unique()
#883, 1427
print ('n UUID:',len(all_UUIDs))
n UUID: 1570
### use andrea mapping to map from TCGA barcode to UUID.
manifest_df=pd.read_pickle(manifest_dir)
manifest_df['processed']=manifest_df.file_id.isin(all_UUIDs)
uuid_barcode_mapDf=pd.read_csv('/cellar/users/andreabc/GDC_barcodes/uuid_barcode_map.txt',sep='\t').set_index('file_id')
manifest_df['sample_barcode']=uuid_barcode_mapDf.loc[manifest_df.file_id]['sample_barcode'].values
m_data_category=manifest_df.data_category=='Raw Sequencing Data'
m_experimental_strategy=manifest_df['experimental_strategy'].isin(['RNA-Seq','WXS'])
manifest_df_sub=manifest_df[manifest_df['processed']&m_data_category&m_experimental_strategy]
tmpVC=manifest_df_sub['sample_barcode'].value_counts()
with_both=tmpVC.index[tmpVC==2]
len(with_both)
524
manifest_df_w_RNA_WXS=manifest_df_sub[manifest_df_sub.sample_barcode.isin(with_both)]
#g=manifest_df_w_RNA_WXS.groupby(['sample_barcode','experimental_strategy'])
#[manifest_df_sub['sample_barcode']=='TCGA-HT-A4DV-01A']
#manifest_df_sub
#queryBarcode='TCGA-HT-A4DV-01A'
#rnaseq_uuid=g.get_group((queryBarcode,'RNA-Seq'))['file_id'].iloc[0]
#wxs_uuid=g.get_group((queryBarcode,'WXS'))['file_id'].iloc[0]
#targetted_df.head()
query_chromosome, qeury_corrdinate='2',208248388
m_chr=targetted_df.index.get_level_values('Chr')=='2'
Pos_array=targetted_df.index.get_level_values('Pos')
window_size=10
m_pos=(Pos_array>(qeury_corrdinate-window_size))&(Pos_array<(qeury_corrdinate+window_size))
#
targetted_df_sub=targetted_df[m_pos]
rnaseq_uuids=manifest_df_w_RNA_WXS[manifest_df_w_RNA_WXS['experimental_strategy']=='WXS']['file_id'].unique()
m_uuid=targetted_df_sub.index.get_level_values('Run_digits').isin(rnaseq_uuids)
index_metaDf=targetted_df_sub.index.to_frame()
index_metaDf['sample_barcode']=manifest_df_w_RNA_WXS.set_index('file_id').loc[index_metaDf['Run_digits']]['sample_barcode'].values
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: Passing list-likes to .loc or [] with any missing label will raise KeyError in the future, you can use .reindex() as an alternative. See the documentation here: https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike This is separate from the ipykernel package so we can avoid doing imports until
multI=index_metaDf.set_index(['sample_barcode','Run_digits','Chr','Pos','base']).index
targetted_df_sub.index=multI
wxs_df=targetted_df_sub[m_uuid]#.loc[wxs_uuid]
groupings_L=['sample_barcode','Chr','Pos','base']
%time wxs_df=wxs_df.groupby(groupings_L).first()
CPU times: user 64 ms, sys: 4 ms, total: 68 ms Wall time: 67.4 ms
rnaseq_uuids=manifest_df_w_RNA_WXS[manifest_df_w_RNA_WXS['experimental_strategy']=='RNA-Seq']['file_id'].unique()
m_uuid=targetted_df_sub.index.get_level_values('Run_digits').isin(rnaseq_uuids)
rnaseq_df=targetted_df_sub[m_uuid]#.loc[rnaseq_uuid]
%time rnaseq_df=rnaseq_df.groupby(groupings_L).first()
CPU times: user 24 ms, sys: 0 ns, total: 24 ms Wall time: 20.7 ms
mergedDf=pd.concat([ rnaseq_df,wxs_df],axis=1,keys=['rna-seq','wxs'])
mergedDf=mergedDf[~mergedDf.isnull().all(axis=1)]
#ref C
IDH1_corrdinate=208248388
mergedDf_subDf=mergedDf[mergedDf.index.get_level_values('Pos')==IDH1_corrdinate]
m_quality=(mergedDf_subDf['wxs']['AverageBaseQuality']>20)
ref_alt='T'
m_alt=(mergedDf_subDf.index.get_level_values('base'))==ref_alt
qualFilteredDf=mergedDf_subDf[m_quality]
unstackDf=qualFilteredDf.unstack()
unstackDf.head()
rna-seq | wxs | |||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
features | ReadDepth | AverageBaseQuality | ReadDepth | AverageBaseQuality | ||||||||||||||
base | A | C | G | T | A | C | G | T | A | C | G | T | A | C | G | T | ||
sample_barcode | Chr | Pos | ||||||||||||||||
TCGA-CS-4938-01B | 2 | 208248388 | NaN | 68.0 | NaN | 24.0 | NaN | 37.0 | NaN | 38.0 | NaN | 87.0 | NaN | 29.0 | NaN | 29.0 | NaN | 29.0 |
TCGA-CS-4941-01A | 2 | 208248388 | NaN | 235.0 | NaN | NaN | NaN | 37.0 | NaN | NaN | NaN | 182.0 | NaN | NaN | NaN | 27.0 | NaN | NaN |
TCGA-CS-4942-01A | 2 | 208248388 | NaN | 140.0 | NaN | 19.0 | NaN | 38.0 | NaN | 38.0 | NaN | 192.0 | NaN | 46.0 | NaN | 25.0 | NaN | 28.0 |
TCGA-CS-4943-01A | 2 | 208248388 | NaN | 179.0 | NaN | 73.0 | NaN | 38.0 | NaN | 37.0 | NaN | 116.0 | NaN | 59.0 | NaN | 26.0 | NaN | 28.0 |
TCGA-CS-4944-01A | 2 | 208248388 | NaN | 43.0 | NaN | 17.0 | NaN | 37.0 | NaN | 37.0 | NaN | 149.0 | NaN | 43.0 | NaN | 27.0 | NaN | 28.0 |
from sklearn import metrics
import matplotlib.pyplot as plt
fig,ax=plt.subplots(figsize=(5,3))
unstackDf[('wxs','ReadDepth','T')].fillna(0).hist(bins=30,label='Minor alle',ax=ax)
#unstackDf[('wxs','ReadDepth','C')].fillna(0).hist(bins=30,ax=ax,
# label='Reference allele')
ax.axvline(x=10,c='red')
ax.set_ylabel('# of tumor samples')
#ax.legend([])
ax.set_xlabel('Allelic read counts')
ax.grid(False)
(unstackDf[('wxs','ReadDepth','T')]>0).sum()
351
asdasdas
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-41-61aacb46f36f> in <module>() ----> 1 asdasdas NameError: name 'asdasdas' is not defined
y_true=(>10)
y_true.value_counts()
print ('ROC AUC',metrics.roc_auc_score(y_true,
unstackDf[('rna-seq','ReadDepth','T')].fillna(0)))
precision, recall, thresholds=metrics.precision_recall_curve(y_true,
unstackDf[('rna-seq','ReadDepth','T')].fillna(0))
ax=pd.DataFrame({'precision':precision,'recall':recall}).plot(x='recall',y='precision')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
metrics.auc(recall,precision)
ax.set_title('IDH1 C>T mutation using RNAseq')
import seaborn as sns
qualFilteredDf[('wxs','ReadDepth')].index.get_level_values('base').value_counts()
qualFilteredDf[('rna-seq','ReadDepth')].index.get_level_values('base').value_counts()
qualFilteredDf[('rna-seq','ReadDepth')].index.get_level_values('base').value_counts()
mergedDf_subDf
qualFilteredDf['ReadDepth'],qualFilteredDf['wxs']>3
##c
#qualFilteredDf[]
unstackDf=qualFilteredDf['wxs'].unstack()['ReadDepth']
#qualFilteredDf['wxs'].unstack()['ReadDepth']
sns.jointplot(data=qualFilteredDf[qualFilteredDf.index.get_level_values('base')=='T'],x=('rna-seq','ReadDepth'),y=('wxs','ReadDepth'))
#categorical_crossentropy
### generate the correlation between the data