%matplotlib inline
import pandas as pd
import numpy as np
import re
import os
import math
from multiprocessing import Pool
from tqdm import tqdm
from scipy import stats
## init
mySpecie='Homo_sapiens'
#prealigned_dir='/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.prealigned.pickle'
targetted_align_dir='/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.pickle'
manifest_dir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/./tcga_lgg_wgs_bams.df.wxs_rnaseq.pickle'
targetted_df=pd.read_pickle(targetted_align_dir).loc["TCGA"]
all_UUIDs=targetted_df.index.get_level_values('Run_digits').unique()
manifest_df=pd.read_pickle(manifest_dir)
manifest_df['processed']=manifest_df.file_id.isin(all_UUIDs)
uuid_barcode_mapDf=pd.read_csv('/cellar/users/andreabc/GDC_barcodes/uuid_barcode_map.txt',sep='\t').set_index('file_id')
manifest_df['sample_barcode']=uuid_barcode_mapDf.loc[manifest_df.file_id]['sample_barcode'].values
m_data_category=manifest_df.data_category=='Raw Sequencing Data'
m_experimental_strategy=manifest_df['experimental_strategy'].isin(['RNA-Seq','WXS'])
manifest_df_sub=manifest_df[manifest_df['processed']&m_data_category&m_experimental_strategy]
tmpVC=manifest_df_sub['sample_barcode'].value_counts()
#identify data with both RNAseq and WXS
with_both=tmpVC.index[tmpVC==2]
manifest_df_sub['is_tumor']=manifest_df_sub['sample_barcode'].str.contains('TCGA-\w+-\w+-01')
manifest_df_w_RNA_WXS=manifest_df_sub[manifest_df_sub.sample_barcode.isin(with_both)&manifest_df_sub['is_tumor']]
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:20: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
inVcfDir='/data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.f1_byte2_not_00.vcf.gz'
vcfDf=pd.read_csv(inVcfDir,sep='\t',header=None)
vcfDf.columns=['Chr','Pos','RsId','RefBase','AltBase','','','Annot']
vcfDf['Chr']=vcfDf['Chr'].astype(np.str)
#queryChr,querySite,refBase='17',7673803,'G','A' #TP53
#8 142877758
#6, 31356729
#G A
#1 237591774
#
queryChr,querySite,refBase='1',237591774,'C',
m_chrom=(targetted_df.index.get_level_values('Chr')==queryChr)
vcfDf[vcfDf.Pos==querySite]
Chr | Pos | RsId | RefBase | AltBase | Annot | |||
---|---|---|---|---|---|---|---|---|
28989 | 1 | 237591774 | rs75901196 | C | A | . | . | RS=75901196;RSPOS=237591774;dbSNPBuildID=131;S... |
targetted_df.head()
features | ReadDepth | AverageBaseQuality | |||
---|---|---|---|---|---|
Run_digits | Chr | Pos | base | ||
08ce1dd9-3167-4fe4-8619-724727a32a36 | 1 | 14727 | A | 18 | 28 |
G | 280 | 32 | |||
630825 | T | 11 | 32 | ||
630833 | C | 9 | 34 | ||
833068 | G | 2 | 34 |
m_site=(targetted_df.index.get_level_values('Pos')==querySite)
hitDf=targetted_df[m_site&m_chrom]
hitDfResetDf=hitDf.reset_index()
hitDfResetDf['is_ref']=hitDfResetDf.base==refBase
uuidToExperimentS=manifest_df_w_RNA_WXS.set_index('file_id')['experimental_strategy']
uuidToBarcodeS=manifest_df_w_RNA_WXS.set_index('file_id')['sample_barcode']
hitDfResetDf['Strategy']=uuidToExperimentS.loc[hitDfResetDf['Run_digits']].values
hitDfResetDf['Barcode']=uuidToBarcodeS.loc[hitDfResetDf['Run_digits']].values
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: Passing list-likes to .loc or [] with any missing label will raise KeyError in the future, you can use .reindex() as an alternative. See the documentation here: https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike """Entry point for launching an IPython kernel. /cellar/users/btsui/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Passing list-likes to .loc or [] with any missing label will raise KeyError in the future, you can use .reindex() as an alternative. See the documentation here: https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
validHitResetDf=hitDfResetDf.dropna()
unstackDf=validHitResetDf.set_index(['Barcode','base','Strategy'])['ReadDepth'].unstack().unstack().fillna(0)#.set_index(['Strategy'])
import seaborn as sns
#unstackDf
'ref at {}:{}'.format(queryChr,querySite)
'ref at 1:237591774'
altBase='A'
#refBase,altBase
g=sns.jointplot(data=unstackDf['RNA-Seq'],x=refBase,y=altBase)#['G']#['RNA-Seq']
g.set_axis_labels('ref allele read count at {}:{}'.format(queryChr,querySite),'alt allle')
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been " /cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been "
<seaborn.axisgrid.JointGrid at 0x2af947b404a8>
g=sns.jointplot(x=np.log2(unstackDf['RNA-Seq'][altBase]+1),y=np.log2(unstackDf['WXS'][altBase]+1))
g.set_axis_labels('Alt allele read count at {}:{} in RNAseq'.format(queryChr,querySite),
'Alt allele read count at {}:{} in WXS'.format(queryChr,querySite))
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been " /cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been "
<seaborn.axisgrid.JointGrid at 0x2af9484b5d30>
sns.jointplot(data=unstackDf['WXS'],x='G',y='A')
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been " /cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been "
<seaborn.axisgrid.JointGrid at 0x2af947f95ac8>
unstackDf['A'].head()
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3062 try: -> 3063 return self._engine.get_loc(key) 3064 except KeyError: pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'A' During handling of the above exception, another exception occurred: KeyError Traceback (most recent call last) <ipython-input-296-8988e14ab09b> in <module>() ----> 1 unstackDf['A'].head() ~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key) 2681 return self._getitem_frame(key) 2682 elif is_mi_columns: -> 2683 return self._getitem_multilevel(key) 2684 else: 2685 return self._getitem_column(key) ~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_multilevel(self, key) 2725 2726 def _getitem_multilevel(self, key): -> 2727 loc = self.columns.get_loc(key) 2728 if isinstance(loc, (slice, Series, np.ndarray, Index)): 2729 new_columns = self.columns[loc] ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in get_loc(self, key, method) 2262 2263 if not isinstance(key, tuple): -> 2264 loc = self._get_level_indexer(key, level=0) 2265 2266 # _get_level_indexer returns an empty slice if the key has ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in _get_level_indexer(self, key, level, indexer) 2521 else: 2522 -> 2523 loc = level_index.get_loc(key) 2524 if isinstance(loc, slice): 2525 return loc ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3063 return self._engine.get_loc(key) 3064 except KeyError: -> 3065 return self._engine.get_loc(self._maybe_cast_indexer(key)) 3066 3067 indexer = self.get_indexer([key], method=method, tolerance=tolerance) pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'A'
sns.jointplot(data=np.log2(unstackDf['A']+1),x=('RNA-Seq'),y=('WXS'))
unstackDf['WXS'].sum(axis=1)
manifest_dmanifest_g=manifest_df_w_RNA_WXS.groupby(['sample_barcode','experimental_strategy'])
manifest_df_w_RNA_WXS.set_index('file_id')['experimental_strategy']
file_id 1985b367-00c4-4c25-b049-5858e937cc6d RNA-Seq a7aff6a7-cea4-42e9-8d56-b211843a0302 RNA-Seq 4038984b-21d1-45d3-a5bb-208f9e71aa78 RNA-Seq 7b17a5ef-c778-4753-8c30-427af9171f7a RNA-Seq 93e20db7-4892-4ef8-9331-ecd665e4cb91 RNA-Seq 35a18f9e-0ef8-4e55-b655-33397b63fd1a RNA-Seq bc27f71a-4222-4940-8a64-246e2b9f3d44 WXS 84943493-e5ef-4099-8706-09afa625b076 WXS f36d778a-2c49-43aa-a577-7a6a921ad6cc WXS 243e9ccb-7c93-4c6a-9e3e-105c379e7f78 RNA-Seq f895b46c-7811-4ebb-ac9c-970bbfed657a RNA-Seq 3594993e-f65c-434b-a64e-b1873f4b04ca WXS 4522becf-1566-4de8-b760-79bb93513ecf WXS 65c95f19-84f0-4e8b-99c3-186f881181a4 RNA-Seq eb4dda76-4215-458b-b9a0-60f9ff24bd6e RNA-Seq f4db57f0-ba62-4a60-b65d-d2354612eb7b RNA-Seq 00b53e00-d640-49e5-b2bd-3a3bdf867998 RNA-Seq e55c8de3-3a92-42f2-bc1e-70a21355d696 WXS 3a0e5ae0-dc79-468d-b459-a6d43b612851 WXS 3011867e-1c3b-4791-849e-4e7d636ddc88 RNA-Seq fd5d9171-979e-4742-adec-179f19bf6c06 WXS aeebf360-49c4-4db3-bdf4-daffdc5279cc RNA-Seq 25650a4b-775b-485b-92c2-3f8f30ba4169 WXS b2bd014e-44fe-4d24-9b75-91baf36b3c0a RNA-Seq dcbaf670-8100-4c98-bd4d-2880a5805f23 RNA-Seq eda26d7b-0725-46c4-9264-bcdf94dc163d RNA-Seq e52ef468-33a1-4241-8286-be7387218f18 RNA-Seq 06c8a70b-3f0c-42bc-875a-f3e13c887a73 RNA-Seq 386d69e7-b4a7-4981-beb8-98f088c689f7 WXS 37589627-0b38-41b0-b7a3-d2dd1f343550 RNA-Seq ... e2a550e9-4f8c-48ea-838d-2e4bcfcbbfe4 WXS 4b5c011b-efe1-476c-91cf-fd2776ce80a5 WXS 78be435a-35d5-4ef7-8ded-e323a9c342d1 WXS ff151b36-6dc1-4462-af67-0d5947d38df4 WXS 5dbbebdc-a12b-4b04-8f4f-510f34e87e05 WXS c6bfb8ca-6824-4dac-a415-b2a676be4cc6 WXS 6eff4d5c-1647-4505-9d70-f36e4a532124 WXS 761633fb-a98a-4a62-973b-69a65e4da5a7 WXS bd2695cb-879e-4f57-865b-1289a036ac52 WXS 419a5f50-4217-41b5-a534-9437c0be4747 WXS 5d931e04-0e84-4b36-b555-e3d82bfc8e6f WXS 71266893-25de-4a54-a1db-3cdafb34b44a WXS de3dfa71-a490-4344-95dc-99301d1dee08 WXS cdb99f45-9d8a-4f5f-8e10-9b3ef51eac2e WXS 6bd2f0f6-df8f-4b4e-804b-328923ec382b WXS 4e30cef6-d1af-4d77-ad93-6e31ec58fc35 WXS 3772a453-c9cb-4084-b73a-181304516637 WXS 5de82077-8540-474f-a569-c4b3b951f81c WXS 704f5788-ca6e-499f-8d43-47134c5ba275 WXS 2207155d-348a-4ed0-91f1-71b60f179ee2 WXS e45b1fae-1a83-438e-a6e3-967813831346 WXS 67a2edc2-4776-4442-b74c-ff8b9b91867e WXS 2d622f8a-a91d-44ed-a818-ea7c2fd698cd WXS 06350102-0c44-4846-935d-515a5b0da989 WXS 17e5b895-6ec1-4bfc-a55f-a2114adb0af1 WXS db95c082-fe8d-4c85-a4d8-b63d4a39c1ea WXS 5b96100f-1815-453d-b2a4-b32bb747b4ad WXS d1ff8258-be02-44bf-9cbb-0bbc895452bf WXS e81ecc22-c399-4d35-b4b0-abcec7f895d5 WXS 2b0048e0-a062-40d2-a1e1-4bb763ea0ead WXS Name: experimental_strategy, Length: 1048, dtype: object