### parameters
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np
mySpecie='Homo_sapiens'
"""
Download from synapse if it is not installed
"""
skymap_snp_dir='/cellar/users/btsui/Data/SRA/snp/{specie}_snp_pos/'.format(specie=mySpecie)
%%time
metaDataMappingSDir='/cellar/users/btsui/Data/nrnb01_nobackup/METAMAP//input/allAttrib.v5.csv.NCI.prefilter.pyc'
bio_metaDf=pd.read_pickle(metaDataMappingSDir)
CPU times: user 16.7 s, sys: 3.02 s, total: 19.7 s Wall time: 19.8 s
termCountS=bio_metaDf.NciEng.value_counts()
#termCountS[termCountS.index.str.contains('breast',case=False)]
#bio_metaDf.attrib.value_counts()
m22=bio_metaDf.attrib.isin(['cell type','source name'])
m_primary=bio_metaDf.NciEng.str.contains('primary',case=False)
m_breast=bio_metaDf.NciEng=='Malignant Breast Neoplasm'
m_cell_line=bio_metaDf.attrib=='cell line'
inTmpBioMetaDf=bio_metaDf.copy()
bio_metaDf.head()
srs | attrib | CUI | score | NCI | NciEng | |
---|---|---|---|---|---|---|
0 | SRS286232 | sex | C1706180 | 1000 | C46109 | Male Gender |
1 | SRS286232 | sex | C1706429 | 1000 | C46107 | Male, Self-Report |
2 | SRS286232 | sex | C1706428 | 1000 | C46112 | Male Phenotype |
3 | DRS052357 | BioSampleModel | C1332821 | 694 | C24597 | CXCL9 Gene |
4 | DRS052357 | BioSampleModel | C1707170 | 694 | C49770 | CXCL9 wt Allele |
tmpSrs=np.intersect1d(bio_metaDf[m_primary].srs, bio_metaDf[m22].srs)
caseSrs=np.intersect1d(tmpSrs,bio_metaDf[m_breast].srs)
#caseSrs=bio_metaDf_hit.srs.unique()
#caseSrs
#bio_metaDf_hit=bio_metaDf[m_cell_line&m_breast]
ctrlSrs=np.intersect1d(bio_metaDf[m_cell_line].srs, bio_metaDf[m_breast].srs)
ctrlSrs
array(['DRS007732', 'DRS007733', 'DRS008484', ..., 'SRS987464', 'SRS987465', 'SRS987466'], dtype=object)
%%time
sra_dump_pickle_dir='/cellar/users/btsui/Data/SRA/DUMP/sra_dump.pickle'
technical_meta_data_df=pd.read_pickle(sra_dump_pickle_dir)
CPU times: user 2 s, sys: 352 ms, total: 2.35 s Wall time: 2.36 s
technical_meta_data_df_sub=technical_meta_data_df[technical_meta_data_df.Sample.isin(caseSrs)]
technical_meta_data_df_sub_ctrl=technical_meta_data_df[technical_meta_data_df.Sample.isin(ctrlSrs)]
"""
extract the run from the sequencing data
"""
'\nextract the run from the sequencing data \n'
def loadDf(fname,mmap_mode='r'):
with open(fname+'.index.txt') as f:
myIndex=map(lambda s:s.replace("\n",""), f.readlines())
with open(fname+'.columns.txt') as f:
myColumns=map(lambda s:s.replace("\n",""), f.readlines())
tmpMatrix=np.load(fname+".npy",mmap_mode=mmap_mode)
tmpDf=pd.DataFrame(tmpMatrix,index=myIndex,columns=myColumns)
tmpDf.columns.name='Run'
return tmpDf
expression_metric='TPM'
#change this to where the matrix is located on your computer
baseDir='/cellar/users/btsui/Data/nrnb01_nobackup/Data/SRA/MATRIX/DATA/hgGRC38/'
data_matrix_dir_fmt=baseDir+'/allSRAmatrix.realign.v9.base.{feature}.gene.symbol'.format(feature=expression_metric)
rnaseqDf=loadDf(data_matrix_dir_fmt)
caseIds=pd.Series(np.intersect1d(technical_meta_data_df_sub.index,rnaseqDf.columns)).sample(n=400).values
ctrlIds=pd.Series(np.intersect1d(technical_meta_data_df_sub_ctrl.index,rnaseqDf.columns)).sample(n=400).values#ctrlSrs
len(caseIds)
400
otherIds=pd.Series(list(set(rnaseqDf.columns.tolist())-set(caseIds))).sample(n=200).values
otherDf=rnaseqDf.loc[:,otherIds]
#CD44, CD45, CD90 CD29 and CD105
caseDf=rnaseqDf.loc[:,caseIds]
mergedDf=np.log2(pd.concat([caseDf,otherDf],axis=1,keys=['case','ctrl'])+1)
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5278527/
stromalMarkers=['VIM']
stackS=mergedDf.loc[stromalMarkers].T.stack()
sns.boxplot(data=stackS[stackS>0].reset_index(),x='level_2',hue='level_0',y=0)
<matplotlib.axes._subplots.AxesSubplot at 0x2adef460b128>
!echo $PWD/Project/METAMAP/notebook/RapMapTest/Analysis/CheckForStromalMarkers.ipynb
/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/Analysis/Project/METAMAP/notebook/RapMapTest/Analysis/CheckForStromalMarkers.ipynb