compare alignment statisictics with TCGA
the data contain patients allelic fraction information, to ensure hippa compliance, data is not provided in here
"""
for TCGA alginment, filter down to only sites that overlahps
"""
'\nfor TCGA alginment, filter down to only sites that overlahps \n'
#
inMySnpDir='./2b0048e0-a062-40d2-a1e1-4bb763ea0ead.snp.txt.gz'
#count directly from TCGA data
"""
the following file contain all the sites from TCGA bam
"""
tcgaSnpDir='./2b0048e0-a062-40d2-a1e1-4bb763ea0ead.tcga.txt.gz'
import pandas as pd
%matplotlib inline
import seaborn as sns
import numpy as np
import gzip
f=gzip.open(inMySnpDir,'r')
myList=[]
for l in f:
splitL=l.split('\t')
myList.append(splitL[0]+'-'+splitL[1])
f.close()
mySnpSites=set(myList)
overlapTcgaDir='./tmp.tcga.txt.gz'
inf=gzip.open(tcgaSnpDir,'r')
outf=gzip.open(overlapTcgaDir,'w')
#with open(tcgaSnpDir)as f:
for i,l in enumerate(inf):
splitL=l.split('\t')
name=splitL[0].replace('chr','')+'-'+splitL[1]
if name in mySnpSites:
splitL[0]=splitL[0].replace('chr','')
outf.write("\t".join(splitL))
if (i%(10**9))==0:
print i
inf.close()
outf.close()
0
#fname=inSrrDir+inSrr+'.txt.snp.gz'
def readBamRead(fname):
tmpDf_all=pd.read_csv(fname,sep='\s+',header=None,names=np.arange(50),index_col=None,error_bad_lines=False)
myCols=['Chr','Pos','Ref','rd_all','','A','C','G','T','N']
tmpDf=tmpDf_all.iloc[:,:len(myCols)]
tmpDf.columns=myCols
tmpDf2=tmpDf.set_index(['Chr','Pos'])
myBases=['A','C','G','T']
myL=[]
for base in myBases:
splitL=tmpDf2[base].str.split(':',expand=True)
### extract the read count and base quality
tmpDf5=splitL[[1,3]].astype(np.float)
tmpDf5.columns=['ReadDepth','AverageBaseQuality']
myL.append(tmpDf5)
tmpDf6=pd.concat(myL,keys=myBases,axis=0,names=['base'])
tmpDf6.columns.name='features'
mergedDf=tmpDf6.astype(np.uint16)
non_zero_df=mergedDf[mergedDf['ReadDepth']>0]
return non_zero_df
tcgaDf=readBamRead(overlapTcgaDir)
myDf=readBamRead(inMySnpDir)
### drop identical duplicates
myDf=myDf.groupby(['base','Chr','Pos']).first()
### compare the two file.
ctrl_label='TCGA aligned: Read count per (base, position)'
case_label='My pipeline aligned: Read count per (base, position)'
myMergedStatDf=pd.concat([tcgaDf['ReadDepth'],myDf['ReadDepth']],axis=1,keys=[case_label,ctrl_label])
inPlotDf=np.log2(myMergedStatDf.dropna()+1)
print 'All alleles (n='+str(inPlotDf.shape[0])+')'
All alleles (n=270987)
g=sns.jointplot(data=inPlotDf,x=ctrl_label,y=case_label,kind='hex',xlim=[0,10],ylim=[0,10])
g.savefig('./TCGA_compare.pdf')
g.savefig('./TCGA_compare.png',dpi=300)
Conclusion: Counts of the ACGT are highly similar between my simplified pipeline as compared to TCGA.
### identify reference allelels
tmpDf_all=pd.read_csv('./2b0048e0-a062-40d2-a1e1-4bb763ea0ead.snp.txt.gz',
sep='\s+',header=None,names=np.arange(50),index_col=None,error_bad_lines=False)
myCols=['Chr','Pos','Ref','rd_all','','A','C','G','T','N']
tmpDf=tmpDf_all.iloc[:,:len(myCols)]
tmpDf.columns=myCols
myReindexedDf=tmpDf.set_index(['Ref','Chr','Pos'])
refIndex=myReindexedDf.index
## extract non reference alleles
altAllelDf=inPlotDf[~inPlotDf.index.isin(refIndex)]
m1=altAllelDf[ctrl_label]>1.0
inPlotDf2=altAllelDf[m1]
print 'Alternative alleles only (n='+str(inPlotDf2.shape[0])+')'
Alternative alleles only (n=31370)
g=sns.jointplot(data=inPlotDf2,x=ctrl_label,y=case_label,kind='hex',xlim=[0,10],ylim=[0,10])
g.savefig('./TCGA_compare.alternative_allele.pdf')
g.savefig('./TCGA_compare.alternative_allele.png',dpi=300)
Conclusion: Some people might wonder Counts of the ACGT are highly similar between my simplified pipeline as compared to TCGA even among just the reference alleles.
#!cp ./../XGS_WGS/TCGA_compare.* .
#!ls ./Figures/