import pandas as pd
import os
import numpy as np
%matplotlib inline
inVcfDir='/data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.f1_byte2_not_00.vcf.gz'
vcfDf=pd.read_csv(inVcfDir,sep='\t',header=None)
vcfDf.columns=['Chr','Pos','RsId','RefBase','AltBase','','','Annot']
vcfDf['Chr']=vcfDf['Chr'].astype(str)
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
vcfDf.drop_duplicates(['Chr','Pos']).shape
(387950, 8)
#'\w{22}'
vcfDf['VP_Int']=vcfDf['Annot'].str.extract('VP=(0x\w+)',expand=False).apply(lambda Str:int(Str,16))
#from right to left
nth_bit=6
vcfDf['somatic']=(vcfDf['VP_Int']%(2**(nth_bit-1))).astype(bool).values
vcfDf['specificSNP']=vcfDf['AltBase'].str.contains('^[ACGT]$')
somaticI=vcfDf[vcfDf['somatic']&vcfDf['specificSNP']].set_index(['Chr','Pos','AltBase']).index
germline_I=vcfDf[(~vcfDf['somatic'])&vcfDf['specificSNP']].set_index(['Chr','Pos','AltBase']).index
#somatic_I=vcfDf.set_index('AltBase')
myG=['Run_db','Run_digits',u'Chr', u'Pos',u'base']
vcfDf.head()
Chr | Pos | RsId | RefBase | AltBase | Annot | VP_Int | somatic | specificSNP | |||
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 14727 | rs1045587 | G | A | . | . | RS=1045587;RSPOS=14727;RV;dbSNPBuildID=117;SSR... | 1547614017554423799187767552 | False | True |
1 | 1 | 630825 | rs9783068 | T | C | . | . | RS=9783068;RSPOS=630825;dbSNPBuildID=119;SSR=1... | 1547613980660935651768664384 | False | True |
2 | 1 | 630833 | rs9701099 | C | T | . | . | RS=9701099;RSPOS=630833;dbSNPBuildID=119;SSR=1... | 1547613980660935651768664384 | False | True |
3 | 1 | 817186 | rs3094315 | G | A | . | . | RS=3094315;RSPOS=817186;RV;dbSNPBuildID=103;SS... | 1548822906480573393185800449 | True | True |
4 | 1 | 833068 | rs12562034 | G | A | . | . | RS=12562034;RSPOS=833068;dbSNPBuildID=120;SSR=... | 1548823017161040034332147968 | False | True |
refI=vcfDf.set_index(['Chr','Pos','RefBase']).index
tmp_dir='/nrnb/users/btsui/Data/all_seq/tmp/'
inFnameS=pd.Series(os.listdir(tmp_dir)).sample(n=30)
#!mkdir /tmp/btsui/jupyter/
varToPlot='pass rd'
myG=['ref base','Run_db','Run_digits']
#inFname=inFnameS.iloc[1]
def inF(inFname):
try:
tmpDf=pd.read_pickle(tmp_dir+inFname)
except:
print ('failed:',inFname)
return None
tmpDf2=tmpDf.reset_index().drop_duplicates()
myI=tmpDf2.set_index(['Chr','Pos','base']).index
m_inRef=myI.isin(refI)
m_somatic=myI.isin(somaticI)
m_germline=myI.isin(germline_I)
tmpDf2.loc[m_inRef,'ref base']='Reference alleles'
tmpDf2.loc[m_somatic,'ref base']='Known somatic \nmutations'
tmpDf2.loc[m_germline,'ref base']='Known germline \nmutations'
#tmpDf2['ref base'].fillna('Known germline \nmutations',inplace=True)
tmpDf2['pass rd']=tmpDf2['ReadDepth']>=2
#myG=['ref base','base','Run_db','Run_digits']
tmpS=tmpDf2.groupby(myG)[varToPlot].sum()
tmpS.to_pickle('/tmp/btsui/jupyter/'+inFname)
return tmpS
from multiprocessing import Pool
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
#tmpSL=list(tqdm(map(inF,inFnameS.values),
# total=len(inFnameS)))
#take one hour and twenty mins
#ls_out.txt: screwed up the process
with Pool(48) as p:
tmpSL=list(tqdm(p.imap(inF,inFnameS.values),total=len(inFnameS)))#,
#total=len(inFnameS))
100%|██████████| 30/30 [00:56<00:00, 1.89s/it]
#tmpSL[0]
len(tmpSL)
30
mergeS_raw=pd.concat(tmpSL,axis=0)#.groupby(myG).sum()
ax=sns.boxplot(data=mergeS_raw.reset_index(),x='ref base',y='pass rd')
ax.set_ylabel('# of sites with at least two reads')
ax.set_xlabel('')
Text(0.5,0,'')
(mergeS_raw>2000).groupby('ref base').mean()
ref base Known germline \nmutations 0.587196 Known somatic \nmutations 0.133583 Reference alleles 0.846616 Name: pass rd, dtype: float64
mergeS_raw.groupby('ref base').median()
ref base Known germline \nmutations 3608.0 Known somatic \nmutations 384.5 Reference alleles 23044.0 Name: pass rd, dtype: float64
#mergeS_raw[~(mergeS_raw.index.get_level_values('ref base')=='Reference alleles')]
minorAllelS=mergeS_raw[~(mergeS_raw.index.get_level_values('ref base')!='Reference alleles')]
#minorAllelS
(minorAllelS>1000).median()
1.0
mergeS_raw.median()
2471.5
mergeS_raw.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x2b36b02b7320>
mergeS_raw=mergeS_raw.groupby(myG).sum()
mergedS=np.log10(mergeS_raw+1)
mergedS.head()
Run_db Run_digits DRR 61036 5.055466 ERR 6319 3.775028 6338 4.457019 6358 3.744762 6376 4.650094 6422 4.491320 15993 4.365113 16147 4.581540 16149 4.993908 16177 4.762829 16213 5.020034 16217 4.541205 16251 5.168886 16257 4.703996 16267 5.075006 16307 4.379541 18464 5.090322 19901 4.592177 22333 4.751164 22355 4.689309 22379 4.396356 22444 4.426853 22449 5.281465 31876 5.367432 31879 5.362268 31920 5.475671 31984 5.407880 32040 5.262268 42505 5.497302 42532 5.491119 ... SRR 6040477 3.582972 6040599 3.482445 6051218 5.047049 6054443 2.326336 6054539 1.880814 6066112 4.068483 6075271 2.761176 6078306 5.177444 6081961 5.232078 6082022 5.162409 6161416 4.295567 6161467 4.535243 6161499 4.401711 6161567 4.161937 6161598 4.125676 6197347 2.336460 6205895 3.523876 6212971 5.352844 6213081 5.372166 6213145 5.247553 6213652 5.212446 6214153 5.300163 6231125 5.060286 6237236 5.163313 6301139 3.975294 6301209 4.463505 6307936 4.395519 6308005 4.363913 6310117 5.159077 6315419 2.416641 Name: pass rd, Length: 2999, dtype: float64
#%matplotlib inline
plt.rcParams['pdf.fonttype'] = 42
mergeS_raw.groupby('ref base').median()
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-30-d619e344b43c> in <module>() ----> 1 mergeS_raw.groupby('ref base').median() ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, observed, **kwargs) 6657 return groupby(self, by=by, axis=axis, level=level, as_index=as_index, 6658 sort=sort, group_keys=group_keys, squeeze=squeeze, -> 6659 observed=observed, **kwargs) 6660 6661 def asfreq(self, freq, method=None, how=None, normalize=False, ~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in groupby(obj, by, **kwds) 2150 raise TypeError('invalid type: %s' % type(obj)) 2151 -> 2152 return klass(obj, by, **kwds) 2153 2154 ~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, **kwargs) 597 sort=sort, 598 observed=observed, --> 599 mutated=self.mutated) 600 601 self.obj = obj ~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _get_grouper(obj, key, axis, level, sort, observed, mutated, validate) 3289 in_axis, name, level, gpr = False, None, gpr, None 3290 else: -> 3291 raise KeyError(gpr) 3292 elif isinstance(gpr, Grouper) and gpr.key is not None: 3293 # Add key to exclusions KeyError: 'ref base'
fig,ax=plt.subplots(figsize=(7,3))
xorder=['Reference alleles','Known germline \nmutations','Known somatic \nmutations']
ax=sns.boxplot(data=mergedS.reset_index(),
x='ref base',y=varToPlot,ax=ax,order=xorder)
# tmpDf2.loc[m_inRef,'ref base']='Reference alleles'
# tmpDf2.loc[m_somatic,'ref base']='Known somatic \nmutations'
# tmpDf2.loc[m_germline,'ref base']='Known germline \nmutations'
#ax.set_xticks()
#at least 2 reads
#each dot is a variant
#ax.set_yscale('log')
#ax.set_ylim([0,10**6])
#ax.ticklabel_format(axis='y',style='sci',scilimits=(1,4))
ax.set_ylabel(' # of sequencing runs with variant detected\n (log10 scale)' )
#ax.set_xlabel()
ax.set_xlabel('')
fig.savefig('./Figures/variant_suppport_for_each_sequencing_run.pdf')
fig.savefig('./Figures/variant_suppport_for_each_sequencing_run.png',dpi=300)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-31-7ad981b857e3> in <module>() 2 xorder=['Reference alleles','Known germline \nmutations','Known somatic \nmutations'] 3 ax=sns.boxplot(data=mergedS.reset_index(), ----> 4 x='ref base',y=varToPlot,ax=ax,order=xorder) 5 # tmpDf2.loc[m_inRef,'ref base']='Reference alleles' 6 # tmpDf2.loc[m_somatic,'ref base']='Known somatic \nmutations' ~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in boxplot(x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, dodge, fliersize, linewidth, whis, notch, ax, **kwargs) 2209 plotter = _BoxPlotter(x, y, hue, data, order, hue_order, 2210 orient, color, palette, saturation, -> 2211 width, dodge, fliersize, linewidth) 2212 2213 if ax is None: ~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in __init__(self, x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, dodge, fliersize, linewidth) 439 width, dodge, fliersize, linewidth): 440 --> 441 self.establish_variables(x, y, hue, data, orient, order, hue_order) 442 self.establish_colors(color, palette, saturation) 443 ~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in establish_variables(self, x, y, hue, data, orient, order, hue_order, units) 149 if isinstance(input, string_types): 150 err = "Could not interpret input '{}'".format(input) --> 151 raise ValueError(err) 152 153 # Figure out the plotting orientation ValueError: Could not interpret input 'ref base'
!echo $PWD/./Figures/variant_suppport_for_each_sequencing_run.pdf
/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/Analysis/./Figures/variant_suppport_for_each_sequencing_run.pdf
#ax=mergedS[False].hist()
### make sure the data read in will have all the results
inFname=inFnameS.iloc[1]
tmpDf=pd.read_pickle(tmp_dir+inFname)
tmpDf2=tmpDf.reset_index().drop_duplicates()
#tmpDf2.head().dtypes
myI=tmpDf2.set_index(['Chr','Pos','base']).index
m_inRef=myI.isin(refI)
tmpDf2.loc[m_inRef,'ref base']=True
tmpDf2['ref base'].fillna(False,inplace=True)
tmpDf2['pass rd']=tmpDf2['ReadDepth']>=2
tmpS=tmpDf2.groupby(['ref base','base','Run_db','Run_digits'])['pass rd'].sum()
#tmpS
import matplotlib.pyplot as plt
fig,ax=plt.subplots()
g=tmpDf2[tmpDf2['ref base']].groupby(['base','Run_db','Run_digits'])['pass rd']
countS1=np.log10(g.sum())
countS1.hist(normed=True,ax=ax,label='Reference allele')
g=tmpDf2[~tmpDf2['ref base']].groupby(['Run_db','Run_digits'])['pass rd']
countS2=np.log10(g.sum())
countS2.hist(normed=True,ax=ax,alpha=0.4,label='Alternative allele')
#'log10 ( number of sites with at least 2 reads)' , '% of SRA sequencing runs'
ax.set_ylabel('% of SRA sequencing runs')
ax.legend()
ax.set_xlabel('log10 ( number of sites with at least 2 reads)')
ax.grid(False)
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been "
from scipy import stats
stats.wilcoxon(countS1,countS2)
type(myI),
m_inRef.mean()
0.7398492072866819
tmpDf=pd.read_pickle(tmp_dir+inFnameS.iloc[0])
tmpDf
features | ReadDepth | AverageBaseQuality | ||||
---|---|---|---|---|---|---|
Run_db | Run_digits | Chr | Pos | base | ||
ERR | 18506 | 1 | 14727 | A | 2 | 15 |
G | 2 | 27 | ||||
630825 | G | 2 | 13 | |||
T | 165 | 33 | ||||
630833 | C | 180 | 33 | |||
G | 1 | 7 | ||||
842133 | A | 1 | 7 | |||
G | 1 | 33 | ||||
843942 | A | 2 | 31 | |||
850609 | C | 2 | 33 | |||
970788 | G | 1 | 38 | |||
1014143 | C | 3 | 30 | |||
1014228 | G | 2 | 22 | |||
1014316 | C | 1 | 28 | |||
1014359 | G | 2 | 38 | |||
1022188 | A | 1 | 33 | |||
1022225 | G | 1 | 18 | |||
1022260 | C | 1 | 22 | |||
1041582 | G | 1 | 10 | |||
1041583 | A | 1 | 6 | |||
1043476 | G | 1 | 30 | |||
1044134 | G | 1 | 11 | |||
1044176 | G | 1 | 29 | |||
1044455 | G | 1 | 35 | |||
1045172 | G | 1 | 12 | |||
1045177 | G | 1 | 6 | |||
1045393 | C | 1 | 14 | |||
1045707 | G | 1 | 28 | |||
1045751 | A | 1 | 36 | |||
1045785 | G | 1 | 39 | |||
... | ... | ... | ... | ... | ... | ... |
SRR | 5981338 | MT | 15784 | T | 3786 | 37 |
15812 | A | 4 | 16 | |||
C | 1 | 2 | ||||
G | 3138 | 37 | ||||
T | 1 | 2 | ||||
15833 | A | 2 | 5 | |||
C | 2133 | 37 | ||||
15848 | A | 1446 | 37 | |||
G | 1 | 7 | ||||
T | 3 | 28 | ||||
15884 | G | 236 | 34 | |||
15890 | A | 93 | 34 | |||
C | 8 | 37 | ||||
15923 | A | 1 | 36 | |||
15927 | G | 1 | 36 | |||
15928 | G | 1 | 37 | |||
15932 | T | 1 | 37 | |||
15943 | T | 3 | 39 | |||
15950 | G | 3 | 40 | |||
T | 1 | 34 | ||||
15965 | A | 6 | 39 | |||
15967 | G | 6 | 39 | |||
15990 | C | 31 | 36 | |||
16188 | A | 1 | 2 | |||
C | 194 | 26 | ||||
16278 | C | 280 | 37 | |||
16390 | C | 1 | 2 | |||
G | 394 | 36 | ||||
16519 | C | 298 | 37 | |||
G | 1 | 2 |
9393594 rows × 2 columns