In [1]:

import pandas  as pd
import os
import numpy as np
%matplotlib inline

In [2]:

inVcfDir='/data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.f1_byte2_not_00.vcf.gz' 
vcfDf=pd.read_csv(inVcfDir,sep='\t',header=None)
vcfDf.columns=['Chr','Pos','RsId','RefBase','AltBase','','','Annot']
vcfDf['Chr']=vcfDf['Chr'].astype(str)

/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [3]:

vcfDf.drop_duplicates(['Chr','Pos']).shape

Out[3]:

(387950, 8)

In [4]:

#'\w{22}'

In [5]:

vcfDf['VP_Int']=vcfDf['Annot'].str.extract('VP=(0x\w+)',expand=False).apply(lambda Str:int(Str,16))

In [6]:

#from right to left
nth_bit=6
vcfDf['somatic']=(vcfDf['VP_Int']%(2**(nth_bit-1))).astype(bool).values

In [ ]:

In [7]:

vcfDf['specificSNP']=vcfDf['AltBase'].str.contains('^[ACGT]$')

In [8]:

somaticI=vcfDf[vcfDf['somatic']&vcfDf['specificSNP']].set_index(['Chr','Pos','AltBase']).index

In [9]:

germline_I=vcfDf[(~vcfDf['somatic'])&vcfDf['specificSNP']].set_index(['Chr','Pos','AltBase']).index

In [10]:

#somatic_I=vcfDf.set_index('AltBase')

In [11]:

myG=['Run_db','Run_digits',u'Chr', u'Pos',u'base']
vcfDf.head()

Out[11]:

	Chr	Pos	RsId	RefBase	AltBase			Annot	VP_Int	somatic	specificSNP
0	1	14727	rs1045587	G	A	.	.	RS=1045587;RSPOS=14727;RV;dbSNPBuildID=117;SSR...	1547614017554423799187767552	False	True
1	1	630825	rs9783068	T	C	.	.	RS=9783068;RSPOS=630825;dbSNPBuildID=119;SSR=1...	1547613980660935651768664384	False	True
2	1	630833	rs9701099	C	T	.	.	RS=9701099;RSPOS=630833;dbSNPBuildID=119;SSR=1...	1547613980660935651768664384	False	True
3	1	817186	rs3094315	G	A	.	.	RS=3094315;RSPOS=817186;RV;dbSNPBuildID=103;SS...	1548822906480573393185800449	True	True
4	1	833068	rs12562034	G	A	.	.	RS=12562034;RSPOS=833068;dbSNPBuildID=120;SSR=...	1548823017161040034332147968	False	True

In [12]:

refI=vcfDf.set_index(['Chr','Pos','RefBase']).index

In [13]:

tmp_dir='/nrnb/users/btsui/Data/all_seq/tmp/'

In [14]:

inFnameS=pd.Series(os.listdir(tmp_dir)).sample(n=30)

In [15]:

#!mkdir /tmp/btsui/jupyter/

In [16]:

varToPlot='pass rd'

In [51]:

myG=['ref base','Run_db','Run_digits']

In [52]:

#inFname=inFnameS.iloc[1]
def inF(inFname):
    try:
        tmpDf=pd.read_pickle(tmp_dir+inFname)
    except:
        print ('failed:',inFname)
        return None
    tmpDf2=tmpDf.reset_index().drop_duplicates()
    myI=tmpDf2.set_index(['Chr','Pos','base']).index
    
    m_inRef=myI.isin(refI)
    m_somatic=myI.isin(somaticI)
    m_germline=myI.isin(germline_I)
    tmpDf2.loc[m_inRef,'ref base']='Reference alleles'
    tmpDf2.loc[m_somatic,'ref base']='Known somatic \nmutations'
    tmpDf2.loc[m_germline,'ref base']='Known germline \nmutations'
    #tmpDf2['ref base'].fillna('Known germline \nmutations',inplace=True)

    tmpDf2['pass rd']=tmpDf2['ReadDepth']>=2
    #myG=['ref base','base','Run_db','Run_digits']
    
    tmpS=tmpDf2.groupby(myG)[varToPlot].sum()
    tmpS.to_pickle('/tmp/btsui/jupyter/'+inFname)
    return tmpS

In [53]:

from multiprocessing import Pool

In [54]:

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [55]:

#tmpSL=list(tqdm(map(inF,inFnameS.values),
#                   total=len(inFnameS)))

In [56]:

#take one hour and twenty mins
#ls_out.txt: screwed up the process
with Pool(48) as p:
    tmpSL=list(tqdm(p.imap(inF,inFnameS.values),total=len(inFnameS)))#,
                   #total=len(inFnameS))

100%|██████████| 30/30 [00:56<00:00,  1.89s/it]

In [57]:

#tmpSL[0]

In [58]:

len(tmpSL)

Out[58]:

In [59]:

mergeS_raw=pd.concat(tmpSL,axis=0)#.groupby(myG).sum()

In [89]:

ax=sns.boxplot(data=mergeS_raw.reset_index(),x='ref base',y='pass rd')
ax.set_ylabel('# of sites with at least two reads')
ax.set_xlabel('')

Out[89]:

Text(0.5,0,'')

In [84]:

(mergeS_raw>2000).groupby('ref base').mean()

Out[84]:

ref base
Known germline \nmutations    0.587196
Known somatic \nmutations     0.133583
Reference alleles             0.846616
Name: pass rd, dtype: float64

In [75]:

mergeS_raw.groupby('ref base').median()

Out[75]:

ref base
Known germline \nmutations     3608.0
Known somatic \nmutations       384.5
Reference alleles             23044.0
Name: pass rd, dtype: float64

In [76]:

#mergeS_raw[~(mergeS_raw.index.get_level_values('ref base')=='Reference alleles')]

In [66]:

minorAllelS=mergeS_raw[~(mergeS_raw.index.get_level_values('ref base')!='Reference alleles')]

In [69]:

#minorAllelS

In [73]:

(minorAllelS>1000).median()

Out[73]:

1.0

In [61]:

mergeS_raw.median()

Out[61]:

2471.5

In [35]:

mergeS_raw.hist()

Out[35]:

<matplotlib.axes._subplots.AxesSubplot at 0x2b36b02b7320>

In [ ]:

In [26]:

mergeS_raw=mergeS_raw.groupby(myG).sum()

In [27]:

mergedS=np.log10(mergeS_raw+1)

In [32]:

mergedS.head()

Out[32]:

Run_db  Run_digits
DRR     61036         5.055466
ERR     6319          3.775028
        6338          4.457019
        6358          3.744762
        6376          4.650094
        6422          4.491320
        15993         4.365113
        16147         4.581540
        16149         4.993908
        16177         4.762829
        16213         5.020034
        16217         4.541205
        16251         5.168886
        16257         4.703996
        16267         5.075006
        16307         4.379541
        18464         5.090322
        19901         4.592177
        22333         4.751164
        22355         4.689309
        22379         4.396356
        22444         4.426853
        22449         5.281465
        31876         5.367432
        31879         5.362268
        31920         5.475671
        31984         5.407880
        32040         5.262268
        42505         5.497302
        42532         5.491119
                        ...   
SRR     6040477       3.582972
        6040599       3.482445
        6051218       5.047049
        6054443       2.326336
        6054539       1.880814
        6066112       4.068483
        6075271       2.761176
        6078306       5.177444
        6081961       5.232078
        6082022       5.162409
        6161416       4.295567
        6161467       4.535243
        6161499       4.401711
        6161567       4.161937
        6161598       4.125676
        6197347       2.336460
        6205895       3.523876
        6212971       5.352844
        6213081       5.372166
        6213145       5.247553
        6213652       5.212446
        6214153       5.300163
        6231125       5.060286
        6237236       5.163313
        6301139       3.975294
        6301209       4.463505
        6307936       4.395519
        6308005       4.363913
        6310117       5.159077
        6315419       2.416641
Name: pass rd, Length: 2999, dtype: float64

In [29]:

#%matplotlib inline
plt.rcParams['pdf.fonttype'] = 42

In [30]:

mergeS_raw.groupby('ref base').median()

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-30-d619e344b43c> in <module>()
----> 1 mergeS_raw.groupby('ref base').median()

~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, observed, **kwargs)
   6657         return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
   6658                        sort=sort, group_keys=group_keys, squeeze=squeeze,
-> 6659                        observed=observed, **kwargs)
   6660 
   6661     def asfreq(self, freq, method=None, how=None, normalize=False,

~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in groupby(obj, by, **kwds)
   2150         raise TypeError('invalid type: %s' % type(obj))
   2151 
-> 2152     return klass(obj, by, **kwds)
   2153 
   2154 

~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, **kwargs)
    597                                                     sort=sort,
    598                                                     observed=observed,
--> 599                                                     mutated=self.mutated)
    600 
    601         self.obj = obj

~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _get_grouper(obj, key, axis, level, sort, observed, mutated, validate)
   3289                 in_axis, name, level, gpr = False, None, gpr, None
   3290             else:
-> 3291                 raise KeyError(gpr)
   3292         elif isinstance(gpr, Grouper) and gpr.key is not None:
   3293             # Add key to exclusions

KeyError: 'ref base'

In [31]:

fig,ax=plt.subplots(figsize=(7,3))
xorder=['Reference alleles','Known germline \nmutations','Known somatic \nmutations']
ax=sns.boxplot(data=mergedS.reset_index(),
               x='ref base',y=varToPlot,ax=ax,order=xorder)
#    tmpDf2.loc[m_inRef,'ref base']='Reference alleles'
#    tmpDf2.loc[m_somatic,'ref base']='Known somatic \nmutations'
#    tmpDf2.loc[m_germline,'ref base']='Known germline \nmutations'
#ax.set_xticks()
#at least 2 reads
#each dot is a variant
#ax.set_yscale('log')
#ax.set_ylim([0,10**6])
#ax.ticklabel_format(axis='y',style='sci',scilimits=(1,4))

ax.set_ylabel(' # of sequencing runs with variant detected\n (log10 scale)' )
#ax.set_xlabel()
ax.set_xlabel('')
fig.savefig('./Figures/variant_suppport_for_each_sequencing_run.pdf')
fig.savefig('./Figures/variant_suppport_for_each_sequencing_run.png',dpi=300)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-31-7ad981b857e3> in <module>()
      2 xorder=['Reference alleles','Known germline \nmutations','Known somatic \nmutations']
      3 ax=sns.boxplot(data=mergedS.reset_index(),
----> 4                x='ref base',y=varToPlot,ax=ax,order=xorder)
      5 #    tmpDf2.loc[m_inRef,'ref base']='Reference alleles'
      6 #    tmpDf2.loc[m_somatic,'ref base']='Known somatic \nmutations'

~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in boxplot(x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, dodge, fliersize, linewidth, whis, notch, ax, **kwargs)
   2209     plotter = _BoxPlotter(x, y, hue, data, order, hue_order,
   2210                           orient, color, palette, saturation,
-> 2211                           width, dodge, fliersize, linewidth)
   2212 
   2213     if ax is None:

~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in __init__(self, x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, dodge, fliersize, linewidth)
    439                  width, dodge, fliersize, linewidth):
    440 
--> 441         self.establish_variables(x, y, hue, data, orient, order, hue_order)
    442         self.establish_colors(color, palette, saturation)
    443 

~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in establish_variables(self, x, y, hue, data, orient, order, hue_order, units)
    149                 if isinstance(input, string_types):
    150                     err = "Could not interpret input '{}'".format(input)
--> 151                     raise ValueError(err)
    152 
    153             # Figure out the plotting orientation

ValueError: Could not interpret input 'ref base'

In [66]:

!echo $PWD/./Figures/variant_suppport_for_each_sequencing_run.pdf

/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/Analysis/./Figures/variant_suppport_for_each_sequencing_run.pdf

In [258]:

#ax=mergedS[False].hist()

In [ ]:

In [60]:

### make sure the data read in will have all the results
inFname=inFnameS.iloc[1]

tmpDf=pd.read_pickle(tmp_dir+inFname)

tmpDf2=tmpDf.reset_index().drop_duplicates()

#tmpDf2.head().dtypes

myI=tmpDf2.set_index(['Chr','Pos','base']).index

m_inRef=myI.isin(refI)

tmpDf2.loc[m_inRef,'ref base']=True

tmpDf2['ref base'].fillna(False,inplace=True)

tmpDf2['pass rd']=tmpDf2['ReadDepth']>=2

tmpS=tmpDf2.groupby(['ref base','base','Run_db','Run_digits'])['pass rd'].sum()

In [1]:

#tmpS

In [161]:

import matplotlib.pyplot as plt

In [165]:

fig,ax=plt.subplots()
g=tmpDf2[tmpDf2['ref base']].groupby(['base','Run_db','Run_digits'])['pass rd']
countS1=np.log10(g.sum())
countS1.hist(normed=True,ax=ax,label='Reference allele')

g=tmpDf2[~tmpDf2['ref base']].groupby(['Run_db','Run_digits'])['pass rd']
countS2=np.log10(g.sum())
countS2.hist(normed=True,ax=ax,alpha=0.4,label='Alternative allele')
#'log10 ( number of sites with at least 2 reads)' , '% of SRA sequencing runs'
ax.set_ylabel('% of SRA sequencing runs')
ax.legend()
ax.set_xlabel('log10 ( number of sites with at least 2 reads)')
ax.grid(False)

/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "

In [163]:

from scipy import stats

In [ ]:

stats.wilcoxon(countS1,countS2)

In [ ]:

type(myI),

In [42]:

m_inRef.mean()

Out[42]:

0.7398492072866819

In [ ]:

tmpDf=pd.read_pickle(tmp_dir+inFnameS.iloc[0])

In [150]:

tmpDf

Out[150]:

				features	ReadDepth	AverageBaseQuality
Run_db	Run_digits	Chr	Pos	base
ERR	18506	1	14727	A	2	15
			14727	G	2	27
			630825	G	2	13
			630825	T	165	33
			630833	C	180	33
			630833	G	1	7
			842133	A	1	7
			842133	G	1	33
			843942	A	2	31
			850609	C	2	33
			970788	G	1	38
			1014143	C	3	30
			1014228	G	2	22
			1014316	C	1	28
			1014359	G	2	38
			1022188	A	1	33
			1022225	G	1	18
			1022260	C	1	22
			1041582	G	1	10
			1041583	A	1	6
			1043476	G	1	30
			1044134	G	1	11
			1044176	G	1	29
			1044455	G	1	35
			1045172	G	1	12
			1045177	G	1	6
			1045393	C	1	14
			1045707	G	1	28
			1045751	A	1	36
			1045785	G	1	39
...	...	...	...	...	...	...
SRR	5981338	MT	15784	T	3786	37
			15812	A	4	16
				C	1	2
				G	3138	37
				T	1	2
			15833	A	2	5
			15833	C	2133	37
			15848	A	1446	37
				G	1	7
				T	3	28
			15884	G	236	34
			15890	A	93	34
			15890	C	8	37
			15923	A	1	36
			15927	G	1	36
			15928	G	1	37
			15932	T	1	37
			15943	T	3	39
			15950	G	3	40
			15950	T	1	34
			15965	A	6	39
			15967	G	6	39
			15990	C	31	36
			16188	A	1	2
			16188	C	194	26
			16278	C	280	37
			16390	C	1	2
			16390	G	394	36
			16519	C	298	37
			16519	G	1	2

9393594 rows × 2 columns

In [ ]: