import pandas as pd
import textwrap
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
mySpecie='Homo_sapiens'
#change base dir to your data location
baseDir='/cellar/users/btsui/Data/SRA/snp/'
skymap_snp_dir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.pickle'
#skymap_snp_dir=baseDir+'{specie}_snp_pos/'.format(specie=mySpecie)
import os
skymapSnpDf=pd.read_pickle(skymap_snp_dir)
skymapSnpDf['AverageBaseQuality'].quantile(0.2)
32.0
readCutoff=2
m=skymapSnpDf['ReadDepth']>=readCutoff
#skymapSnpDf_subDf['AverageBaseQuality']
skymapSnpDf_subDf=skymapSnpDf[m]
posByStudyS=skymapSnpDf_subDf.groupby(['Chr','Pos','base']).size()
#show the read counts,
#show the number of bases covered
len(posByStudyS)
253472
#posByStudyS
#x-axis: alternative allele count
#posByStudyS
#!ls -lath /nrnb/users/btsui/Data/all_seq/tmp/
#!ls /cellar/users/btsui/Data/SRA/snp/Homo_sapiens_snp_pos/
#!ls -lath /nrnb/users/btsui/Data/all_seq/tmp/
#os.listdir(skymap_snp_dir)
skymap_snp_dir+os.listdir(skymap_snp_dir)[0]
--------------------------------------------------------------------------- NotADirectoryError Traceback (most recent call last) <ipython-input-12-81fb52ccfef8> in <module>() ----> 1 skymap_snp_dir+os.listdir(skymap_snp_dir)[0] NotADirectoryError: [Errno 20] Not a directory: '/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.pickle'
hdf_s=pd.HDFStore(skymap_snp_dir+'Pos_block_29100000',mode='r')
tmpChunkDf=hdf_s['/chunk']
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-13-d9495aea488b> in <module>() ----> 1 tmpChunkDf=hdf_s['/chunk'] NameError: name 'hdf_s' is not defined
#.reset_index()
g=tmpChunkDf[tmpChunkDf['ReadDepth']>2].groupby([b'Chr','Pos',b'base'])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-14-d17b5620af73> in <module>() 1 #.reset_index() ----> 2 g=tmpChunkDf[tmpChunkDf['ReadDepth']>2].groupby([b'Chr','Pos',b'base']) NameError: name 'tmpChunkDf' is not defined
print ('# of samples:',tmpChunkDf['Run_digits'].nunique())
# of samples: 151568
#tmpChunkDf
rdMedianS=g['ReadDepth']#.sum()
tmpChunkDf
b'features' | Run_digits | Pos | ReadDepth | AverageBaseQuality | block | ||
---|---|---|---|---|---|---|---|
b'Chr' | b'base' | b'Run_db' | |||||
18 | G | SRR | 3094396 | 29102295 | 2 | 38 | 29100000 |
T | SRR | 568452 | 29102295 | 1 | 27 | 29100000 | |
SRR | 1201234 | 29102295 | 1 | 41 | 29100000 | ||
12 | A | SRR | 3094396 | 29105394 | 5 | 37 | 29100000 |
SRR | 5197171 | 29105394 | 1 | 2 | 29100000 | ||
G | SRR | 5197171 | 29105394 | 1 | 37 | 29100000 | |
A | SRR | 2125425 | 29105394 | 5 | 34 | 29100000 | |
SRR | 20518 | 29105394 | 1 | 31 | 29100000 | ||
SRR | 5198890 | 29105394 | 1 | 38 | 29100000 | ||
16 | T | SRR | 5197171 | 29109843 | 1 | 35 | 29100000 |
SRR | 2148761 | 29109843 | 1 | 35 | 29100000 | ||
SRR | 5198890 | 29109843 | 2 | 38 | 29100000 | ||
SRR | 402503 | 29109843 | 1 | 41 | 29100000 | ||
SRR | 2125425 | 29109843 | 1 | 33 | 29100000 | ||
SRR | 1201234 | 29109843 | 1 | 41 | 29100000 | ||
SRR | 1918523 | 29109843 | 1 | 2 | 29100000 | ||
17 | G | SRR | 830996 | 29111451 | 42 | 31 | 29100000 |
A | SRR | 402503 | 29111451 | 1 | 40 | 29100000 | |
G | SRR | 5197171 | 29111451 | 1 | 38 | 29100000 | |
A | SRR | 1585539 | 29111451 | 3 | 55 | 29100000 | |
SRR | 581928 | 29111451 | 3 | 33 | 29100000 | ||
SRR | 3096642 | 29111451 | 11 | 38 | 29100000 | ||
G | SRR | 3096642 | 29111451 | 16 | 34 | 29100000 | |
SRR | 5224344 | 29111451 | 2 | 35 | 29100000 | ||
A | SRR | 5224344 | 29111451 | 5 | 37 | 29100000 | |
G | SRR | 399205 | 29111451 | 1 | 41 | 29100000 | |
A | SRR | 399205 | 29111451 | 2 | 37 | 29100000 | |
SRR | 1850979 | 29111451 | 1 | 21 | 29100000 | ||
G | SRR | 3547450 | 29111451 | 1 | 35 | 29100000 | |
A | SRR | 3096831 | 29111451 | 22 | 32 | 29100000 | |
... | ... | ... | ... | ... | ... | ... | ... |
2 | C | SRR | 5322158 | 29197676 | 2 | 8 | 29100000 |
SRR | 393179 | 29197676 | 6 | 37 | 29100000 | ||
SRR | 1518150 | 29197676 | 4 | 40 | 29100000 | ||
SRR | 5882189 | 29197676 | 4 | 42 | 29100000 | ||
SRR | 3030702 | 29197676 | 3 | 31 | 29100000 | ||
SRR | 2971320 | 29197676 | 2 | 38 | 29100000 | ||
T | SRR | 1509423 | 29197676 | 1 | 37 | 29100000 | |
C | SRR | 590775 | 29197676 | 3 | 40 | 29100000 | |
SRR | 3170127 | 29197676 | 6 | 31 | 29100000 | ||
SRR | 596798 | 29197676 | 4 | 30 | 29100000 | ||
A | SRR | 1509423 | 29197676 | 1 | 37 | 29100000 | |
G | SRR | 1509423 | 29197676 | 1 | 33 | 29100000 | |
C | SRR | 5356251 | 29197676 | 3 | 39 | 29100000 | |
SRR | 1509423 | 29197676 | 63 | 37 | 29100000 | ||
SRR | 3938516 | 29197676 | 1 | 40 | 29100000 | ||
SRR | 2038315 | 29197676 | 5 | 36 | 29100000 | ||
7 | A | SRR | 4422435 | 29198378 | 1 | 37 | 29100000 |
ERR | 183395 | 29198378 | 3 | 33 | 29100000 | ||
SRR | 501919 | 29198378 | 2 | 39 | 29100000 | ||
SRR | 796754 | 29198378 | 1 | 29 | 29100000 | ||
SRR | 2971320 | 29198378 | 1 | 35 | 29100000 | ||
SRR | 2301053 | 29198378 | 2 | 35 | 29100000 | ||
G | SRR | 394091 | 29198378 | 1 | 30 | 29100000 | |
SRR | 3165034 | 29198378 | 1 | 36 | 29100000 | ||
A | SRR | 5322158 | 29198378 | 2 | 6 | 29100000 | |
SRR | 3170127 | 29198378 | 21 | 36 | 29100000 | ||
SRR | 4022259 | 29198378 | 1 | 30 | 29100000 | ||
SRR | 1282230 | 29198378 | 1 | 30 | 29100000 | ||
SRR | 2034920 | 29198378 | 1 | 34 | 29100000 | ||
SRR | 948653 | 29198378 | 2 | 35 | 29100000 |
3134531 rows × 5 columns
np.log2(rdMedianS+1).hist()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-82-bda1e1b2cac0> in <module>() ----> 1 np.log2(rdMedianS+1).hist() TypeError: unsupported operand type(s) for +: 'SeriesGroupBy' and 'int'
#pd.read_pickle(skymap_snp_dir+os.listdir(skymap_snp_dir)[2])
!grep _snp_pos ./../../**/*.ipynb
#grep _snp_pos ./**/*.ipynb
./../../UpdatePipeline/UploadDataToSynapse.ipynb: "'tar -zcvf /data/cellardata/users/btsui/SRA/snp/Homo_sapiens_snp_pos.tar.gz /data/cellardata/users/btsui/SRA/snp/Homo_sapiens_snp_pos'"
#grep -r --include "*.ipynb" _snp_pos .
/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS
#find -name "parse"