fname='Homo_sapiens.fa.gz'
inDir='/cellar/users/btsui/Data/ensembl/snp_masked/'+fname
import os
withMicrobeDir=inDir.replace('.fa.gz','.microbe.fa.gz')
!ls -lah /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz
-rw-r--r-- 1 btsui users 25 Dec 30 12:30 /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz
specieName='Homo_sapiens'
withMicrobeDir
'/cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.microbe.fa.gz'
https://www.mothur.org/wiki/RDP_reference_files
!gunzip -c ../Microbiome/viral.1.1_2.1.genomic.fna.gz > ../Microbiome/viral.1.1_2.1.genomic.fa
#!head ../Microbiome/viral.1.1_2.1.genomic.fa
!echo $PWD/../Microbiome/viral.1.1_2.1.genomic.fa.fai
/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/../Microbiome/viral.1.1_2.1.genomic.fa.fai
import pandas as pd
pd.read_csv('../Microbiome/viral.1.1_2.1.genomic.fa.fai',sep='\t',header=None)
#!samtools faidx ../Microbiome/viral.1.1_2.1.genomic.fa
#!gunzip -c ../Microbiome/viral.1.1.genomic.fna.gz | head
#!echo $PWD/../Microbiome/
#!gzip ../Microbiome/trainset16_022016.rdp.fasta
#!gunzip -c ../Microbiome/viral.1.1_2.1.genomic.fna.gz | head -n 10
!cat ../Microbiome/trainset16_022016.rdp.fasta.gz ../Microbiome/viral.1.1_2.1.genomic.fna.gz > microbe.fa.gz
!ls -lah /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz
-rw-r--r-- 1 btsui users 86M Dec 30 12:51 /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz
os.system('cat microbe.fa.gz '+inDir+' >'+withMicrobeDir)
0
#!ls -alh /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.microbe.fa
!ls -lah /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz
-rw-r--r-- 1 btsui users 86M Dec 30 12:51 /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.fa.gz
#withMicrobeDir
gunzipedFaDir=withMicrobeDir.replace('.fa.gz','.fa')
os.system('rm '+gunzipedFaDir)
0
os.system('gunzip '+withMicrobeDir)
0
os.system('samtools faidx '+gunzipedFaDir)
0
#import pandas as pd
tmpDf=pd.read_csv(faDir+'.fai',sep='\t',header=None)
"""m1=tmpDf[0].str.contains('\|')
m2=tmpDf[0].str.contains('_')
tmpDf.loc[m1,'state']='rdp'
tmpDf.loc[m2,'state']='ncbi_virus'
tmpDf.loc[~(m1|m2),'state']='human'
tmpDf"""
"m1=tmpDf[0].str.contains('\\|')\nm2=tmpDf[0].str.contains('_')\ntmpDf.loc[m1,'state']='rdp'\ntmpDf.loc[m2,'state']='ncbi_virus'\ntmpDf.loc[~(m1|m2),'state']='human'\ntmpDf"
myDir='/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/'+specieName+'/'
faDir=gunzipedFaDir
#tmpDir='/tmp/btsui/'+fname
#os.system('cp '+faDir+' '+tmpDir)
#os.system('gunzip '+tmpDir)
cmd= 'bowtie2-build --threads 64 '+faDir+' '+myDir
os.system('rm -r '+myDir)
os.system('mkdir '+myDir)
cmd
#os.system('mkdir '+myDir)
tmpDf3=pd.read_csv('/cellar/users/btsui/per_fa_record_stat.txt',sep='\t',header=None).sort_values(2)
tmpDf3[tmpDf3[0].str.contains('NC_001357')]
0 | 1 | 2 | 3 | |
---|---|---|---|---|
15804 | NC_001357.1 | 7857 | 0 | 0 |
tmpDf3[tmpDf3[2]>5].iloc[-50:-20]
0 | 1 | 2 | 3 | |
---|---|---|---|---|
14162 | NC_001493.2 | 134226 | 33 | 0 |
14812 | NC_024382.1 | 137090 | 33 | 0 |
18228 | NC_009823.1 | 9711 | 35 | 0 |
17966 | NC_004102.1 | 9646 | 39 | 0 |
15950 | NC_006641.1 | 15959 | 42 | 0 |
17914 | NC_030200.1 | 137448 | 43 | 0 |
20658 | NC_021858.1 | 1908524 | 59 | 0 |
22235 | NC_009127.1 | 295146 | 59 | 0 |
13939 | NC_020231.1 | 233501 | 61 | 0 |
22138 | NC_008912.1 | 3141 | 61 | 0 |
14731 | NC_020474.2 | 180421 | 61 | 0 |
13843 | NC_019491.1 | 291144 | 66 | 0 |
20983 | NC_005261.2 | 137821 | 68 | 0 |
14853 | NC_024709.1 | 33452 | 69 | 0 |
15780 | NC_001499.1 | 5894 | 77 | 0 |
14004 | NC_021312.1 | 459984 | 78 | 0 |
15691 | NC_002794.1 | 195859 | 102 | 0 |
17235 | NC_028834.1 | 48216 | 115 | 0 |
20659 | NC_022098.1 | 2473870 | 126 | 0 |
14829 | NC_024697.1 | 370920 | 358 | 0 |
14211 | NC_022518.1 | 9472 | 483 | 0 |
15776 | NC_001506.1 | 3811 | 506 | 0 |
21974 | NC_008168.1 | 104710 | 1535 | 0 |
14240 | NC_018464.1 | 927 | 3123 | 0 |
18783 | NC_032111.1 | 163005 | 8688 | 0 |
22813 | Y | 57227415 | 16392 | 0 |
22807 | 18 | 80373285 | 72541 | 0 |
22810 | 21 | 46709983 | 79744 | 0 |
22802 | 13 | 114364328 | 101329 | 0 |
22811 | 22 | 50818468 | 152059 | 0 |