import os
baseGenomesDir='/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/'
###baseGenomesDir='/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/'
####out_dir='
specie='Homo_sapiens'
adaptor='AGATCGGAAGAGC'
myTmpDir='/nrnb/users/btsui/Data/tcga_raw/2b0048e0-a062-40d2-a1e1-4bb763ea0ead/'
bam_read_count_dir='/cellar/users/btsui/Program/bam_read_count/bam-readcount-master/bin/bam-readcount'
genomeDir=baseGenomesDir+specie+'/'
os.chdir(myTmpDir)
tcgaBamName='C494.TCGA-S9-A6U1-01A-21D-A33T-08.1_gdc_realn.bam'
os.system('bamToFastq -i '+tcgaBamName + ' -fq /dev/stdout | head -n 4000 > test.fq')
trim_galore_Dir='/cellar/users/btsui/Program/TRIMAGLORE//trim_galore'
os.system(trim_galore_Dir+' '+'test.fq')
os.system('rm fastq_pipe')
os.system('mkfifo fastq_pipe ')
os.system('bamToFastq -i '+tcgaBamName+' -fq /dev/stdout | cutadapt -a AGATCGGAAGAGC - > fastq_pipe &')
cmd_1='bowtie2 --local -x {ref} -U fastq_pipe --no-unal --threads 64 | samtools view -bS - | samtools sort - > sorted.bam 2>bowtie_log.txt'.format(ref=genomeDir)
os.system(cmd_1)
snpBed='/cellar/users/btsui/Data/dbsnp/snp_beds/'+specie+'.bed'
fa_dir='/cellar/users/btsui/Data/ensembl/snp_masked/'+specie+'.microbe.fa'
###extract bam read count from TCGA files
cmd_5=bam_read_count_dir+' C494.TCGA-S9-A6U1-01A-21D-A33T-08.1_gdc_realn.bam |gzip > tcga.bam_read_count.txt.gz'
cmd_5
'/cellar/users/btsui/Program/bam_read_count/bam-readcount-master/bin/bam-readcount C494.TCGA-S9-A6U1-01A-21D-A33T-08.1_gdc_realn.bam |gzip > tcga.txt.gz'
specie='Homo_sapiens'
snpBed='/cellar/users/btsui/Data/dbsnp/snp_beds/'+specie+'.bed'
fa_dir='/cellar/users/btsui/Data/ensembl/snp_masked/'+specie+'.microbe.fa'
cmd_5=bam_read_count_dir+' -l '+snpBed+' -f '+fa_dir+' file_sorted |gzip > snp.txt.gz'
cmd_5
'/cellar/users/btsui/Program/bam_read_count/bam-readcount-master/bin/bam-readcount -l /cellar/users/btsui/Data/dbsnp/snp_beds/Homo_sapiens.bed -f /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.microbe.fa file_sorted |gzip > snp.txt.gz'
#!ls /cellar/users/btsui/Program/bam_read_count/bam-readcount-master/bin/bam-readcount
#'bamToFastq -i G35154.TCGA-DU-6407-01A-13D-1705-08.1.bam -fq test.fq '
'bamToFastq -i TCGA-FG-6690-01A-11D-1891_130919_SN590_0238_AC29TRACXX_s_7_rg.sorted.bam -fq /dev/stdout |head -n 2000000 > test.out '
#!ls /cellar/users/btsui/Data/dbsnp/snp_beds/Homo_sapiens.bed
### try processing one
myTmpDir='/nrnb/users/btsui/Data/tcga_raw/52ae2dd2-f573-41c6-ad1a-18b19c9eea35/'
cmd_1='bowtie2 -x {ref} -U test.out --threads 2 > out.sam 2>bowtie_log.txt'.format(ref=genomeDir)
cmd_1
'bowtie2 -x /cellar/users/btsui/Data/BOWTIE2_GENOME_INDEX/Homo_sapiens/ -U test.out --threads 2 > out.sam 2>bowtie_log.txt'
"""
milestone:
1. past fastq
still need:
2. complete the bam
3. Output entire bam
4. check for IDH1 mutation for the ATGC
"""
'\nmilestone:\n 1. past fastq\nstill need:\n 2. complete the bam\n 3. Output entire bam \n 4. check for IDH1 mutation for the ATGC\n'
'mkfifo fastq_pipe '
'bamToFastq -i 2.bam -fq fastq_pipe &'
'bamToFastq -i TCGA-FG-6690-01A-11D-1891_130919_SN590_0238_AC29TRACXX_s_7_rg.sorted.bam -fq fastq_pipe'
'bamToFastq -i G35154.TCGA-DU-6407-01A-13D-1705-08.1.bam -fq /dev/stdout | head -n 400000 > test.fq'
'bamToFastq -i '
cmd_1='bowtie2 --local -x {ref} -U small.name.sorted.fq --no-unal --threads 64 | samtools view -bS - | samtools sort - > sorted.bam 2>bowtie_log.txt'.format(ref=genomeDir)
cmd_1
'bowtie2 --local -x /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX//Homo_sapiens/ -U small.name.sorted.fq --no-unal --threads 64 | samtools view -bS - | samtools sort - > sorted.bam 2>bowtie_log.txt'
#'samtools sort output.bam > output.sorted'
#test case vcf: 6f5b793c-9040-4fd7-8b32-2fe33bc8c7d2
fa_dir='/cellar/users/btsui/Data/ensembl/release/hg19/Homo_sapiens.GRCh37.75.dna_rm.toplevel.fa'
#fa_dir='/cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa'
!ls /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/
Homo_sapiens
'samtools index sorted.bam'
'samtools index sorted.bam'
bam_read_count_dir='/cellar/users/btsui/Program/bam_read_count/bam-readcount-master/bin/bam-readcount'
'samtools index sorted.bam'
'samtools index sorted.bam'
#bam-readcount -f ref.fa some.bam
bam_read_count_dir+' -l loci.txt -f '+fa_dir+' sorted.bam'
'/cellar/users/btsui/Program/bam_read_count/bam-readcount-master/bin/bam-readcount -l loci.txt -f /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa sorted.bam'
#R132, R133
"""
it's the bam to fastq is almost at 0 utilization.
"""
token_dir='/cellar/users/btsui/../hcarter/gdc-user-token.2017-12-11T21_35_55.818Z.txt'
out_dir='/nrnb/users/btsui/Data/tcga_raw/'
file_uuid='6f5b793c-9040-4fd7-8b32-2fe33bc8c7d2'
gdc_cmd_fmt='gdc-client download -t {token_dir} -d {out_dir} {file_uuid}'
gdc_cmd_fmt.format(token_dir=token_dir,out_dir=out_dir,file_uuid=file_uuid)
'gdc-client download -t /cellar/users/btsui/../hcarter/gdc-user-token.2017-12-11T21_35_55.818Z.txt -d /nrnb/users/btsui/Data/tcga_raw/ 6f5b793c-9040-4fd7-8b32-2fe33bc8c7d2'
#rs121913500, 86
#209113110
#2 209113112 rs121913500 C T 61 PASS DB;DP=165;Gene=IDH1;MQ0=0;SOMATIC;SS=Somatic;VC=Missense_Mutation;VT=SNP;TID=uc002vcs.3;VLSC=255 GT:AD:DP:FA:MQ0:BQ:SS:SSC 0/0:79,0:79:0.000:0:.:2:. 0/1:63,23:86:0.267:0:31:2:.
#86 for primary tumor
#up the alignment rate for the unmasked genome first,
## test with chromosome 2 bam
'samtools view -b -f 0x02 small.bam | samtools sort -n - > small.name.sorted.bam'
'bamToFastq -i small.name.sorted.bam -fq small.name.sorted.fq'
'samtools view -h 2.name.sorted.bam | head -n 1000 > 2.name.sorted.little.sam'
'samtools view -h 2.name.sorted.bam | head -n 100000 | samtools view -bS - > 2.name.sorted.little.bam'
'samtools view -b -f 0x02 G35154.TCGA-DU-6407-01A-13D-1705-08.1.bam 2 > 2.bam'
import pandas as pd
sra_dump=pd.read_pickle('/cellar/users/btsui/Data/METAMAP/deploy_ready/syn11415787/meta_data/technical_meta_df.pickle')
sra_dump[sra_dump.ScientificName=='Homo_sapiens']
Sample | percent_reads_aligned | total_num_reads_aligned | ScientificName | Run | Experiment | Study | LibraryStrategy | proj_accession_BioProject | proj_accession_BioSample | proj_accession_Spots | proj_accession_Bases | proj_accession_Loaded | proj_accession_Visibility | proj_accession_Center | proj_accession_Type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Run |
sra_dump.ScientificName.value_counts()
Mus musculus 113073 Name: ScientificName, dtype: int64
!cp ../../Scratch/MaskingGenomeWithSnp.ipynb .
!ls -alt ./tmp.tcga.txt.gz
-rw-r--r-- 1 btsui users 4873 Jan 24 13:27 ./tmp.tcga.txt.gz
#!mv ./2b0048e0-a062-40d2-a1e1-4bb763ea0ead.tcga.txt.gz ../Analysis/.