import pandas as pd
import gzip
import re
%matplotlib inline
pattern='VP=0x\d{4}(\d{2})'
prog = re.compile(pattern)
inDbDir='/data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz'
outDbDir=inDbDir.replace('.vcf.gz','.f1_byte2_not_00.vcf.gz')
number of snps retained
!gunzip -c /data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz |wc -l
325174853
!gunzip -c /data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz| wc -l
325174853
###
TEST=False
with gzip.open(inDbDir, 'rb') as f:
with gzip.open(outDbDir,'wb') as wf:
for i,l in enumerate(f):
if l[0]!='#':
f1_byte2=prog.findall(l)[0]
if f1_byte2!='00':
wf.write(l)
if TEST and (i>100000):
break
if (i%(10**6))==0:
print i
#inDbDir='/data/cellardata/users/btsui/dbsnp/All_20170710.vcf.gz'
#outDbDir=inDbDir.replace('.vcf.gz','.f1_byte2_not_00.vcf.gz')
tmpDf=pd.read_csv(outDbDir,sep='\t',header=None)
tmpDf.columns=['Chr','Loc','rs','REF','ALT','','','Annot']
### take only the non-
#give it 1000
window_size=1000
tmpDf['Start']=tmpDf['Loc']-window_size
tmpDf.loc[(tmpDf['Start']<0),'Start']=0
tmpDf['End']=tmpDf['Loc']+window_size
tmpDf2=tmpDf[['Chr','Start','End']]
tmpDf2.to_csv('extracting_region.bed',sep='\t',header=None,index=None)
#!head /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa.fai
myCleanFaDir='/cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa'
os.system(' samtools faidx '+ myCleanFaDir)
0
#bedtools complement -i <BED/GFF/VCF> -g <GENOME>
myFai=pd.read_csv(myCleanFaDir+'.fai',sep='\t',header=None)
myFai[[0,1]].to_csv('genome',sep='\t',header=None,index=None)
!bedtools complement -i extracting_region.bed -g genome > complement.txt
***** WARNING: MT:0-17519 exceeds the length of chromosome (MT)
complementDf=pd.read_csv('complement.txt',sep='\t',header=None)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-6-5410482d8961> in <module>() ----> 1 complementDf=pd.read_csv('complement.txt',sep='\t',header=None) NameError: name 'pd' is not defined
!bedtools --version
bedtools v2.26.0
#!rm -r /data/cellardata/users/btsui/dbsnp/snp_bed/
!mkdir /data/cellardata/users/btsui/dbsnp/snp_beds
mkdir: cannot create directory ‘/data/cellardata/users/btsui/dbsnp/snp_beds’: File exists
tmpDf[['Chr','Loc','Loc']].to_csv('/data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed',
sep='\t',header=None,index=None)
#!wc -l /data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed
#! maskFastaFromBed
#!rm -r fifo pipe
!rm pipe
!mkfifo pipe
import os
os.system('maskFastaFromBed -fi /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa -bed complement.txt -fo pipe &')
0
specie='Homo_sapiens'
!ls -alh /cellar/users/btsui/Data/ensembl/snp_masked/
total 480M drwxr-xr-x 2 btsui users 5 Dec 30 12:57 . drwxr-xr-x 5 btsui users 5 Dec 26 22:58 .. -rw-r--r-- 1 btsui users 86M Dec 30 12:51 Homo_sapiens.fa.gz -rw-r--r-- 1 btsui users 3.0G Dec 30 12:51 Homo_sapiens.microbe.fa -rw-r--r-- 1 btsui users 1.7M Dec 30 12:58 Homo_sapiens.microbe.fa.fai
outDir='/cellar/users/btsui/Data/ensembl/snp_masked/'
os.system('gzip -c pipe > '+outDir+specie+'.fa.gz')
0
#!rm /cellar/users/btsui/Data/ensembl/snp_masked/*
#!ls -lah /cellar/users/btsui/Data/ensembl/snp_masked/
#run bowtie
myDir='/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Homo_sapiens/'
faDir='/cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa'
cmd= 'bowtie2-build --threads 48 '+faDir+' '+myDir
cmd
'bowtie2-build --threads 48 /cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Homo_sapiens/'
#!cp Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa /cellar/users/btsui/Data/ensembl/snp_masked/.
#IDH1
"""
Chromosome:2
Start:208,236,227 bp from pterEnd:208,266,074 bp from pter
"""
"""
subDf=complementDf[(complementDf.Chr=='2')]
subDf['dist_idh']=(subDf['Start']-208236227).abs()
subDf.sort_values('dist_idh')"""
"\nsubDf=complementDf[(complementDf.Chr=='2')]\nsubDf['dist_idh']=(subDf['Start']-208236227).abs()\nsubDf.sort_values('dist_idh')"
spaceMasked=(complementDf.End-complementDf.Start)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-7-ae3fda4a1c43> in <module>() ----> 1 spaceMasked=(complementDf.End-complementDf.Start) NameError: name 'complementDf' is not defined
np.log10((spaceMasked.sum()))
10.705346589807313
import cPickle as pickle
import bz2
db = pickle.load(bz2.BZ2File('db_v20/mpa_v20_m200.pkl', 'r'))
50739547498
specie='Mus_musculus'
from Bio import SeqIO
#record = SeqIO.read("single.fasta", "fasta")
#!cp ./Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa.gz /cellar/users/btsui/Data/ensembl/snp_masked/
!ls -laht /cellar/users/btsui/Data/ensembl/snp_masked/| head
total 806M drwxr-xr-x 2 btsui users 7 Oct 28 16:20 . -rw-r--r-- 1 btsui users 62M Oct 28 16:20 Mus_musculus.fa.gz -rw-r--r-- 1 btsui users 119M Mar 2 2018 Mus_musculus.microbe.fa.gz drwxr-xr-x 6 btsui users 6 Jan 1 2018 .. -rw-r--r-- 1 btsui users 869K Dec 30 2017 Homo_sapiens.microbe.fa.fai -rw-r--r-- 1 btsui users 3.2G Dec 30 2017 Homo_sapiens.microbe.fa -rw-r--r-- 1 btsui users 86M Dec 30 2017 Homo_sapiens.fa.gz
Homo_sapiens.fa.gz
i=208248388
#0 3134601
str(record.seq)[i-1000:i+1000]
'GGCAGGAAGGAATGCTTTAGTAAGGCTGCTTTCAACTACCGAGTCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGTTTTTTCTTTTATGCATGATGGGATCATGTTTAATACAATCTTTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGAATAAAAGGATAAAGGNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTGTTGAGATGGACGCCTATTTGTAAGTTTATTTGTATTTGCCTTTAGCTAAATGTGTGTAAATATACAGTTATACATATATGCATTTCTCAATTTCATACCTTGCTTAATGGGTGTAGATACCAAAAGATAAGAATAAAACACATACAAGTTGGAAATTTCTGGGCCATGNNNNNNNNNNCATGCAAAATCACATTATTGCCAACATGACTTACTTGATCCCCATAAGCATGACGACCTATGATGATAGGTTTTACCCATCCACTCACAAGCCGGGGGATATTTTTGCAGATAATGGCTTCTCTGAAGACCGTGCCACCCAGAATATTTCGTATGGTGCCATTTGGTGATTTCCACATTTGTTTCAACTTGAACTCCTCAACCCTCTTCTCATCAGGAGTGATAGTGGCACATTTGACGCCAACATTATGCTTCTTTATAGCTTCTGCAGCATCCTTGGTGACTTGGTCGTTGGTGGCATCACGATTCTCTATGCCTAAATCATAGCTTGAAAGAGAAAAATTAGAAGCAAAGTTTTTCAGACAAATGGATAGTTATAACCTACAACTGCAGTGATGGCATATAGAGCTCATTATGCAGAAAGCGAAGGCTCTGAGTACACCAAAATCTGCCAAGTTTTAGCACTGGCACACCCTAAACACAGAAGATGGGTGCCAACTACAAAGAGAACTAAGAGAGGCTAGACTGCTGGGCTGCCCCTCCTCTTGTAATGGGAAGCTGTCTATCAGGAAAATGAATGGAAACCATCAAACTCTGCCCTTTGCCTCCTGTTTTTCACCAGATAGAGGCAATAATGGCCTGGTTTCTCTCCATATAATTAGTATAATAAGTGTAAGTCTAAGGACCACCTTTTTCCCTTAGTAAATATTGCTCTTTAAAAATAAAAGGGGGTGGGAGGGGTTAAAGGATTTTTTGTATTTTGACGTTGAAGTGGTGGGAGGTAAAGATGTGACAGTTGATGCCAAAAATGCGAATGACTACTTCATTAATGCCAGTTAGTCATAAAATGAGCCTTTCCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCTTCTGTAATTCTTGAGCAAGTGTTTGCT'
#!head /cellar/users/btsui/Data/ensembl/snp_masked/Mus_musculus.fa.gz
str(record.seq)[i-10:i+10]
'TAAGCATGACGACCTATGAT'
proteome='/cellar/users/btsui/Downloads/viral.2.protein.faa'
with open(proteome) as handle:
for record in SeqIO.parse(handle, "fasta") :
print (len(record))*3
break
3660
record
SeqRecord(seq=Seq('MAVNTSGKTRLPQPASEDYTQYARNTLKNLNNVYEKFAVRGPVLALVRPAQFSK...GAV', SingleLetterAlphabet()), id='YP_003620396.1', name='YP_003620396.1', description='YP_003620396.1 p130 [Providence virus]', dbxrefs=[])
!ls -lah /cellar/users/btsui/Downloads/viral.2.protein.faa
-rw-r--r-- 1 btsui users 19M Dec 30 13:25 /cellar/users/btsui/Downloads/viral.2.protein.faa
import pandas as pd
import numpy as np
tmpBedDf=pd.read_csv('/data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed',header=None,sep='\t')
tmpBedDf[0].astype(np.str).unique()
array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT'], dtype=object)
tmpBedDf['Chr']=tmpBedDf['Chr'].astype(np.str)
tmpBedDf['Pos']=tmpBedDf['Pos'].astype(np.str)
tmpBedDf.columns=['Chr','Pos','']
tmpBedDf['Chr_Pos']=tmpBedDf['Chr']+'-'+tmpBedDf['Pos']
tmpBedDf['Id']=tmpBedDf.index
tmpBedDf[['Chr_Pos','Id']].drop_duplicates(['Chr_Pos']).to_pickle('/data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens_chrom_pos__id.pickle')
index=tmpDf.Chr.value_counts()
index.astype(np.str)
2 35223 1 30833 6 28444 11 23637 5 21139 3 21012 7 20244 12 17929 10 17399 16 17252 9 14998 17 14816 X 14761 19 14662 4 14607 8 13743 13 13635 15 12346 14 10423 20 7769 18 6810 22 6742 17 6050 21 4910 Y 3346 MT 512 Name: Chr, dtype: object