#!/usr/bin/env python # coding: utf-8 # In[ ]: import pandas as pd import gzip import re get_ipython().run_line_magic('matplotlib', 'inline') # In[ ]: pattern='VP=0x\d{4}(\d{2})' prog = re.compile(pattern) inDbDir='/data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz' outDbDir=inDbDir.replace('.vcf.gz','.f1_byte2_not_00.vcf.gz') # number of snps retained # In[3]: get_ipython().system('gunzip -c /data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz |wc -l') # In[4]: get_ipython().system('gunzip -c /data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz| wc -l') # ### identify snps with reference # In[ ]: ### TEST=False with gzip.open(inDbDir, 'rb') as f: with gzip.open(outDbDir,'wb') as wf: for i,l in enumerate(f): if l[0]!='#': f1_byte2=prog.findall(l)[0] if f1_byte2!='00': wf.write(l) if TEST and (i>100000): break if (i%(10**6))==0: print i # In[ ]: #inDbDir='/data/cellardata/users/btsui/dbsnp/All_20170710.vcf.gz' #outDbDir=inDbDir.replace('.vcf.gz','.f1_byte2_not_00.vcf.gz') # ### identify snp window # In[27]: tmpDf=pd.read_csv(outDbDir,sep='\t',header=None) tmpDf.columns=['Chr','Loc','rs','REF','ALT','','','Annot'] # In[7]: ### take only the non- #give it 1000 window_size=1000 tmpDf['Start']=tmpDf['Loc']-window_size tmpDf.loc[(tmpDf['Start']<0),'Start']=0 tmpDf['End']=tmpDf['Loc']+window_size # In[8]: tmpDf2=tmpDf[['Chr','Start','End']] tmpDf2.to_csv('extracting_region.bed',sep='\t',header=None,index=None) # In[5]: #!head /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa.fai # In[46]: myCleanFaDir='/cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa' os.system(' samtools faidx '+ myCleanFaDir) # In[49]: #bedtools complement -i -g myFai=pd.read_csv(myCleanFaDir+'.fai',sep='\t',header=None) myFai[[0,1]].to_csv('genome',sep='\t',header=None,index=None) # In[53]: get_ipython().system('bedtools complement -i extracting_region.bed -g genome > complement.txt') # ### mask out the rest of the genome # In[6]: complementDf=pd.read_csv('complement.txt',sep='\t',header=None) # In[5]: get_ipython().system('bedtools --version') # In[56]: #!rm -r /data/cellardata/users/btsui/dbsnp/snp_bed/ # In[57]: get_ipython().system('mkdir /data/cellardata/users/btsui/dbsnp/snp_beds') # In[58]: tmpDf[['Chr','Loc','Loc']].to_csv('/data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed', sep='\t',header=None,index=None) # In[59]: #!wc -l /data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed # In[60]: #! maskFastaFromBed # In[76]: #!rm -r fifo pipe # In[102]: get_ipython().system('rm pipe') get_ipython().system('mkfifo pipe') # In[103]: import os # In[104]: os.system('maskFastaFromBed -fi /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa -bed complement.txt -fo pipe &') # In[106]: specie='Homo_sapiens' # In[116]: get_ipython().system('ls -alh /cellar/users/btsui/Data/ensembl/snp_masked/') # In[15]: outDir='/cellar/users/btsui/Data/ensembl/snp_masked/' # In[108]: os.system('gzip -c pipe > '+outDir+specie+'.fa.gz') # In[94]: #!rm /cellar/users/btsui/Data/ensembl/snp_masked/* # In[25]: #!ls -lah /cellar/users/btsui/Data/ensembl/snp_masked/ # In[67]: #run bowtie myDir='/cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Homo_sapiens/' faDir='/cellar/users/btsui/Data/ensembl/snp_masked/Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa' cmd= 'bowtie2-build --threads 48 '+faDir+' '+myDir cmd # In[68]: #!cp Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa /cellar/users/btsui/Data/ensembl/snp_masked/. # In[282]: #IDH1 """ Chromosome:2 Start:208,236,227 bp from pterEnd:208,266,074 bp from pter """ """ subDf=complementDf[(complementDf.Chr=='2')] subDf['dist_idh']=(subDf['Start']-208236227).abs() subDf.sort_values('dist_idh')""" # In[7]: spaceMasked=(complementDf.End-complementDf.Start) # In[265]: np.log10((spaceMasked.sum())) # In[267]: import cPickle as pickle import bz2 db = pickle.load(bz2.BZ2File('db_v20/mpa_v20_m200.pkl', 'r')) # In[8]: specie='Mus_musculus' # In[10]: from Bio import SeqIO # In[12]: #record = SeqIO.read("single.fasta", "fasta") # In[13]: #!cp ./Homo_sapiens.GRCh38.dna_rm.toplevel.SNP_masked.fa.gz /cellar/users/btsui/Data/ensembl/snp_masked/ # In[22]: get_ipython().system('ls -laht /cellar/users/btsui/Data/ensembl/snp_masked/| head') # In[100]: # In[101]: i=208248388 #0 3134601 str(record.seq)[i-1000:i+1000] # In[23]: #!head /cellar/users/btsui/Data/ensembl/snp_masked/Mus_musculus.fa.gz # In[364]: str(record.seq)[i-10:i+10] # In[113]: proteome='/cellar/users/btsui/Downloads/viral.2.protein.faa' with open(proteome) as handle: for record in SeqIO.parse(handle, "fasta") : print (len(record))*3 break # In[114]: record # In[115]: get_ipython().system('ls -lah /cellar/users/btsui/Downloads/viral.2.protein.faa') # #### create ids for the bed # In[13]: import pandas as pd import numpy as np # In[36]: tmpBedDf=pd.read_csv('/data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens.bed',header=None,sep='\t') # In[38]: tmpBedDf[0].astype(np.str).unique() # In[15]: tmpBedDf['Chr']=tmpBedDf['Chr'].astype(np.str) tmpBedDf['Pos']=tmpBedDf['Pos'].astype(np.str) # In[18]: tmpBedDf.columns=['Chr','Pos',''] tmpBedDf['Chr_Pos']=tmpBedDf['Chr']+'-'+tmpBedDf['Pos'] # In[20]: tmpBedDf['Id']=tmpBedDf.index # In[26]: tmpBedDf[['Chr_Pos','Id']].drop_duplicates(['Chr_Pos']).to_pickle('/data/cellardata/users/btsui/dbsnp/snp_beds/Homo_sapiens_chrom_pos__id.pickle') # In[33]: index=tmpDf.Chr.value_counts() # In[35]: index.astype(np.str) # In[ ]: