Purpose of this script:
dbSNP use the reference strand to annnotate.
Map genomic corrdinates with known human variants to other species (Like mouse)
Input: Variant table
import pandas as pd
import os
import gzip
import re
from tqdm import tqdm
%matplotlib inline
pattern='VP=0x\d{4}(\d{2})'
prog = re.compile(pattern)
inDbDir='/data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz'
outDbDir=inDbDir.replace('.vcf.gz','.f1_byte2_not_00.vcf.gz')
humanDf=pd.read_csv(outDbDir,sep='\t',header=None)
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
#!gunzip -c /data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.vcf.gz |head -n 20
humanDf.columns=['Chr','Loc','rs','REF','ALT','','','Annot']
## pos strand
#humanDf
!ls /cellar/users/btsui/Data/ucsc_chains/ | grep Dan
hg38ToDanRer10.over.chain.gz hg38ToDanRer11.over.chain.gz
inputDir='/cellar/users/btsui/Data/ucsc_chains/'
fileNameInFolderS=pd.Series(os.listdir(inputDir))
fileNameInFolderS[fileNameInFolderS.str.contains('mm10',case=False)]
143 hg38ToMm10.over.chain.gz dtype: object
chainFiles=fileNameInFolderS[fileNameInFolderS.str.contains('.chain.gz$')]
chainFiles=pd.concat([chainFiles[chainFiles.str.contains('mm10|DanRer',case=False)],
chainFiles[~chainFiles.str.contains('mm10|DanRer',case=False)]])
intervalLength=1
refOffset=1
humanDf['Chr_uscs']='chr'+humanDf['Chr'].astype(str)
humanDf['Loc_+1']=humanDf['Loc']+intervalLength-refOffset
humanDf['Loc_-1']=humanDf['Loc']-refOffset
humanDf['Chr_uscs']=humanDf['Chr_uscs'].str.replace('chrMT','chrM')
humanDf['Score']='.'
humanDf['Strand']='+'
humanDf[['Chr_uscs','Loc_-1','Loc_+1','rs','Score','Strand']
].to_csv('tmp.human.ucsc.bed',sep='\t',header=None,index=None)
#humanDf[['Chr','Loc_-1','Loc_+1','rs']].to_csv('tmp.human.ensembl.bed',sep='\t',header=None,index=None)
#!head tmp.human.ucsc.bed
for chainFile in tqdm(chainFiles):
outDir='/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/{}'.format(chainFile+'.human_homo.bed')
unmappedOutDir='/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/{}'.format(chainFile+'.human_homo.unmapped.bed')
liftOverCmd="liftOver {} {} {} {}".format('tmp.human.ucsc.bed',
inputDir+chainFile,
outDir,unmappedOutDir)
print (os.system(liftOverCmd))
1%| | 1/157 [00:01<03:20, 1.29s/it]
0
1%|▏ | 2/157 [00:03<04:30, 1.74s/it]
0
2%|▏ | 3/157 [08:08<6:58:07, 162.91s/it]
0
3%|▎ | 4/157 [08:10<5:12:26, 122.52s/it]
0
3%|▎ | 5/157 [08:19<4:13:16, 99.98s/it]
0
4%|▍ | 6/157 [08:21<3:30:23, 83.60s/it]
0
4%|▍ | 7/157 [08:34<3:03:46, 73.51s/it]
0
5%|▌ | 8/157 [09:09<2:50:31, 68.67s/it]
0
6%|▌ | 9/157 [16:18<4:28:06, 108.69s/it]
0
6%|▋ | 10/157 [16:30<4:02:46, 99.09s/it]
0
7%|▋ | 11/157 [29:17<6:28:48, 159.78s/it]
0
8%|▊ | 12/157 [46:33<9:22:34, 232.79s/it]
0
#!mkdir /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/
asdasd
#os.system(liftOverCmd)
humanDf[['Chr','Loc_-1','Loc','rs']].to_csv('tmp.human.ensembl.bed',sep='\t',header=None,index=None)
!bedtools getfasta -fi /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa -bed tmp.human.ensembl.bed -fo tmp.human.ucsc.bed.fasta
tmpS=pd.read_csv('tmp.human.ucsc.bed.fasta',header=None)[0]
extractedDf=pd.DataFrame({'Loc':tmpS[(tmpS.index%2)==0].values,
'Nucleotide':tmpS[(tmpS.index%2)==1].values})
extractedDf['Chr']=extractedDf.Loc.str.extract('(\w+):')
extractedDf['Site']=extractedDf.Loc.str.extract('\w+:\d+-(\d+)')
extractedDf['Chr-Site']=extractedDf['Chr']+'-'+extractedDf['Site'].astype(str)
chrToPosN=extractedDf.set_index(['Chr-Site'])['Nucleotide']
chrToPosN=chrToPosN.groupby(chrToPosN.index).first()
humanDf.head()
Chr | Loc | rs | REF | ALT | Annot | Chr_uscs | Loc_+1 | Loc_-1 | |||
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 14727 | rs1045587 | G | A | . | . | RS=1045587;RSPOS=14727;RV;dbSNPBuildID=117;SSR... | chr1 | 14727 | 14726 |
1 | 1 | 630825 | rs9783068 | T | C | . | . | RS=9783068;RSPOS=630825;dbSNPBuildID=119;SSR=1... | chr1 | 630825 | 630824 |
2 | 1 | 630833 | rs9701099 | C | T | . | . | RS=9701099;RSPOS=630833;dbSNPBuildID=119;SSR=1... | chr1 | 630833 | 630832 |
3 | 1 | 817186 | rs3094315 | G | A | . | . | RS=3094315;RSPOS=817186;RV;dbSNPBuildID=103;SS... | chr1 | 817186 | 817185 |
4 | 1 | 833068 | rs12562034 | G | A | . | . | RS=12562034;RSPOS=833068;dbSNPBuildID=120;SSR=... | chr1 | 833068 | 833067 |
humanDf['Chr-Site']=humanDf['Chr'].astype(str)+'-'+humanDf['Loc'].astype(str)#chrToPosN[extractedDf['Chr-Site']].values
#posStrandDf=humanDf[~humanDf.Annot.str.contains(';RV;')]
humanDf['Pos_N']=chrToPosN.loc[humanDf['Chr-Site']].values
humanDf.groupby(['Pos_N','REF']).size().sort_values()
Pos_N REF G GCCTCCAGCCCCAGCT 1 GTGTAAACTCAGAAA 1 GTGTACCCTT 1 GTGTCAT 1 GTGTGCCA 1 GTGTGGGGCGGAGACTT 1 GTGTGT 1 GTGTGTCAGC 1 GTGTTCCAAGGGGACAGTAGCCCC 1 GTTA 1 GTTAAA 1 GTTAAATCCA 1 GTTAAGGAA 1 GTTACAAT 1 GTTACAATTTACTGGCA 1 GTTACAGGAACT 1 GTTACC 1 GTTAGATCC 1 GTTCTCCTCATTGAAAAAGA 1 GTTCTC 1 GTTCTACAACAAGAGCGAGGAT 1 GTTCCT 1 GTTCCATA 1 GTTCAGCT 1 GTGTA 1 GTTCACC 1 GTTATTTTATT 1 GTTATTT 1 GTTATT 1 GTTATC 1 ... T TTC 103 TCA 112 A AAG 118 C CAA 131 CCT 154 CAT 160 A ACT 170 C CTG 258 CTT 263 CAG 308 CG 431 G GT 487 T TA 531 A AT 637 G GC 688 GA 730 A AC 817 C CA 822 T TC 833 TG 883 A AG 910 C CT 941 N A 14995 T 15146 G 18302 C 18431 T T 57432 A A 57962 G G 94487 C C 94699 Length: 5083, dtype: int64
#tmpS.str.extract('(\d+):\d+-(\d+)',expand=True)
#posStrandDf
tmpS
#!head tmp.mouse.bed
## remove 1000 when not testing
liftOverBedDf=pd.read_csv('tmp.mouse.bed',sep='\t',nrows=100000,header=None)
liftOverBedDf.columns=['Chr_uscs','Start','End','rs']
###slice out the region for mouse with name
liftOverBedDf['Chr_ensmbl']=liftOverBedDf['Chr_uscs'].str.replace('^chr','')
liftOverBedDf['Start']=liftOverBedDf['Start']
liftOverBedDf['End']=liftOverBedDf['End']
liftOverBedDf[['Chr_ensmbl','Start','End','rs']].to_csv('tmp.specie.ensembl.bed',sep='\t',header=None,index=None)
specieFastaDir='/cellar/users/btsui/Data/ensembl/release/fasta/Mus_musculus.GRCm38.dna_rm.toplevel.fa'
bedInterCmd="bedtools getfasta -name -fi {} -bed tmp.specie.ensembl.bed > liftOver.fasta".format(specieFastaDir)
#bedInterCmdHuman="bedtools getfasta -name -fi {} -bed tmp.human.ucsc.bed > liftOver.human.fasta".format(humanFastaDir)
#!head liftOver.human.fasta
#!bedtools getfasta -name -fi /cellar/users/btsui/Data/ensembl/release/fasta/Homo_sapiens.GRCh38.dna_rm.toplevel.fa -bed tmp.human.ucsc.bed > liftOver.human.fasta
os.system(bedInterCmdHuman)
!bedtools getfasta -name -fi /cellar/users/btsui/Data/ensembl/release/fasta/Mus_musculus.GRCm38.dna_rm.toplevel.fa -bed tmp.specie.ensembl.bed > liftOver.fasta
os.system(bedInterCmd)
#pd.read_csv('liftOver.human.fasta')
myFastaS=pd.read_csv( 'liftOver.fasta',header=None)[0]
rsToSeqDf=pd.DataFrame({'rs':myFastaS[(myFastaS.index%2)==0].str.replace('>','').values,
'Seq':myFastaS[(myFastaS.index%2)==1].values})
rsToSeqSpecie=rsToSeqDf.set_index('rs')['Seq']
revS=pd.Series({'A':'T','T':'A','C':'G','G':'C'})
rsToSeqSpecie_corrected=pd.Series(data=revS[rsToSeqSpecie].values,index=rsToSeqSpecie.index)
humanDf['ref_sepcie']=rsToSeqSpecie_corrected[humanDf['rs']].values
#withValidDf.Chr
withValidDf=humanDf[humanDf['ref_sepcie'].isin(['A','C','G','T'])&(humanDf.Chr==3)]
(withValidDf['ref_sepcie']==withValidDf['REF']).mean()
#withValidDf#.mean()
#(withValidDf['REF']==withValidDf['ref_sepcie'])[withValidDf['Loc']<100000000].astype(int).reset_index().plot(x='index',y=0,kind='scatter',alpha=0.5)
#withValidDf.Chr
#withValidDf[]
(withValidDf['REF']==withValidDf['ref_sepcie'])[withValidDf['Loc']<100000000].astype(int).reset_index().plot(x='Loc',y=0,kind='scatter',alpha=0.5)
#(withValidDf['REF']==withValidDf['specie_ref_seq']).mean()
withValidDf[(withValidDf['REF']==withValidDf['specie_ref_seq'])]
#withValidDf
revS
myDict={}
for i in range(0,50,1):
myDict[i]={'rev':(withValidDf['specie_ref_seq'].str[i]==revS[withValidDf['REF']].values).mean(),
'pos':(withValidDf['specie_ref_seq'].str[i]==withValidDf['REF'].values).mean()
}
withValidDf['rev_ref']=revS[withValidDf['REF']].values
#withValidDf[withValidDf.specie_ref_seq.str[0]!=revS[withValidDf['REF']].values]
#withValidDf
pd.DataFrame(myDict).T#.plot()
i=25
#pd.Series(myDict)
tmpDf=pd.DataFrame( {'specie':withValidDf['specie_ref_seq'].str[i].values,'ref':withValidDf['REF'].values})
tmpDf['rev_ref']=revS[tmpDf['ref']].values
(tmpDf['rev_ref']==tmpDf['specie']).mean()
pd.Series(myDict
).plot()
"""myDict={}
for i in range(0,100,1):
myDict[i]=(withValidDf['specie_ref_seq'].str[i]).value_counts()"""
#withValidDf['specie_ref_seq'].str.len()
pd.DataFrame(myDict).T.plot()
pd.Series(myDict)#.plot()
!head tmp.human.ensembl.bed
#!bedtools getfasta -fi /cellar/users/btsui/Data/ensembl/release/fasta/Mus_musculus.GRCm38.dna_rm.toplevel.fa -bed tmp.human.ensembl.bed > liftOver.bed
#!getFastaFromBed -fi
#getFastaFromBed [OPTIONS] -fi <input FASTA> -bed <BED/GFF/VCF>
## get the fasta
#liftOverBedDf
!wc -l tmp.mouse.bed
!wc -l unMapped
#!liftOver