#!/usr/bin/env python # coding: utf-8 # In[2]: import pandas as pd # In[3]: import os # In[4]: from tqdm import tqdm # In[71]: #!cat /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/hg38ToMm10.over.chain.gz.human_homo.bed | head # In[ ]: liftOverDir='/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/' # In[11]: fnames=pd.Series(os.listdir(liftOverDir)) inFnameS=fnames[~fnames.str.contains('unmapped')] # In[12]: myDict={} for fname in tqdm(inFnameS): liftOverFDir=liftOverDir+fname liftUnmappedDir=liftOverFDir.replace('.human_homo.bed','.human_homo.unmapped.bed') #os.listdir(liftOverDir) colOrder=['Chr','Start','End','Rs'] liftOverDf=pd.read_csv(liftOverFDir,sep='\t',header=None,) liftOverDf.columns=colOrder #unmappedS=pd.read_csv(liftUnmappedDir,header=None,)[0] #unmappedDf=unmappedS[(unmappedS.index%2)==1].str.split('\t',expand=True) #unmappedDf.columns=colOrder #mapped=liftOverDf.groupby('Chr').size().sum() myDict[fname ]=liftOverDf.shape[0] #print (total) # In[28]: unmappedS=pd.read_csv(liftUnmappedDir,header=None,)[0] unmappedDf=unmappedS[(unmappedS.index%2)==1].str.split('\t',expand=True) unmappedDf.columns=colOrder mapped=liftOverDf.groupby('Chr').size().sum() # In[29]: total=liftOverDf.groupby('Chr').size().sum()+unmappedDf.groupby('Chr').size().sum() # In[38]: #liftOverDf # In[45]: tmpDf=(pd.Series(myDict)).reset_index() tmpDf.columns=['Ucsc Chain Filename','Number of Human Snps Lifted Over'] #.sort_values(ascending=False).to_frame(name='PercentLiftedOver') # In[47]: tmpDf['Percent of Human Snps Lifted Over']=tmpDf['Number of Human Snps Lifted Over']/total # In[52]: ucscChainEfficiency=tmpDf.sort_values('Number of Human Snps Lifted Over',ascending=False) # In[55]: #!ls # In[56]: ucscChainEfficiency.to_csv('./Data/Human_Snps_Lifted_Over_efficiency.csv',index=False) # In[59]: #!head ./Data/Human_Snps_Lifted_Over_efficiency.csv # In[ ]: # In[19]: unmappedDf.head() # In[34]: get_ipython().system('head $liftUnmappedDir') # In[35]: liftOverDf.columns=[''] # In[ ]: # In[21]: liftOverDf.head() # In[72]: #!ls -laht /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/ # In[10]: #!ls -lah /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/hg38ToMm10.over.chain.gz.human_homo.bed # In[4]: pd.read_csv('/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/hg38ToMm10.over.chain.gz.human_homo.bed',sep='\t',header=None) # In[ ]: