#!/usr/bin/env python
# coding: utf-8

# In[2]:


import pandas as pd


# In[3]:


import os


# In[4]:


from tqdm import tqdm


# In[71]:


#!cat /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/hg38ToMm10.over.chain.gz.human_homo.bed | head


# In[ ]:


liftOverDir='/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/'


# In[11]:


fnames=pd.Series(os.listdir(liftOverDir))
inFnameS=fnames[~fnames.str.contains('unmapped')]


# In[12]:


myDict={}
for fname in tqdm(inFnameS):
    liftOverFDir=liftOverDir+fname
    liftUnmappedDir=liftOverFDir.replace('.human_homo.bed','.human_homo.unmapped.bed')

    #os.listdir(liftOverDir)

    colOrder=['Chr','Start','End','Rs']

    liftOverDf=pd.read_csv(liftOverFDir,sep='\t',header=None,)

    liftOverDf.columns=colOrder

    #unmappedS=pd.read_csv(liftUnmappedDir,header=None,)[0]

    #unmappedDf=unmappedS[(unmappedS.index%2)==1].str.split('\t',expand=True)

    #unmappedDf.columns=colOrder
    #mapped=liftOverDf.groupby('Chr').size().sum()
    myDict[fname ]=liftOverDf.shape[0]
    
    #print (total)


# In[28]:


unmappedS=pd.read_csv(liftUnmappedDir,header=None,)[0]

unmappedDf=unmappedS[(unmappedS.index%2)==1].str.split('\t',expand=True)

unmappedDf.columns=colOrder
mapped=liftOverDf.groupby('Chr').size().sum()


# In[29]:


total=liftOverDf.groupby('Chr').size().sum()+unmappedDf.groupby('Chr').size().sum()


# In[38]:


#liftOverDf


# In[45]:


tmpDf=(pd.Series(myDict)).reset_index()
tmpDf.columns=['Ucsc Chain Filename','Number of Human Snps Lifted Over']
#.sort_values(ascending=False).to_frame(name='PercentLiftedOver')


# In[47]:


tmpDf['Percent of Human Snps Lifted Over']=tmpDf['Number of Human Snps Lifted Over']/total


# In[52]:


ucscChainEfficiency=tmpDf.sort_values('Number of Human Snps Lifted Over',ascending=False)


# In[55]:


#!ls


# In[56]:


ucscChainEfficiency.to_csv('./Data/Human_Snps_Lifted_Over_efficiency.csv',index=False)


# In[59]:


#!head ./Data/Human_Snps_Lifted_Over_efficiency.csv


# In[ ]:


# In[19]:


unmappedDf.head()


# In[34]:


get_ipython().system('head $liftUnmappedDir')


# In[35]:


liftOverDf.columns=['']


# In[ ]:


# In[21]:


liftOverDf.head()


# In[72]:


#!ls -laht /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/


# In[10]:


#!ls -lah /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/hg38ToMm10.over.chain.gz.human_homo.bed


# In[4]:


pd.read_csv('/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/hg38ToMm10.over.chain.gz.human_homo.bed',sep='\t',header=None)


# In[ ]: