import pandas as pd
import os
from tqdm import tqdm
#!cat /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/hg38ToMm10.over.chain.gz.human_homo.bed | head
liftOverDir='/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/'
fnames=pd.Series(os.listdir(liftOverDir))
inFnameS=fnames[~fnames.str.contains('unmapped')]
myDict={}
for fname in tqdm(inFnameS):
liftOverFDir=liftOverDir+fname
liftUnmappedDir=liftOverFDir.replace('.human_homo.bed','.human_homo.unmapped.bed')
#os.listdir(liftOverDir)
colOrder=['Chr','Start','End','Rs']
liftOverDf=pd.read_csv(liftOverFDir,sep='\t',header=None,)
liftOverDf.columns=colOrder
#unmappedS=pd.read_csv(liftUnmappedDir,header=None,)[0]
#unmappedDf=unmappedS[(unmappedS.index%2)==1].str.split('\t',expand=True)
#unmappedDf.columns=colOrder
#mapped=liftOverDf.groupby('Chr').size().sum()
myDict[fname ]=liftOverDf.shape[0]
#print (total)
100%|██████████| 157/157 [01:07<00:00, 2.32it/s]
unmappedS=pd.read_csv(liftUnmappedDir,header=None,)[0]
unmappedDf=unmappedS[(unmappedS.index%2)==1].str.split('\t',expand=True)
unmappedDf.columns=colOrder
mapped=liftOverDf.groupby('Chr').size().sum()
total=liftOverDf.groupby('Chr').size().sum()+unmappedDf.groupby('Chr').size().sum()
#liftOverDf
tmpDf=(pd.Series(myDict)).reset_index()
tmpDf.columns=['Ucsc Chain Filename','Number of Human Snps Lifted Over']
#.sort_values(ascending=False).to_frame(name='PercentLiftedOver')
tmpDf['Percent of Human Snps Lifted Over']=tmpDf['Number of Human Snps Lifted Over']/total
ucscChainEfficiency=tmpDf.sort_values('Number of Human Snps Lifted Over',ascending=False)
#!ls
ucscChainEfficiency.to_csv('./Data/Human_Snps_Lifted_Over_efficiency.csv',index=False)
#!head ./Data/Human_Snps_Lifted_Over_efficiency.csv
unmappedDf.head()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-19-8002f020a79e> in <module>() ----> 1 unmappedDf.head() NameError: name 'unmappedDf' is not defined
!head $liftUnmappedDir
#Deleted in new chr1 817185 817186 rs3094315 #Deleted in new chr1 833067 833068 rs12562034 #Deleted in new chr1 842132 842133 rs6672353 #Deleted in new chr1 843941 843942 rs4040617 #Deleted in new chr1 850608 850609 rs2980300
liftOverDf.columns=['']
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-35-2642342195d6> in <module>() ----> 1 liftOverDf.columns=[''] ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in __setattr__(self, name, value) 4387 try: 4388 object.__getattribute__(self, name) -> 4389 return object.__setattr__(self, name, value) 4390 except AttributeError: 4391 pass pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__set__() ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _set_axis(self, axis, labels) 644 645 def _set_axis(self, axis, labels): --> 646 self._data.set_axis(axis, labels) 647 self._clear_item_cache() 648 ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in set_axis(self, axis, new_labels) 3321 raise ValueError( 3322 'Length mismatch: Expected axis has {old} elements, new ' -> 3323 'values have {new} elements'.format(old=old_len, new=new_len)) 3324 3325 self.axes[axis] = new_labels ValueError: Length mismatch: Expected axis has 4 elements, new values have 1 elements
liftOverDf.head()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | chr17 | 66119104 | 66119105 | rs1045587 |
1 | chrM | 5085 | 5086 | rs9783068 |
2 | chrM | 5094 | 5095 | rs9701099 |
3 | chr4 | 156244135 | 156244136 | rs267598747 |
4 | chr4 | 156239438 | 156239439 | rs267598748 |
#!ls -laht /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/
#!ls -lah /cellar/users/btsui/Data/ucsc_chains_human_homo_snps/hg38ToMm10.over.chain.gz.human_homo.bed
pd.read_csv('/cellar/users/btsui/Data/ucsc_chains_human_homo_snps/hg38ToMm10.over.chain.gz.human_homo.bed',sep='\t',header=None)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | chr17 | 66119104 | 66119105 | rs1045587 |
1 | chrM | 5085 | 5086 | rs9783068 |
2 | chrM | 5094 | 5095 | rs9701099 |
3 | chr4 | 156244135 | 156244136 | rs267598747 |
4 | chr4 | 156239438 | 156239439 | rs267598748 |
5 | chr4 | 156225156 | 156225157 | rs267598759 |
6 | chr4 | 156199912 | 156199913 | rs786201005 |
7 | chr4 | 156199827 | 156199828 | rs1921 |
8 | chr4 | 156199739 | 156199740 | rs672601345 |
9 | chr4 | 156199696 | 156199697 | rs672601312 |
10 | chr4 | 156197369 | 156197370 | rs115173026 |
11 | chr4 | 156197365 | 156197366 | rs1057523287 |
12 | chr4 | 156197347 | 156197348 | rs201073369 |
13 | chr4 | 156195516 | 156195517 | rs115704555 |
14 | chr4 | 156195480 | 156195481 | rs756623659 |
15 | chr4 | 156195445 | 156195446 | rs6657048 |
16 | chr4 | 156195392 | 156195393 | rs879253787 |
17 | chr4 | 156179593 | 156179594 | rs544749044 |
18 | chr4 | 156179320 | 156179321 | rs191270495 |
19 | chr4 | 156179311 | 156179312 | rs536085218 |
20 | chr4 | 156179276 | 156179277 | rs200607541 |
21 | chr4 | 156179245 | 156179246 | rs113789806 |
22 | chr4 | 156178902 | 156178903 | rs587777299 |
23 | chr4 | 156178901 | 156178902 | rs150359724 |
24 | chr4 | 156178836 | 156178837 | rs138031468 |
25 | chr4 | 156178640 | 156178641 | rs2799066 |
26 | chr4 | 156178454 | 156178455 | rs879253788 |
27 | chr4 | 156178405 | 156178406 | rs2710876 |
28 | chr4 | 156177431 | 156177432 | rs116586548 |
29 | chr4 | 156177391 | 156177392 | rs147346337 |
... | ... | ... | ... | ... |
280826 | chr8 | 22333653 | 22333654 | rs207482105 |
280827 | chr8 | 22333667 | 22333668 | rs207482106 |
280828 | chr8 | 22333742 | 22333743 | rs207482107 |
280829 | chr8 | 22333747 | 22333748 | rs207482108 |
280830 | chr8 | 22333798 | 22333799 | rs207482109 |
280831 | chr8 | 22334439 | 22334440 | rs207482111 |
280832 | chr8 | 22334458 | 22334459 | rs207482112 |
280833 | chr8 | 22334590 | 22334591 | rs207482113 |
280834 | chr8 | 22334699 | 22334700 | rs207482114 |
280835 | chr8 | 22334842 | 22334843 | rs207482115 |
280836 | chr8 | 22335006 | 22335007 | rs207482116 |
280837 | chr8 | 22336376 | 22336377 | rs207482117 |
280838 | chr8 | 22339437 | 22339438 | rs207482131 |
280839 | chr8 | 22339469 | 22339470 | rs207482132 |
280840 | chr8 | 22345073 | 22345074 | rs207482133 |
280841 | chr8 | 22347953 | 22347954 | rs207482136 |
280842 | chr8 | 22349866 | 22349867 | rs207482155 |
280843 | chr8 | 22349895 | 22349896 | rs207482156 |
280844 | chr8 | 22351725 | 22351726 | rs207482159 |
280845 | chr8 | 22360231 | 22360232 | rs207482168 |
280846 | chr8 | 22360258 | 22360259 | rs207482169 |
280847 | chr8 | 22360273 | 22360274 | rs207482170 |
280848 | chr8 | 22360399 | 22360400 | rs207482171 |
280849 | chr8 | 22360419 | 22360420 | rs207482172 |
280850 | chr8 | 22360479 | 22360480 | rs207482173 |
280851 | chr8 | 22360993 | 22360994 | rs207482175 |
280852 | chr8 | 22361088 | 22361089 | rs207482176 |
280853 | chr8 | 22361219 | 22361220 | rs207482177 |
280854 | chr8 | 22361240 | 22361241 | rs207482178 |
280855 | chr8 | 22370508 | 22370509 | rs207482182 |
280856 rows × 4 columns