#!/usr/bin/env python
# coding: utf-8

# The code in here is somewhat involved mostly because it needs to parallelize the data reading process, otherwise it could be a lot simpler. 
# 
# parseSrr is probably what u need to read the most

# In[1]:


import pandas as pd
import numpy as np
import re
import os
import math
from multiprocessing import Pool

## init
mySpecie='Homo_sapiens'
outMergedDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5'

##change this dir to point to the updated csv
full_meta_dir="/cellar/users/btsui/Project/METAMAP/notebook/Parsing/sra_dump.csv"
inSrrDir='/nrnb/users/btsui/Data/all_seq/snp/'
tmp_dir='/nrnb/users/btsui/Data/all_seq/tmp/'


# In[2]:


#!df -h /data/cellardata/


# In[3]:


get_ipython().run_cell_magic('time', '', "#%matplotlib inline\n##machine: 5-1\n\n\n\nfull_meta_df=pd.read_csv(full_meta_dir)\n\n#inSrrDir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'\n#existingMergedDf=pd.read_pickle(outMergedDir)\n\nmySpecieDf=full_meta_df[full_meta_df['ScientificName']==mySpecie]\n\n#find the chromosomes \ntmpBedDf=pd.read_csv('/data/cellardata/users/btsui/dbsnp/snp_beds/'+mySpecie+'.bed',header=None,sep='\\t')\nunique_chroms=tmpBedDf[0].astype(np.str).unique()\n\n### start merging one by one \nif os.path.exists(tmp_dir):\n    os.system('rm -r '+tmp_dir)\nos.system('mkdir -p '+tmp_dir)\n\n\nos.chdir(tmp_dir)\n#identify non empty files\nos.system('ls -la '+inSrrDir+' > ls_out.txt ')\nls_df=pd.read_csv('ls_out.txt',sep='\\s+',header=None,names=np.arange(9)).iloc[1:]\n#ls_df=\nsize_S=ls_df[4]\nm4=size_S.astype(np.int)>1000\nm5=ls_df[8].str.contains('.txt.snp.gz$')\nnon_empty_files=ls_df[m4&m5][8].str.split('/').str[-1].str.split('.').str[0].values\n\n")


# In[4]:


print ('# of files to merge:',len(non_empty_files))


# In[6]:


get_ipython().run_cell_magic('time', '', '"""\ngiven: srr id \nreturn: the merged file\n"""\ndef parseSrr(inSrr):\n    #print inSrr\n    fname=inSrrDir+inSrr+\'.txt.snp.gz\'\n    tmpDf_all=pd.read_csv(fname,sep=\'\\s+\',header=None,names=np.arange(50),index_col=None,error_bad_lines=False)\n    myCols=[\'Chr\',\'Pos\',\'Ref\',\'rd_all\',\'\',\'A\',\'C\',\'G\',\'T\',\'N\']\n    tmpDf=tmpDf_all.iloc[:,:len(myCols)]\n    tmpDf.columns=myCols\n    tmpDf2=tmpDf.set_index([\'Chr\',\'Pos\'])\n    myBases=[\'A\',\'C\',\'G\',\'T\']\n    myL=[]\n    for base in myBases:\n            splitL=tmpDf2[base].str.split(\':\',expand=True)\n            ### extract  the read count and base quality\n            tmpDf5=splitL[[1,3]].astype(np.float)\n            tmpDf5.columns=[\'ReadDepth\',\'AverageBaseQuality\']\n            myL.append(tmpDf5)\n    tmpDf6=pd.concat(myL,keys=myBases,axis=0,names=[\'base\'])\n    tmpDf6.columns.name=\'features\'\n    mergedDf=tmpDf6.astype(np.uint16)\n    non_zero_df=mergedDf[mergedDf[\'ReadDepth\']>0]\n    tmpDf7=non_zero_df.reset_index()\n    Run_digits=re.search(\'[DES]RR(\\d+)\', inSrr)\n    Run_Db=re.search(\'([DES]RR)\\d+\', inSrr)\n    tmpDf7[\'Run_digits\']=Run_digits.group(1)\n    tmpDf7[\'Run_db\']=Run_Db.group(1)\n    ###convert the datatypes\n    tmpDf7[\'Pos\']=tmpDf7[\'Pos\'].astype(np.uint32)    \n    tmpDf7[\'Run_digits\']=tmpDf7[\'Run_digits\'].astype(np.uint64)\n    tmpDf7[\'Chr\']=tmpDf7[\'Chr\'].astype(np.str).astype(\'category\',\n                                                      categories=unique_chroms,ordered=True)\n    tmpDf7[\'Run_db\']=tmpDf7[\'Run_db\'].astype(np.str).astype(\'category\',\n                                                            categories=[\'DRR\',\'ERR\',\'SRR\'],ordered=True)\n    tmpDf7[\'base\']=tmpDf7[\'base\'].astype(\'category\',\n                                         categories=myBases,ordered=True)\n    srr_pickle_df=tmpDf7.set_index([\'Run_db\',\'Run_digits\',u\'Chr\', u\'Pos\',u\'base\']).sort_index()\n    return srr_pickle_df\n\n### identify files to be merged\nfnames=pd.Series(os.listdir(inSrrDir))\nsnpFnames=fnames[fnames.str.contains(\'.snp.gz$\')]\nsrrsWithData=snpFnames.str.split(\'.\').str[0]\n#mergedSrrs=existingMergedDf.index.get_level_values(\'Run\')\n#m1=~srrsWithData.isin(mergedSrrs)\nm2=srrsWithData.isin(mySpecieDf[\'Run\'])\nm3=srrsWithData.isin(non_empty_files)\ntoMergeSrrs=srrsWithData[m2&m3].values\n')


# In[8]:


TEST=True
if TEST:
    toRunSrrs=toMergeSrrs[:10]
    chunkSize=5
    nThread=1
else:
    toRunSrrs=toMergeSrrs
    chunkSize=1000
    nThread=64
#optional: free up the memory
if not TEST:
    del mySpecieDf, full_meta_df

def mergeSrrsL(i):
    tmpL=[]
    failedSrrsL=[]
    for srr in toRunSrrs[i:(i+chunkSize)]:
        try:
            tmpL.append(parseSrr(srr))
        except :
            print ('failed: '+srr)
            failedSrrsL.append(srr)
    tmpMergedDf=pd.concat(tmpL)
    #tmpMergedDf=pd.concat([parseSrr(srr) for srr in toRunSrrs[i:(i+chunkSize)]])
    reorderedDf=tmpMergedDf.sort_index()
    reorderedDf.to_pickle(tmp_dir+str(i)+'.pickle.gz',compression='gzip')
    return failedSrrsL

Chunks=np.arange(0, len(toRunSrrs),chunkSize)
if TEST:
    failed_srr_l=map(mergeSrrsL,Chunks.tolist())
else:
    from multiprocessing import Pool
    p=Pool(nThread)
    ### sweep for uncompleted chunks
    failed_srr_l=p.map(mergeSrrsL,Chunks.tolist())
    p.close()


# In[9]:


#!ls /cellar/users/btsui/Data/dbsnp/


# In[10]:


snpBed='/cellar/users/btsui/Data/dbsnp/snp_beds/'+mySpecie+'.bed'


### find the data that overlap exactly
tmpDf_ref_all_snps_df=pd.read_csv(snpBed,
                      sep='\s+',header=None,names=np.arange(50),index_col=None,error_bad_lines=False)
myCols=['Chr','Pos','Ref','rd_all','','A','C','G','T','N']
tmpDf=tmpDf_ref_all_snps_df.iloc[:,:len(myCols)]
tmpDf.columns=myCols
myReindexedDf=tmpDf.set_index(['Chr','Pos'])
refIndex=myReindexedDf.index


# ### merging the files

# In[ ]:


"""
changes: need to do the split for different bins in here

Map the regions into the right bin
"""
#myInDir=''
#feature=''
from multiprocessing import Pool
import pandas as pd
import numpy as np
import os
tmp_dir='/nrnb/users/btsui/Data/all_seq/tmp/'
TEST=False

"""
change directory and identify intermediate pickle objects
"""
os.chdir(tmp_dir)
my_ls_S=pd.Series(os.listdir('./'))
ChunksFnames=my_ls_S[my_ls_S.str.contains('.pickle.gz$')].values

### create another dirctory for 
tmp_dir_2='/nrnb/users/btsui/Data/all_seq/tmp_2/'
os.system('rm -r '+tmp_dir_2)
os.system('mkdir '+tmp_dir_2)

"""
input: pickle name
output: split into chunks 
"""
feature_for_hash='Pos'
#2483 blocks per file 
def splitChunk(inChunkFname):
    tmpDf3=pd.read_pickle(inChunkFname).reset_index()
    my_base_shift=int(10**5)#int(10**(max_size_order-n))
    my_block_S=(tmpDf3[feature_for_hash]/my_base_shift).astype(np.int)*my_base_shift
    ##add one for the loop at the end
    tmpDf3['block']=my_block_S #6.5s
    tmpDf3=tmpDf3.sort_values(feature_for_hash) # might improve perfromance of group by
    """
    export chunks
    """
    g=tmpDf3.groupby('block')
    for myBlock,tmpDf9 in g:
        myKeyName='Chunk_'+str(inChunkFname.split('.')[0])+'.'+feature_for_hash+'_block_'+str(myBlock)
        #print (tmp_dir_2+myKeyName)
        tmpDf9.to_pickle(tmp_dir_2+myKeyName)
        
#splitChunk('60700.pickle.gz')

"""
test on a small number of files first 
"""
p=Pool(64)
if TEST:
    p.map(splitChunk,ChunksFnames[:10])
else:
    p.map(splitChunk,ChunksFnames)
p.close()


"""

"""
tmp_dir_3='/nrnb/users/btsui/Data/all_seq/tmp_3/'
os.system('rm -r '+tmp_dir_3)
os.system('mkdir '+tmp_dir_3)
my_chunked_pickles=pd.Series(os.listdir(tmp_dir_2))
tmpDf6=pd.DataFrame({"chunk":my_chunked_pickles.str.split('.').str[1],"fname":my_chunked_pickles})
g=tmpDf6.groupby('chunk')['fname']
myChunks=g.groups.keys()

def mergeChunks(inputChunk):
    myFiles=g.get_group(inputChunk)
    myL=[]
    for myFile in myFiles:
        myL.append(pd.read_pickle(tmp_dir_2+myFile))
    myMergedDf=pd.concat(myL,axis=0)
    myMergedDf.set_index(['Chr','base','Run_db']).to_hdf(tmp_dir_3+inputChunk,'chunk',mode='w',format='fixed')

p=Pool(64)
p.map(mergeChunks,myChunks)
p.close()
"""
for each chunks
"""


# In[ ]:


"""
for each chunk, merge into a hdf5 file 
merge all the tiny chunks into 
"""


# In[23]:


myDfDf=pd.read_hdf('Pos_block_140700000',mode='r')


# In[32]:


#subDf=myDfDf.loc['7']
#subDf[subDf['Pos']==140753336].reset_index()#.index.get_level_values('base')


# In[15]:


"""
changes: need to do the split for different bins in here

Map the regions into the right bin
"""
#myInDir=''
#feature=''
import pandas as pd
from multiprocessing import Pool
import numpy as np
import os
"""
change directory and identify intermediate pickle objects
"""
os.chdir(tmp_dir)
my_ls_S=pd.Series(os.listdir('./'))
ChunksFnames=my_ls_S[my_ls_S.str.contains('.pickle.gz$')].values


### create another dirctory for 
tmp_dir_2='/nrnb/users/btsui/Data/all_seq/tmp_2/'
os.system('rm -r '+tmp_dir_2)
os.system('mkdir '+tmp_dir_2)

"""
input: pickle name
output: split into chunks 
"""
feature_for_hash='Pos'
inChunkFname=ChunksFnames[0]
tmpDf3=pd.read_pickle(inChunkFname).reset_index()
my_max=tmpDf3[feature_for_hash].max()

print (my_max)# 6121752
#max_size_order=math.ceil(np.log10(my_max))
#print ('order of max',max_size_order)
#n=4
my_base_shift=int(10**5)#int(10**(max_size_order-n))
my_block_S=(tmpDf3[feature_for_hash]/my_base_shift).astype(np.int)*my_base_shift

nBlocks=my_block_S.max()
print ('# of blocks',)
blocks_VC=my_block_S.value_counts()
##add one for the loop at the end
my_range=blocks_VC.index#np.arange(0,nBlocks+1,my_base_shift)
print ('range size: ',len(my_range))
outH5Name=outMergedDir.replace('.h5','.'+feature_for_hash+'.'+str(my_base_shift)+'.chunked.h5')
print ('output name:',outH5Name)
os.system('rm '+outH5Name)
get_ipython().run_line_magic('time', "tmpDf3['block']=my_block_S #6.5s")
get_ipython().run_line_magic('time', 'tmpDf3=tmpDf3.sort_values(feature_for_hash) # 30 mins')

"""
export chunks
"""
#myOrderDict={'Pos':[u'Chr', u'Pos',u'base','Run_digits','Run_db'],
#            'Run_digits':['Run_digits','Run_db',u'Chr', u'Pos',u'base']}
g=tmpDf3.groupby('block')
for myBlock,tmpDf9 in g:
    tmpDf9=tmpDf3[tmpDf3['block']==myBlock]# For run, 27s, For Pos: 2 min :30 s
    my_index_order=myOrderDict[feature_for_hash]
    #tmpDf10=tmpDf9.set_index(my_index_order) 
    myKeyName=feature_for_hash+'_'+str(myBlock)+'.block_'+str(myBlock)
    print (tmp_dir_2+myKeyName)
    tmpDf9.to_pickle(tmp_dir_2+myKeyName)
    #print (myKeyName,tmpDf9.shape)
    #myKeyName


# In[ ]:


### merge the files
"""
my_max=tmpDf3[feature_for_hash].max()

print (my_max)# 6121752
max_size_order=math.ceil(np.log10(my_max))
print ('order of max',max_size_order)
n=4
my_base_shift=int(10**(max_size_order-n))
my_block_S=(tmpDf3[feature_for_hash]/my_base_shift).astype(np.int)*my_base_shift
##check the data type of my_block_S and Run_digits
#my_block_S=my_block_S
### make the data become int 
nBlocks=my_block_S.max()
print ('# of blocks',)

blocks_VC=my_block_S.value_counts()

##add one for the loop at the end
my_range=blocks_VC.index#np.arange(0,nBlocks+1,my_base_shift)
print ('range size: ',len(my_range))
outH5Name=outMergedDir.replace('.h5','.'+feature_for_hash+'.'+str(my_base_shift)+'.chunked.h5')
print ('output name:',outH5Name)
os.system('rm '+outH5Name)
%time tmpDf3['block']=my_block_S #6.5s
%time tmpDf3=tmpDf3.sort_values(feature_for_hash) # 30 mins

"""
"""
changes: need to do the split for different bins in here


"""
"""
myTmpL=[]
my_ls_S=pd.Series(os.listdir('./'))
ChunksFnames=my_ls_S[my_ls_S.str.contains('.pickle.gz$')].values
for chunkFname in ChunksFnames:
    tmpDf=pd.read_pickle(chunkFname)
    tmpDf2=tmpDf.reset_index()
    #tmpDf3=tmpDf2.set_index(['Chr','Pos'])
    #m=tmpDf3.index.isin(refIndex)
    subTmpDf2=tmpDf2#[m]
    print '% sites in targets:', (m.mean())
    print subTmpDf2.shape
    myTmpL.append(subTmpDf2)
    
mergedDf=pd.concat(myTmpL)

#mergedS

all_mergedDf=mergedDf.set_index(
    ['Run_db','Run_digits']).sort_index()
#all_mergedDf.to_pickle(outMergedDir)
"""
all_mergedDf.to_hdf(outMergedDir,key='master',mode="w")
allFailedSrrs=pd.Series(reduce(lambda a,x:a+x,failed_srr_l,[]))
allFailedSrrs.to_csv('/nrnb/users/btsui/Data/all_seq/'+mySpecie+'.failed.srrs.txt')


# In[ ]:


###change corrdinate to chromsome base for slicing
#mySpecie='Homo_sapiens'
#skymap_snp_dir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5'
sorted_snp_dir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.chrom_pos_sorted.h5'
tmpDf=pd.read_hdf(outMergedDir,key='master',mode='r')
tmpDf4=tmpDf.reset_index()
sortedDf=tmpDf4.set_index( [u'Chr', u'Pos',u'base']).sort_index()
sortedDf.to_hdf(sorted_snp_dir,key='master',mode='w')


# ### scratch

# In[ ]:


#sorted_snp_dir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.chrom_pos_sorted.h5'


tmpDf=pd.read_hdf(outMergedDir,key='master',mode='r')
tmpDf3=tmpDf.reset_index()
tmpDf3['base']=tmpDf3['base'].astype('category')

tmpDf3['Chr']=tmpDf3['Chr'].astype('category')
tmpDf3['Run_db']=tmpDf3['Run_db'].astype('category')

tmpDf3['Pos']=tmpDf3['Pos'].astype(np.uint32)
tmpDf3['Run_digits']=tmpDf3['Run_digits'].astype(np.uint32)

#tmpDf4=tmpDf3.set_index(['Run_digits','Run_db',u'Chr', u'Pos',u'base'])
#tmpDf5=tmpDf4.sort_index()
outRunSortedOutDir=outMergedDir.replace('.h5','.pickle')
os.system('rm '+outRunSortedOutDir)
tmpDf3.to_pickle(outRunSortedOutDir)
#tmpDf3.to_hdf(outRunSortedOutDir,key='master',mode='w')


# In[ ]:


# ### test exporting to hdf5 data

# In[ ]:


'/cellar/users/btsui/'


# In[ ]:


import pandas as pd
import numpy as np
import re
import os
from multiprocessing import Pool

## init
mySpecie='Homo_sapiens'
outMergedDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5'


get_ipython().run_line_magic('time', "tmpDf=pd.read_hdf(outMergedDir,key='master',mode='r')")
get_ipython().run_line_magic('time', 'tmpDf3=tmpDf.reset_index()')
"""
CPU times: user 8.21 s, sys: 19.9 s, total: 28.2 s
Wall time: 1min 18s
CPU times: user 1min 35s, sys: 57.8 s, total: 2min 33s
Wall time: 2min 33s

"""
get_ipython().run_line_magic('time', "tmpDf5=tmpDf3.set_index(['Run_digits','Run_db'])")
"""
#CPU times: user 2min 7s, sys: 1min 54s, total: 4min 1s
Wall time: 4min 1s
"""
get_ipython().run_line_magic('time', "tmpDf5.to_hdf('./test_wo_chroms.hdf','master',mode='w',append=False,format='table')")
#Exception: cannot find the correct atom type -> [dtype->object,items->Index(['Chr', 'base'], dtype='object')] 


# In[36]:


math.ceil()


# In[ ]:


#myCol=np.array(['Run_digits','Run_db',u'Chr', u'Pos',u'base'])
### what's the datatype for those guys:


# In[ ]:


### let's just 
import pandas as pd
import numpy as np
import re
import os
import math

from multiprocessing import Pool

## init
mySpecie='Homo_sapiens'
outMergedDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5'

get_ipython().run_line_magic('time', "tmpDf=pd.read_hdf(outMergedDir,key='master',mode='r')")
get_ipython().run_line_magic('time', 'tmpDf3=tmpDf.reset_index()')

#feature_for_hash='Run_digits'
feature_for_hash='Pos'
myOrderDict={'Pos':[u'Chr', u'Pos',u'base','Run_digits','Run_db'],
            'Run_digits':['Run_digits','Run_db',u'Chr', u'Pos',u'base']}
my_max=tmpDf3[feature_for_hash].max()

print (my_max)# 6121752
max_size_order=math.ceil(np.log10(my_max))
print ('order of max',max_size_order)
n=4
my_base_shift=int(10**(max_size_order-n))
my_block_S=(tmpDf3[feature_for_hash]/my_base_shift).astype(np.int)*my_base_shift
##check the data type of my_block_S and Run_digits
#my_block_S=my_block_S
### make the data become int 
nBlocks=my_block_S.max()
print ('# of blocks',)

blocks_VC=my_block_S.value_counts()

##add one for the loop at the end
my_range=blocks_VC.index#np.arange(0,nBlocks+1,my_base_shift)
print ('range size: ',len(my_range))
outH5Name=outMergedDir.replace('.h5','.'+feature_for_hash+'.'+str(my_base_shift)+'.chunked.h5')
print ('output name:',outH5Name)
os.system('rm '+outH5Name)
get_ipython().run_line_magic('time', "tmpDf3['block']=my_block_S #6.5s")
get_ipython().run_line_magic('time', 'tmpDf3=tmpDf3.sort_values(feature_for_hash) # 30 mins')
#%time g=tmpDf3.groupby('block') #1min 35s
#
for myBlock in my_range:
#for myBlock,tmpDf9 in g:
    """
    in Pos, 
    """
    get_ipython().run_line_magic('time', "tmpDf9=tmpDf3[tmpDf3['block']==myBlock]# For run, 27s, For Pos: 2 min :30 s")
    
    my_index_order=myOrderDict[feature_for_hash]
    
    #tmpDf10=tmpDf9.set_index(my_index_order) 
    myKeyName=feature_for_hash+'_'+str(myBlock)
    print (myKeyName,tmpDf9.shape)
    
    tmpDf9.to_hdf(outH5Name,myKeyName,mode='a',format='fixed')
    #if myBlock >3000:
    #    break


# In[4]:


#('range size: ', 1201)


# In[3]:


### make sure the results is chunked as expects
#!ls /cellar/users/btsui/all_seq_snp/
#2 seconds per data push 


# In[ ]:


#7 digits, 
# 18gb to 0.1
#create an hdf5 object, 
tmpDf.to_hdf('./test.h5','master',mode='w',append=False,format='table')


# In[ ]:


get_ipython().run_line_magic('time', "tmpDf.to_hdf('./test.h5','master',mode='w',append=False,format='table')")


testDf=tmpDf.head()

testDf.to_hdf('./test.h5','master',mode='w',format='table')
#export succceedded, but the slicing operation time takes forever, the path only lead from /master/ to py table. 


# In[1]:


import pandas as pd
import numpy as np


# In[2]:


#v6
#14.9s 
get_ipython().run_line_magic('time', "testDf20=pd.read_hdf('/cellar/users/btsui/test.pos.chunked.h5','Run_digits_790000',mode='r')")


# In[9]:


np.log10(testDf20.shape[0])


# In[6]:


# In[ ]:


#dataset_name   = '/Configure:0000/Run:0000/CalibCycle:0000/Camera::FrameV1/XppSb4Pim.1:Tm6740.1/image'


# In[10]:


import h5py
import numpy as np
import matplotlib.pyplot as plt


# In[11]:


hdf5_file_name='/cellar/users/btsui/test.h5'


# In[12]:


f  = h5py.File(hdf5_file_name, 'r') 


# In[23]:


table=f['/master/table']
#obj.iteritems()


# In[ ]:


table[1000:2000]


# In[33]:


table.dtype


# In[26]:


table.shape


# In[19]:


for i,myObj in enumerate(obj.iteritems()):
    print myObj
    if i>10:
        break
    

# In[35]:


get_ipython().run_line_magic('time', "posDf=pd.read_hdf('/cellar/users/btsui/test.h5','master',where='Run_digits=15999')")


# In[2]:


"""
the multindex doesn't work no matter what
just use pickle for now: 
"""


# In[ ]:


"""
[u'Chr', u'Pos',u'base','Run']
-rw-r--r-- 1 btsui users 1.8M Jan  2 16:42 0.pickle.gz
-rw-r--r-- 1 btsui users 2.0M Jan  2 16:42 10.pickle.gz

['Run', u'Chr', u'Pos',u'base']
-rw-r--r-- 1 btsui users 1.9M Jan  2 16:43 0.pickle.gz
-rw-r--r-- 1 btsui users 2.2M Jan  2 16:44 10.pickle.gz
"""


# In[ ]:


"""
### where did 1000.pickle.gz go from the first iteration
### out of 82, only 60 processed, what about the rest?
### 
"""


# In[ ]:


#!rm /tmp/btsui/snp_merged/0.pickle


# In[15]:


#18G
myDf=pd.read_hdf('/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.h5',key='master')


# In[17]:


#myDf


# In[ ]:


### keep everything in the same place?
##


# In[ ]:


### read in all the human.
###take only those in human and merge all of them together, be only additivite, add only those where it has not been added. 
#srr_pickle_df.iloc[:0].to_pickle('/data/cellardata/users/btsui/dbsnp/empty_base_snp.pickle')


# In[ ]:


## merge all the data in bulk, 
### generate an empty pickle for each species?


# In[ ]:


#orignal pickle, 2.7m
#!ls -lah ./tmp.pickle


# In[ ]:


#!ls -lah tmp2.pickle.gz


# In[ ]:


#!rm ./tmp2.pickle


# In[ ]:


### merge all the files, by 1000 steps


# In[ ]:


srr_pickle_df.to_pickle('./tmp2.pickle')
#srr_pickle_df.to_pickle('./tmp.pickle')


# In[ ]:


# In[ ]:


get_ipython().system('ls -lah ./tmp.pickle  ./tmp2.pickle')


# In[ ]:


get_ipython().system('rm ./tmp2.pickle.gz')
get_ipython().system('rm ./tmp.pickle.gz')


# In[ ]:


get_ipython().system('gzip ./tmp2.pickle')


# In[ ]:


get_ipython().system('gzip ./tmp.pickle')


# In[ ]:


get_ipython().system('ls -lah tmp2.pickle.gz')


# In[ ]:


get_ipython().system('ls -lah tmp.pickle.gz')


# In[ ]:


### merge the pickle 


# In[ ]:


#0.4 mb
np.log10(0.4*(4*(10**6)))


# In[ ]:


### the multindex doesn't increase the 


# In[ ]:


#no filtering on variants
countDf=tmpDf6[tmpDf6.rd>0]#.rd.value_counts()
##at a position, tell what's the value


# In[ ]:


subDf.head()


# In[ ]:


my_data_L=[]
my_key_L=[]
g=countDf.stack().groupby(['base','features'])
for myTuple, subS in g:
    sparse_Ids=subS.index.get_level_values(index_name)
    tmp_row_col=(sparse_Ids,np.zeros(len(sparse_Ids)))
    tmp_array=sparse.csc_matrix((subS ,tmp_row_col),shape=tmp_shape)
    my_data_L.append(tmp_array)
    my_key_L.append(myTuple)


# In[ ]:


sparseM=sparse.hstack(my_data_L)


# In[ ]:


# In[ ]:


index_name='Chr_Pos_Id'
baseDf.groupby(index_name)
#csc_matrix((data, (row, col)), shape=(3, 3)).toarray()
attrib='rd'
tmp_shape=(Chr_Pos_to_ID_S.max(),1)
for my_base, subDf in countDf.groupby(level='base'):
    sparse_Ids=subDf.index.get_level_values(index_name)
    
    tmp_row_col=(sparse_Ids,np.zeros(len(sparse_Ids)))
    
    tmpS=sparse.csc_matrix((subDf[attrib] ,tmp_row_col),shape=tmp_shape)


# In[ ]:


countDf.shape


# In[ ]:


#countDf


# In[289]:


#full_meta_df['Run']


# In[282]:


import pandas as pd
import numpy as np
import re
import os
from multiprocessing import Pool

## init
mySpecie='Homo_sapiens'
outMergedDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5'

full_meta_dir="/cellar/users/btsui/Project/METAMAP/notebook/Parsing/sra_dump.csv"
full_meta_df=pd.read_csv(full_meta_dir)

#inSrrDir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'
inSrrDir='/nrnb/users/btsui/Data/all_seq/snp/'
existingMergedDf=pd.read_pickle(outMergedDir)


# In[292]:


existingMergedDf.groupby(['Run_db','Run_digits']).size()


# In[3]:


import pandas as pd
import numpy as np


# In[4]:


s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])


# In[7]:


s.flags


# In[6]:


s.index.flags


# In[11]:


pd.__path__


# In[12]:


store = pd.HDFStore('test.h5','w')


# In[ ]:


### check compression ratio with at last ten reads: 


# In[ ]:


### select by variants, for select by