#!/usr/bin/env python # coding: utf-8 # The code in here is somewhat involved mostly because it needs to parallelize the data reading process, otherwise it could be a lot simpler. # # parseSrr is probably what u need to read the most # In[1]: import pandas as pd import numpy as np import re import os import math from multiprocessing import Pool ## init mySpecie='Homo_sapiens' outMergedDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5' ##change this dir to point to the updated csv full_meta_dir="/cellar/users/btsui/Project/METAMAP/notebook/Parsing/sra_dump.csv" inSrrDir='/nrnb/users/btsui/Data/all_seq/snp/' tmp_dir='/nrnb/users/btsui/Data/all_seq/tmp/' # In[2]: #!df -h /data/cellardata/ # In[3]: get_ipython().run_cell_magic('time', '', "#%matplotlib inline\n##machine: 5-1\n\n\n\nfull_meta_df=pd.read_csv(full_meta_dir)\n\n#inSrrDir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'\n#existingMergedDf=pd.read_pickle(outMergedDir)\n\nmySpecieDf=full_meta_df[full_meta_df['ScientificName']==mySpecie]\n\n#find the chromosomes \ntmpBedDf=pd.read_csv('/data/cellardata/users/btsui/dbsnp/snp_beds/'+mySpecie+'.bed',header=None,sep='\\t')\nunique_chroms=tmpBedDf[0].astype(np.str).unique()\n\n### start merging one by one \nif os.path.exists(tmp_dir):\n os.system('rm -r '+tmp_dir)\nos.system('mkdir -p '+tmp_dir)\n\n\nos.chdir(tmp_dir)\n#identify non empty files\nos.system('ls -la '+inSrrDir+' > ls_out.txt ')\nls_df=pd.read_csv('ls_out.txt',sep='\\s+',header=None,names=np.arange(9)).iloc[1:]\n#ls_df=\nsize_S=ls_df[4]\nm4=size_S.astype(np.int)>1000\nm5=ls_df[8].str.contains('.txt.snp.gz$')\nnon_empty_files=ls_df[m4&m5][8].str.split('/').str[-1].str.split('.').str[0].values\n\n") # In[4]: print ('# of files to merge:',len(non_empty_files)) # In[6]: get_ipython().run_cell_magic('time', '', '"""\ngiven: srr id \nreturn: the merged file\n"""\ndef parseSrr(inSrr):\n #print inSrr\n fname=inSrrDir+inSrr+\'.txt.snp.gz\'\n tmpDf_all=pd.read_csv(fname,sep=\'\\s+\',header=None,names=np.arange(50),index_col=None,error_bad_lines=False)\n myCols=[\'Chr\',\'Pos\',\'Ref\',\'rd_all\',\'\',\'A\',\'C\',\'G\',\'T\',\'N\']\n tmpDf=tmpDf_all.iloc[:,:len(myCols)]\n tmpDf.columns=myCols\n tmpDf2=tmpDf.set_index([\'Chr\',\'Pos\'])\n myBases=[\'A\',\'C\',\'G\',\'T\']\n myL=[]\n for base in myBases:\n splitL=tmpDf2[base].str.split(\':\',expand=True)\n ### extract the read count and base quality\n tmpDf5=splitL[[1,3]].astype(np.float)\n tmpDf5.columns=[\'ReadDepth\',\'AverageBaseQuality\']\n myL.append(tmpDf5)\n tmpDf6=pd.concat(myL,keys=myBases,axis=0,names=[\'base\'])\n tmpDf6.columns.name=\'features\'\n mergedDf=tmpDf6.astype(np.uint16)\n non_zero_df=mergedDf[mergedDf[\'ReadDepth\']>0]\n tmpDf7=non_zero_df.reset_index()\n Run_digits=re.search(\'[DES]RR(\\d+)\', inSrr)\n Run_Db=re.search(\'([DES]RR)\\d+\', inSrr)\n tmpDf7[\'Run_digits\']=Run_digits.group(1)\n tmpDf7[\'Run_db\']=Run_Db.group(1)\n ###convert the datatypes\n tmpDf7[\'Pos\']=tmpDf7[\'Pos\'].astype(np.uint32) \n tmpDf7[\'Run_digits\']=tmpDf7[\'Run_digits\'].astype(np.uint64)\n tmpDf7[\'Chr\']=tmpDf7[\'Chr\'].astype(np.str).astype(\'category\',\n categories=unique_chroms,ordered=True)\n tmpDf7[\'Run_db\']=tmpDf7[\'Run_db\'].astype(np.str).astype(\'category\',\n categories=[\'DRR\',\'ERR\',\'SRR\'],ordered=True)\n tmpDf7[\'base\']=tmpDf7[\'base\'].astype(\'category\',\n categories=myBases,ordered=True)\n srr_pickle_df=tmpDf7.set_index([\'Run_db\',\'Run_digits\',u\'Chr\', u\'Pos\',u\'base\']).sort_index()\n return srr_pickle_df\n\n### identify files to be merged\nfnames=pd.Series(os.listdir(inSrrDir))\nsnpFnames=fnames[fnames.str.contains(\'.snp.gz$\')]\nsrrsWithData=snpFnames.str.split(\'.\').str[0]\n#mergedSrrs=existingMergedDf.index.get_level_values(\'Run\')\n#m1=~srrsWithData.isin(mergedSrrs)\nm2=srrsWithData.isin(mySpecieDf[\'Run\'])\nm3=srrsWithData.isin(non_empty_files)\ntoMergeSrrs=srrsWithData[m2&m3].values\n') # In[8]: TEST=True if TEST: toRunSrrs=toMergeSrrs[:10] chunkSize=5 nThread=1 else: toRunSrrs=toMergeSrrs chunkSize=1000 nThread=64 #optional: free up the memory if not TEST: del mySpecieDf, full_meta_df def mergeSrrsL(i): tmpL=[] failedSrrsL=[] for srr in toRunSrrs[i:(i+chunkSize)]: try: tmpL.append(parseSrr(srr)) except : print ('failed: '+srr) failedSrrsL.append(srr) tmpMergedDf=pd.concat(tmpL) #tmpMergedDf=pd.concat([parseSrr(srr) for srr in toRunSrrs[i:(i+chunkSize)]]) reorderedDf=tmpMergedDf.sort_index() reorderedDf.to_pickle(tmp_dir+str(i)+'.pickle.gz',compression='gzip') return failedSrrsL Chunks=np.arange(0, len(toRunSrrs),chunkSize) if TEST: failed_srr_l=map(mergeSrrsL,Chunks.tolist()) else: from multiprocessing import Pool p=Pool(nThread) ### sweep for uncompleted chunks failed_srr_l=p.map(mergeSrrsL,Chunks.tolist()) p.close() # In[9]: #!ls /cellar/users/btsui/Data/dbsnp/ # In[10]: snpBed='/cellar/users/btsui/Data/dbsnp/snp_beds/'+mySpecie+'.bed' ### find the data that overlap exactly tmpDf_ref_all_snps_df=pd.read_csv(snpBed, sep='\s+',header=None,names=np.arange(50),index_col=None,error_bad_lines=False) myCols=['Chr','Pos','Ref','rd_all','','A','C','G','T','N'] tmpDf=tmpDf_ref_all_snps_df.iloc[:,:len(myCols)] tmpDf.columns=myCols myReindexedDf=tmpDf.set_index(['Chr','Pos']) refIndex=myReindexedDf.index # ### merging the files # In[ ]: """ changes: need to do the split for different bins in here Map the regions into the right bin """ #myInDir='' #feature='' from multiprocessing import Pool import pandas as pd import numpy as np import os tmp_dir='/nrnb/users/btsui/Data/all_seq/tmp/' TEST=False """ change directory and identify intermediate pickle objects """ os.chdir(tmp_dir) my_ls_S=pd.Series(os.listdir('./')) ChunksFnames=my_ls_S[my_ls_S.str.contains('.pickle.gz$')].values ### create another dirctory for tmp_dir_2='/nrnb/users/btsui/Data/all_seq/tmp_2/' os.system('rm -r '+tmp_dir_2) os.system('mkdir '+tmp_dir_2) """ input: pickle name output: split into chunks """ feature_for_hash='Pos' #2483 blocks per file def splitChunk(inChunkFname): tmpDf3=pd.read_pickle(inChunkFname).reset_index() my_base_shift=int(10**5)#int(10**(max_size_order-n)) my_block_S=(tmpDf3[feature_for_hash]/my_base_shift).astype(np.int)*my_base_shift ##add one for the loop at the end tmpDf3['block']=my_block_S #6.5s tmpDf3=tmpDf3.sort_values(feature_for_hash) # might improve perfromance of group by """ export chunks """ g=tmpDf3.groupby('block') for myBlock,tmpDf9 in g: myKeyName='Chunk_'+str(inChunkFname.split('.')[0])+'.'+feature_for_hash+'_block_'+str(myBlock) #print (tmp_dir_2+myKeyName) tmpDf9.to_pickle(tmp_dir_2+myKeyName) #splitChunk('60700.pickle.gz') """ test on a small number of files first """ p=Pool(64) if TEST: p.map(splitChunk,ChunksFnames[:10]) else: p.map(splitChunk,ChunksFnames) p.close() """ """ tmp_dir_3='/nrnb/users/btsui/Data/all_seq/tmp_3/' os.system('rm -r '+tmp_dir_3) os.system('mkdir '+tmp_dir_3) my_chunked_pickles=pd.Series(os.listdir(tmp_dir_2)) tmpDf6=pd.DataFrame({"chunk":my_chunked_pickles.str.split('.').str[1],"fname":my_chunked_pickles}) g=tmpDf6.groupby('chunk')['fname'] myChunks=g.groups.keys() def mergeChunks(inputChunk): myFiles=g.get_group(inputChunk) myL=[] for myFile in myFiles: myL.append(pd.read_pickle(tmp_dir_2+myFile)) myMergedDf=pd.concat(myL,axis=0) myMergedDf.set_index(['Chr','base','Run_db']).to_hdf(tmp_dir_3+inputChunk,'chunk',mode='w',format='fixed') p=Pool(64) p.map(mergeChunks,myChunks) p.close() """ for each chunks """ # In[ ]: """ for each chunk, merge into a hdf5 file merge all the tiny chunks into """ # In[23]: myDfDf=pd.read_hdf('Pos_block_140700000',mode='r') # In[32]: #subDf=myDfDf.loc['7'] #subDf[subDf['Pos']==140753336].reset_index()#.index.get_level_values('base') # In[15]: """ changes: need to do the split for different bins in here Map the regions into the right bin """ #myInDir='' #feature='' import pandas as pd from multiprocessing import Pool import numpy as np import os """ change directory and identify intermediate pickle objects """ os.chdir(tmp_dir) my_ls_S=pd.Series(os.listdir('./')) ChunksFnames=my_ls_S[my_ls_S.str.contains('.pickle.gz$')].values ### create another dirctory for tmp_dir_2='/nrnb/users/btsui/Data/all_seq/tmp_2/' os.system('rm -r '+tmp_dir_2) os.system('mkdir '+tmp_dir_2) """ input: pickle name output: split into chunks """ feature_for_hash='Pos' inChunkFname=ChunksFnames[0] tmpDf3=pd.read_pickle(inChunkFname).reset_index() my_max=tmpDf3[feature_for_hash].max() print (my_max)# 6121752 #max_size_order=math.ceil(np.log10(my_max)) #print ('order of max',max_size_order) #n=4 my_base_shift=int(10**5)#int(10**(max_size_order-n)) my_block_S=(tmpDf3[feature_for_hash]/my_base_shift).astype(np.int)*my_base_shift nBlocks=my_block_S.max() print ('# of blocks',) blocks_VC=my_block_S.value_counts() ##add one for the loop at the end my_range=blocks_VC.index#np.arange(0,nBlocks+1,my_base_shift) print ('range size: ',len(my_range)) outH5Name=outMergedDir.replace('.h5','.'+feature_for_hash+'.'+str(my_base_shift)+'.chunked.h5') print ('output name:',outH5Name) os.system('rm '+outH5Name) get_ipython().run_line_magic('time', "tmpDf3['block']=my_block_S #6.5s") get_ipython().run_line_magic('time', 'tmpDf3=tmpDf3.sort_values(feature_for_hash) # 30 mins') """ export chunks """ #myOrderDict={'Pos':[u'Chr', u'Pos',u'base','Run_digits','Run_db'], # 'Run_digits':['Run_digits','Run_db',u'Chr', u'Pos',u'base']} g=tmpDf3.groupby('block') for myBlock,tmpDf9 in g: tmpDf9=tmpDf3[tmpDf3['block']==myBlock]# For run, 27s, For Pos: 2 min :30 s my_index_order=myOrderDict[feature_for_hash] #tmpDf10=tmpDf9.set_index(my_index_order) myKeyName=feature_for_hash+'_'+str(myBlock)+'.block_'+str(myBlock) print (tmp_dir_2+myKeyName) tmpDf9.to_pickle(tmp_dir_2+myKeyName) #print (myKeyName,tmpDf9.shape) #myKeyName # In[ ]: ### merge the files """ my_max=tmpDf3[feature_for_hash].max() print (my_max)# 6121752 max_size_order=math.ceil(np.log10(my_max)) print ('order of max',max_size_order) n=4 my_base_shift=int(10**(max_size_order-n)) my_block_S=(tmpDf3[feature_for_hash]/my_base_shift).astype(np.int)*my_base_shift ##check the data type of my_block_S and Run_digits #my_block_S=my_block_S ### make the data become int nBlocks=my_block_S.max() print ('# of blocks',) blocks_VC=my_block_S.value_counts() ##add one for the loop at the end my_range=blocks_VC.index#np.arange(0,nBlocks+1,my_base_shift) print ('range size: ',len(my_range)) outH5Name=outMergedDir.replace('.h5','.'+feature_for_hash+'.'+str(my_base_shift)+'.chunked.h5') print ('output name:',outH5Name) os.system('rm '+outH5Name) %time tmpDf3['block']=my_block_S #6.5s %time tmpDf3=tmpDf3.sort_values(feature_for_hash) # 30 mins """ """ changes: need to do the split for different bins in here """ """ myTmpL=[] my_ls_S=pd.Series(os.listdir('./')) ChunksFnames=my_ls_S[my_ls_S.str.contains('.pickle.gz$')].values for chunkFname in ChunksFnames: tmpDf=pd.read_pickle(chunkFname) tmpDf2=tmpDf.reset_index() #tmpDf3=tmpDf2.set_index(['Chr','Pos']) #m=tmpDf3.index.isin(refIndex) subTmpDf2=tmpDf2#[m] print '% sites in targets:', (m.mean()) print subTmpDf2.shape myTmpL.append(subTmpDf2) mergedDf=pd.concat(myTmpL) #mergedS all_mergedDf=mergedDf.set_index( ['Run_db','Run_digits']).sort_index() #all_mergedDf.to_pickle(outMergedDir) """ all_mergedDf.to_hdf(outMergedDir,key='master',mode="w") allFailedSrrs=pd.Series(reduce(lambda a,x:a+x,failed_srr_l,[])) allFailedSrrs.to_csv('/nrnb/users/btsui/Data/all_seq/'+mySpecie+'.failed.srrs.txt') # In[ ]: ###change corrdinate to chromsome base for slicing #mySpecie='Homo_sapiens' #skymap_snp_dir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5' sorted_snp_dir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.chrom_pos_sorted.h5' tmpDf=pd.read_hdf(outMergedDir,key='master',mode='r') tmpDf4=tmpDf.reset_index() sortedDf=tmpDf4.set_index( [u'Chr', u'Pos',u'base']).sort_index() sortedDf.to_hdf(sorted_snp_dir,key='master',mode='w') # ### scratch # In[ ]: #sorted_snp_dir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.chrom_pos_sorted.h5' tmpDf=pd.read_hdf(outMergedDir,key='master',mode='r') tmpDf3=tmpDf.reset_index() tmpDf3['base']=tmpDf3['base'].astype('category') tmpDf3['Chr']=tmpDf3['Chr'].astype('category') tmpDf3['Run_db']=tmpDf3['Run_db'].astype('category') tmpDf3['Pos']=tmpDf3['Pos'].astype(np.uint32) tmpDf3['Run_digits']=tmpDf3['Run_digits'].astype(np.uint32) #tmpDf4=tmpDf3.set_index(['Run_digits','Run_db',u'Chr', u'Pos',u'base']) #tmpDf5=tmpDf4.sort_index() outRunSortedOutDir=outMergedDir.replace('.h5','.pickle') os.system('rm '+outRunSortedOutDir) tmpDf3.to_pickle(outRunSortedOutDir) #tmpDf3.to_hdf(outRunSortedOutDir,key='master',mode='w') # In[ ]: # ### test exporting to hdf5 data # In[ ]: '/cellar/users/btsui/' # In[ ]: import pandas as pd import numpy as np import re import os from multiprocessing import Pool ## init mySpecie='Homo_sapiens' outMergedDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5' get_ipython().run_line_magic('time', "tmpDf=pd.read_hdf(outMergedDir,key='master',mode='r')") get_ipython().run_line_magic('time', 'tmpDf3=tmpDf.reset_index()') """ CPU times: user 8.21 s, sys: 19.9 s, total: 28.2 s Wall time: 1min 18s CPU times: user 1min 35s, sys: 57.8 s, total: 2min 33s Wall time: 2min 33s """ get_ipython().run_line_magic('time', "tmpDf5=tmpDf3.set_index(['Run_digits','Run_db'])") """ #CPU times: user 2min 7s, sys: 1min 54s, total: 4min 1s Wall time: 4min 1s """ get_ipython().run_line_magic('time', "tmpDf5.to_hdf('./test_wo_chroms.hdf','master',mode='w',append=False,format='table')") #Exception: cannot find the correct atom type -> [dtype->object,items->Index(['Chr', 'base'], dtype='object')] # In[36]: math.ceil() # In[ ]: #myCol=np.array(['Run_digits','Run_db',u'Chr', u'Pos',u'base']) ### what's the datatype for those guys: # In[ ]: ### let's just import pandas as pd import numpy as np import re import os import math from multiprocessing import Pool ## init mySpecie='Homo_sapiens' outMergedDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5' get_ipython().run_line_magic('time', "tmpDf=pd.read_hdf(outMergedDir,key='master',mode='r')") get_ipython().run_line_magic('time', 'tmpDf3=tmpDf.reset_index()') #feature_for_hash='Run_digits' feature_for_hash='Pos' myOrderDict={'Pos':[u'Chr', u'Pos',u'base','Run_digits','Run_db'], 'Run_digits':['Run_digits','Run_db',u'Chr', u'Pos',u'base']} my_max=tmpDf3[feature_for_hash].max() print (my_max)# 6121752 max_size_order=math.ceil(np.log10(my_max)) print ('order of max',max_size_order) n=4 my_base_shift=int(10**(max_size_order-n)) my_block_S=(tmpDf3[feature_for_hash]/my_base_shift).astype(np.int)*my_base_shift ##check the data type of my_block_S and Run_digits #my_block_S=my_block_S ### make the data become int nBlocks=my_block_S.max() print ('# of blocks',) blocks_VC=my_block_S.value_counts() ##add one for the loop at the end my_range=blocks_VC.index#np.arange(0,nBlocks+1,my_base_shift) print ('range size: ',len(my_range)) outH5Name=outMergedDir.replace('.h5','.'+feature_for_hash+'.'+str(my_base_shift)+'.chunked.h5') print ('output name:',outH5Name) os.system('rm '+outH5Name) get_ipython().run_line_magic('time', "tmpDf3['block']=my_block_S #6.5s") get_ipython().run_line_magic('time', 'tmpDf3=tmpDf3.sort_values(feature_for_hash) # 30 mins') #%time g=tmpDf3.groupby('block') #1min 35s # for myBlock in my_range: #for myBlock,tmpDf9 in g: """ in Pos, """ get_ipython().run_line_magic('time', "tmpDf9=tmpDf3[tmpDf3['block']==myBlock]# For run, 27s, For Pos: 2 min :30 s") my_index_order=myOrderDict[feature_for_hash] #tmpDf10=tmpDf9.set_index(my_index_order) myKeyName=feature_for_hash+'_'+str(myBlock) print (myKeyName,tmpDf9.shape) tmpDf9.to_hdf(outH5Name,myKeyName,mode='a',format='fixed') #if myBlock >3000: # break # In[4]: #('range size: ', 1201) # In[3]: ### make sure the results is chunked as expects #!ls /cellar/users/btsui/all_seq_snp/ #2 seconds per data push # In[ ]: #7 digits, # 18gb to 0.1 #create an hdf5 object, tmpDf.to_hdf('./test.h5','master',mode='w',append=False,format='table') # In[ ]: get_ipython().run_line_magic('time', "tmpDf.to_hdf('./test.h5','master',mode='w',append=False,format='table')") testDf=tmpDf.head() testDf.to_hdf('./test.h5','master',mode='w',format='table') #export succceedded, but the slicing operation time takes forever, the path only lead from /master/ to py table. # In[1]: import pandas as pd import numpy as np # In[2]: #v6 #14.9s get_ipython().run_line_magic('time', "testDf20=pd.read_hdf('/cellar/users/btsui/test.pos.chunked.h5','Run_digits_790000',mode='r')") # In[9]: np.log10(testDf20.shape[0]) # In[6]: # In[ ]: #dataset_name   = '/Configure:0000/Run:0000/CalibCycle:0000/Camera::FrameV1/XppSb4Pim.1:Tm6740.1/image' # In[10]: import h5py import numpy as np import matplotlib.pyplot as plt # In[11]: hdf5_file_name='/cellar/users/btsui/test.h5' # In[12]: f = h5py.File(hdf5_file_name, 'r') # In[23]: table=f['/master/table'] #obj.iteritems() # In[ ]: table[1000:2000] # In[33]: table.dtype # In[26]: table.shape # In[19]: for i,myObj in enumerate(obj.iteritems()): print myObj if i>10: break # In[35]: get_ipython().run_line_magic('time', "posDf=pd.read_hdf('/cellar/users/btsui/test.h5','master',where='Run_digits=15999')") # In[2]: """ the multindex doesn't work no matter what just use pickle for now: """ # In[ ]: """ [u'Chr', u'Pos',u'base','Run'] -rw-r--r-- 1 btsui users 1.8M Jan 2 16:42 0.pickle.gz -rw-r--r-- 1 btsui users 2.0M Jan 2 16:42 10.pickle.gz ['Run', u'Chr', u'Pos',u'base'] -rw-r--r-- 1 btsui users 1.9M Jan 2 16:43 0.pickle.gz -rw-r--r-- 1 btsui users 2.2M Jan 2 16:44 10.pickle.gz """ # In[ ]: """ ### where did 1000.pickle.gz go from the first iteration ### out of 82, only 60 processed, what about the rest? ### """ # In[ ]: #!rm /tmp/btsui/snp_merged/0.pickle # In[15]: #18G myDf=pd.read_hdf('/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.h5',key='master') # In[17]: #myDf # In[ ]: ### keep everything in the same place? ## # In[ ]: ### read in all the human. ###take only those in human and merge all of them together, be only additivite, add only those where it has not been added. #srr_pickle_df.iloc[:0].to_pickle('/data/cellardata/users/btsui/dbsnp/empty_base_snp.pickle') # In[ ]: ## merge all the data in bulk, ### generate an empty pickle for each species? # In[ ]: #orignal pickle, 2.7m #!ls -lah ./tmp.pickle # In[ ]: #!ls -lah tmp2.pickle.gz # In[ ]: #!rm ./tmp2.pickle # In[ ]: ### merge all the files, by 1000 steps # In[ ]: srr_pickle_df.to_pickle('./tmp2.pickle') #srr_pickle_df.to_pickle('./tmp.pickle') # In[ ]: # In[ ]: get_ipython().system('ls -lah ./tmp.pickle ./tmp2.pickle') # In[ ]: get_ipython().system('rm ./tmp2.pickle.gz') get_ipython().system('rm ./tmp.pickle.gz') # In[ ]: get_ipython().system('gzip ./tmp2.pickle') # In[ ]: get_ipython().system('gzip ./tmp.pickle') # In[ ]: get_ipython().system('ls -lah tmp2.pickle.gz') # In[ ]: get_ipython().system('ls -lah tmp.pickle.gz') # In[ ]: ### merge the pickle # In[ ]: #0.4 mb np.log10(0.4*(4*(10**6))) # In[ ]: ### the multindex doesn't increase the # In[ ]: #no filtering on variants countDf=tmpDf6[tmpDf6.rd>0]#.rd.value_counts() ##at a position, tell what's the value # In[ ]: subDf.head() # In[ ]: my_data_L=[] my_key_L=[] g=countDf.stack().groupby(['base','features']) for myTuple, subS in g: sparse_Ids=subS.index.get_level_values(index_name) tmp_row_col=(sparse_Ids,np.zeros(len(sparse_Ids))) tmp_array=sparse.csc_matrix((subS ,tmp_row_col),shape=tmp_shape) my_data_L.append(tmp_array) my_key_L.append(myTuple) # In[ ]: sparseM=sparse.hstack(my_data_L) # In[ ]: # In[ ]: index_name='Chr_Pos_Id' baseDf.groupby(index_name) #csc_matrix((data, (row, col)), shape=(3, 3)).toarray() attrib='rd' tmp_shape=(Chr_Pos_to_ID_S.max(),1) for my_base, subDf in countDf.groupby(level='base'): sparse_Ids=subDf.index.get_level_values(index_name) tmp_row_col=(sparse_Ids,np.zeros(len(sparse_Ids))) tmpS=sparse.csc_matrix((subDf[attrib] ,tmp_row_col),shape=tmp_shape) # In[ ]: countDf.shape # In[ ]: #countDf # In[289]: #full_meta_df['Run'] # In[282]: import pandas as pd import numpy as np import re import os from multiprocessing import Pool ## init mySpecie='Homo_sapiens' outMergedDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'_all_merged_snp.h5' full_meta_dir="/cellar/users/btsui/Project/METAMAP/notebook/Parsing/sra_dump.csv" full_meta_df=pd.read_csv(full_meta_dir) #inSrrDir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/' inSrrDir='/nrnb/users/btsui/Data/all_seq/snp/' existingMergedDf=pd.read_pickle(outMergedDir) # In[292]: existingMergedDf.groupby(['Run_db','Run_digits']).size() # In[3]: import pandas as pd import numpy as np # In[4]: s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) # In[7]: s.flags # In[6]: s.index.flags # In[11]: pd.__path__ # In[12]: store = pd.HDFStore('test.h5','w') # In[ ]: ### check compression ratio with at last ten reads: # In[ ]: ### select by variants, for select by