#!/usr/bin/env python # coding: utf-8 # In[2]: mySpecie='Homo_sapiens' # In[1]: import os import pandas as pd import glob import gzip #snp_out_dir='/nrnb/users/btsui/Data/tcga_extracted_lgg_snp/' #/nrnb/users/btsui/Data/all_seq/snp/ preprocessed_vcf_dir='/nrnb/users/btsui/Data/all_seq/snp/' inFnames=pd.Series(os.listdir(preprocessed_vcf_dir)) # In[ ]: # In[3]: outDir='/cellar/users/btsui/all_seq_snp/'+mySpecie+'.pickle' # In[6]: outDir # In[54]: #vcfDirs=glob.glob('/nrnb/users/btsui/Data/tcga_orig_vcf/**/*.vcf$') # In[183]: vcfDirs=glob.glob('/nrnb/users/btsui/Data/tcga_orig_vcf/**/*.vcf*') # In[ ]: #len(vcfDirs) # In[185]: from tqdm import tqdm # In[ ]: myL=[] for vcfDir in tqdm(vcfDirs): myL.append(pd.read_csv(vcfDir,comment='#',header=None,sep='\s+')) # In[201]: meregdDf=pd.concat(myL,axis=0,keys=vcfDirs,names=['fdir']) # In[202]: len(myL) # In[ ]: # In[204]: meregdDf.to_pickle(outDir) # In[ ]: ### analyze the allelic fraction against the primary tumors for somatic mutation calling? # In[209]: #!ls -lath ./TCGA_VCF.pickle # In[ ]: asdasdasd # ## # In[ ]: #!gunzip -c /nrnb/users/btsui/Data/tcga_orig_vcf/b8d00849-003c-4675-b1ca-1382316a0c93/b8d00849-003c-4675-b1ca-1382316a0c93.vcf.gz | head -n 23 # # !rm head.txt # #MuSE_call_1 # """ # with gzip.open('file.txt.gz', 'rb') as f: # file_content = f.read() # """ # fL=[] # for i,vcfDir in enumerate(tqdm(vcfDirs)): # # with gzip.open(vcfDir, 'r') as f: # #.decode('ascii') # file_content = f.read().decode('ascii') # # #str(file_content) # #print ( ('MuSE' in file_content)) # if not ('MuSE' in file_content): # print (vcfDir) # os.system('echo "{}\n" >>head.txt'.format(vcfDir)) # os.system('gunzip -c {} | head - # n 20 >>head.txt'.format(vcfDir)) # # os.system('echo "\n" >> head.txt'.format(vcfDir)) # fL.append(vcfDir) # # In[172]: #lgg_rest_meta_df=pd.read_pickle('lgg_rest_meta.pickle') # In[205]: #lgg_rest_meta_df.groupby('var_pipeline').first() # In[206]: #529 files having all #'varscan_dr', 'muse', 'mutect', 'somaticsniper' #lgg_rest_meta_df.var_pipeline.value_counts().index # In[207]: #lgg_rest_meta_df # In[167]: lgg_rest_meta_df['submitter_id'][~lgg_rest_meta_df['submitter_id'].str.contains('varscan_dr|somaticsniper|mutect|muse')].values#.str.split('_').str[-1] # In[153]: get_ipython().system('cat head.txt') # In[112]: tmpStr=str(file_content) # In[127]: def whatisthis(s): if isinstance(s, str): print ("ordinary string") elif isinstance(s, unicode): print ("unicode string") else: print ("not a string") whatisthis(file_content) # In[113]: 'fileformat' in file_content # In[123]: file_content[:10] # In[114]: get_ipython().system('cat head.txt') # In[33]: get_ipython().system('wc -l head.txt') # In[12]: get_ipython().system('gunzip -c /nrnb/users/btsui/Data/tcga_orig_vcf/bcc7d112-2d5d-4ff1-a863-70f11d5b0f2c/bcc7d112-2d5d-4ff1-a863-70f11d5b0f2c.vcf.gz | head -n 100') # In[152]: # In[ ]: #Analysis ID #submitter_id might do,