#!/usr/bin/env python # coding: utf-8 # In[16]: from tqdm import tqdm import pandas as pd import os import subprocess CWD='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/' os.chdir(CWD) #gdc_meta_df=pd.read_json('files.2017-12-09T19_29_39.496570.json') #gdc_meta_df=pd.read_csv('gdc_manifest.2018-07-11.txt',sep='\t') # In[17]: #gdc_meta_df=pd.read_csv('./gdc_manifest.2017-12-27T02_43_35.959399.txt',sep='\t') #/cellar/users/andreabc/GDC_barcodes/uuid_barcode_map.txt #gdc_meta_df.str.contains('') # In[18]: #516 cases ##bams only gdc_meta_df=pd.read_json('files.2018-07-11.json') # In[19]: gdc_meta_df['data_format'].value_counts() # In[23]: m_dtype=gdc_meta_df['data_type']=='Aligned Reads' #, m_experimental_strategy=gdc_meta_df['experimental_strategy'].isin(['WXS','RNA-Seq']) ### process all the TCGA, realigned bams. gdc_meta_df_sub=gdc_meta_df[m_dtype&m_experimental_strategy] # In[24]: #gdc_meta_df['experimental_strategy'].value_counts() # In[25]: """ with each file, the pipeline can extract the data quickly """ gdc_meta_df_sub.sort_values('file_size').to_pickle('./tcga_lgg_wgs_bams.df.wxs_rnaseq.pickle') # In[56]: get_ipython().system('echo $PWD/./tcga_lgg_wgs_bams.df.pickle') # In[52]: ### for each of the file, generate the allellic read count using the standard refere gdc_meta_df_sub.sort_values('file_size').shape#['file_size']#/10**9 # In[47]: print ("size of bams in TB:",(gdc_meta_df_sub['file_size']/10**12).sum()) # In[40]: out_dir='/nrnb/users/btsui/Data/tcga_raw_lgg/' #gdc_meta_df.cases.iloc[0] token_dir='/cellar/users/ramarty/tokens/gdc-user-token.2018-06-25T22_21_40.089Z.txt' gdc_cmd_fmt='gdc-client download -t {token_dir} -d {out_dir} {file_uuid}' for _,rowS in tqdm(gdc_meta_df.iterrows()) : file_uuid=rowS.loc['id'] gdc_cmd=gdc_cmd_fmt.format(out_dir=out_dir,file_uuid=file_uuid,token_dir=token_dir) #result = os.system(gdc_cmd) # In[78]: get_ipython().system('ls /nrnb/users/btsui/Data/tcga_raw_lgg/') # In[81]: #!gunzip -c /nrnb/users/btsui/Data/tcga_extracted_lgg_snp/3a0e5ae0-dc79-468d-b459-a6d43b612851.snp.txt.gz | head -n 2000|tail -n 20 # In[ ]: #download, and then extract, run the smallest first. #need to use # In[ ]: # In[59]: # In[76]: #!ls -lah /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Homo_sapiens/ # In[82]: get_ipython().system('ls /nrnb/users/btsui/Data/tcga_orig_vcf/0085c844-82bf-414a-bc05-5e7488a70c25/') # In[ ]: