#!/usr/bin/env python
# coding: utf-8

# In[16]:


from tqdm import tqdm

import pandas as pd
import os
import subprocess

CWD='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'
os.chdir(CWD)
#gdc_meta_df=pd.read_json('files.2017-12-09T19_29_39.496570.json')


#gdc_meta_df=pd.read_csv('gdc_manifest.2018-07-11.txt',sep='\t')


# In[17]:


#gdc_meta_df=pd.read_csv('./gdc_manifest.2017-12-27T02_43_35.959399.txt',sep='\t')
#/cellar/users/andreabc/GDC_barcodes/uuid_barcode_map.txt
#gdc_meta_df.str.contains('')


# In[18]:


#516 cases
##bams only 
gdc_meta_df=pd.read_json('files.2018-07-11.json')


# In[19]:


gdc_meta_df['data_format'].value_counts()


# In[23]:


m_dtype=gdc_meta_df['data_type']=='Aligned Reads'
#,
m_experimental_strategy=gdc_meta_df['experimental_strategy'].isin(['WXS','RNA-Seq'])
### process all the TCGA, realigned bams. 
gdc_meta_df_sub=gdc_meta_df[m_dtype&m_experimental_strategy]


# In[24]:


#gdc_meta_df['experimental_strategy'].value_counts()


# In[25]:


"""
with each file, the pipeline can extract the data quickly
"""
gdc_meta_df_sub.sort_values('file_size').to_pickle('./tcga_lgg_wgs_bams.df.wxs_rnaseq.pickle')


# In[56]:


get_ipython().system('echo $PWD/./tcga_lgg_wgs_bams.df.pickle')


# In[52]:


### for each of the file, generate the allellic read count using the standard refere
gdc_meta_df_sub.sort_values('file_size').shape#['file_size']#/10**9


# In[47]:


print ("size of bams in TB:",(gdc_meta_df_sub['file_size']/10**12).sum())


# In[40]:


out_dir='/nrnb/users/btsui/Data/tcga_raw_lgg/'

#gdc_meta_df.cases.iloc[0]
token_dir='/cellar/users/ramarty/tokens/gdc-user-token.2018-06-25T22_21_40.089Z.txt'
gdc_cmd_fmt='gdc-client download -t {token_dir} -d {out_dir} {file_uuid}'

for _,rowS in   tqdm(gdc_meta_df.iterrows()) :
    file_uuid=rowS.loc['id']
    gdc_cmd=gdc_cmd_fmt.format(out_dir=out_dir,file_uuid=file_uuid,token_dir=token_dir)
    #result = os.system(gdc_cmd)


# In[78]:


get_ipython().system('ls /nrnb/users/btsui/Data/tcga_raw_lgg/')


# In[81]:


#!gunzip -c  /nrnb/users/btsui/Data/tcga_extracted_lgg_snp/3a0e5ae0-dc79-468d-b459-a6d43b612851.snp.txt.gz | head -n 2000|tail -n 20


# In[ ]:


#download, and then extract, run the smallest first. 
#need to use


# In[ ]:


# In[59]:


# In[76]:


#!ls -lah /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Homo_sapiens/


# In[82]:


get_ipython().system('ls /nrnb/users/btsui/Data/tcga_orig_vcf/0085c844-82bf-414a-bc05-5e7488a70c25/')


# In[ ]: