from tqdm import tqdm
import pandas as pd
import os
import subprocess
gdc_meta_df=pd.read_json('files.2018-07-12.json')
#4240 if just slicing using VCF
m_vcf=(gdc_meta_df.data_format=='VCF')
#m_vep=(~gdc_meta_df.file_name.str.contains('vep.vcf.gz$'))
m_wxs=(gdc_meta_df['experimental_strategy']=='WXS')
m_somatic=gdc_meta_df['data_type']=='Raw Simple Somatic Mutation'
m_annot=gdc_meta_df.annotations.notnull()
gdc_meta_df_sub=gdc_meta_df[m_vcf&m_wxs&m_somatic]
gdc_meta_df_sub['file_size'].sum()/10**9
0.331879807
out_dir='/nrnb/users/btsui/Data/tcga_orig_vcf/'
token_dir='/cellar/users/ramarty/tokens/gdc-user-token.2018-06-25T22_21_40.089Z.txt'
gdc_cmd_fmt='gdc-client download -t {token_dir} -d {out_dir} {file_uuid}'
for (_,tmpS) in tqdm((list(gdc_meta_df_sub.iterrows()))):
file_uuid=tmpS['file_id']
gdc_cmd=gdc_cmd_fmt.format(out_dir=out_dir,file_uuid=file_uuid,token_dir=token_dir)
#print (gdc_cmd)
os.system(gdc_cmd)
61%|██████ | 1286/2120 [1:15:34<49:00, 3.53s/it]
##most likely the file folder name match the uuid on the files
#def
gdc_meta_df_sub['cases'].iloc[2
]
[{'project': {'project_id': 'TCGA-LGG'}, 'case_id': '6a0bcf0c-fa4c-4119-99d2-f722b781d20f'}]
#gdc_meta_df_sub
!ls -lath /nrnb/users/btsui/Data/tcga_raw_lgg/ | head
total 21M drwxr-xr-x 3 btsui users 512 Jul 12 08:27 3a0e5ae0-dc79-468d-b459-a6d43b612851 drwxr-xr-x 84 btsui users 128K Jul 12 08:23 . drwxr-xr-x 3 btsui users 128K Jul 12 08:22 ceb1a38c-fc22-4d27-9ada-553c1765f1f6 drwxr-xr-x 13 btsui users 128K Jul 11 23:39 .. drwxr-xr-x 3 btsui users 512 Jul 10 06:58 fc837d52-5e38-4d1d-a953-48c321d60ce5 drwxr-xr-x 3 btsui users 128K Jul 10 06:51 30e7af75-7d8e-4aa5-b01e-1149dff334ac drwxr-xr-x 3 btsui users 128K Jul 10 05:05 1c15ff7e-3cbc-41b1-b814-1cf03f1f5a27 drwxr-xr-x 3 btsui users 128K Jul 10 04:59 97216c14-8db6-4a96-8df6-1e710f4a0ed3 drwxr-xr-x 3 btsui users 128K Jul 10 04:58 2b1e29cb-e83c-4e25-a643-1ded3bbcccb6