import os
import pandas as pd
import glob
import gzip
import requests
import json
from tqdm import tqdm
#snp_out_dir='/nrnb/users/btsui/Data/tcga_extracted_lgg_snp/'
preprocessed_vcf_dir='/nrnb/users/btsui/Data/tcga_orig_vcf/'
snp_out_dir='/nrnb/users/btsui/Data/tcga_extracted_lgg_snp/'
inFnames=pd.Series(os.listdir(preprocessed_vcf_dir))
inUuids=inFnames.str.split('.').str[0]
#preprocessed_vcf_dir=
#curl https://api.gdc.cancer.gov/files/386d69e7-b4a7-4981-beb8-98f088c689f7
file_endpt = 'https://api.gdc.cancer.gov/files/'
#file_uuid = '9ad50bf6-9fe0-4ad7-b26f-b6f52824ef9e'
myL=[]
for file_uuid in tqdm(inUuids):
response = requests.get(file_endpt + file_uuid)
myL.append(response.json()['data'])
#print (json.dumps(response.json(), indent=2))
#inUuids
mergedMetaDf=pd.DataFrame(myL)
mergedMetaDf
access | acl | created_datetime | data_category | data_format | data_type | experimental_strategy | file_id | file_name | file_size | file_state | md5sum | platform | state | submitter_id | type | updated_datetime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | controlled | [phs000178] | 2016-05-02T22:01:32.655381-05:00 | Raw Sequencing Data | BAM | Aligned Reads | WXS | 9554192d-20f8-47ef-bcd1-ea5b5540fdcf | C494.TCGA-WY-A85A-01A-21D-A36O-08.1_gdc_realn.bam | 7650242164 | submitted | b2fb86f35c386d471da9df06baef37cb | Illumina | live | 5e4fbe2d-1a65-4e08-b23b-b737c57216ae | aligned_reads | 2017-03-04T16:37:26.081298-06:00 |
1 | controlled | [phs000178] | 2016-05-03T05:43:02.847753-05:00 | Raw Sequencing Data | BAM | Aligned Reads | WXS | 0e07d005-3332-4262-8220-9e4eccafdc3e | C494.TCGA-DU-6399-10A-01D-1705-08.5_gdc_realn.bam | 8589248703 | submitted | 13252e397d9297063f14d7cb2867c25e | Illumina | live | 14626059-15c1-414c-8d2c-f502d9efeb3a | aligned_reads | 2017-03-04T16:37:26.081298-06:00 |
2 | controlled | [phs000178] | 2016-05-04T23:03:38.572473-05:00 | Raw Sequencing Data | BAM | Aligned Reads | WXS | 14f22bdf-a8e0-40c4-835f-7c43327030fa | C494.TCGA-P5-A77X-01A-11D-A32B-08.1_gdc_realn.bam | 7558356053 | submitted | 596f9d710ff233338054c7f1aa65ef63 | Illumina | live | d2c4c976-7f28-4a80-b3fa-b44eac15e5af | aligned_reads | 2017-03-04T16:37:26.081298-06:00 |
3 | controlled | [phs000178] | 2016-05-08T16:38:10.284691-05:00 | Raw Sequencing Data | BAM | Aligned Reads | WXS | 55dcee55-a992-4c57-a899-659064659dee | C494.TCGA-HT-8110-01A-11D-2395-08.1_gdc_realn.bam | 8082621230 | submitted | 631de89d981b380b5dee0e127749250f | Illumina | live | 6e8f6419-515d-43ee-96d4-9874e5ce82fc | aligned_reads | 2017-03-04T16:37:26.081298-06:00 |
mergedMetaDf['var_pipeline']=mergedMetaDf['submitter_id'].str.extract('(varscan_dr|somaticsniper|mutect|muse)',expand=False)
mergedMetaDf.to_pickle('lgg_rest_meta.pickle')
mergedMetaDf['submitter_id'].values
array(['TCGA-DU-5870-01A-11D-1705-08_TCGA-DU-5870-10A-01D-1705-08_varscan_dr_10.0', 'TCGA-DB-5279-01A-01D-1468-08_TCGA-DB-5279-10A-01D-1468-08_somaticsniper', 'TCGA-DU-A76K-01A-11D-A33T-08_TCGA-DU-A76K-10A-01D-A33W-08_mutect', ..., 'TCGA-HT-8110-01A-11D-2395-08_TCGA-HT-8110-10A-01D-2396-08_mutect', 'TCGA-RY-A845-01A-11D-A36O-08_TCGA-RY-A845-10A-01D-A367-08_varscan_dr_10.0', 'TCGA-DU-7018-01A-11D-2024-08_TCGA-DU-7018-10A-01D-2024-08_varscan_dr_10.0'], dtype=object)