from tqdm import tqdm
import pandas as pd
import os
import subprocess
CWD='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/'
os.chdir(CWD)
#gdc_meta_df=pd.read_json('files.2017-12-09T19_29_39.496570.json')
#gdc_meta_df=pd.read_csv('gdc_manifest.2018-07-11.txt',sep='\t')
#gdc_meta_df=pd.read_csv('./gdc_manifest.2017-12-27T02_43_35.959399.txt',sep='\t')
#/cellar/users/andreabc/GDC_barcodes/uuid_barcode_map.txt
#gdc_meta_df.str.contains('')
#516 cases
##bams only
gdc_meta_df=pd.read_json('files.2018-07-11.json')
gdc_meta_df['data_format'].value_counts()
BAM 2105 Name: data_format, dtype: int64
m_dtype=gdc_meta_df['data_type']=='Aligned Reads'
#,
m_experimental_strategy=gdc_meta_df['experimental_strategy'].isin(['WXS','RNA-Seq'])
### process all the TCGA, realigned bams.
gdc_meta_df_sub=gdc_meta_df[m_dtype&m_experimental_strategy]
#gdc_meta_df['experimental_strategy'].value_counts()
"""
with each file, the pipeline can extract the data quickly
"""
gdc_meta_df_sub.sort_values('file_size').to_pickle('./tcga_lgg_wgs_bams.df.wxs_rnaseq.pickle')
!echo $PWD/./tcga_lgg_wgs_bams.df.pickle
/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/./tcga_lgg_wgs_bams.df.pickle
### for each of the file, generate the allellic read count using the standard refere
gdc_meta_df_sub.sort_values('file_size').shape#['file_size']#/10**9
(1045, 11)
print ("size of bams in TB:",(gdc_meta_df_sub['file_size']/10**12).sum())
size of bams in TB: 12.400216759716
out_dir='/nrnb/users/btsui/Data/tcga_raw_lgg/'
#gdc_meta_df.cases.iloc[0]
token_dir='/cellar/users/ramarty/tokens/gdc-user-token.2018-06-25T22_21_40.089Z.txt'
gdc_cmd_fmt='gdc-client download -t {token_dir} -d {out_dir} {file_uuid}'
for _,rowS in tqdm(gdc_meta_df.iterrows()) :
file_uuid=rowS.loc['id']
gdc_cmd=gdc_cmd_fmt.format(out_dir=out_dir,file_uuid=file_uuid,token_dir=token_dir)
#result = os.system(gdc_cmd)
!ls /nrnb/users/btsui/Data/tcga_raw_lgg/
ls: cannot access '/nrnb/users/btsui/Data/tcga_raw_lgg/3a0e5ae0-dc79-468d-b459-a6d43b612851': No such file or directory
#!gunzip -c /nrnb/users/btsui/Data/tcga_extracted_lgg_snp/3a0e5ae0-dc79-468d-b459-a6d43b612851.snp.txt.gz | head -n 2000|tail -n 20
#download, and then extract, run the smallest first.
#need to use
#!ls -lah /cellar/users/btsui/Data/BOWTIE_GENOME_SNP_INDEX/Homo_sapiens/
!ls /nrnb/users/btsui/Data/tcga_orig_vcf/0085c844-82bf-414a-bc05-5e7488a70c25/
0085c844-82bf-414a-bc05-5e7488a70c25.vcf