from google.cloud import bigquery from google.colab import auth import numpy as np import pandas as pd import seaborn as sns import pandas_gbq auth.authenticate_user() my_project_id = "" # write your project id here bqclient = bigquery.Client( my_project_id ) prot = '''quant AS ( SELECT meta.sample_submitter_id, meta.sample_type, quant.case_id, quant.aliquot_id, quant.gene_symbol, CAST(quant.protein_abundance_log2ratio AS FLOAT64) AS protein_abundance_log2ratio FROM `isb-cgc-bq.CPTAC.quant_proteome_CPTAC_CCRCC_discovery_study_pdc_current` as quant JOIN `isb-cgc-bq.PDC_metadata.aliquot_to_case_mapping_current` as meta ON quant.case_id = meta.case_id AND quant.aliquot_id = meta.aliquot_id AND meta.sample_type IN ('Primary Tumor','Solid Tissue Normal') )''' gexp = '''gexp AS ( SELECT DISTINCT meta.sample_submitter_id, meta.sample_type, rnaseq.gene_name , LOG(rnaseq.HTSeq__FPKM + 1) as HTSeq__FPKM FROM `isb-cgc-bq.CPTAC.RNAseq_hg38_gdc_current` as rnaseq JOIN `isb-cgc-bq.PDC_metadata.aliquot_to_case_mapping_current` as meta ON meta.sample_submitter_id = rnaseq.sample_barcode )''' corr = '''correlation AS ( SELECT quant.gene_symbol, gexp.sample_type, COUNT(*) as n, CORR(protein_abundance_log2ratio,HTSeq__FPKM) as corr FROM quant JOIN gexp ON quant.sample_submitter_id = gexp.sample_submitter_id AND gexp.gene_name = quant.gene_symbol AND gexp.sample_type = quant.sample_type GROUP BY quant.gene_symbol, gexp.sample_type )''' pval = '''SELECT gene_symbol, sample_type, n, corr, `cgc-05-0042.functions.corr_pvalue`(corr, n) as p FROM correlation WHERE ABS(corr) <= 1.0''' mysql = '''DECLARE Nrows INT64; CREATE TEMP TABLE PearsonCorrelation AS WITH {0}, {1}, {2} {3} ; # Adjust pvalues for multiple tests SET Nrows = ( SELECT COUNT(*) FROM PearsonCorrelation ); CALL `cgc-05-0042.functions.BHmultipletests`( 'PearsonCorrelation', 'p', Nrows ) '''.format(prot, gexp, corr, pval) job_config = bigquery.QueryJobConfig() job_config.use_legacy_sql = False try: query_job = bqclient.query ( mysql, job_config=job_config ) except: print ( " FATAL ERROR: query execution failed " ) mydf = query_job.to_dataframe() mydf s_level = 0.01 mydf['significant'] = np.where( mydf['p_adj'] <= s_level, True, False) sns.displot(data=mydf, x="corr", hue="significant", multiple="stack", binwidth=0.1, col='sample_type')