from google.cloud import bigquery
from google.colab import auth
import numpy as np
import pandas as pd
import seaborn as sns
import pandas_gbq

auth.authenticate_user()
my_project_id = "" # write your project id here
bqclient = bigquery.Client( my_project_id )

prot = '''quant AS (
    SELECT meta.sample_submitter_id, meta.sample_type, quant.case_id, quant.aliquot_id, quant.gene_symbol, 
           CAST(quant.protein_abundance_log2ratio AS FLOAT64) AS protein_abundance_log2ratio 
    FROM `isb-cgc-bq.CPTAC.quant_proteome_CPTAC_CCRCC_discovery_study_pdc_current` as quant
    JOIN `isb-cgc-bq.PDC_metadata.aliquot_to_case_mapping_current` as meta
        ON quant.case_id = meta.case_id
        AND quant.aliquot_id = meta.aliquot_id
        AND meta.sample_type IN ('Primary Tumor','Solid Tissue Normal')
)'''

gexp = '''gexp AS (
    SELECT DISTINCT meta.sample_submitter_id, meta.sample_type, rnaseq.gene_name , LOG(rnaseq.HTSeq__FPKM + 1) as HTSeq__FPKM   
    FROM `isb-cgc-bq.CPTAC.RNAseq_hg38_gdc_current` as rnaseq
    JOIN `isb-cgc-bq.PDC_metadata.aliquot_to_case_mapping_current` as meta
        ON meta.sample_submitter_id = rnaseq.sample_barcode
)'''

corr = '''correlation AS (
    SELECT quant.gene_symbol, gexp.sample_type, COUNT(*) as n, CORR(protein_abundance_log2ratio,HTSeq__FPKM)  as corr 
    FROM quant JOIN gexp 
        ON quant.sample_submitter_id = gexp.sample_submitter_id
        AND gexp.gene_name = quant.gene_symbol
        AND gexp.sample_type = quant.sample_type
    GROUP BY quant.gene_symbol, gexp.sample_type
)'''

pval = '''SELECT  gene_symbol, sample_type, n, corr,
   `cgc-05-0042.functions.corr_pvalue`(corr, n) as p
FROM correlation
WHERE ABS(corr) <= 1.0'''

mysql = '''DECLARE Nrows INT64;
CREATE TEMP TABLE PearsonCorrelation AS
WITH {0}, 
{1}, 
{2} 
{3}
;
# Adjust pvalues for multiple tests
SET Nrows = ( SELECT COUNT(*) FROM PearsonCorrelation );
CALL `cgc-05-0042.functions.BHmultipletests`( 'PearsonCorrelation', 'p', Nrows )
'''.format(prot, gexp, corr, pval)

job_config = bigquery.QueryJobConfig()
job_config.use_legacy_sql = False
try:
    query_job = bqclient.query ( mysql, job_config=job_config )
except:
    print ( "  FATAL ERROR: query execution failed " )
mydf = query_job.to_dataframe()

mydf

s_level = 0.01 
mydf['significant'] = np.where( mydf['p_adj'] <= s_level, True, False)
sns.displot(data=mydf, x="corr", hue="significant", multiple="stack", binwidth=0.1, col='sample_type')