Authentication

The first step is to authorize access to BigQuery and the Google Cloud. For more information see 'Quick Start Guide to ISB-CGC' and alternative authentication methods can be found here.

In [73]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from google.cloud import bigquery
import seaborn as sns
from scipy import stats
import re_module.bq_functions as regulome
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
In [74]:
## PARAMETERS

# information from Bigquery tables
Features = {     'Gene Expression' : { 'table'  : 'isb-cgc.TCGA_hg38_data_v0.RNAseq_Gene_Expression',
                                       'symbol' : 'gene_name',
                                       'study'  : 'project_short_name',
                                       'data'   : 'AVG( LOG10( HTSeq__Counts + 1 ) ) ',
                                       'rnkdata': 'data',
                                       'avgdat' : 'avgdata',  
                                   'patientcode': 'case_barcode',
                                    'samplecode': 'sample_barcode',
                                       'where'  : 'AND HTSeq__Counts IS NOT NULL',
                                       'dattype': 'numeric' },
             'MicroRNA Expression': {  'table'  : 'isb-cgc.TCGA_hg38_data_v0.miRNAseq_Expression',
                                       'symbol' : 'mirna_id',
                                        'study' : 'project_short_name',
                                       'data'   : 'AVG( reads_per_million_miRNA_mapped )',
                                       'rnkdata': 'data',
                                       'avgdat' : 'avgdata',
                                   'patientcode': 'case_barcode',
                                    'samplecode': 'sample_barcode',
                                       'where'  : 'AND reads_per_million_miRNA_mapped IS NOT NULL',
                                       'dattype': 'numeric'}
                
               }

# parameteres for Gene expression data
feat1 = Features['Gene Expression']
cohort1 = feat1['patientcode'] + " IN ( SELECT case_barcode FROM cohort ) "  #" IN UNNEST(@PATIENTLIST) "
labels1 = ' IS NOT NULL '     #' = \'TP53\' '

# parameteres for Gene expression data
feat2 = Features['MicroRNA Expression'] 
cohort2 = feat2['patientcode'] + " IN ( SELECT case_barcode FROM cohort ) " # " IN UNNEST(@PATIENTLIST) "
labels2 = ' IS NOT NULL '

# minimum correlation
min_corr = 0.5

# minimun number of samples
nsamples = 25

Read cohort

In [75]:
cohort = """
cohort AS(
SELECT case_barcode FROM `isb-cgc.TCGA_bioclin_v0.Clinical`
WHERE project_short_name = "TCGA-BRCA" AND age_at_diagnosis <=50 AND
    pathologic_stage IN ('Stage II', 'Stage IIA', 'Stage IIB') AND
    race IN ('WHITE', 'BLACK OR AFRICAN AMERICAN') AND ethnicity = "NOT HISPANIC OR LATINO"
)
"""

Queries to read tables

In [76]:
query_t1 = regulome.generic_numeric_bqtable ( 'table1' , feat1, cohort1, labels1 )
query_t2 = regulome.generic_numeric_bqtable ( 'table2' , feat2, cohort2, labels2 ) 

Queries to join tables and perform statistics

In [79]:
summ_query = regulome.get_summarized_table('Gene Expression',feat1,'MicroRNA Expression',feat2)

sql = ( 'WITH' + cohort + ',' + query_t1 + ',' + query_t2 + ',' + summ_query + """
SELECT symbol1, symbol2, n, correlation AS abs_correlation
FROM summ_table
WHERE 
    n > {0} AND correlation > {1}
LIMIT 100
""".format( str(nsamples) , str(min_corr) ) )

print( sql )  # THIS Query perform the computations
WITH
cohort AS(
SELECT case_barcode FROM `isb-cgc.TCGA_bioclin_v0.Clinical`
WHERE project_short_name = "TCGA-BRCA" AND age_at_diagnosis <=50 AND
    pathologic_stage IN ('Stage II', 'Stage IIA', 'Stage IIB') AND
    race IN ('WHITE', 'BLACK OR AFRICAN AMERICAN') AND ethnicity = "NOT HISPANIC OR LATINO"
)
,
table1 AS (
SELECT
   symbol,
   data AS rnkdata,
   ParticipantBarcode
FROM (
   SELECT
      gene_name AS symbol, 
      AVG( LOG10( HTSeq__Counts + 1 ) )  AS data,
      case_barcode AS ParticipantBarcode
   FROM `isb-cgc.TCGA_hg38_data_v0.RNAseq_Gene_Expression`
   WHERE case_barcode IN ( SELECT case_barcode FROM cohort )     # cohort 
         AND gene_name  IS NOT NULL   # labels 
         AND HTSeq__Counts IS NOT NULL  
   GROUP BY
      ParticipantBarcode, symbol
   )
)
,
table2 AS (
SELECT
   symbol,
   data AS rnkdata,
   ParticipantBarcode
FROM (
   SELECT
      mirna_id AS symbol, 
      AVG( reads_per_million_miRNA_mapped ) AS data,
      case_barcode AS ParticipantBarcode
   FROM `isb-cgc.TCGA_hg38_data_v0.miRNAseq_Expression`
   WHERE case_barcode IN ( SELECT case_barcode FROM cohort )     # cohort 
         AND mirna_id  IS NOT NULL   # labels 
         AND reads_per_million_miRNA_mapped IS NOT NULL  
   GROUP BY
      ParticipantBarcode, symbol
   )
)
,
summ_table AS (
SELECT 
   n1.symbol as symbol1,
   n2.symbol as symbol2,
   COUNT( n1.ParticipantBarcode ) as n,
   ABS(CORR(n1.rnkdata , n2.rnkdata)) as correlation
    
FROM
   table1 AS n1
INNER JOIN
   table2 AS n2
ON
   n1.ParticipantBarcode = n2.ParticipantBarcode
   
GROUP BY
   symbol1, symbol2
)
SELECT symbol1, symbol2, n, correlation AS abs_correlation
FROM summ_table
WHERE 
    n > 25 AND correlation > 0.5
LIMIT 100

Run the Query

In [80]:
bqclient = bigquery.Client()
df_results = regulome.runQuery ( bqclient, sql, [], [], PatientList, dryRun=False )
df_results
 in runQuery ... 
  FATAL ERROR: query execution failed 
In [ ]: