from google.colab import auth
try:
  auth.authenticate_user()
  print('You have been successfully authenticated!')
except:
  print('You have not been authenticated.')

from google.cloud import bigquery
try:
  project_id = 'isb-cgc-02-0001'
  bqclient = bigquery.Client(project=project_id)
  print('BigQuery client successfully initialized')
except:
  print('Failure to initialize BigQuery client')

import numpy as np
import pandas as pd
import seaborn as sns

def runQuery ( client, qString, dryRun=False ):
  
  print ( "\n in runQuery ... " )
  if ( dryRun ):
    print ( "    dry-run only " )
    
  ## set up QueryJobConfig object
  job_config = bigquery.QueryJobConfig()
  job_config.dry_run = dryRun
  job_config.use_query_cache = True
  job_config.use_legacy_sql = False
  
  ## run the query
  try:
    query_job = client.query ( qString, job_config=job_config )
    ## print ( "    query job state: ", query_job.state )
  except:
    print ( "  FATAL ERROR: query execution failed " )
    return ( None )
  
  ## return results as a dataframe (or an empty dataframe for a dry-run) 
  if ( not dryRun ):
    try:
      df = query_job.to_dataframe()
      if ( query_job.total_bytes_processed==0 ):
        print ( "    the results for this query were previously cached " )
      else:
        print ( "    this query processed {} bytes ".format(query_job.total_bytes_processed) )
      if ( len(df) < 1 ):
        print ( "  WARNING: this query returned NO results ")
      return ( df )
    except:
      print ( "  FATAL ERROR: query execution failed " )
      return ( None )
    
  else:
    print ( "    if not cached, this query will process {} bytes ".format(query_job.total_bytes_processed) )
    ## return an empty dataframe
    return ( pd.DataFrame() )
 

def checkQueryResults ( qr ):
  print ( "\n in checkQueryResults ... " )
  if ( not isinstance(qr, pd.DataFrame) ):
    print ( " query execution failed! " )
    return ( False )
  else:
    if ( len(qr) > 0 ): 
      print ( " # of rows in query results: {} ".format(len(qr)) )
      print ( "\n", qr.head() )
    else:
      print ( " query returned NO results ?!? " )  
    return ( True )

def build_cohort ( study ):
  qString = """
      WITH
        --
        -- we start with the clinical table
        --
        cohort AS (
        SELECT
          acronym as Study,
          bcr_patient_barcode as ParticipantBarcode
        FROM
          `isb-cgc-01-0008.Filtered.clinical_PANCAN_patient_with_followup_filtered` 
         WHERE
          acronym = '__study__'
          )
          """.replace('__study__',study)
  return(qString)
  

def select_genes ( dtype, n ):
  if dtype == 'expr':
    qString = """
      selected_genes AS (
      SELECT
        Symbol,
        STDDEV(normalized_count) AS sigmaExp
      FROM
        `isb-cgc-01-0008.Filtered.EBpp_AdjustPANCAN_RNASeqV2_filtered`
      WHERE
        Symbol IS NOT NULL
        AND ParticipantBarcode IN (
          SELECT
            ParticipantBarcode
          FROM
            cohort)
      GROUP BY
        1
      ORDER BY
        sigmaExp DESC
      LIMIT
        __n__ )
      """.replace('__n__', str(n))
    
  elif dtype == 'cnv':
    qString = """
      selected_genes AS (
      SELECT
        Gene_Symbols as Symbol,
        STDDEV(GISTIC_Calls) AS sigmaExp
      FROM
        `isb-cgc-01-0008.Filtered.all_CNVR_data_by_gene_filtered`
      WHERE
        Symbol IS NOT NULL
        AND ParticipantBarcode IN (
          SELECT
            ParticipantBarcode
          FROM
            cohort)
      GROUP BY
        1
      ORDER BY
        sigmaExp DESC
      LIMIT
        __n__ )
      """.replace('__n__', str(n))
  else:
    # 
    # Could be some other gene selection function here ##
    #
    qString = ''
  return(qString)

def get_expr_data():
  qString = """
      expr_data AS (
      SELECT
        Symbol,
        ParticipantBarcode,
        SampleBarcode,
        normalized_count as expr,
        DENSE_RANK() OVER (PARTITION BY SampleBarcode ORDER BY normalized_count ASC) AS rankExpr
      FROM
        `isb-cgc-01-0008.Filtered.EBpp_AdjustPANCAN_RNASeqV2_filtered`
      WHERE
        Symbol IS NOT NULL AND
        Symbol IN (
          SELECT
            Symbol
          FROM
            selected_genes)
        AND ParticipantBarcode IN (
          SELECT
            ParticipantBarcode
          FROM
            cohort) 
      )
  """
  return(qString)


def get_cnv_data():
  qString = """
      cnv_data AS (
      SELECT
        Gene_Symbol as Symbol,
        ParticipantBarcode,
        SampleBarcode,
        GISTIC_Calls as cnv,
        DENSE_RANK() OVER (PARTITION BY SampleBarcode ORDER BY GISTIC_Calls ASC) AS rankCNV
      FROM
        `isb-cgc-01-0008.Filtered.all_CNVR_data_by_gene_filtered`
      WHERE
  	   Gene_Symbol IS NOT NULL AND
       Gene_Symbol IN (
        SELECT
          Symbol AS Gene_Symbol
        FROM
          selected_genes)
       AND ParticipantBarcode IN (
        SELECT
       ParticipantBarcode
        FROM
       cohort) 
       )
  """
  return(qString)


  def join_data():
    qString = """
      j_data AS (
      SELECT
        expr_data.Symbol as Symbol,
        expr_data.SampleBarcode as SampleBarcode,
        expr,
        rankExpr,
        cnv,
        rankCNV
      FROM
        expr_data JOIN cnv_data
      ON
        expr_data.SampleBarcode = cnv_data.SampleBarcode
        AND expr_data.Symbol = cnv_data.Symbol 
      )
    """
    return(qString)

def comp_corr():
  qString = """
  corr_table AS (
  SELECT
    Symbol,
    CORR(rankExpr,rankCNV) AS corr
  FROM
    j_data
  GROUP BY
    1
  ORDER BY
    corr DESC
    )
  """
  return(qString)


def final( last_table ):
  qString = """
  SELECT * FROM __last_table__
  """.replace('__last_table__', last_table)
  return(qString)


def build_query( study, dtype, size ):
  sql = (
    build_cohort( study ) + ',\n' +
    select_genes ( dtype, size ) + ',' +
    get_expr_data() + ',\n' +
    get_cnv_data() + ',\n' +
    join_data() + ',\n' +
    comp_corr() + '\n' +
    final( 'corr_table' )
  )
  return(sql)

# select a tumor type

studyList = [ 'ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA', 
                'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LAML', 'LGG', 'LIHC', 
                'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 
                'SARC', 'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM' ]

study = studyList[1]

# choosing which genes to look at
vartype = 'expr'  # 'expr' or 'cnv'; to use for ranking genes
n       = 100     # the n most variable genes

# building the query string
sql = build_query (study, vartype, size)

# print(sql)

# calling Google! #

res0 = runQuery ( bqclient, sql, dryRun=False )


res0.shape

res0['index'] = range(0,95)

res0[1:5]

import seaborn as sns
sns.set(style = 'darkgrid')

sns.lmplot(x='index', y='corr', data=res0)


from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

# @hidden_cell
widgets.Dropdown(
    options=['1', '2', '3'],
    value='2',
    description='Number:',
    disabled=False,
)

#@title Default title text

widgets.IntSlider(
    value=7,
    min=0,
    max=10,
    step=1,
    description='Test:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)