from google.colab import auth try: auth.authenticate_user() print('You have been successfully authenticated!') except: print('You have not been authenticated.') from google.cloud import bigquery try: project_id = 'isb-cgc-02-0001' bqclient = bigquery.Client(project=project_id) print('BigQuery client successfully initialized') except: print('Failure to initialize BigQuery client') import numpy as np import pandas as pd import seaborn as sns def runQuery ( client, qString, dryRun=False ): print ( "\n in runQuery ... " ) if ( dryRun ): print ( " dry-run only " ) ## set up QueryJobConfig object job_config = bigquery.QueryJobConfig() job_config.dry_run = dryRun job_config.use_query_cache = True job_config.use_legacy_sql = False ## run the query try: query_job = client.query ( qString, job_config=job_config ) ## print ( " query job state: ", query_job.state ) except: print ( " FATAL ERROR: query execution failed " ) return ( None ) ## return results as a dataframe (or an empty dataframe for a dry-run) if ( not dryRun ): try: df = query_job.to_dataframe() if ( query_job.total_bytes_processed==0 ): print ( " the results for this query were previously cached " ) else: print ( " this query processed {} bytes ".format(query_job.total_bytes_processed) ) if ( len(df) < 1 ): print ( " WARNING: this query returned NO results ") return ( df ) except: print ( " FATAL ERROR: query execution failed " ) return ( None ) else: print ( " if not cached, this query will process {} bytes ".format(query_job.total_bytes_processed) ) ## return an empty dataframe return ( pd.DataFrame() ) def checkQueryResults ( qr ): print ( "\n in checkQueryResults ... " ) if ( not isinstance(qr, pd.DataFrame) ): print ( " query execution failed! " ) return ( False ) else: if ( len(qr) > 0 ): print ( " # of rows in query results: {} ".format(len(qr)) ) print ( "\n", qr.head() ) else: print ( " query returned NO results ?!? " ) return ( True ) def build_cohort ( study ): qString = """ WITH -- -- we start with the clinical table -- cohort AS ( SELECT acronym as Study, bcr_patient_barcode as ParticipantBarcode FROM `isb-cgc-01-0008.Filtered.clinical_PANCAN_patient_with_followup_filtered` WHERE acronym = '__study__' ) """.replace('__study__',study) return(qString) def select_genes ( dtype, n ): if dtype == 'expr': qString = """ selected_genes AS ( SELECT Symbol, STDDEV(normalized_count) AS sigmaExp FROM `isb-cgc-01-0008.Filtered.EBpp_AdjustPANCAN_RNASeqV2_filtered` WHERE Symbol IS NOT NULL AND ParticipantBarcode IN ( SELECT ParticipantBarcode FROM cohort) GROUP BY 1 ORDER BY sigmaExp DESC LIMIT __n__ ) """.replace('__n__', str(n)) elif dtype == 'cnv': qString = """ selected_genes AS ( SELECT Gene_Symbols as Symbol, STDDEV(GISTIC_Calls) AS sigmaExp FROM `isb-cgc-01-0008.Filtered.all_CNVR_data_by_gene_filtered` WHERE Symbol IS NOT NULL AND ParticipantBarcode IN ( SELECT ParticipantBarcode FROM cohort) GROUP BY 1 ORDER BY sigmaExp DESC LIMIT __n__ ) """.replace('__n__', str(n)) else: # # Could be some other gene selection function here ## # qString = '' return(qString) def get_expr_data(): qString = """ expr_data AS ( SELECT Symbol, ParticipantBarcode, SampleBarcode, normalized_count as expr, DENSE_RANK() OVER (PARTITION BY SampleBarcode ORDER BY normalized_count ASC) AS rankExpr FROM `isb-cgc-01-0008.Filtered.EBpp_AdjustPANCAN_RNASeqV2_filtered` WHERE Symbol IS NOT NULL AND Symbol IN ( SELECT Symbol FROM selected_genes) AND ParticipantBarcode IN ( SELECT ParticipantBarcode FROM cohort) ) """ return(qString) def get_cnv_data(): qString = """ cnv_data AS ( SELECT Gene_Symbol as Symbol, ParticipantBarcode, SampleBarcode, GISTIC_Calls as cnv, DENSE_RANK() OVER (PARTITION BY SampleBarcode ORDER BY GISTIC_Calls ASC) AS rankCNV FROM `isb-cgc-01-0008.Filtered.all_CNVR_data_by_gene_filtered` WHERE Gene_Symbol IS NOT NULL AND Gene_Symbol IN ( SELECT Symbol AS Gene_Symbol FROM selected_genes) AND ParticipantBarcode IN ( SELECT ParticipantBarcode FROM cohort) ) """ return(qString) def join_data(): qString = """ j_data AS ( SELECT expr_data.Symbol as Symbol, expr_data.SampleBarcode as SampleBarcode, expr, rankExpr, cnv, rankCNV FROM expr_data JOIN cnv_data ON expr_data.SampleBarcode = cnv_data.SampleBarcode AND expr_data.Symbol = cnv_data.Symbol ) """ return(qString) def comp_corr(): qString = """ corr_table AS ( SELECT Symbol, CORR(rankExpr,rankCNV) AS corr FROM j_data GROUP BY 1 ORDER BY corr DESC ) """ return(qString) def final( last_table ): qString = """ SELECT * FROM __last_table__ """.replace('__last_table__', last_table) return(qString) def build_query( study, dtype, size ): sql = ( build_cohort( study ) + ',\n' + select_genes ( dtype, size ) + ',' + get_expr_data() + ',\n' + get_cnv_data() + ',\n' + join_data() + ',\n' + comp_corr() + '\n' + final( 'corr_table' ) ) return(sql) # select a tumor type studyList = [ 'ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA', 'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LAML', 'LGG', 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM' ] study = studyList[1] # choosing which genes to look at vartype = 'expr' # 'expr' or 'cnv'; to use for ranking genes n = 100 # the n most variable genes # building the query string sql = build_query (study, vartype, size) # print(sql) # calling Google! # res0 = runQuery ( bqclient, sql, dryRun=False ) res0.shape res0['index'] = range(0,95) res0[1:5] import seaborn as sns sns.set(style = 'darkgrid') sns.lmplot(x='index', y='corr', data=res0) from __future__ import print_function from ipywidgets import interact, interactive, fixed, interact_manual import ipywidgets as widgets # @hidden_cell widgets.Dropdown( options=['1', '2', '3'], value='2', description='Number:', disabled=False, ) #@title Default title text widgets.IntSlider( value=7, min=0, max=10, step=1, description='Test:', disabled=False, continuous_update=False, orientation='horizontal', readout=True, readout_format='d' )