Check out more notebooks at our Community Notebooks Repository!
Title: How to visualize results from BigQuery
Author: David L Gibbs
Created: 2019-07-17
Purpose: Demonstrate how visualize the results from a query.
URL: https://github.com/isb-cgc/Community-Notebooks/blob/master/Notebooks/How_to_plot_BigQuery_results.ipynb
Notes:
In this example, we'll perform a few easy queries, and use the python Seaborn library to visualize the results.
Library documentation: https://seaborn.pydata.org/
There's many python tutorials on using the Seaborn library for plotting. For example, https://elitedatascience.com/python-seaborn-tutorial
# If you start your jupyter notebook session using the
# Community-Notebook/VM-Notebook-Launcher then you're already logged in!
# !gcloud auth application-default login
!pip3 install seaborn
!pip3 install google-cloud-bigquery
import seaborn as se
from google.cloud import bigquery
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
def runQuery ( client, qString, dryRun=False ):
print ( "\n in runQuery ... " )
if ( dryRun ):
print ( " dry-run only " )
## set up QueryJobConfig object
job_config = bigquery.QueryJobConfig()
job_config.dry_run = dryRun
job_config.use_query_cache = True
job_config.use_legacy_sql = False
## run the query
try:
query_job = client.query ( qString, job_config=job_config )
## print ( " query job state: ", query_job.state )
except:
print ( " FATAL ERROR: query execution failed " )
return ( None )
## return results as a dataframe (or an empty dataframe for a dry-run)
if ( not dryRun ):
try:
df = query_job.to_dataframe()
if ( query_job.total_bytes_processed==0 ):
print ( " the results for this query were previously cached " )
else:
print ( " this query processed {} bytes ".format(query_job.total_bytes_processed) )
if ( len(df) < 1 ):
print ( " WARNING: this query returned NO results ")
return ( df )
except:
print ( " FATAL ERROR: query execution failed " )
return ( None )
else:
print ( " if not cached, this query will process {} bytes ".format(query_job.total_bytes_processed) )
## return an empty dataframe
return ( pd.DataFrame() )
def checkQueryResults ( qr ):
print ( "\n in checkQueryResults ... " )
if ( not isinstance(qr, pd.DataFrame) ):
print ( " query execution failed! " )
return ( False )
else:
if ( len(qr) > 0 ):
print ( " # of rows in query results: {} ".format(len(qr)) )
print ( "\n", qr.head() )
else:
print ( " query returned NO results ?!? " )
return ( True )
# User your own project ID
project_id = 'your_project_number'
if project_id == 'your_project_number':
print('Please update the project number with your Google Cloud Project')
else:
bqclient = bigquery.Client(project=project_id)
# We define queries as strings #
sql = '''
SELECT
icd_10,
COUNT(*) as Count
FROM
`isb-cgc.TCGA_bioclin_v0.Clinical`
GROUP BY
1 -- this is the same as 'group by icd_10'
ORDER BY
Count
'''
res0 = runQuery ( bqclient, sql, dryRun=False )
res0[0:5]
# first I'll subset the results,
# there's a lot of categories!
res1 = res0[res0['Count'] > 200]
# then we'll make a bar plot
p = se.barplot(data=res1, x = 'icd_10', y = 'Count')
sql = '''
SELECT
avg_percent_neutrophil_infiltration,
avg_percent_lymphocyte_infiltration
FROM
`isb-cgc.TCGA_bioclin_v0.Biospecimen`
GROUP BY
1,2
'''
res3 = runQuery ( bqclient, sql, dryRun=False )
res3[0:5]
se.scatterplot(data=res3, x='avg_percent_lymphocyte_infiltration', y='avg_percent_neutrophil_infiltration')