#!/usr/bin/env python # coding: utf-8 # In[11]: from __future__ import absolute_import get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import sys import pandas as pd import numpy as np import cohorts from cohorts.functions import missense_snv_count, snv_count from query_tcga import cohort # # Initializing the cohort # # In this example, we will demonstrate how to use [query_tcga](http://github.com/jburos/query_tcga) to initialize a cohort object. We will then do some simple mock-analyses using this object. # # This proceeds in two steps: # 1. `prep_patients()`: create a list of Patients (objects of class `cohort.Patient`) # 2. `prep_cohort()`: create a Cohort from the list of patients (object class `cohorts.Cohort`) # # For example, to query the `BLCA` cohort: # In[12]: blca_patients = cohort.prep_patients(project_name='TCGA-BLCA', project_data_dir='data', data_dir='data/gdc') blca_cohort = cohort.prep_cohort(patients=blca_patients, cache_dir='data-cache3') # We are now ready to proceed with our analysis. # ## Analysis of outcomes: Survival by Somatic Mutation Burden # In this cohort, we have VCF data so we can plot survival according to `snv_count`. # # `plot_survival` plots `OS` as the outcome, using the median value as the threshold. # In[13]: blca_cohort.plot_survival(snv_count) # There are a number of functions pre-defined in `cohorts` which provide common summaries of genetic data. # # Some examples: # # - `snv_count` summarizes the number of somatic SNVs per patient # - `missense_snv_count` summarizes the number of missense SNVs per patient # - `neoantigen_count` counts number of predicted neoantigens, after running topiary on VCFs # - `expressed_snv_count` same as `snv_count` but filtered to expressed variants # # Note that not every function will work for every cohort. If your cohort does not have mRNA data, for example, the `expressed_snv_count` function will produce an error. # ## Analysis of outcomes: Benefit by SNV count # Alternatively, we might want to summarize the association of SNV count with `benefit`, which is defined as OS > 1 year by default (see `?cohort.prep_patients`). # # To accomplish this, we can use the `plot_benefit` function for the cohort: # In[73]: blca_cohort.plot_benefit(cohorts.functions.snv_count) # ## Data exploration: Correlation of SNV count with Age # We might want to summarize correlation between `snv_count` and `age at diagnosis`. # # One peculiarity of the `plot_correlation` function is that it currently requires a function as input. Unlike `plot_survival` and `plot_benefit`, you cannot pass a variable name as a string. # # So, for now, we create a mini-function to return the desired variable: # In[20]: def age_at_dx(row): return row['age'] # In[19]: blca_cohort.plot_correlation(snv_count, age_at_dx) # As before, we can alternatively plot `snv_count` by benefit : # ## More flexibility: Prepare dataframe for analysis # Finally, we can retrieve the cohort data as a dataframe, including calculated functions like `snv_count`, etc. # # This is to enable opportunities for more detailed data exploration and/or analysis using external libraries. # # Here it is possible to provide a list of functions so that multiple summary statistics can be computed in the same dataframe. # In[14]: extra_cols, df = blca_cohort.as_dataframe(on=[cohorts.functions.snv_count]) print(extra_cols) # Also note that **the function now returns a tuple** when a list is passed to the parameter `on`. # (By comparison, when no `on` parameter is given, `as_dataframe()` simply returns a dataframe.) # ## Other options to `prep_cohort`: # The idea behind `cohorts` is to encourage the majority of analyses to use the "standard" options for this particular cohort. # # Thus, the user can set various *default* options at a cohort-level at the time of creation, but then override these on an analysis-by-analysis basis as needed. # # Examples of options that utilize this framework include: # - `normalized_per_mb`: whether to normalize metrics like SNV count to coverage # - `filter_fn`: QC filters to apply to *all variants* prior to summary stats being calculated # - `plot_benefit_name`: Descriptor to use for "benefit" plots # - etc. # # See `?cohorts.Cohort` for a more complete list of options. #