#!/usr/bin/env python # coding: utf-8 #

Table of Contents

#
# # Analysis of serum mutational antigenic profiling # This Python Jupyter notebook analyzes mutational antigenic profiling of serum against virus carrying the A/Perth/2009 (H3N2) HA. # ## Configure analysis # We first configure the analysis by importing packages and getting information on the samples that we will analyze. # ### Import Python modules / packages # Import modules / packages. # In particular, we use: # # - [plotnine](https://plotnine.readthedocs.io) for ggplot2-like plotting syntax # - [dmslogo](https://jbloomlab.github.io/dmslogo/) to draw sequence logo plots # - [dms_tools2](https://jbloomlab.github.io/dms_tools2/) for much of the analysis # In[1]: import os import math import collections import itertools import warnings import subprocess import yaml import pandas as pd from IPython.display import display, HTML import matplotlib.pyplot as plt from plotnine import * import dms_tools2 from dms_tools2.ipython_utils import showPDF from dms_tools2.plot import COLOR_BLIND_PALETTE_GRAY as PALETTE import dmslogo # Turn on interactive matplotlib plotting: # In[2]: plt.ion() # turn on interactive plotting # Print versions of Bloom lab software: # In[3]: print(f"Using dms_tools2 version {dms_tools2.__version__}") print(f"Using dmslogo version {dmslogo.__version__}") # Set data frame display options: # In[4]: pd.set_option('display.max_colwidth', 200) # Ignore warnings that can clutter notebook: # In[5]: warnings.simplefilter('ignore') # ### Get config variables # Read the variables from the config file. # These variables specify input / output paths and key parameters for analysis: # In[6]: configfile = 'config.yaml' with open(configfile) as f: config = yaml.safe_load(f) print(f"Read the following configuration from {configfile}:") display(HTML(pd.Series(config) .to_frame('value') .to_html() )) # ### Get sera information # Read information on the sera that are being mapped. # # For each serum sample below, we get: # - *serum*: the abbreviation used for that serum in the experiments # - *serum_description*: description of the serum # - *serum_group*: group to which the serum belongs # - *serum_name*: a short informative name # - *serum_species*: species from which the serum is derived # - *serum_vaccination*: any relevant information on vaccination status (some serum are paired pre- and post-vaccination) # In[7]: with open(config['serum_info']) as f: sera = (pd.DataFrame(yaml.safe_load(f)) .transpose() .add_prefix('serum_') .rename_axis('serum') .reset_index() ) assert len(sera) == len(sera['serum'].unique()), 'sera not unique' print(f"Read the following sera information from {config['serum_info']}:") display(HTML(sera.to_html(index=False))) # ### Get list of samples # Read information about all of the samples that we have deep sequenced. # # For each sample, we have information on the serum to which it corresponds, the virus library, the date of sequencing, the serum dilution, the percent infectivity, and (depending on the value of *seq_data_source* in the config file) either the [Sequence Read Archive (SRA)](https://www.ncbi.nlm.nih.gov/sra) accession or the location of the R1 files on the Hutch server: # In[8]: samples = pd.read_csv(config['sample_list']) # don't need any R1 column if we are using SRA accession if config['seq_data_source'] == 'SRA_accession': samples = samples.drop(columns='R1', errors='ignore') assert len(samples) == len(samples['sample'].unique()), 'non-unique samples' print(f"Read the following samples from {config['sample_list']}:") display(HTML(samples.to_html(index=False))) # Check that the serum for all samples are in our set of sera: # In[9]: unknown_sera = set(samples['serum']) - set(sera['serum']) if unknown_sera: raise ValueError(f"samples include unknown sera: {unknown_sera}") else: print('We have information for all sera used for the samples.') # ### Download deep sequencing data if needed # The config file specifies whether we get the data from existing *R1* files on the Hutch server, or download the data from the [Sequence Read Archive](https://www.ncbi.nlm.nih.gov/sra) (SRA) using [dms_tools2.sra.fastqFromSRA](https://jbloomlab.github.io/dms_tools2/dms_tools2.sra.html): # In[10]: if config['seq_data_source'] == 'SRA_accession': # are we using Aspera for rapid downloads? if config['ascp'] and config['asperakey']: aspera = (config['ascp'], config['asperakey']) else: aspera = None # do the downloads print(f"Downloading FASTQ files to {config['fastq_dir']} (takes a while)...") os.makedirs(config['fastq_dir'], exist_ok=True) samples = samples.rename(columns={'sample': 'name', 'SRA_accession': 'run'}) dms_tools2.sra.fastqFromSRA( samples=samples, fastq_dump=config['fastq_dump'], fastqdir=config['fastq_dir'], aspera=aspera, ncpus=config['ncpus'], ) samples = samples.rename(columns={'name': 'sample', 'run': 'SRA_accession'}) print('Completed downloading files.') elif config['seq_data_source'] != 'R1': raise ValueError('invalid value of `seq_data_source`') # ## Align sequencing and count mutations # The samples were sequenced using [barcoded subamplicon sequencing](https://jbloomlab.github.io/dms_tools2/bcsubamp.html) to obtain high accuracy. # So we need to process these data to determine the counts of each codon mutation in each sample. # # First, create the directory used for the results of this part of the analysis: # In[11]: os.makedirs(config['countsdir'], exist_ok=True) print(f"Results from counting mutations go to {config['countsdir']}") # ### Run `dms2_batch_bcsubamp` # We process the sequencing data by using [dms2_batch_bcsubamp](https://jbloomlab.github.io/dms_tools2/dms2_batch_bcsubamp.html#dms2-batch-bcsubamp) to generate "codon counts" files that give the counts of each codon mutation for each sample. # # First, write the batch file used as input by this program: # In[12]: bcsubamp_batchfile = os.path.join(config['countsdir'], 'batch.csv') (samples .rename(columns={'sample': 'name'}) [['name', 'R1']] .to_csv(bcsubamp_batchfile) ) print(f"Creating batch file {bcsubamp_batchfile}") # Now run the program: # In[13]: cmds = ['dms2_batch_bcsubamp', '--batchfile', bcsubamp_batchfile, '--refseq', config['refseq'], '--alignspecs'] + config['alignspecs'].split() + [ '--R1trim', str(config['R1trim']), '--R2trim', str(config['R2trim']), '--outdir', config['countsdir'], '--fastqdir', config['fastq_dir'], '--summaryprefix', 'summary', '--ncpus', str(config['ncpus']), '--use_existing', config['use_existing'], ] print(f"Running dms2_batch_bcsubamp with this command:\n{' '.join(cmds)}") subprocess.check_output(cmds) print('Completed running dms2_batch_bcsubamp.') # Confirm that all the expected counts files exist: # In[14]: assert all(os.path.isfile(f) for f in config['countsdir'] + '/' + samples['sample'] + '_codoncounts.csv' ), 'missing counts files' # ### Plot sequencing and mutation counts summaries # Running [dms2_batch_bcsubamp](https://jbloomlab.github.io/dms_tools2/dms2_batch_bcsubamp.html#dms2-batch-bcsubamp) created some summary plots. # The prefix on those plots should be as follows: # In[15]: countsplotprefix = os.path.join(config['countsdir'], 'summary_') # #### Reads and barcodes per sample # Total sequencing reads per sample: # In[16]: showPDF(countsplotprefix + 'readstats.pdf') # Distribution of sequencing reads per barcode on subamplicons: # In[17]: showPDF(countsplotprefix + 'readsperbc.pdf') # Number of barcoded subamplicons that align and have sufficient reads: # In[18]: showPDF(countsplotprefix + 'bcstats.pdf') # #### Coverage across gene # Depth of valid barcoded subamplicons covering each site in the gene: # In[19]: showPDF(countsplotprefix + 'depth.pdf') # #### Mutation frequencies # The average mutation frequency for each sample, stratifying by codon mutation type: # In[20]: showPDF(countsplotprefix + 'codonmuttypes.pdf') # Average mutation frequency per sample, stratifying by number of nucleotide changes per codon mutation: # In[21]: showPDF(countsplotprefix + 'codonntchanges.pdf') # Per-codon mutation frequencies across all sites in gene for each sample: # In[22]: showPDF(countsplotprefix + 'mutfreq.pdf') # #### Check for oxidative damage # Sometimes there is oxidative damage which manifests as an enrichment of `G`->`T` and `C`->`A` mutations among the single-nucleotide codon mutations. # Check for this by plotting frequencies of different single-nucleotide mutation types: # In[23]: showPDF(countsplotprefix + 'singlentchanges.pdf') # ### Renumber to standard HA numbering # The above alignments use sequential 1, 2, ... numbering of the codons. # This is not the standard numbering scheme used for HA, so we use [dms_tools2.utils.renumbSites](https://jbloomlab.github.io/dms_tools2/dms_tools2.utils.html#dms_tools2.utils.renumberSites) to renumber to the standard HA numbering scheme: # In[24]: dms_tools2.utils.renumberSites( renumbfile=config['renumbering_scheme'], infiles=list(config['countsdir'] + '/' + samples['sample'] + '_codoncounts.csv'), missing='drop', outdir=config['renumbcountsdir']) assert all(os.path.isfile(f) for f in config['renumbcountsdir'] + '/' + samples['sample'] + '_codoncounts.csv' ), 'missing renumbered counts files' print(f"Renumbered codon counts are in {config['renumbcountsdir']}") # ## Compute immune selection on mutations # We will now determine the immune selection on each mutation by comparing its frequency in each serum-selected sample to an appropriate mock-selected control. # Specifically, we will quantify the immune selection as the *differential selection (diffsel)*, which is essentially the log enrichment of each mutation relative to wildtype in the immune-selected versus mock sample. # See [Doud et al (2017)](https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1006271) for the paper introducing this metric, and see [here](https://jbloomlab.github.io/dms_tools2/diffsel.html) for a more detailed description. # ### Samples to compare for each selection # In order to quantify the immune selection, we need to compare each selected sample to the appropriate controls. # Specifically, for each selection, we define three samples: # - *sel*: the immune-selected sample # - *mock*: the appropriate mock-selected control for that date and library # - *err*: the appropriate wildtype plasmid control to estimate sequencing error rates. # # We call each combination of three such samples a "selection". # For each selection, we also record: # - *serum_name_formatted*: a version of *serum_name* that also indicates species if non-human # - *name*: a string indicating the library and the percent infectivity remaining. So for instance, *lib1-5* indicates library 1 with 5% infectivity remaining. # # Below we construct the data frame with this information on the selections: # In[25]: selections = ( # get all immune selected (sel) samples samples .query('(serum != "mock") & (serum != "plasmid")') .rename(columns={'sample': 'sel'}) # add mock sample for that date and library .merge(samples .query('serum == "mock"') [['sample', 'library', 'date']] .rename(columns={'sample': 'mock'}) ) # add plasmid sample as error control (err) for that date and library .merge(samples .query('serum == "plasmid"') [['sample', 'library', 'date']] .rename(columns={'sample': 'err'}) ) # add information about sera .merge(sera, validate='many_to_one') # add informative names for serum and samples .assign( serum_name_formatted=lambda x: x['serum_species'].map(lambda s: '' if pd.isnull(s) or s == 'human' else s + '-') + x['serum_name'], name_formatted=lambda x: x['library'] + ', ' + x['percent_infectivity'].apply( dms_tools2.utils.sigFigStr, nsig=2) + '% infectivity', name=lambda x: x['library'] + '-' + x['percent_infectivity'].apply( dms_tools2.utils.sigFigStr, nsig=2) ) # drop unneeded columns .drop(['R1', 'R2', 'SRA_accession'], axis='columns', errors='ignore') # re-order columns a bit so key ones are displayed first .set_index(['serum_name_formatted', 'name', 'sel', 'mock', 'err']) .reset_index() ) # make sure no duplicated serum / names assert len(selections) == len(selections.groupby(['serum_name_formatted', 'name'])) print(f"Tabulated information for {len(selections)} selections:") display(HTML(selections.to_html(index=False))) # ### Compute immune selection # Now we run [dms2_batch_diffsel](https://jbloomlab.github.io/dms_tools2/dms2_batch_diffsel.html) to compute the immune selection. # We then add to our `selections` data frame the name of the files holding the computed site (*site*) and mutation (*mut*) level selection for each sample. # # The next cell does all of this: # In[26]: outdir = config['diffseldir'] os.makedirs(outdir, exist_ok=True) # write batch file used by program batchfile = os.path.join(outdir, 'batch.csv') (selections .rename(columns={'serum_name_formatted': 'group'}) .to_csv(batchfile, index=False) ) cmds = ['dms2_batch_diffsel', '--summaryprefix', 'summary', '--batchfile', batchfile, '--outdir', outdir, '--indir', config['renumbcountsdir'], '--use_existing', config['use_existing'], '--ncpus', str(config['ncpus']) ] print(f"Computing diffsel using dms2_batch_diffsel with command:\n{' '.join(cmds)}") subprocess.check_output(cmds) selfilecols = [] for selfile in ['mutdiffsel', 'sitediffsel']: selfilecol = selfile + '_file' selfilecols.append(selfilecol) selections[selfilecol] = (outdir + '/' + selections['serum_name_formatted'] + '-' + selections['name'] + '_' + selfile + '.csv') missing_files = [f for f in selections[selfilecol] if not os.path.isfile(f)] assert not missing_files, f"missing files: {missing_files}" print(f"Created {len(selections[selfilecol])} {selfile} files, adding to " f"`selections` data frame in column {selfilecol}") # ### Get all selection information in one data frame # For further processing, we want to create a dataframe that holds all of the selection information at the site and mutation levels for all samples. # We create such a dataframe, *sel_df*, by reading the files in *selections* into the data frame using [dms_tools2.diffsel.df_read_filecols](https://jbloomlab.github.io/dms_tools2/dms_tools2.diffsel.html#dms_tools2.diffsel.df_read_filecols): # In[27]: sel_df = (dms_tools2.diffsel.df_read_filecols(selections, selfilecols) .drop(columns=selfilecols) ) # Now *sel_df* is a very large data frame, but it has all the information we want to plot. # Here are the first few rows: # In[28]: print(f"sel_df has {len(sel_df)} rows. Here are the first few:") display(HTML(sel_df.head(n=5).to_html(index=False))) # ## Analyze and plot immune selection # ### Choose sample to retain for each serum # For some serum, we have multiple samples. # These include the samples from multiple library replicates, but sometimes also several samples for the same library replicate at different serum concentrations that yield different percent viral infectivity remaining. # # Which of these samples do we want to retain? # We probably want just one sample per library (so we don't over-weight some libraries in our results), and we need to select the serum concentration (percent viral infectivity remaining) that is "best" in terms of giving reproducible results between replicates with maximal signal to noise. # #### Plot site-level selection for all samples # We first plot the site-level selection for all samples. # To do this, we look over all sites and use the `facet_plot` command of [dmslogo](https://jbloomlab.github.io/dmslogo/) to plot the site-level selection for all samples for each serum. # We also compute the correlation between samples for each serum: # In[29]: for serum_name, serum_sel_df in sel_df.groupby('serum_name_formatted'): print(f"\n\n******************* {serum_name} *******************") fig, ax = dmslogo.facet_plot( serum_sel_df, x_col='isite', gridcol_col='name_formatted', show_col=None, wspace=0.6, draw_line_kwargs=dict( xtick_col='site', height_col='positive_diffsel', ylabel='positive_diffsel', ) ) display(fig) plt.close(fig) corr_df = (serum_sel_df .rename(columns={'name_formatted': 'sample'}) .pivot_table(values='positive_diffsel', columns='sample', index=['site']) .corr() .round(3) ) display(HTML(corr_df.to_html())) # #### Choose samples to retain based on infectivity remaining # From the plots and correlations immediately above, several things are apparent: # 1. Different samples with the **same** virus library tend to be substantially more correlated than samples between different virus libraries. # This makes sense, as some of the noise probably comes from library composition differences in terms of variant abundance. # This fact implies that we want to keep just one (or the same number) of samples from each library, so that when we take means / medians across samples they are not disproportionately influenced by a single library. # 2. The "best" concentration of serum appears to be one that leaves about about 1% to 5% of the viral infectivity remaining. # This is judged by the fact that samples with this percent infectivity remaining tend to look "cleanest" in the line plots of site differential selection, and also tend to correlate best with samples from another library. # # Based on the above, we will filter our samples down to those that have closest to the target amount of viral infectivity remaining. # Because we qualitatively estimated above that the best results are obtained when between 1% to 5% of infectivity is remaining, we will target 2% infectivity. # Then for any library / serum with multiple samples, we will retain the one that is closest (in log space) to that target infectivity. # We do this in log space because in linear space, 0% infectivity would be closer to 2% infectivity as would 6% infectivity--even though the latter is clearly preferable if our target is 2%. # In[30]: target_infectivity = 2 print(f"Choosing samples closest to {target_infectivity:.2f}% infectivity.") # Mark the samples to retain by adding a *retained* column to both the *selections* and *sel_df* data frames, and flagging it as *True* only for the sample for that serum and library that has the smallest log-space distance to the target infectivity: # In[31]: selections = ( selections .assign(retained=lambda x: x.assign(dist=lambda y: (y.percent_infectivity / target_infectivity) .apply(math.log).apply(abs)) .groupby(['serum_name_formatted', 'library']) ['dist'] .transform(lambda y: y <= y.min()) ) ) print(f"Retaining {len(selections.query('retained'))} of {len(selections)}") sel_df = sel_df.merge(selections[['serum_name_formatted', 'name', 'retained']], validate='many_to_one') # Plot the samples to retain and their percent infectivity. # In the plot below, the dashed horizontal line indicates the target percent infectivity, and the colors of the points indicate which sample we retained for each serum / library: # In[32]: p = (ggplot(selections.assign(xlabel=lambda x: (x['serum_name_formatted'] + ', ' + x['library'])), aes('xlabel', 'percent_infectivity', color='retained')) + geom_point(size=3, alpha=0.7) + theme( axis_text_x=element_text(angle=90), figure_size=(0.25 * len(selections.groupby(['serum_name_formatted', 'library'])), 2.5) ) + scale_y_log10(name='percent infectivity') + xlab('') + scale_color_manual(values=PALETTE) + geom_hline(yintercept=target_infectivity, linetype='dashed', alpha=0.7, color=PALETTE[2]) ) _ = p.draw() # #### Retained samples, percent infectivity, and serum/antibody concentration # Here is a small table listing the samples that we retained for each serum and # 1. The percent infectivity remaining # 2. The serum or antibody concentration. # For sera, this is the dilution of the serum. # For antibodies, this is the antibody concentration in $\mu$g/ml. # For antibody+serum mixes, this is the serum dilution followed by the antibody concentration in $\mu$g/ml. # In[33]: os.makedirs(config['selectiontabledir'], exist_ok=True) for valtype in ['percent_infectivity', 'serum_dilution']: table = (selections .assign(serum_name=lambda x: pd.Categorical(x['serum_name'], sera['serum_name'], ordered=True)) .query('retained') .sort_values('serum_name') [['serum_group', 'serum_name_formatted', 'library', valtype]] .pivot_table(index=['serum_group', 'serum_name_formatted'], columns='library', values=valtype, aggfunc=lambda x: ' '.join(str(v) for v in x)) ) print(f"\n************* {valtype} for each replicate *************") display(HTML(table.to_html())) table_file = os.path.join(config['selectiontabledir'], f"{valtype}_table.csv") print(f"Writing to {table_file}") table.to_csv(table_file) # ### Compute serum average from retained samples # We now compute the average selection among the retained samples for each serum. # # First, confirm that we have retained just one sample per serum / library: # In[34]: assert all(len(group) == 1 for _, group in (selections .query('retained') .groupby(['serum_name_formatted', 'library']) ) ) # We can compute the "average" selection using either the mean or the median; which one to use is specified in the config file: # In[35]: avg_type = config['avg_type'] print(f"Computing across replicate averages as the {avg_type}") # Do a sanity check and make sure none of our libraries are named the same as the average type: # In[36]: assert all(avg_type not in selections[col].values for col in ['library', 'name_formatted']) # Now loop over all sera and compute the average selection for all retained samples for that sera. # Note that the averages are computed on the mutation-level data, and then the site data are computed from those averaged mutation-level data. # The averages (along with the samples used to compute these averages) are then added to a new data frame similar to *selections* that is called *avg_selections*: # In[37]: avgdir = config['avgdiffseldir'] os.makedirs(avgdir, exist_ok=True) avg_selections = [] for serum_name_formatted, group in ( selections .query('retained') [['serum_name_formatted', 'library', 'name_formatted'] + list(sera.columns) + selfilecols] .groupby('serum_name_formatted') ): avg_selections.append(group) # build row of data frame with average avg_row = group.iloc[0].to_dict(into=collections.OrderedDict) avg_row['library'] = avg_type avg_row['name_formatted'] = avg_type avg_row['mutdiffsel_file'] = (f"{avgdir}/{serum_name_formatted}-" f"mutdiffsel-{avg_type}.csv") (dms_tools2.diffsel.avgMutDiffSel(group['mutdiffsel_file'], avg_type) .to_csv(avg_row['mutdiffsel_file'], index=False)) avg_row['sitediffsel_file'] = (f"{avgdir}/{serum_name_formatted}-" f"sitediffsel-{avg_type}.csv") (dms_tools2.diffsel.mutToSiteDiffSel(pd.read_csv(avg_row['mutdiffsel_file'])) .to_csv(avg_row['sitediffsel_file'], index=False)) avg_row = pd.Series(avg_row).to_frame().transpose() assert all(avg_row.columns == group.columns) avg_selections.append(avg_row) # put avg_selections in data frame, sort to match config['sera'] avg_selections = (pd.concat(avg_selections) .assign(serum_name=lambda x: pd.Categorical(x['serum_name'], sera['serum_name'], ordered=True)) .sort_values(['serum_name', 'library']) .assign(serum_name_formatted=lambda x: pd.Categorical(x['serum_name_formatted'], x['serum_name_formatted'].unique(), ordered=True)) .reset_index(drop=True) ) # Now the `avg_selections` data frame lists the files giving all the retained library replicates plus the average calculated from them these replicates. # Now we create the data frame `avg_sel_df` which actually holds the site- and mutation-level averages for all sera as well as the samples (one replicate per library) that we used to compute these averages: # In[38]: avg_sel_df = (dms_tools2.diffsel.df_read_filecols(avg_selections, selfilecols) .drop(columns=selfilecols) # preserve order of sera as in `avg_selections` .assign(serum_name_formatted=lambda x: pd.Categorical(x['serum_name_formatted'], (avg_selections['serum_name_formatted'] .unique()), ordered=True)) ) # This `avg_sel_df` data frame differs from `sel_df` only in that it includes the averages as a library type, and only has the retained replicates for each library. # ### Identify sites of "significant" selection # We want to identify the sites that are under "significant" immune selection. # The reason is that we can then zoom in on these sites in logo plots. # # In [dms_tools2.plot.findSigSel](https://jbloomlab.github.io/dms_tools2/dms_tools2.plot.html#dms_tools2.plot.findSigSel) function, we have implemented a heuristic way to do this. # Essentially, this function fits a gamma distribution to the selection values for each site, and then identifies those that are "outliers" of high selection. # #### Cutoff for significance # First, we define a cutoff for what constitutes significant: # In[39]: fdr_cutoff = 0.05 # #### Identify significant sites # Now we use [dms_tools2.plot.findSigSel](https://jbloomlab.github.io/dms_tools2/dms_tools2.plot.html#dms_tools2.plot.findSigSel) to get a dataframe (`sigsites_df`) that lists the "significant" sites for each serum. # That function finds these sites by fitting a gamma distribution to the data and then finding sites that are far outside the range of the distribution (thereby computing a heuristic P-value). # It has two methods, *robust_hist* and *mle*; we use both and take any sites found by either method. # # Because the pre-vaccination and pre-infection serum generally have weak signal and the vaccination at most boosts existing specificities in the human samples that we have, we will ignore the pre-vaccination samples when identifying significant sites. # # The cell below also saves plots showing the fit gamma distribution (you can inspect these separately if you want to look in more detail): # In[40]: os.makedirs(config['avgdiffsel_sigsites_dir'], exist_ok=True) plotfile_template = os.path.join(config['avgdiffsel_sigsites_dir'], 'sigsites_{serum}_{method}.pdf') print(f"Identifying sites of significant selection at a FDR of {fdr_cutoff}.\n" f"Plots of distribution fitting saved as {plotfile_template}") sigsites_df = [] sigsites_cols = ['serum_group', 'serum_name_formatted', 'isite', 'site', 'positive_diffsel'] for serum_name_formatted, group in ( avg_sel_df .query('serum_vaccination != "pre"') .query('library == @avg_type') [sigsites_cols] .drop_duplicates() .groupby('serum_name_formatted', observed=True) ): for method in ['robust_hist', 'mle']: plotfile = plotfile_template.format(serum=serum_name_formatted, method=method) df, _, _ = dms_tools2.plot.findSigSel( group, 'positive_diffsel', plotfile, fdr=fdr_cutoff, title=serum_name_formatted, method=method ) sigsites_df.append(df) sigsites_df = (pd.concat(sigsites_df, ignore_index=True) .groupby(sigsites_cols) .aggregate({'sig': 'sum'}) .reset_index() .assign(sig=lambda x: x['sig'].astype(bool)) .query('sig') ) print('Here are the first few rows of sigsites_df:') display(HTML(sigsites_df.head(n=4).to_html(index=False))) # #### List significant sites for each serum # Now display lists of the significant sites for each serum (we did not determine significant sites for pre-vaccination samples as described above): # In[41]: display(HTML(sigsites_df .sort_values('isite') .assign(nsites=1) .groupby('serum_name_formatted') .aggregate({'site': lambda x: ', '.join(list(x)), 'nsites': 'sum'}) .rename(columns={'site': 'significant sites', 'nsites': 'number of sites'}) .to_html() )) # #### Get significant sites for each serum group # In the analyses below, we may want to plot the sites that are significant within at least one serum sample for each serum group. # Therefore, we determine the significant sites within each serum group: # In[42]: sigsites_by_serumgroup = ( sigsites_df [['serum_group', 'isite', 'site']] .drop_duplicates() .sort_values('isite') .groupby('serum_group') .aggregate(lambda x: list(x)) .assign(nsites=lambda x: x['site'].apply(len)) .sort_values('nsites') ) display(HTML(sigsites_by_serumgroup.to_html())) # ### Line and logo plots of average for each serum # Now we plot the average (across libraries) selection for each serum. # #### Choose sites to zoom-in on # In the plots, we will zoom in on important sites using logo plots. # The reason that we zoom in on just a subset of sites is to keep the logo plots relatively compact. # # The sites we zoom in on will be those identified above as being under "significant" selection for all sera in that serum group. # We also may want to "pad" the sites that we zoom in on by zooming in on this many sites before and after each significant site. # This is useful if you want to give some context to the zoomed sites. # Below, set the amount of padding you want around each significant site; a value of 0 means no padding: # In[43]: zoom_pad = 0 # Now build a dict, *zoom_sites* that is keyed first by *serum_group* and then by *isite* and *site*, with values being the list of sites to zoom for that serum group: # In[44]: zoom_sites = {} for tup in sigsites_by_serumgroup.reset_index().itertuples(index=False): isites = set(tup.isite) for isite in list(isites): for pad in range(-zoom_pad, zoom_pad + 1): isites.add(isite + pad) # only keep valid sites isites = isites.intersection(set(avg_sel_df['isite'])) isites = sorted(isites) # get as sorted list # get sites corresponding to each isite sites = (avg_sel_df [['isite', 'site']] .drop_duplicates() .sort_values('isite') .query('isite in @isites') ['site'] .tolist() ) zoom_sites[tup.serum_group] = {'isite': isites, 'site': sites, 'nsites': len(sites)} # Here are the sites we will zoom in on for each serum group: # In[45]: display(HTML(pd.DataFrame.from_dict(zoom_sites, orient='index').to_html())) # Add a column (`zoom_site`) to `avg_sel_df` that indicates which sites to zoom in on: # In[46]: avg_sel_df = pd.concat( [df.assign(zoom_site=lambda x: x['isite'].isin( zoom_sites[serum_group]['isite'])) for serum_group, df in avg_sel_df.groupby('serum_group')], ignore_index=True ) # #### Write tidy data frame with selection data # We now have all the information used to display the data in the tidy data frame `avg_sel_df`, which has the selection for every mutation for each antibody averaged across this replicates. # We write this data frame to a CSV file, getting rid of some unneeded columns: # In[47]: avg_sel_df_file = os.path.join(config['avgdiffseldir'], 'avg_sel_tidy.csv') print(f"Writing average selection information to {avg_sel_df_file}") (avg_sel_df .query('library == @avg_type') .drop(columns=['library', 'name_formatted', 'serum_description', 'serum_name', 'serum_species']) .to_csv(avg_sel_df_file, index=False, float_format='%.5g') ) # #### Compact "zoom" plots # For each group of sera we make line plots that show the site-level selection and logo plots that zoom in on mutations at the sites of significant selection. # We make these plots using the `facet_plot` command of [dmslogo](https://jbloomlab.github.io/dmslogo/). # # We want to label the logo plots with site numbers **and** wildtype residue, so first we create a column that contains this information: # In[48]: avg_sel_df = avg_sel_df.assign(site_label=lambda x: x['wildtype'] + x['site'].astype('str')) # We will use axes with shared ylimits across rows for all plots **except** for the *antibody* serum group: # In[49]: share_ylim_across_rows = {serum_group: ('antibody' not in serum_group) for serum_group in avg_sel_df['serum_group'].unique()} # Now we make the line and logo plots. # We also save PDF versions of each plot: # In[50]: os.makedirs(config['avgdiffsel_zoom_dir'], exist_ok=True) for serum_group, df in avg_sel_df.groupby('serum_group'): plotfile = os.path.join(config['avgdiffsel_zoom_dir'], f"{serum_group}_avg.pdf") print(f"\n\n{'*' * 72}\nSerum group {serum_group}, saving to {plotfile}\n") fig, axes = dmslogo.facet_plot( data=df.query('library == @avg_type'), x_col='isite', show_col='zoom_site', gridrow_col='serum_name_formatted', share_xlabel=True, share_ylabel=True, share_ylim_across_rows=share_ylim_across_rows[serum_group], wspace=0.6, draw_line_kwargs=dict( height_col='positive_diffsel', xtick_col='site', ylabel='immune selection', ), draw_logo_kwargs=dict( letter_col='mutation', letter_height_col='mutdiffsel', xtick_col='site_label', xlabel='site', ylabel='immune selection', clip_negative_heights=True, ), ) display(fig) fig.savefig(plotfile) plt.close(fig) # #### Whole-gene logo plots # Finally, we make whole-gene logo plots for each serum that shows the replicate-average selection for **all** sites. # We make these whole-gene plots using [dms2_logoplot](https://jbloomlab.github.io/dms_tools2/dms2_logoplot.html). # They are too large to be useful to show visually in this notebook, but the cell below gives the name of the PDF holding each logo plot so you can examine them individually if you'd like. # In[51]: outdir = config['avgdiffsel_full_dir'] # save plots here os.makedirs(outdir, exist_ok=True) for tup in (avg_selections .query('library == @avg_type') .itertuples(index=False) ): name = getattr(tup, "serum_name_formatted") plotfile = os.path.join(outdir, f"{name}_diffsel.pdf") if os.path.isfile(plotfile) and config['use_existing'] == 'yes': print(f"{plotfile} already exists.") continue datafile = getattr(tup, 'mutdiffsel_file') maxsitediffsel = (pd.read_csv(getattr(tup, 'sitediffsel_file')) ['positive_diffsel'] .max() ) barheight = f"{0.5 * maxsitediffsel:.2g}" barlabel = f"differential selection = {barheight}" cmds = ['dms2_logoplot', '--outdir', outdir, '--diffsel', datafile, '--name', name, '--nperline', '95', '--overlay1', datafile, 'wildtype', 'wildtype', '--underlay', 'yes', '--restrictdiffsel', 'positive', '--scalebar', barheight, barlabel, '--use_existing', config['use_existing'], ] print(f"Plotting {name} to {plotfile}") subprocess.check_output(cmds) assert os.path.isfile(plotfile) # ### Beta diversity for each group # In order to determine how variable the selection is in each group, we compute the [beta diversity](https://esajournals.onlinelibrary.wiley.com/doi/10.1890/06-1736.1), using both the Shannon and Simpson indices. # We look only at post-vaccination samples, excluding the duplicate 2009 age 53 sample and not examing the spike-in experiment. # Here are the results: # In[52]: for index in ['shannon', 'simpson']: print(f"\n{index} beta diversity:") print(avg_sel_df .query('library == @avg_type') .query('serum_vaccination != "pre"') .query('serum_name_formatted != "2009-age-53-plus-2-months"') .query('serum_group != "serum_mAb_spike"') .groupby(['serum_group']) .apply(dms_tools2.diffsel.beta_diversity, samplecol='serum', sitecol='site', valcol='positive_diffsel', index=index) ) # Large beta diversity indicates more variability among the samples in a group. # As expected, the ferrets have lower diversity than the human sera. # This is especially pronounced when using the Simpson index, which emphasizes the effect of strong-selection sites. # ### Plots of each replicate in averages # In the above section, we plotted the average for each serum. # Here we also show some data for the replicates that went into this average. # #### Zoom plots showing each replicate # Here we make line and zoomed logo plots for each replicate that goes into the average: # In[53]: os.makedirs(config['avgdiffsel_reps_dir'], exist_ok=True) for serum_group, df in avg_sel_df.groupby('serum_group'): plotfile = os.path.join(config['avgdiffsel_reps_dir'], f"{serum_group}_reps.pdf") svgplotfile = os.path.splitext(plotfile)[0] + '.svg' print(f"\n\n{'*' * 72}\n{serum_group}, saving to {plotfile} and {svgplotfile}\n") fig, axes = dmslogo.facet_plot( data=df.query('library != @avg_type'), x_col='isite', show_col='zoom_site', gridrow_col='serum_name_formatted', gridcol_col='library', share_xlabel=True, share_ylabel=True, share_ylim_across_rows=share_ylim_across_rows[serum_group], wspace=0.6, draw_line_kwargs=dict( height_col='positive_diffsel', xtick_col='site', ylabel='immune selection', ), draw_logo_kwargs=dict( letter_col='mutation', letter_height_col='mutdiffsel', xtick_col='site_label', xlabel='site', ylabel='immune selection', clip_negative_heights=True, ), ) display(fig) fig.savefig(plotfile) fig.savefig(svgplotfile) plt.close(fig) # #### Plot replicate-replicate correlations # Now we plot the correlation among the replicates that we retained for each serum. # First, we make a tidy data frame with the correlations between all pairs of replicates for the same serum: # In[54]: corr_df = [] serum_names = avg_sel_df['serum_name_formatted'].unique() libraries = [lib for lib in avg_sel_df['library'].unique() if lib != avg_type] for serum_name, serum_sel_df in avg_sel_df.groupby('serum_name_formatted'): corr_df.append( serum_sel_df .query('library in @libraries') .pivot_table(values='positive_diffsel', columns='library', index=['site']) .corr(method='pearson') .reset_index() .melt(id_vars='library', var_name='lib_B', value_name='correlation') .rename(columns={'library': 'lib_A'}) .query('lib_A <= lib_B') .assign(serum_name=serum_name) ) corr_df = (pd.concat(corr_df, ignore_index=True) .assign(serum_name=lambda x: pd.Categorical(x['serum_name'], serum_names, ordered=True), lib_A=lambda x: pd.Categorical(x['lib_A'], libraries, ordered=True), lib_B=lambda x: pd.Categorical(x['lib_B'], reversed(libraries), ordered=True), corr_str=lambda x: x['correlation'].apply('{:.2f}'.format) ) ) print('Here are the first few lines of the tidy correlation data frame:') display(HTML(corr_df.head().to_html(index=False))) # Now we use [plotnine](https://plotnine.readthedocs.io) to plot a correlation matrix for each serum, and then save to a file as well as showing it: # In[55]: ncol = 5 nsera = len(corr_df['serum_name'].unique()) corr_plot = ( ggplot(corr_df, aes('lib_A', 'lib_B', fill='correlation', label='corr_str')) + geom_tile(color='white', size=0.5) + geom_text() + facet_wrap('~ serum_name', ncol=ncol) + theme(figure_size=(2.55 * ncol, 2.55 * math.ceil(nsera / ncol)), panel_grid_major=element_blank() ) + scale_fill_continuous(limits=(0, 1)) + xlab('') + ylab('') ) _ = corr_plot.draw() rep_corr_plot = os.path.join(config['avgdiffsel_reps_dir'], 'rep_corr_matrix.pdf') print(f"Saving plot to {rep_corr_plot}") corr_plot.save(rep_corr_plot) # ## Customized figures for paper # # In the following section, we will generate additional customized paper figures not created above. # # These go in the following directory: # In[56]: print(f"Putting additional figures in {config['figsdir']}") os.makedirs(config['figsdir'], exist_ok=True) # We will save each of these figures with the following extensions: # In[57]: fig_extensions = ['.pdf', '.svg'] # ### Logo and line plot figures # Now we are going to make versions of the zoomed logo plots that are slightly different than the ones above. # We read in colors for specific mutations from a YAML file; these colors match those used to plot the neutralization curves. # We also plot a slightly different set of sites: not just the "significant" ones, but also others of interest (these are sites of strong selection in ferrets). # # First, get data frames with sites to zoom on and how to color them, and how to color specific mutations: # In[58]: with open(config['figure_config']) as f: fig_config = yaml.safe_load(f) zoom_sites = [] mutation_colors = [] for figure, fig_d in fig_config['figures'].items(): # process zoom sites site_color_map = collections.defaultdict(lambda: fig_config['default_logo_color']) if 'site_colors' in fig_d: for r, c in fig_d['site_colors'].items(): site_color_map[str(r)] = c zoom_sites.append( pd.DataFrame({'figure': figure, 'sera': fig_d['sera'], 'dummy': 0}) .merge(pd.DataFrame({'site': avg_sel_df['site'].unique(), 'dummy': 0})) .drop(columns='dummy') .assign(zoom=lambda x: x['site'].isin([str(r) for r in fig_d['sites']]), color=lambda x: x['site'].map(site_color_map) ) ) # process colors for serum in fig_d['sera']: if ('ignore_serum_virus' in fig_d) and (serum in fig_d['ignore_serum_virus']): ignore_muts = fig_d['ignore_serum_virus'][serum] else: ignore_muts = [] muts = [mut for mut in fig_d['colors'] if (mut not in {'wt', 'syn'}) and (mut not in ignore_muts)] mutation_colors.append(pd.DataFrame( {'figure': figure, 'sera': serum, 'site': [mut[1: -1] for mut in muts], 'mutation': [mut[-1] for mut in muts], 'color': [fig_d['colors'][mut][0] for mut in muts], })) zoom_sites = pd.concat(zoom_sites) mutation_colors = pd.concat(mutation_colors) # Merge the zoom and mutation-color data frames with the data on all of the immune selection values: # In[59]: colored_zoom_df = ( avg_sel_df .query('library == @avg_type') [['serum_name_formatted', 'isite', 'site', 'mutation', 'positive_diffsel', 'mutdiffsel', 'site_label']] .rename(columns={'serum_name_formatted': 'sera'}) .merge(zoom_sites, on=['sera', 'site'], how='left') .assign(zoom=lambda x: x['zoom'].fillna(False)) .merge(mutation_colors, on=['figure', 'sera', 'site', 'mutation'], how='left') .assign(color=lambda x: x['color_x'].where(x['color_y'].isna(), x['color_y'])) .drop(columns=['color_x', 'color_y']) ) # Make the zoomed logo plots: # In[60]: for figure, df in colored_zoom_df.groupby('figure'): if 'sera_names' in fig_config['figures'][figure]: name_map = fig_config['figures'][figure]['sera_names'] else: name_map = {s: s.replace('-', ' ') for s in df.sera.unique()} df = df.assign(sera_names=lambda x: pd.Categorical(x.sera.map(name_map), x.sera.map(name_map).unique(), ordered=True)) fig, axes = dmslogo.facet_plot( data=df, x_col='isite', show_col='zoom', gridrow_col='sera_names', share_xlabel=True, share_ylabel=True, share_ylim_across_rows=share_ylim_across_rows[serum_group], wspace=0.6, rmargin=0.7, draw_line_kwargs=dict( height_col='positive_diffsel', xtick_col='site', ylabel='immune selection', show_color=PALETTE[-2], ), draw_logo_kwargs=dict( letter_col='mutation', letter_height_col='mutdiffsel', color_col='color', xtick_col='site_label', xlabel='site', ylabel='immune selection', clip_negative_heights=True, ), ) display(fig) for ext in fig_extensions: plotfile = os.path.join(config['figsdir'], f"{figure}_logo{ext}") print(f"Saving figure to {plotfile}") fig.savefig(plotfile) plt.close(fig) # ### Replicate-to-replicate correlations # Plot replicate-replicate correlations for the sera in each figure: # In[61]: for figure, fig_d in fig_config['figures'].items(): sera = fig_d['sera'] if len(sera) == 5: ncol = 5 else: ncol = 4 fig_corr_df = (corr_df .query('serum_name in @sera') .assign(serum_name=lambda x: x['serum_name'].str.replace('-', ' ')) ) fig_corr_plot = ( ggplot(fig_corr_df, aes('lib_A', 'lib_B', fill='correlation', label='corr_str')) + geom_tile(color='white', size=0.5) + geom_text() + facet_wrap('~ serum_name', ncol=ncol) + theme(figure_size=(2.55 * min(len(sera), ncol), 2.55 * math.ceil(len(sera) / ncol)), panel_grid_major=element_blank() ) + scale_fill_continuous(limits=(0, 1)) + xlab('') + ylab('') ) print(fig_corr_plot) for ext in fig_extensions: plotfile = os.path.join(config['figsdir'], f"{figure}_rep_corr{ext}") print(f"Saving figure to {plotfile}") fig_corr_plot.save(plotfile) # ### Percent infectivity for each replicate # Plot the percent infectivity remaining for the library for each selection. # In[62]: def format_label(x): if x == 100: lab = '100' else: lab = '{0:#.2g}'.format(x) if lab[-1] == '.': lab = lab[: -1] return lab + '%' for figure, fig_d in fig_config['figures'].items(): sera = fig_d['sera'] df = (selections .query('retained') .query('serum_name_formatted in @sera') [['serum_name_formatted', 'library', 'percent_infectivity']] .assign(serum=lambda x: x['serum_name_formatted'].str.replace('-', ' '), label=lambda x: x['percent_infectivity'].apply(format_label) ) ) ymax = max(100, 10**math.ceil(math.log10(df['percent_infectivity'].max()))) ymin = min(0.1, 10**math.floor(math.log10(df['percent_infectivity'].min()))) yextent = math.log10(ymax / ymin) if df['percent_infectivity'].max() > 50: nudge_y = -0.1 * yextent else: nudge_y = 0.1 * yextent if len(sera) == 5: ncol = 5 else: ncol = 4 plot_percent_infectivity = ( ggplot(df, aes('library', 'percent_infectivity', label='label', color='library')) + geom_point(size=3) + geom_text(size=10, nudge_y=nudge_y) + facet_wrap('~ serum', ncol=ncol) + scale_y_log10(limits=(ymin, ymax), name='percent infectivity') + theme(figure_size=(2.55 * min(ncol, len(sera)), 2.55 * math.ceil(len(sera) / ncol))) + scale_color_manual(values=PALETTE[1: ], guide=False) ) print(plot_percent_infectivity) for ext in fig_extensions: plotfile = os.path.join(config['figsdir'], f"{figure}_percent_infectivity{ext}") print(f"Saving figure to {plotfile}") plot_percent_infectivity.save(plotfile) # In[ ]: