#!/usr/bin/env python # coding: utf-8 # *** # ## Figure 6: # # # Histograms show distributions of overlap estimates $\hat{s}$, computed using Eq (11), for various values of s which are indicated by color-matched dotted lines. While all estimates are distributed around the true values of s, increasing the number of colonies c from 48 (top) to 96 (middle) and to 144 (bottom) substantially decreases the error of the estimates. For example the bottom plot shows that successfully sequencing c = 144 colonies from each parasite is guaranteed to produce estimates $\hat{s}$ that are off by at most 5 (8.3%) in either direction of the true s. # # Link to the published figure: https://doi.org/10.1371/journal.pcbi.1006898.g006 # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') from scipy.stats import hypergeom from scipy.stats import binned_statistic as binsta from scipy.special import logsumexp from util import * import palettable as pal clrx = pal.cartocolors.qualitative.Prism_10.mpl_colors clr = tuple(x for n,x in enumerate(clrx) if n in [1,2,4,5,6]) clr2 = pal.cartocolors.sequential.agSunset_7.mpl_colors import matplotlib.gridspec as gridspec import matplotlib.patches as patches import plotly.graph_objects as go import plotly.tools as tls from plotly.offline import plot, iplot, init_notebook_mode from IPython.core.display import display, HTML init_notebook_mode(connected = True) config={'showLink': False, 'displayModeBar': False} # CCP, the Coupon Collector's Problem def ccp_sample(c,pool=60): return len(set(np.random.choice(pool,c))) # Draw overlap def nab_sample(s,na,nb,pool=60): sa = np.random.hypergeometric(s,pool-s,na) nab = np.random.hypergeometric(sa,pool-sa,nb) return nab # Overlap between two PCRs of depth c and overlap s def pcr_sample(c,s): na = ccp_sample(c) nb = ccp_sample(c) return nab_sample(s,na,nb),na,nb # Draw na and nb samples from two populations of size pool_a and pool_b, with true overlap s # and return empirical overlap between na and nb # note that this is basically the same as nab_sample, but with two different size pools! def nab_sample_unequal(s,na,nb,pool_a,pool_b): sa = np.random.hypergeometric(s,pool_a-s,na) nab = np.random.hypergeometric(sa,pool_b-sa,nb) return nab def p_ccp(c, pool=60): p = np.zeros([c+1,pool+1]) p[0,0] = 1; for row in range(1,c+1): for k in range(1,np.min([row+2,pool+1])): p[row,k] = p[row-1,k]*k/pool + p[row-1,k-1]*(1-(k-1)/pool) return p[-1,:] def p_overlap(na,nb,nab,pool=60): p_s = np.zeros(pool+1) # reference: hypergeom.pmf(outcome, Total, hits, Draws, loc=0) for s in np.arange(pool+1): # p_sa is the probability that we'd get sa from the overlap (s), just in na draws of a p_sa = hypergeom.pmf(np.arange(pool+1),pool,s,na) # p_nab_given_sa is the probability of getting that nab, given sa p_nab_given_sa = hypergeom.pmf(nab,pool,np.arange(pool+1),nb) p_s[s] = np.dot(p_sa,p_nab_given_sa) return p_s/np.sum(p_s) def e_overlap(na,nb,nab,pool=60): p_s = p_overlap(na,nb,nab,pool=pool) return np.dot(np.arange(pool+1),p_s) def credible_interval(na,nb,nab,pct=90,pool=60): p_s = p_overlap(na,nb,nab,pool=pool) cdf = np.cumsum(p_s) ccdf = np.flipud(np.cumsum(np.flipud(p_s))) # adjust for fractions vs percents; put everything as a fraction if pct > 1: pct = pct/100 cutoff = (1-pct)/2 # get the lower bound. # it's the first index at which cdf ≥ cutoff try: lower = np.where(cdf >= cutoff)[0][0] except IndexError: lower = 0 # get the upper bound # it's the first index at which ccdf ≥ 0.05 try: upper = np.where(ccdf >= cutoff)[0][-1] except IndexError: upper=pool expectation = np.dot(np.arange(pool+1),p_s) # Sanity and indexing check: uncomment this line to see true tail probability ≤ 0.05 # print([cdf[lower-1],(1-ccdf[upper+1])]) return lower,expectation,upper def p_nab_given_c(s,c,pool=60): pna = p_ccp(c) pnb = p_ccp(c) nas = np.arange(1,len(pna)) nbs = np.arange(1,len(pnb)) p_gen = np.zeros([pool+1,pool+1,pool+1]) for na in nas: p_sa = hypergeom.pmf(np.arange(pool+1),pool,s,na) for nb in nbs: pna_pnb = pna[na] * pnb[nb] for nab in range(0,np.minimum(na,nb)): p_nab_given_sa = hypergeom.pmf(nab,pool,np.arange(pool+1),nb) p_nab_given_s = np.dot(p_sa,p_nab_given_sa) p_gen[na,nb,nab] = p_nab_given_s * pna_pnb return p_gen def p_shat_given_sc(s,c,shat,pool=60): masses = p_nab_given_c(s,c,pool=pool) if np.sum(masses)<0.99: print('Swapping to Monte Carlo') return p_shat_given_sc_montecarlo(s,c,shat,pool=pool) hist = binsta(np.ravel(shat),np.ravel(masses),statistic='sum',bins=(np.arange(pool+2)-0.5)) return hist def p_shat_given_sc_montecarlo(s,c,shat,pool=60,n_mc=int(1e5)): masses = np.zeros([pool+1,pool+1,pool+1]) for ii in range(n_mc): nab,na,nb = pcr_sample(c,s) masses[na,nb,nab] += 1 hist = binsta(np.ravel(shat),np.ravel(masses/n_mc),statistic='sum',bins=(np.arange(pool+2)-0.5)) return hist def compute_all_estimates(pool=60): shat = np.zeros([pool+1,pool+1,pool+1]) for na in range(1,pool+1): for nb in range(1,pool+1): for nab in range(0,np.minimum(na+1,nb+1)): shat[na,nb,nab] = e_overlap(na,nb,nab,pool=pool) return shat def p_overlap_unequal(na,nb,nab,pool_a,pool_b): # all loops are in terms of pool_a, which is assumed to be ≤ pool_b. p_s = np.zeros(pool_a+1) # reference: hypergeom.pmf(outcome, Total, hits, Draws, loc=0) for s in np.arange(pool_a+1): # p_sa is the probability that we'd get sa from the overlap (s), just in na draws of a p_sa = hypergeom.pmf(np.arange(pool_a+1),pool_a,s,na) # p_nab_given_sa is the probability of getting that nab, given sa p_nab_given_sa = hypergeom.pmf(nab,pool_b,np.arange(pool_a+1),nb) p_s[s] = np.dot(p_sa,p_nab_given_sa) return p_s/np.sum(p_s) def e_overlap_unequal(na,nb,nab,pool_a,pool_b): # TODO. Code expects that pool_b > pool_a... p_s = p_overlap_unequal(na,nb,nab,pool_a,pool_b) return np.dot(np.arange(pool_a+1),p_s) # shat = compute_all_estimates(pool=60) # np.save('shat_60.npy',shat) shat = np.load('shat_60.npy') shat = np.load('shat_60.npy') hist_error = {} pool = 60 ss = [10,20,30,40,50] cc = [48,96,144] for c in cc: hist_error[c] = np.zeros([len(ss),pool+1]) for idx,s in enumerate(ss): hist_error[c][idx,:] = np.array(p_shat_given_sc(s,c,shat)[0]) fig5 = go.Figure() colors = ['rgb(126,169,195)', 'rgb(143,204,204)','rgb(183,214,161)', 'rgb(243,207,113)','rgb(237,181,114)'] c1 = [96,144] for idx, s in enumerate(ss): fig5.add_trace(go.Scatter(x = s*np.ones(15), y =np.linspace(0,0.29,15), mode = 'markers', marker=dict(color=colors[idx]),showlegend=False, name = s )) for idx, s in enumerate(ss): fig5.add_trace(go.Bar(x = np.arange(pool+1), y = hist_error[48][idx],marker_color = colors[idx],opacity=0.75, showlegend=False, marker_line_color = "Black", marker_line_width=1.5, name = s)) for c in c1: for idx, s in enumerate(ss): fig5.add_trace(go.Bar(x = np.arange(pool+1), y = hist_error[c][idx],marker_color = colors[idx],opacity=0.75, showlegend=False, visible = False,marker_line_color = "Black", marker_line_width=1.5, name = s)) fig5.update_layout(barmode='overlay', plot_bgcolor='rgb(255,255,255)', height = 350, xaxis_title = 'Estimated overlap, \u015D', yaxis_title = 'Probability of \u015D', updatemenus=[ dict( active=0, buttons=list([ dict(label="c = 48 colonies", method="update", args=[{"visible": [True,True,True,True, True,True,True,True,True, True, False, False, False, False,False,False, False, False, False,False]}, {"title": ""}]), dict(label="c = 96 colonies", method="update", args=[{"visible": [True,True,True,True, True, False, False, False, False,False, True,True,True,True, True,False, False, False, False,False]}, {"title": "" }]), dict(label="c = 144 colonies", method="update", args=[{"visible": [True,True,True,True, True, False, False, False, False,False, False, False, False, False,False,True,True,True,True, True]}, {"title": ""}]), ]), x=0.2, y = 1.27 ) ]) # PLot figure plot(fig5, filename = 'plotly_figures/fig5.html', config = config) display(HTML('plotly_figures/fig5.html'))