#!/usr/bin/env python # coding: utf-8 # # Think Bayes # # This notebook presents example code and exercise solutions for Think Bayes. # # Copyright 2018 Allen B. Downey # # MIT License: https://opensource.org/licenses/MIT # In[1]: # Configure Jupyter so figures appear in the notebook get_ipython().run_line_magic('matplotlib', 'inline') # Configure Jupyter to display the assigned value after an assignment get_ipython().run_line_magic('config', "InteractiveShell.ast_node_interactivity='last_expr_or_assign'") # import classes from thinkbayes2 from thinkbayes2 import Pmf, Suite, Beta import thinkplot import numpy as np # ## The social desirability problem # # Whenever you survey people about sensitive issues, you have to deal with [social desirability bias](https://en.wikipedia.org/wiki/Social_desirability_bias), which is the tendency of people to shade their answers in the direction they think shows them in the most positive light. # # One of the ways to improve the quality of the results is to collect responses in indirect ways. For example, [here's a clever way one research group estimated the prevalence of atheists](https://fivethirtyeight.com/features/way-more-americans-may-be-atheists-than-we-thought/). # # Another way is [randomized response](https://en.wikipedia.org/wiki/Randomized_response), as described in [this presentation](http://www.soz.unibe.ch/ueber_uns/personen/jann/presentations_by_ben_jann/e131361/e131381/rrt_online07_kassel08_ger.pdf) or [this video](https://www.youtube.com/watch?v=nwJ0qY_rP0A). # # As an example, suppose you ask 100 people to flip a coin and: # # * If they get heads, they report YES. # # * If they get tails, they honestly answer the question "Do you believe in God?" # # And suppose you get 80 YESes and 20 NOs. # # 1. Estimate the prevalence of believers in the surveyed population (by which, as always, I mean compute a posterior distribution). # # 2. How efficient is this method? That is, how does the width of the posterior distribution compare to the distribution you would get if 100 people answered the question honestly? # In[2]: # Solution class Social(Suite): def Likelihood(self, data, hypo): """ data: outcome of unreliable measurement, either 'YES' or 'NO' hypo: actual proportion of the thing we're measuring """ p = hypo p_yes = 0.5 + p/2 if data == 'YES': return p_yes else: return 1 - p_yes # In[3]: # Solution prior = np.linspace(0, 1, 101) suite = Social(prior) thinkplot.Pdf(suite, label='Prior') thinkplot.decorate(xlabel='Fraction of the population', ylabel='PDF') # In[4]: # Solution for i in range(80): suite.Update('YES') for i in range(20): suite.Update('NO') # In[5]: # Solution thinkplot.Pdf(suite, label='Posterior') thinkplot.decorate(xlabel='Fraction of the population', ylabel='PDF') # In[6]: # Solution suite.Mean(), suite.MAP() # In[7]: # Solution # For comparison, what would we think if we had been able # to survey 100 people directly? beta = Beta(1, 1) beta.Update((60, 40)) thinkplot.Pdf(beta.MakePmf(), label='Direct', color='gray') thinkplot.Pdf(suite, label='Randomized') thinkplot.decorate(xlabel='Fraction of the population', ylabel='PDF') # In[8]: # Solution # To see how efficient this method is, we can divide the sample size for # the direct method by a factor. It looks like we lose a factor of $2 \sqrt{2}$. factor = 2 * np.sqrt(2) beta = Beta(1, 1) beta.Update((60/factor, 40/factor)) thinkplot.Pdf(beta.MakePmf(), label='Direct', color='gray') thinkplot.Pdf(suite, label='Randomized') thinkplot.decorate(xlabel='Fraction of the population', ylabel='PDF') # In[9]: # Solution # So the effective sample size is about 35. 100 / 2 / np.sqrt(2) # In[ ]: