#!/usr/bin/env python # coding: utf-8 # # The Left Handed Sister Problem # Think Bayes, Second Edition # # Copyright 2021 Allen B. Downey # # License: [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/) # Suppose you meet someone who looks like the brother of your friend Mary. # You ask if he has a sister named Mary, and he says "Yes I do, but I don't think I know you." # # You remember that Mary has a sister who is left-handed, but you don't remember her name. # So you ask your new friend if he has another sister who is left-handed. # # If he does, how much evidence does that provide that he is the brother of your friend, rather than a random person who coincidentally has a sister named Mary and another sister who is left-handed. In other words, what is the Bayes factor of the left-handed sister? # # Let's assume: # # * Out of 100 families with children, 20 have one child, 30 have two children, 40 have three children, and 10 have four children. # # * All children are either boys or girls with equal probability, one girl in 10 is left-handed, and one girl in 100 is named Mary. # # * Name, sex, and handedness are independent, so every child has the same probability of being a girl, left-handed, or named Mary. # # * If the person you met had more than one sister named Mary, he would have said so, but he could have more than one sister who is left handed. # ## Constructing the prior # # I'll make a Pandas `Series` that enumerates possible families with 2, 3, or 4 children. # In[1]: import pandas as pd qs = [(2, 0), (1, 1), (0, 2), (3, 0), (2, 1), (1, 2), (0, 3), (4, 0), (3, 1), (2, 2), (1, 3), (0, 4), ] index = pd.MultiIndex.from_tuples(qs, names=['Boys', 'Girls']) # To compute the proportion of each type of family, I'll use Scipy to compute the binomial distribution. # In[2]: from scipy.stats import binom boys = index.to_frame()['Boys'] girls = index.to_frame()['Girls'] ps = binom.pmf(girls, boys+girls, 0.5) # And put the results into a Pandas `Series`. # In[3]: prior1 = pd.Series(ps, index, name='Prior') pd.DataFrame(prior1) # But we also have the information frequencies of these families are proportional to 30%, 40%, and 10%, so we can multiply through. # In[4]: ps = [30, 30, 30, 40, 40, 40, 40, 10, 10, 10, 10, 10] prior1 *= ps pd.DataFrame(prior1) # So that's the (unnormalized) prior. # # I'll use the following function to do Bayesian updates. # In[5]: import pandas as pd def make_table(prior, likelihood): """Make a DataFrame representing a Bayesian update.""" table = pd.DataFrame(prior) table.columns = ['Prior'] table['Likelihood'] = likelihood table['Product'] = (table['Prior'] * table['Likelihood']) total = table['Product'].sum() table['Posterior'] = table['Product'] / total return table # This function takes a prior and a likelihood and returns a `DataFrame` # ## The first update # # Due to [length-biased sampling](https://towardsdatascience.com/the-inspection-paradox-is-everywhere-2ef1c2e9d709), the person you met is more likely to come from family with more boys. # Specifically, the likelihood of meeting someone from a family with $n$ boys is proportional to $n$. # In[6]: likelihood1 = prior1.index.to_frame()['Boys'] table1 = make_table(prior1, likelihood1) table1 # So that's what we should believe about the family after the first update. # ## The second update # # The likelihood that a person has exactly one sister named Mary is given by the binomial distribution where `n` is the number of girls in the family and `p` is the probability that a girl is named Mary. # In[7]: from scipy.stats import binom ns = prior1.index.to_frame()['Girls'] p = 1 / 100 k = 1 likelihood2 = binom.pmf(k, ns, p) likelihood2 # Here's the second update. # In[8]: prior2 = table1['Posterior'] table2 = make_table(prior2, likelihood2) table2 # Based on the sister named Mary, we can rule out families with no girls, and families with more than one girls are more likely. # ## Probability of a left-handed sister # # Finally, we can compute the probability that he has at least one left-handed sister. # The likelihood comes from the binomial distribution again, where `n` is the number of *additional* sisters, and we use the survival function to compute the probability that one or more are left-handed. # In[9]: ns = prior1.index.to_frame()['Girls'] - 1 ns.name = 'Additional sisters' neg = (ns < 0) ns[neg] = 0 pd.DataFrame(ns) # In[10]: p = 1 / 10 k = 1 likelihood3 = binom.sf(k-1, ns, p) likelihood3 # A convenient way to compute the total probability of an outcome is to do an update as if it happened, ignore the posterior probabilities, and compute the sum of the products. # In[11]: prior3 = table2['Posterior'] table3 = make_table(prior3, likelihood3) table3 # At this point, there are only three family types left standing, (1,2), (2,2), and (1,3). # # Here's the total probability that your new friend has a left-handed sister. # In[12]: p = table3['Product'].sum() p # ## The Bayes factor # # If your interlocutor is the brother of your friend, the probability is 1 that he has a left-handed sister. # If he is not the brother of your friend, the probability is `p`. # So the Bayes factor is the ratio of these probabilities. # In[13]: 1/p # This might be the hardest Bayesian puzzle I've created. # In fact, I got it wrong the first time, until [Aubrey Clayton convinced me](https://twitter.com/aubreyclayton/status/1420041376377475075) I needed to take into account the number of boys and girls in each family, not just the size. # He solved the problem by enumerating the possible families in a giant spreadsheet! # So the fact that we get the same answer gives me more confidence it is correct. # # Thanks to Aubrey and the other folks on Twitter who submitted answers, including # [Corey Yanofsky](https://twitter.com/Corey_Yanofsky/status/1418627294256582664) and # [Michal Haltuf](https://twitter.com/MichalHaltuf/status/1418685902717693952). # # If you like this puzzle, you might like the [new second edition of *Think Bayes*](https://thinkbayes.com). # In[ ]: