In [1]:
from thinkbayes import Pmf
In [2]:
pmf = Pmf()
for x in range(1, 7):
    pmf.Set(x, 1)
In [3]:
pmf.Print()
1 1
2 1
3 1
4 1
5 1
6 1
In [4]:
pmf.Normalize()
Out[4]:
6
In [5]:
pmf.Print()
1 0.166666666667
2 0.166666666667
3 0.166666666667
4 0.166666666667
5 0.166666666667
6 0.166666666667

Prior

In [6]:
pmf = Pmf()
pmf.Set('Bowl 1', 0.5)
pmf.Set('Bowl 2', 0.5)

Update

P(Vanilla | Bowl 1) = 3/4

P(Vanilla | Bowl 1) = 2/4

In [7]:
pmf.Mult('Bowl 1', 0.75)
pmf.Mult('Bowl 2', 0.5)

Normalize

In [8]:
pmf.Normalize()
print pmf.Prob('Bowl 1')
0.6

The dice problem

We representa box with dice of [4, 6, 8, 12, 20] sides.

In [9]:
from thinkbayes import Suite

# Start with equal priors
suite = Suite([4, 6, 8, 12, 20])
suite.Print()
4 0.2
6 0.2
8 0.2
12 0.2
20 0.2

We're going to add a Likelihood method

In [18]:
from thinkbayes import Suite

class Dice(Suite):
    def Likelihood(self, hypo, data):
        """Computes the likelihood of the data under the hypothesis.

        hypo: integer number of sides on the die
        data: integer die roll
        """
        num_sides = hypo
        outcome = data

        if outcome > num_sides:
            return 0
        else:
            return 1.0/num_sides

Let's build a suite that represents our dice:

In [21]:
suite = Dice([4, 6, 8, 12, 20])

And now let's say that we roll once and get a 6. What does this tells us about the probabilities for the various dice?

In [22]:
suite.Update(6)
print 'After one 6'
suite.Print()
After one 6
4 0.0
6 0.392156862745
8 0.294117647059
12 0.196078431373
20 0.117647058824

And now let's roll a few more dice and figure out how that informs us about the probability of the die being of a specific size:

In [23]:
for roll in [6, 4, 8, 7, 7, 2]:
    suite.Update(roll)
    
suite.Print()
4 0.0
6 0.0
8 0.943248453672
12 0.0552061280613
20 0.0015454182665

The German Tank problem

In [27]:
%pylab inline
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.
In [31]:
import thinkbayes
import myplot

class Train(thinkbayes.Suite):
    """Suite of hypotheses about the number of trains."""

    def Likelihood(self, hypo, data):
        """Computes the likelihood of the data under the hypothesis.

        hypo: integer hypothesis about the number of trains
        data: integer serial number of the observed train
        """
        num_trains = hypo
        train_seen = data
        if train_seen > num_trains:
            return 0
        else:
            return 1.0/num_trains
Given the data, our mean estimate is: 597.344008084
In [34]:
hypos = xrange(100, 1001)
suite = Train(hypos)

suite.Update(321)
print 'Given the data, our mean estimate is:',
print suite.Mean()

myplot.Pmf(suite)
Given the data, our mean estimate is: 597.344008084

Let's now assume we're spotting a few more trains, and let's see how our posterior updates as we measure this data:

In [35]:
for train in [782, 423]:
    suite.Update(train)

print 'Given the data, our mean estimate is:',
print suite.Mean()
myplot.Pmf(suite)
Given the data, our mean estimate is: 877.54322135

Euro problem

Estimation: based on the data (140h, 110t), what is the probability that the coin lands heads?

Hypothesis testing: what is the probability that the coin is fair?

In [40]:
class Euro(thinkbayes.Suite):

    def Likelihood(self, hypo, data):
        """Computes the likelihood of the data under the hypothesis.

        hypo: integer value of x, the probability of heads (0-100)
        data: string 'H' or 'T'
        """
        # Note: while we're writing the likelihood function, we
        # *assume* the hypothesis is true.  Don't forget this!
        
        # We convert the hypothesis to a probability in the 0-1 range:
        x = hypo/100.0
        
        # Now, since the hypothesis is about Heads, if the data is H
        # it means the hypothesis is True, so we can just return the hypothesis
        if data == 'H':
            return x
        else:
            return 1-x

suite = Euro(range(0, 101))
myplot.Pmf(suite)
In [41]:
suite.Update('H')
myplot.Pmf(suite)
In [42]:
suite.Update('H')
myplot.Pmf(suite)
In [43]:
suite.Update('T')
myplot.Pmf(suite)
In [44]:
for outocme in 'HHHHHHHHTTT':
    suite.Update(outocme)
myplot.Pmf(suite)
In [45]:
suite = Euro(range(0, 101))
for outocme in 'H'*140 + 'T'*110:
    suite.Update(outocme)
myplot.Pmf(suite)

What is the probability that the coin is fair? I.e., what is the prob. that x is 50%?

In [48]:
suite.Prob(50)
Out[48]:
0.02097652612954468

Hypothesis testing

In [49]:
%load euro2.py
In [50]:
class Euro(thinkbayes.Suite):

    def Likelihood(self, hypo, data):
        """Computes the likelihood of the data under the hypothesis.

        hypo: integer value of x, the probability of heads (0-100)
        data: string 'H' or 'T'
        """
        x = hypo / 100.0
        heads, tails = data
        like = x**heads * (1-x)**tails
        return like


def AverageLikelihood(suite, data):
    """Computes the average likelihood over all hypothesis in suite.

    Args:
      suite: Suite of hypotheses
      data: some representation of the observed data

    Returns:
      float
    """
    total = 0

    for hypo, prob in suite.Items():
        like = suite.Likelihood(hypo, data)
        total += prob * like

    return total
In [51]:
fair = Euro()
fair.Set(50, 1)

bias = Euro()
for x in range(0, 101):
    if x != 50:
        bias.Set(x, 1)
bias.Normalize()

data = 140, 110

like_bias = AverageLikelihood(bias, data)
print 'like_bias', like_bias

like_fair = AverageLikelihood(fair, data)
print 'like_fair', like_fair

ratio = like_bias / like_fair
print 'Bayes factor', ratio
like_bias 2.57964902292e-76
like_fair 5.52714787526e-76
Bayes factor 0.466723359161
In [58]:
bias = Euro()

for x in range(40, 70):
    if x != 50:
        bias.Set(x, 1)
bias.Normalize()

like_bias = AverageLikelihood(bias, data)
print 'like_bias', like_bias

ratio = like_bias / like_fair
print 'Bayes factor', ratio
like_bias 8.89531486664e-76
Bayes factor 1.60938608255

SAT scores

In [60]:
import csv
import math
import numpy
import sys

import matplotlib

import myplot


def ReadScale(filename='sat_scale.csv', col=2):
    """Reads a CSV file of SAT scales (maps from raw score to standard score).

    Args:
      filename: string filename
      col: which column to start with (0=Reading, 2=Math, 4=Writing)

    Returns: thinkbayes.Interpolator object
    """
    def ParseRange(s):
        t = [int(x) for x in s.split('-')]
        return 1.0 * sum(t) / len(t)

    fp = open(filename)
    reader = csv.reader(fp)
    raws = []
    scores = []

    for t in reader:
        try:
            raw = int(t[col])
            raws.append(raw)
            score = ParseRange(t[col+1])
            scores.append(score)
        except:
            pass

    raws.sort()
    scores.sort()
    return thinkbayes.Interpolator(raws, scores)


def ReadRanks(filename='sat_ranks.csv'):
    """Reads a CSV file of SAT scores.

    Args:
      filename: string filename

    Returns:
      list of (score, freq) pairs
    """
    fp = open(filename)
    reader = csv.reader(fp)
    res = []

    for t in reader:
        try:
            score = int(t[0])
            freq = int(t[1])
            res.append((score, freq))
        except ValueError:
            pass

    return res


def DivideValues(pmf, denom):
    """Divides the values in a Pmf by denom.

    Returns a new Pmf.
    """
    new = thinkbayes.Pmf()
    denom = float(denom)
    for val, prob in pmf.Items():
        x = val / denom
        new.Set(x, prob)
    return new


class Exam(object):
    """Encapsulates information about an exam.

    Contains the distribution of scaled scores and an
    Interpolator that maps between scaled and raw scores.
    """
    def __init__(self):
        self.scale = ReadScale()

        scores = ReadRanks()
        score_pmf = thinkbayes.MakePmfFromDict(dict(scores))

        self.raw = self.ReverseScale(score_pmf)
        self.max_score = max(self.raw.Values())
        self.prior = DivideValues(self.raw, denom=self.max_score)

    def Lookup(self, raw):
        """Looks up a raw score and returns a scaled score."""
        return self.scale.Lookup(raw)
        
    def Reverse(self, score):
        """Looks up a scaled score and returns a raw score.

        Since we ignore the penalty, negative scores round up to zero.
        """
        raw = self.scale.Reverse(score)
        return raw if raw > 0 else 0
        
    def ReverseScale(self, pmf):
        """Applies the reverse scale to the values of a PMF.

        Args:
            pmf: Pmf object
            scale: Interpolator object

        Returns:
            new Pmf
        """
        new = thinkbayes.Pmf()
        for val, prob in pmf.Items():
            raw = self.Reverse(val)
            new.Incr(raw, prob)
        return new


class Sat(thinkbayes.Suite):
    """Represents the distribution of efficacy for a test-taker."""

    def __init__(self, exam):
        thinkbayes.Suite.__init__(self)

        self.exam = exam

        # start with the prior distribution
        for p, prob in exam.prior.Items():
            self.Set(p, prob)

    def Likelihood(self, hypo, data):
        """Computes the likelihood of a test score, given efficacy."""
        p = hypo
        score = data
        raw = self.exam.Reverse(score)

        yes, no = raw, self.exam.max_score - raw
        like = p**yes * (1-p)**no
        return like


def MakeFigures(exam, alice, bob):
    formats=['png']

    myplot.Pmf(exam.prior, label='prior')
    myplot.Save(root='sat_prior',
                formats=formats,
                xlabel='p',
                ylabel='PMF')


    myplot.Clf()
    myplot.Pmfs([alice, bob])
    myplot.Save(root='sat_posterior',
                formats=formats,
                xlabel='p',
                ylabel='PMF')
In [65]:
def PmfProbGreater(pmf1, pmf2):
    """Probability that a value from pmf1 is more than a value from pmf2.

    Args:
        pmf1: Pmf object
        pmf2: Pmf object

    Returns:
        float probability
    """
    total = 0.0
    for v1, p1 in pmf1.Items():
        for v2, p2 in pmf2.Items():
            if p1 > p2:
                total += p1*p2
    return total

exam = Exam()

alice = Sat(exam)
alice.name = 'alice'
alice.Update(780)

bob = Sat(exam)
bob.name = 'bob'
bob.Update(760)

print 'Prob Alice is "smarter":', PmfProbGreater(alice, bob)
Prob Alice is "smarter": 0.572030956616