#!/usr/bin/env python
# coding: utf-8

# Orange Is the New Stat
# ----------------------
# 
# This notebook contains code that demonstrates the effect of the inspection paradox on observations of a prison population, which I wrote about in this article on my blog, Probably Overthinking It:
# 
# http://allendowney.blogspot.com/2015/08/orange-is-new-stat.html
# 
# Copyright 2015 Allen Downey
# 
# MIT License: http://opensource.org/licenses/MIT

# In[41]:


from __future__ import print_function, division

import numpy as np
import thinkplot
import thinkstats2

from collections import Counter

get_ipython().run_line_magic('matplotlib', 'inline')
formats = ['png', 'pdf']


# To model the distribution of sentences, I use random values from a gamma distribution, rounded to the nearest integer.  All sentences are in units of months.

# In[42]:


sentences = np.random.gamma(shape=2, scale=60, size=1000).astype(int)
cdf1 = thinkstats2.Cdf(sentences, label='actual')
thinkplot.PrePlot(2)
thinkplot.Cdf(cdf1)
thinkplot.Config(xlabel='sentence (months)', ylabel='CDF')
thinkplot.Save('orange1', formats=formats)


# I chose parameters that very roughly match the histogram of sentences in this report: http://www.ussc.gov/sites/default/files/pdf/research-and-publications/quick-facts/Quick-Facts_BOP.pdf
# 
# In my distribution about 28% of sentences are less than 5 years, and 40% are more than 10.

# In[43]:


(sentences < 5*12).mean(), (sentences > 10*12).mean()


# If we imagine a series of sentences served consecutively, we can compute the release date for each prisoner:

# In[44]:


releases = sentences.cumsum()


# For example, here are the sentences of the first 10 prisoners:

# In[45]:


sentences[:10]


# And the release dates of the first 10 prisoners:

# In[46]:


releases[:10]


# If we arrive during month 500, we can figure out which prisoner we would observe using `searchsorted`, which uses bisection search.  The result is an array index, which we can think of as a prisoner ID.

# In[47]:


releases.searchsorted(500)


# The following function chooses a random prisoner by choosing a random arrival time, `i`, looking up the prisoner who would be observed at `i`, and then looking up that prisoner's sentence.

# In[48]:


def random_sample(sentences, releases):
    i = np.random.random_integers(1, releases[-1])
    prisoner = releases.searchsorted(i)
    sentence = sentences[prisoner]
    return i, prisoner, sentence


# If we call `random_sample` a few times, we get a sample of sentences as seen by random arrivals.

# In[49]:


for _ in range(10):
    print(random_sample(sentences, releases))


# We can be a little more efficient by generating 1000 random arrival times and finding the corresponding sentences.  Then we compute the CDF of the resulting sample.

# In[50]:


arrivals = np.random.random_integers(1, releases[-1], 10000)
prisoners = releases.searchsorted(arrivals)
sample = sentences[prisoners]
cdf2 = thinkstats2.Cdf(sample, label='biased')


# Here's what the biased sample looks like, compared to the actual distribution.  Due to the inspection paradox, we oversample long sentences.

# In[60]:


thinkplot.PrePlot(2)
thinkplot.Cdf(cdf1)
thinkplot.Cdf(cdf2)
thinkplot.Config(xlabel='sentence (months)', ylabel='CDF', loc='lower right')
thinkplot.Save('orange2', formats=formats)


# As expected, the sample mean is substantially higher than the actual mean:

# In[52]:


sentences.mean(), sample.mean()


# Instead of using a random sample, we could have computed the biased distribution directly by applying an operation on the PMF.  The following function takes the actual distribution of sentences, weights each sentence length, `x`, by `x`, and renormalizes.
# 
# This operation is discussed in Section 3.4 of Think Stats: http://greenteapress.com/thinkstats2/html/thinkstats2004.html#toc29

# In[53]:


def bias_pmf(pmf, t=0, label=None):
    label = label if label is not None else pmf.label
    new_pmf = pmf.Copy(label=label)

    for x, p in pmf.Items():
        new_pmf[x] *= (x + t)
        
    new_pmf.Normalize()
    return new_pmf

pmf = thinkstats2.Pmf(sentences)
biased = bias_pmf(pmf, t=0, label='BiasPmf').MakeCdf()


# This figure compares the biased distribution computed by random sampling with the distribution computed by `BiasPmf`.  They should be similar.

# In[54]:


thinkplot.PrePlot(2)
thinkplot.Cdf(cdf2)
thinkplot.Cdf(biased)
thinkplot.Config(xlabel='sentence (months)', ylabel='CDF', loc='lower right')


# Now we get to the scenario I describe in the article: observation made by a prisoner serving a sentence with duration `t`.
# 
# The following function computes this distribution, given an array of (actual) sentences and the sentence of the observer.
# 
# It starts by choosing a random arrival time between 1 and the maximum sentence.  So the initial observation is subject to the inspection paradox.
# 
# Each time through the loop it computes `first_prisoner`, which is the prisoner observed on arrival, and `last_prisoner`, the prisoner observed on departure.  The prisoners between these endpoints (including both) are considered one observed sample, and added to the Counter object, which accumulates a histogram of observed sentences.
# 
# Each time through the loop, the arrival time is advanced by 100, which is approximately the average sentence.  This has the effect of thinning the observations.
# 
# Finally, the function returns the CDF of the observed sentences.
# 

# In[55]:


def simulate_sentence(sentences, t):
    counter = Counter()
    
    releases = sentences.cumsum()
    last_release = releases[-1]
    arrival = np.random.random_integers(1, max(sentences))
    
    for i in range(arrival, last_release-t, 100):
        first_prisoner = releases.searchsorted(i)
        last_prisoner = releases.searchsorted(i+t)
    
        observed_sentences = sentences[first_prisoner:last_prisoner+1]
        counter.update(observed_sentences)
    
    print(sum(counter.values()))
    return thinkstats2.Cdf(counter, label='observed %d' % t)


# The following function takes an observed distribution of sentences and plots it along with the actual distribution and the biased distribution that would be seen by a single arrival.

# In[56]:


def plot_cdfs(cdf3):
    thinkplot.PrePlot(2)
    thinkplot.Cdf(cdf1)
    thinkplot.Cdf(cdf2)
    thinkplot.Cdf(cdf3, color='orange')
    thinkplot.Config(xlabel='sentence (months)', ylabel='CDF', loc='lower right')


# If the sentence of the observer is only 11 months, the observed distribution is almost as biased as what would be seen by an instantaneous observer.

# In[67]:


cdf13 = simulate_sentence(sentences, 13)
#cdf13 = bias_pmf(pmf, 13).MakeCdf()
plot_cdfs(cdf13)
thinkplot.Save('orange3', formats=formats)


# With a longer sentence, we get a less biased view.  But even after 120 months, which is the average sentence, the observed sample is still quite biased.

# In[68]:


cdf120 = simulate_sentence(sentences, 120)
#cdf120 = bias_pmf(pmf, 120).MakeCdf()
plot_cdfs(cdf120)
thinkplot.Save('orange4', formats=formats)


# After 600 months (50 years!) the observed distribution almost reaches the actual distribution.

# In[69]:


cdf600 = simulate_sentence(sentences, 600)
#cdf600 = bias_pmf(pmf, 600).MakeCdf()
plot_cdfs(cdf600)
thinkplot.Save('orange5', formats=formats)


# We conclude that during Kerman's 11 month sentence, she would have seen a biased sample of the distribution of sentences.
# 
# Nevertheless, her observation that many prisoners are serving long sentences that do not fit their crimes is still valid, in my opinion.

# In[71]:


thinkplot.PrePlot(cols=3)
plot_cdfs(cdf13)
thinkplot.SubPlot(2)
plot_cdfs(cdf120)
thinkplot.SubPlot(3)
plot_cdfs(cdf600)
thinkplot.Save('orange6', formats=formats)


# In[ ]: