#!/usr/bin/env python
# coding: utf-8
# # The Inspection Paradox is Everywhere
#
# Allen Downey 2019
#
# [MIT License](https://en.wikipedia.org/wiki/MIT_License)
# In[2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from empiricaldist import Pmf
from utils import decorate
# In[3]:
# set the random seed so we get the same results every time
np.random.seed(17)
# In[4]:
# make the directory for the figures
import os
if not os.path.exists('inspection'):
get_ipython().system('mkdir inspection')
# ## Class size
#
# Here's the data summarizing the distribution of undergraduate class sizes at Purdue University in 2013-14.
# In[38]:
# Class size data originally from
# https://www.purdue.edu/datadigest/2013-14/InstrStuLIfe/DistUGClasses.html
# now available from
# https://web.archive.org/web/20160415011613/https://www.purdue.edu/datadigest/2013-14/InstrStuLIfe/DistUGClasses.html
sizes = [(1, 1),
(2, 9),
(10, 19),
(20, 29),
(30, 39),
(40, 49),
(50, 99),
(100, 300)]
counts = [138, 635, 1788, 1979, 796, 354, 487, 333]
# I generate a sample from this distribution, assuming a uniform distribution in each range and an upper bound of 300.
# In[39]:
def generate_sample(sizes, counts):
"""Generate a sample from a distribution.
sizes: sequence of (low, high) pairs
counts: sequence of integers
returns: NumPy array
"""
t = []
for (low, high), count in zip(sizes, counts):
print(count, low, high)
sample = np.random.randint(low, high+1, count)
t.extend(sample)
return np.array(t)
# The "unbiased" sample is as seen by the college, with each class equally likely to be in the sample.
# In[40]:
unbiased = generate_sample(sizes, counts)
# To generate a biased sample, we use the values themselves as weights and resample with replacement.
# In[41]:
def resample_weighted(sample, weights):
"""Resample values from `sample` with the given weights.
sample: NumPy array
weights: NumPy array
returns: NumPy array
"""
n = len(sample)
p = weights / np.sum(weights)
return np.random.choice(sample, n, p=p)
# In[42]:
biased = resample_weighted(unbiased, unbiased)
# To plot the distribution, I use KDE to estimate the density function, then evaluate it over the given sequence of `xs`.
# In[43]:
from scipy.stats import gaussian_kde
def kdeplot(sample, xs, label=None, **options):
"""Use KDE to plot the density function.
sample: NumPy array
xs: NumPy array
label: string
"""
density = gaussian_kde(sample, **options).evaluate(xs)
plt.plot(xs, density, label=label)
decorate(ylabel='Relative likelihood')
# The following plot shows the distribution of class size as seen by the Dean, and as seen by a sample of students.
# In[44]:
xs = np.arange(1, 300)
kdeplot(unbiased, xs, 'Reported by the Dean')
kdeplot(biased, xs, 'Reported by students')
decorate(xlabel='Class size',
title='Distribution of class sizes')
plt.savefig('inspection/class_size.png', dpi=150)
# Here are the means of the unbiased and biased distributions.
# In[45]:
np.mean(unbiased)
# In[46]:
np.mean(biased)
# In[47]:
from empiricaldist import Cdf
def cdfplot(sample, xs, label=None, **options):
"""Plot the CDF of the sample.
sample: NumPy array
xs: NumPy array (ignored)
label: string
"""
cdf = Cdf.from_seq(sample, **options)
cdf.plot(label=label)
decorate(ylabel='CDF')
# In[48]:
xs = np.arange(1, 300)
cdfplot(unbiased, xs, 'Reported by the Dean')
cdfplot(biased, xs, 'Reported by students')
decorate(xlabel='Class size',
title='Distribution of class sizes')
plt.savefig('inspection/class_size.png', dpi=150)
# ## Red Line
#
# Here are times between trains in seconds.
# In[49]:
unbiased = [
428.0, 705.0, 407.0, 465.0, 433.0, 425.0, 204.0, 506.0, 143.0, 351.0,
450.0, 598.0, 464.0, 749.0, 341.0, 586.0, 754.0, 256.0, 378.0, 435.0,
176.0, 405.0, 360.0, 519.0, 648.0, 374.0, 483.0, 537.0, 578.0, 534.0,
577.0, 619.0, 538.0, 331.0, 186.0, 629.0, 193.0, 360.0, 660.0, 484.0,
512.0, 315.0, 457.0, 404.0, 740.0, 388.0, 357.0, 485.0, 567.0, 160.0,
428.0, 387.0, 901.0, 187.0, 622.0, 616.0, 585.0, 474.0, 442.0, 499.0,
437.0, 620.0, 351.0, 286.0, 373.0, 232.0, 393.0, 745.0, 636.0, 758.0,
]
# Here's the same data in minutes.
# In[50]:
unbiased = np.array(unbiased) / 60
# We can use the same function to generate a biased sample.
# In[51]:
biased = resample_weighted(unbiased, unbiased)
# And plot the results.
# In[52]:
xs = np.linspace(1, 16.5, 101)
kdeplot(unbiased, xs, 'Seen by MBTA')
kdeplot(biased, xs, 'Seen by passengers')
decorate(xlabel='Time between trains (min)',
title='Distribution of time between trains')
plt.savefig('inspection/red_line.png', dpi=150)
# In[53]:
xs = np.linspace(1, 16.5, 101)
cdfplot(unbiased, xs, 'Seen by MBTA')
cdfplot(biased, xs, 'Seen by passengers')
decorate(xlabel='Time between trains (min)',
title='Distribution of time between trains')
plt.savefig('inspection/red_line.png', dpi=150)
# Here are the means of the distributions and the percentage difference.
# In[22]:
np.mean(biased), np.mean(unbiased)
# In[23]:
(np.mean(biased) - np.mean(unbiased)) / np.mean(unbiased) * 100
# ## Social network
#
# The following function reads the Facebook data.
# In[55]:
import networkx as nx
def read_graph(filename):
"""Read a graph from a file.
filename: string
return: nx.Graph
"""
G = nx.Graph()
array = np.loadtxt(filename, dtype=int)
G.add_edges_from(array)
return G
# In[56]:
# https://snap.stanford.edu/data/facebook_combined.txt.gz
fb = read_graph('facebook_combined.txt.gz')
n = len(fb)
m = len(fb.edges())
n, m
# The unbiased sample is the number of friends for each user.
# In[57]:
unbiased = [fb.degree(node) for node in fb]
len(unbiased)
# In[58]:
np.max(unbiased)
# We can use the same function to generate a biased sample.
# In[59]:
biased = resample_weighted(unbiased, unbiased)
# And generate the plot.
# In[60]:
xs = np.linspace(0, 300, 101)
kdeplot(unbiased, xs, 'Random sample of people')
kdeplot(biased, xs, 'Random sample of friends')
decorate(xlabel='Number of friends in social network',
title='Distribution of social network size')
plt.savefig('inspection/social.png', dpi=150)
# In[65]:
xs = np.linspace(0, 300, 101)
cdfplot(unbiased, xs, 'Random sample of people')
cdfplot(biased, xs, 'Random sample of friends')
decorate(xlabel='Number of friends in social network',
title='Distribution of social network size',
xlim=[-10, 310])
plt.savefig('inspection/social.png', dpi=150)
# Here are the means of the distributions.
# In[66]:
np.mean(biased), np.mean(unbiased)
# And the probability that the friend of a user has more friends than the user.
# In[67]:
np.mean(biased > unbiased)
# ## Relay race
#
# The following function read the data from the 2010 James Joyce Ramble 10K, where I ran my personal record time.
# In[68]:
import relay
results = relay.ReadResults()
unbiased = relay.GetSpeeds(results)
# In this case, the weights are related to the difference between each element of the sample and the hypothetical speed of the observer.
# In[69]:
weights = np.abs(np.array(unbiased) - 7)
biased = resample_weighted(unbiased, weights)
# And here's the plot.
# In[70]:
xs = np.linspace(3, 11, 101)
kdeplot(unbiased, xs, 'Seen by spectator')
kdeplot(biased, xs, 'Seen by runner at 7 mph', bw_method=0.2)
decorate(xlabel='Running speed (mph)',
title='Distribution of running speed')
plt.savefig('inspection/relay.png', dpi=150)
# In[72]:
xs = np.linspace(3, 11, 101)
cdfplot(unbiased, xs, 'Seen by spectator')
cdfplot(biased, xs, 'Seen by runner at 7 mph')
decorate(xlabel='Running speed (mph)',
title='Distribution of running speed')
plt.savefig('inspection/relay.png', dpi=150)
# ## Prison sentences
#
# First we read the [data from the Bureau of Prisons web page](https://www.bop.gov/about/statistics/statistics_inmate_sentences.jsp).
# In[73]:
tables = pd.read_html('BOP Statistics_ Sentences Imposed.html')
df = tables[0]
df
# Here are the low and I sentences for each range. I assume that the minimum sentence is about a week, that sentences "less than life" are 40 years, and that a life sentence is between 40 and 60 years.
# In[74]:
sentences = [(0.02, 1),
(1, 3),
(3, 5),
(5, 10),
(10, 15),
(15, 20),
(20, 40),
(40, 60)]
# We can get the counts from the table.
# In[75]:
counts = df['# of Inmates']
# Here's a different version of `generate_sample` for a continuous quantity.
# In[76]:
def generate_sample(sizes, counts):
"""Generate a sample from a distribution.
sizes: sequence of (low, high) pairs
counts: sequence of integers
returns: NumPy array
"""
t = []
for (low, high), count in zip(sizes, counts):
print(count, low, high)
sample = np.random.uniform(low, high, count)
t.extend(sample)
return np.array(t)
# In this case, the data are biased.
# In[77]:
biased = generate_sample(sentences, counts)
# So we have to unbias them with weights inversely proportional to the values.
#
# Prisoners in federal prison typically serve 85% of their nominal sentence. We can take that into account in the weights.
# In[78]:
weights = 1 / (0.85 * np.array(biased))
# Here's the unbiased sample.
# In[79]:
unbiased = resample_weighted(biased, weights)
# And the plotted distributions.
# In[80]:
xs = np.linspace(0, 60, 101)
kdeplot(unbiased, xs, 'Seen by judge', bw_method=0.5)
kdeplot(biased, xs, 'Seen by prison visitor', bw_method=0.5)
decorate(xlabel='Prison sentence (years)',
title='Distribution of federal prison sentences')
plt.savefig('inspection/orange.png', dpi=150)
# In[82]:
xs = np.linspace(0, 60, 101)
cdfplot(unbiased, xs, 'Seen by judge')
cdfplot(biased, xs, 'Seen by prison visitor')
decorate(xlabel='Prison sentence (years)',
title='Distribution of federal prison sentences')
plt.savefig('inspection/orange.png', dpi=150)
# We can also compute the distribution of sentences as seen by someone at the prison for 13 months.
# In[84]:
x = 0.85 * unbiased
y = 13 / 12
weights = x + y
# Here's the sample.
# In[85]:
kerman = resample_weighted(unbiased, weights)
# And here's what it looks like.
# In[86]:
xs = np.linspace(0, 60, 101)
kdeplot(unbiased, xs, 'Seen by judge', bw_method=0.5)
kdeplot(kerman, xs, 'Seen by Kerman', bw_method=0.5)
kdeplot(biased, xs, 'Seen by visitor', bw_method=0.5)
decorate(xlabel='Prison sentence (years)',
title='Distribution of federal prison sentences')
plt.savefig('inspection/orange.png', dpi=150)
# In[87]:
xs = np.linspace(0, 60, 101)
cdfplot(unbiased, xs, 'Seen by judge')
cdfplot(kerman, xs, 'Seen by Kerman')
cdfplot(biased, xs, 'Seen by visitor')
decorate(xlabel='Prison sentence (years)',
title='Distribution of federal prison sentences')
plt.savefig('inspection/orange.png', dpi=150)
# In the unbiased distribution, almost half of prisoners serve less than one year.
# In[88]:
np.mean(unbiased<1)
# But if we sample the prison population, barely 3% are short timers.
# In[89]:
np.mean(biased<1)
# Here are the means of the distributions.
# In[90]:
np.mean(unbiased)
# In[91]:
np.mean(biased)
# In[92]:
np.mean(kerman)
# ## The dartboard problem
# In[ ]:
from matplotlib.patches import Circle
def draw_dartboard():
ax = plt.gca()
c1 = Circle((0, 0), 170, color='C3', alpha=0.3)
c2 = Circle((0, 0), 160, color='white')
c3 = Circle((0, 0), 107, color='C3', alpha=0.3)
c4 = Circle((0, 0), 97, color='white')
c5 = Circle((0, 0), 16, color='C3', alpha=0.3)
c6 = Circle((0, 0), 6, color='white')
for circle in [c1, c2, c3, c4, c5, c6]:
ax.add_patch(circle)
plt.axis('equal')
# In[279]:
draw_dartboard()
plt.text(0, 10, '25 ring')
plt.text(0, 110, 'triple ring')
plt.text(0, 170, 'double ring')
plt.savefig('inspection/darts0.png', dpi=150)
# In[280]:
sigma = 50
n = 100
error_x = np.random.normal(0, sigma, size=(n))
error_y = np.random.normal(0, sigma, size=(n))
# In[281]:
draw_dartboard()
plt.plot(error_x, error_y, '.')
plt.savefig('inspection/darts1.png', dpi=150)
# In[286]:
sigma = 50
n = 10000
error_x = np.random.normal(0, sigma, size=(n))
error_y = np.random.normal(0, sigma, size=(n))
# In[310]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as pl
ax = sns.kdeplot(error_x, error_y, shade=True, cmap="PuBu")
ax.collections[0].set_alpha(0)
plt.axis([-240, 240, -175, 175])
decorate(xlabel='x distance from center (mm)',
ylabel='y distance from center (mm)',
title='Estimated density')
plt.savefig('inspection/darts2.png', dpi=150)
# In[265]:
rs = np.hypot(error_x, error_y)
# In[308]:
np.random.seed(18)
sigma = 50
n = 10000
error_x = np.random.normal(0, sigma, size=(n))
error_y = np.random.normal(0, sigma, size=(n))
# In[309]:
xs = np.linspace(-200, 200, 101)
#ys = np.exp(-(xs/sigma)**2/2)
#pmf = Pmf(ys, index=xs)
#pmf.normalize()
#pmf.plot(color='gray')
unbiased = error_x
biased = resample_weighted(unbiased, np.abs(unbiased))
kdeplot(unbiased, xs, 'Density at a point')
kdeplot(biased, xs, 'Total density in a ring')
#kdeplot(rs, xs, 'Total density in a ring')
decorate(xlabel='Distance from center (mm)',
ylabel='Density',
xlim=[0, 210])
plt.savefig('inspection/darts3.png', dpi=150)
# In[267]:
xs = np.linspace(0, 200, 101)
unbiased = np.abs(error_x)
biased = resample_weighted(unbiased, unbiased)
cdfplot(unbiased, xs, 'Density at a point')
cdfplot(biased, xs, 'Total density in a ring')
decorate(xlabel='Distance from center (mm)',
ylabel='Density')
plt.savefig('inspection/darts4.png', dpi=150)
# In[260]:
triple = (biased > 97) & (biased < 107)
triple.mean() * 100
# In[261]:
ring50 = (biased > 6) & (biased < 16)
ring50.mean() * 100
# In[262]:
double = (biased > 160) & (biased < 170)
double.mean() * 100
# In[263]:
bull = (biased < 6)
bull.mean() * 100
# In[ ]: