The Inspection Paradox is Everywhere

Allen Downey 2019

MIT License

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from empiricaldist import Pmf

from utils import decorate
In [3]:
# set the random seed so we get the same results every time
np.random.seed(17)
In [4]:
# make the directory for the figures
import os

if not os.path.exists('inspection'):
    !mkdir inspection

Class size

Here's the data summarizing the distribution of undergraduate class sizes at Purdue University in 2013-14.

In [38]:
# Class size data originally from
# https://www.purdue.edu/datadigest/2013-14/InstrStuLIfe/DistUGClasses.html
# now available from
# https://web.archive.org/web/20160415011613/https://www.purdue.edu/datadigest/2013-14/InstrStuLIfe/DistUGClasses.html

sizes = [(1, 1), 
         (2, 9),
         (10, 19),
         (20, 29),
         (30, 39),
         (40, 49),
         (50, 99),
         (100, 300)]
         
counts = [138, 635, 1788, 1979, 796, 354, 487, 333]

I generate a sample from this distribution, assuming a uniform distribution in each range and an upper bound of 300.

In [39]:
def generate_sample(sizes, counts):
    """Generate a sample from a distribution.
    
    sizes: sequence of (low, high) pairs
    counts: sequence of integers
    
    returns: NumPy array
    """
    t = []
    for (low, high), count in zip(sizes, counts):
        print(count, low, high)
        sample = np.random.randint(low, high+1, count)
        t.extend(sample)
    return np.array(t)

The "unbiased" sample is as seen by the college, with each class equally likely to be in the sample.

In [40]:
unbiased = generate_sample(sizes, counts)
138 1 1
635 2 9
1788 10 19
1979 20 29
796 30 39
354 40 49
487 50 99
333 100 300

To generate a biased sample, we use the values themselves as weights and resample with replacement.

In [41]:
def resample_weighted(sample, weights):
    """Resample values from `sample` with the given weights. 
    
    sample: NumPy array
    weights: NumPy array
    
    returns: NumPy array
    """
    n = len(sample)
    p = weights / np.sum(weights)
    return np.random.choice(sample, n, p=p)
In [42]:
biased = resample_weighted(unbiased, unbiased)

To plot the distribution, I use KDE to estimate the density function, then evaluate it over the given sequence of xs.

In [43]:
from scipy.stats import gaussian_kde

def kdeplot(sample, xs, label=None, **options):
    """Use KDE to plot the density function.
    
    sample: NumPy array
    xs: NumPy array
    label: string
    """
    density = gaussian_kde(sample, **options).evaluate(xs)
    plt.plot(xs, density, label=label)
    decorate(ylabel='Relative likelihood')

The following plot shows the distribution of class size as seen by the Dean, and as seen by a sample of students.

In [44]:
xs = np.arange(1, 300)
kdeplot(unbiased, xs, 'Reported by the Dean')
kdeplot(biased, xs, 'Reported by students')

decorate(xlabel='Class size',
         title='Distribution of class sizes')

plt.savefig('inspection/class_size.png', dpi=150)

Here are the means of the unbiased and biased distributions.

In [45]:
np.mean(unbiased)
Out[45]:
34.423041474654376
In [46]:
np.mean(biased)
Out[46]:
89.19170506912442
In [47]:
from empiricaldist import Cdf

def cdfplot(sample, xs, label=None, **options):
    """Plot the CDF of the sample.
    
    sample: NumPy array
    xs: NumPy array (ignored)
    label: string
    """
    cdf = Cdf.from_seq(sample, **options)
    cdf.plot(label=label)
    decorate(ylabel='CDF')
In [48]:
xs = np.arange(1, 300)
cdfplot(unbiased, xs, 'Reported by the Dean')
cdfplot(biased, xs, 'Reported by students')

decorate(xlabel='Class size',
         title='Distribution of class sizes')

plt.savefig('inspection/class_size.png', dpi=150)

Red Line

Here are times between trains in seconds.

In [49]:
unbiased = [
    428.0, 705.0, 407.0, 465.0, 433.0, 425.0, 204.0, 506.0, 143.0, 351.0, 
    450.0, 598.0, 464.0, 749.0, 341.0, 586.0, 754.0, 256.0, 378.0, 435.0, 
    176.0, 405.0, 360.0, 519.0, 648.0, 374.0, 483.0, 537.0, 578.0, 534.0, 
    577.0, 619.0, 538.0, 331.0, 186.0, 629.0, 193.0, 360.0, 660.0, 484.0, 
    512.0, 315.0, 457.0, 404.0, 740.0, 388.0, 357.0, 485.0, 567.0, 160.0, 
    428.0, 387.0, 901.0, 187.0, 622.0, 616.0, 585.0, 474.0, 442.0, 499.0, 
    437.0, 620.0, 351.0, 286.0, 373.0, 232.0, 393.0, 745.0, 636.0, 758.0,
]

Here's the same data in minutes.

In [50]:
unbiased = np.array(unbiased) / 60

We can use the same function to generate a biased sample.

In [51]:
biased = resample_weighted(unbiased, unbiased)

And plot the results.

In [52]:
xs = np.linspace(1, 16.5, 101)
kdeplot(unbiased, xs, 'Seen by MBTA')
kdeplot(biased, xs, 'Seen by passengers')

decorate(xlabel='Time between trains (min)',
         title='Distribution of time between trains')

plt.savefig('inspection/red_line.png', dpi=150)
In [53]:
xs = np.linspace(1, 16.5, 101)
cdfplot(unbiased, xs, 'Seen by MBTA')
cdfplot(biased, xs, 'Seen by passengers')

decorate(xlabel='Time between trains (min)',
         title='Distribution of time between trains')

plt.savefig('inspection/red_line.png', dpi=150)

Here are the means of the distributions and the percentage difference.

In [22]:
np.mean(biased), np.mean(unbiased)
Out[22]:
(9.207857142857144, 7.7680952380952375)
In [23]:
(np.mean(biased) - np.mean(unbiased)) /  np.mean(unbiased) * 100
Out[23]:
18.53429779930119

Social network

The following function reads the Facebook data.

In [55]:
import networkx as nx

def read_graph(filename):
    """Read a graph from a file.
    
    filename: string
    
    return: nx.Graph
    """
    G = nx.Graph()
    array = np.loadtxt(filename, dtype=int)
    G.add_edges_from(array)
    return G
In [56]:
# https://snap.stanford.edu/data/facebook_combined.txt.gz

fb = read_graph('facebook_combined.txt.gz')
n = len(fb)
m = len(fb.edges())
n, m
Out[56]:
(4039, 88234)

The unbiased sample is the number of friends for each user.

In [57]:
unbiased = [fb.degree(node) for node in fb]
len(unbiased)
Out[57]:
4039
In [58]:
np.max(unbiased)
Out[58]:
1045

We can use the same function to generate a biased sample.

In [59]:
biased = resample_weighted(unbiased, unbiased)

And generate the plot.

In [60]:
xs = np.linspace(0, 300, 101)
kdeplot(unbiased, xs, 'Random sample of people')
kdeplot(biased, xs, 'Random sample of friends')

decorate(xlabel='Number of friends in social network',
         title='Distribution of social network size')

plt.savefig('inspection/social.png', dpi=150)
In [65]:
xs = np.linspace(0, 300, 101)
cdfplot(unbiased, xs, 'Random sample of people')
cdfplot(biased, xs, 'Random sample of friends')

decorate(xlabel='Number of friends in social network',
         title='Distribution of social network size',
         xlim=[-10, 310])

plt.savefig('inspection/social.png', dpi=150)

Here are the means of the distributions.

In [66]:
np.mean(biased), np.mean(unbiased)
Out[66]:
(108.5679623669225, 43.69101262688784)

And the probability that the friend of a user has more friends than the user.

In [67]:
np.mean(biased > unbiased)
Out[67]:
0.7655360237682595

Relay race

The following function read the data from the 2010 James Joyce Ramble 10K, where I ran my personal record time.

In [68]:
import relay
results = relay.ReadResults()
unbiased = relay.GetSpeeds(results)

In this case, the weights are related to the difference between each element of the sample and the hypothetical speed of the observer.

In [69]:
weights = np.abs(np.array(unbiased) - 7)
biased = resample_weighted(unbiased, weights)

And here's the plot.

In [70]:
xs = np.linspace(3, 11, 101)
kdeplot(unbiased, xs, 'Seen by spectator')
kdeplot(biased, xs, 'Seen by runner at 7 mph', bw_method=0.2)

decorate(xlabel='Running speed (mph)',
         title='Distribution of running speed')

plt.savefig('inspection/relay.png', dpi=150)
In [72]:
xs = np.linspace(3, 11, 101)
cdfplot(unbiased, xs, 'Seen by spectator')
cdfplot(biased, xs, 'Seen by runner at 7 mph')

decorate(xlabel='Running speed (mph)',
         title='Distribution of running speed')

plt.savefig('inspection/relay.png', dpi=150)

Prison sentences

First we read the data from the Bureau of Prisons web page.

In [73]:
tables = pd.read_html('BOP Statistics_ Sentences Imposed.html')
df = tables[0]
df
Out[73]:
Sentence # of Inmates % of Inmates
0 0 to 1 year* 5155 2.3 %
1 > 1 year to < 3 years** 18619 11.3%
2 3 years to < 5 years 17897 10.9%
3 5 years to < 10 years 41887 25.4%
4 10 years to < 15 years 34995 21.3%
5 15 years to < 20 years 18674 11.3%
6 20 years or more but < Life 22738 13.8%
7 Life 4600 2.8%

Here are the low and I sentences for each range. I assume that the minimum sentence is about a week, that sentences "less than life" are 40 years, and that a life sentence is between 40 and 60 years.

In [74]:
sentences = [(0.02, 1),
             (1, 3),
             (3, 5),
             (5, 10),
             (10, 15),
             (15, 20),
             (20, 40),
             (40, 60)]

We can get the counts from the table.

In [75]:
counts = df['# of Inmates']

Here's a different version of generate_sample for a continuous quantity.

In [76]:
def generate_sample(sizes, counts):
    """Generate a sample from a distribution.
    
    sizes: sequence of (low, high) pairs
    counts: sequence of integers
    
    returns: NumPy array
    """
    t = []
    for (low, high), count in zip(sizes, counts):
        print(count, low, high)
        sample = np.random.uniform(low, high, count)
        t.extend(sample)
    return np.array(t)

In this case, the data are biased.

In [77]:
biased = generate_sample(sentences, counts)
5155 0.02 1
18619 1 3
17897 3 5
41887 5 10
34995 10 15
18674 15 20
22738 20 40
4600 40 60

So we have to unbias them with weights inversely proportional to the values.

Prisoners in federal prison typically serve 85% of their nominal sentence. We can take that into account in the weights.

In [78]:
weights = 1 / (0.85 * np.array(biased))

Here's the unbiased sample.

In [79]:
unbiased = resample_weighted(biased, weights)

And the plotted distributions.

In [80]:
xs = np.linspace(0, 60, 101)
kdeplot(unbiased, xs, 'Seen by judge', bw_method=0.5)
kdeplot(biased, xs, 'Seen by prison visitor', bw_method=0.5)

decorate(xlabel='Prison sentence (years)',
         title='Distribution of federal prison sentences')

plt.savefig('inspection/orange.png', dpi=150)
In [82]:
xs = np.linspace(0, 60, 101)
cdfplot(unbiased, xs, 'Seen by judge')
cdfplot(biased, xs, 'Seen by prison visitor')

decorate(xlabel='Prison sentence (years)',
         title='Distribution of federal prison sentences')

plt.savefig('inspection/orange.png', dpi=150)

We can also compute the distribution of sentences as seen by someone at the prison for 13 months.

In [84]:
x = 0.85 * unbiased
y = 13 / 12

weights = x + y

Here's the sample.

In [85]:
kerman = resample_weighted(unbiased, weights)

And here's what it looks like.

In [86]:
xs = np.linspace(0, 60, 101)
kdeplot(unbiased, xs, 'Seen by judge', bw_method=0.5)
kdeplot(kerman, xs, 'Seen by Kerman', bw_method=0.5)
kdeplot(biased, xs, 'Seen by visitor', bw_method=0.5)

decorate(xlabel='Prison sentence (years)',
         title='Distribution of federal prison sentences')

plt.savefig('inspection/orange.png', dpi=150)
In [87]:
xs = np.linspace(0, 60, 101)
cdfplot(unbiased, xs, 'Seen by judge')
cdfplot(kerman, xs, 'Seen by Kerman')
cdfplot(biased, xs, 'Seen by visitor')

decorate(xlabel='Prison sentence (years)',
         title='Distribution of federal prison sentences')

plt.savefig('inspection/orange.png', dpi=150)

In the unbiased distribution, almost half of prisoners serve less than one year.

In [88]:
np.mean(unbiased<1)
Out[88]:
0.44996809771215024

But if we sample the prison population, barely 3% are short timers.

In [89]:
np.mean(biased<1)
Out[89]:
0.0313250083553611

Here are the means of the distributions.

In [90]:
np.mean(unbiased)
Out[90]:
3.5762182609236373
In [91]:
np.mean(biased)
Out[91]:
12.778002004195422
In [92]:
np.mean(kerman)
Out[92]:
10.455078798407829

The dartboard problem

In [ ]:
from matplotlib.patches import Circle

def draw_dartboard():
    ax = plt.gca()

    c1 = Circle((0, 0), 170, color='C3', alpha=0.3)
    c2 = Circle((0, 0), 160, color='white')
    c3 = Circle((0, 0), 107, color='C3', alpha=0.3)
    c4 = Circle((0, 0), 97, color='white')
    c5 = Circle((0, 0), 16, color='C3', alpha=0.3)
    c6 = Circle((0, 0), 6, color='white')
    for circle in [c1, c2, c3, c4, c5, c6]:
        ax.add_patch(circle)
    plt.axis('equal')
    
In [279]:
draw_dartboard()
plt.text(0, 10, '25 ring')
plt.text(0, 110, 'triple ring')
plt.text(0, 170, 'double ring')

plt.savefig('inspection/darts0.png', dpi=150)
In [280]:
sigma = 50
n = 100
error_x = np.random.normal(0, sigma, size=(n))
error_y = np.random.normal(0, sigma, size=(n))
In [281]:
draw_dartboard()
plt.plot(error_x, error_y, '.')
plt.savefig('inspection/darts1.png', dpi=150)
In [286]:
sigma = 50
n = 10000
error_x = np.random.normal(0, sigma, size=(n))
error_y = np.random.normal(0, sigma, size=(n))
In [310]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as pl

ax = sns.kdeplot(error_x, error_y, shade=True, cmap="PuBu")
ax.collections[0].set_alpha(0)
plt.axis([-240, 240, -175, 175])

decorate(xlabel='x distance from center (mm)',
         ylabel='y distance from center (mm)',
         title='Estimated density')


plt.savefig('inspection/darts2.png', dpi=150)
In [265]:
rs = np.hypot(error_x, error_y)
In [308]:
np.random.seed(18)
sigma = 50
n = 10000
error_x = np.random.normal(0, sigma, size=(n))
error_y = np.random.normal(0, sigma, size=(n))
In [309]:
xs = np.linspace(-200, 200, 101)

#ys = np.exp(-(xs/sigma)**2/2)
#pmf = Pmf(ys, index=xs)
#pmf.normalize()
#pmf.plot(color='gray')

unbiased = error_x
biased = resample_weighted(unbiased, np.abs(unbiased))

kdeplot(unbiased, xs, 'Density at a point')
kdeplot(biased, xs, 'Total density in a ring')
#kdeplot(rs, xs, 'Total density in a ring')

decorate(xlabel='Distance from center (mm)',
         ylabel='Density',
         xlim=[0, 210])

plt.savefig('inspection/darts3.png', dpi=150)
In [267]:
xs = np.linspace(0, 200, 101)
unbiased = np.abs(error_x)
biased = resample_weighted(unbiased, unbiased)

cdfplot(unbiased, xs, 'Density at a point')
cdfplot(biased, xs, 'Total density in a ring')

decorate(xlabel='Distance from center (mm)',
         ylabel='Density')

plt.savefig('inspection/darts4.png', dpi=150)
In [260]:
triple = (biased > 97) & (biased < 107)
triple.mean() * 100
Out[260]:
6.15
In [261]:
ring50 = (biased > 6) & (biased < 16)
ring50.mean() * 100
Out[261]:
4.6
In [262]:
double = (biased > 160) & (biased < 170)
double.mean() * 100
Out[262]:
0.22
In [263]:
bull = (biased < 6)
bull.mean() * 100
Out[263]:
0.76
In [ ]: