%matplotlib inline
import numpy as np
from scipy.stats import binom, geom, hypergeom, nbinom, poisson
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.offline as offline
init_notebook_mode(connected=True)
n = 40
x_range = np.arange(0,n+1)
p_range = np.arange(0.0,1.1,0.1)
data = [dict(
visible = False,
type = 'bar',
name = 'p = '+str(p),
x = x_range,
y = binom.pmf(x_range,n,p)) for p in p_range]
steps = []
for i in range(len(data)):
step = dict(
method = 'restyle',
args = ['visible', [False] * len(p_range)],
label = str(p_range[i])
)
step['args'][1][i] = True # Toggle i'th trace to "visible"
steps.append(step)
i = 5
data[i]['visible'] = True
sliders = [ dict(
active = i,
currentvalue = {"prefix": "p = "},
pad = {"t": 50},
steps = steps
) ]
layout = dict(sliders=sliders,
title='PMF Bernoulli(n,p) as p varies',
yaxis=dict(
autorange=False,
range=[0, 1]
))
fig = dict(data=data, layout=layout)
offline.iplot(fig, filename='Binomial_var_p')
n_range = np.arange(1,41,1)
p_range = np.array([0.25,0.5,0.75])
hist = [dict(
visible = False,
type = 'bar',
name = 'p = ' + str(p),
x = np.arange(0,n+1),
y = binom.pmf(np.arange(0,n+1),n,p)) for n in n_range for p in p_range]
lines = [dict(
visible = False,
type = 'scatter',
name = 'µ for p = ' + str(p),
line = dict(dash = 'dash'),
opacity = 0.3,
x=[n*p, n*p],
y=[-10,10]) for n in n_range for p in p_range]
data = hist + lines
steps = []
for i in range(len(n_range)):
step = dict(
method = 'restyle',
args = ['visible', [False] * len(data)],
label = str(n_range[i])
)
step['args'][1][i*len(p_range) + 0] = True # Toggle i'th trace to "visible"
step['args'][1][i*len(p_range) + 1] = True
step['args'][1][i*len(p_range) + 2] = True
step['args'][1][len(p_range)*len(n_range) + i*len(p_range) + 0] = True
step['args'][1][len(p_range)*len(n_range) + i*len(p_range) + 1] = True
step['args'][1][len(p_range)*len(n_range) + i*len(p_range) + 2] = True
steps.append(step)
i = 20
data[i*len(p_range) + 0]['visible'] = True
data[i*len(p_range) + 1]['visible'] = True
data[i*len(p_range) + 2]['visible'] = True
data[len(p_range)*len(n_range) + i*len(p_range) + 0]['visible'] = True
data[len(p_range)*len(n_range) + i*len(p_range) + 1]['visible'] = True
data[len(p_range)*len(n_range) + i*len(p_range) + 2]['visible'] = True
sliders = []
sliders.append( dict(
active = 20,
currentvalue = {"prefix": "n = "},
pad = {"t": 50},
steps = steps
))
layout = dict(sliders=sliders,
title='PMF Bernoulli(n,p) as n varies',
yaxis=dict(
autorange=False,
range=[0, .8]
),
xaxis=dict(
tick0=0,
dtick=1
)
)
fig = dict(data=data, layout=layout)
offline.iplot(fig, filename='Binomial_var')
p_range = np.arange(0.05,1.,0.05)
hist = [dict(
visible = False,
type = 'bar',
name = 'f(y)',
x = np.arange(geom.ppf(0.01,p), geom.ppf(0.99,p)),
y = geom.pmf(np.arange(geom.ppf(0.01,p), geom.ppf(0.99,p)),p)) for p in p_range]
linesmu = [dict(
visible = False,
type = 'scatter',
name = 'µ',
line = dict(dash = 'dash'),
opacity = 0.3,
x=[1/p, 1/p],
y=[-10,10]) for p in p_range]
linessig = [dict(
visible = False,
type = 'scatter',
name = 'σ',
line = dict(dash = 'dashdot'),
opacity = 0.3,
x=[np.sqrt((1-p)/p**2), np.sqrt((1-p)/p**2)],
y=[-10,10]) for p in p_range]
data = []
data = hist + linesmu + linessig
steps = []
for i in range(len(p_range)):
step = dict(
method = 'restyle',
args = ['visible', [False] * len(data)],
label = str(p_range[i])
)
step['args'][1][i] = True # Toggle i'th trace to "visible"
step['args'][1][len(p_range) + i] = True # Toggle i'th trace to "visible"
step['args'][1][2*len(p_range) + i] = True # Toggle i'th trace to "visible"
steps.append(step)
i = 5
data[i]['visible'] = True
data[len(p_range) + i]['visible'] = True
data[2*len(p_range) + i]['visible'] = True
sliders = [ dict(
active = i,
currentvalue = {"prefix": "p = "},
pad = {"t": 50},
steps = steps
) ]
layout = dict(sliders=sliders,
title='PMF Geom(p) as p varies',
yaxis=dict(
autorange=False,
range=[0, 1]
))
fig = dict(data=data, layout=layout)
offline.iplot(fig, filename='Geom_var')
p_range = np.arange(0.25, 20, 0.25)
x_range = np.arange(-1, 200)
hist = [dict(
visible = False,
type = 'bar',
name = 'f(y)',
x = x_range,
y = poisson.pmf(x_range, p)) for p in p_range]
linesmu = [dict(
visible = False,
type = 'scatter',
name = 'µ',
line = dict(dash = 'dash'),
opacity = 0.3,
x=[p, p],
y=[-10,10]) for p in p_range]
linessig = [dict(
visible = False,
type = 'scatter',
name = 'σ',
line = dict(dash = 'dashdot'),
opacity = 0.3,
x=[np.sqrt(p), np.sqrt(p)],
y=[-10,10]) for p in p_range]
data = []
data = hist + linesmu + linessig
steps = []
for i in range(len(p_range)):
step = dict(
method = 'restyle',
args = ['visible', [False] * len(data)],
label = str(p_range[i])
)
step['args'][1][i] = True # Toggle i'th trace to "visible"
step['args'][1][len(p_range) + i] = True # Toggle i'th trace to "visible"
step['args'][1][2*len(p_range) + i] = True # Toggle i'th trace to "visible"
steps.append(step)
i = 5
data[i]['visible'] = True
data[len(p_range) + i]['visible'] = True
data[2*len(p_range) + i]['visible'] = True
sliders = [ dict(
active = i,
currentvalue = {"prefix": "λ = "},
pad = {"t": 50},
steps = steps
) ]
layout = dict(sliders=sliders,
title='PMF Poisson(λ) as λ varies',
yaxis=dict(
autorange=False,
range=[0, 1]
),
xaxis=dict(
range=[-1,40]
))
fig = dict(data=data, layout=layout)
offline.iplot(fig, filename='Poisson_var')
Recall that the pmf of $Y \sim hypergeometric(N,K,n)$ for the probability of success in $n$ draws without replacement from a population of size $N$ that contains $K$ successes is given by $$P(Y=k) = f(k) = \frac{{N \choose k}{N-K \choose n-k}}{{N \choose n}}\,,$$ for $k$ in the range $\max(0, n+K-N) \leq k \leq \min(K,n)$.
$E[Y] = n \frac{K}{N}$ and $Var[Y] = n \frac{K}{N} \frac{N-K}{N} \frac{N-n}{N-1}$.
Note that the probability of success changes on each draw as each draw decreases the population since we are sampling without replacement. Question: How does this compare to the binomial distribution; that is, for a binomial experiment, how are we drawing the samples?
Let $X \sim binomial(n, p)$ for $p = K/N$. If ($N$ and $K$ are large compared to $n$) AND ($p$ is not close to $0$ or $1$) then $Y$ and $X$ have similar distributions: $$P(Y \leq k) \approx P(X \leq k)\,,$$ since if $n$ is small compared to $N$ and $K$ the draws are "closer" to being iid $Bernoulli(p)$ trials.
N = 1000
p_range = np.array([0.05,0.25, 0.5])
n_range = np.arange(25,475,25)
hist = []
hist = [dict(
visible = False,
type = 'bar',
name = 'K/N = ' + str(p),
x = np.arange(0,int(p*N)+1),
# hypergeom.pmf(k, N, K, n)
y = hypergeom.pmf(np.arange(0,int(p*N)+1), N, int(p*N), n)) for n in n_range for p in p_range]
blins = []
blines = [dict(
visible = False,
type = 'scatter',
name = 'p = ' + str(p),
x = np.arange(0, n+1),
y = binom.pmf(np.arange(0,n+1),n,p)) for n in n_range for p in p_range]
data =[]
data = hist + blines # + lines
steps = []
for i in range(len(n_range)):
step = dict(
method = 'restyle',
args = ['visible', [False] * len(data)],
label = str(n_range[i])
)
step['args'][1][i*len(p_range) + 0] = True # Toggle i'th trace to "visible"
step['args'][1][i*len(p_range) + 1] = True
step['args'][1][i*len(p_range) + 2] = True
step['args'][1][len(p_range)*len(n_range) + i*len(p_range) + 0] = True
step['args'][1][len(p_range)*len(n_range) + i*len(p_range) + 1] = True
step['args'][1][len(p_range)*len(n_range) + i*len(p_range) + 2] = True
steps.append(step)
i = 0
data[i*len(p_range) + 0]['visible'] = True
data[i*len(p_range) + 1]['visible'] = True
data[i*len(p_range) + 2]['visible'] = True
data[len(p_range)*len(n_range) + i*len(p_range) + 0]['visible'] = True
data[len(p_range)*len(n_range) + i*len(p_range) + 1]['visible'] = True
data[len(p_range)*len(n_range) + i*len(p_range) + 2]['visible'] = True
sliders = []
sliders.append( dict(
active = i,
currentvalue = {"prefix": "n = "},
pad = {"t": 50},
steps = steps
))
layout = dict(sliders=sliders,
# hypergeom.pmf(k, N, K, n)
title='PMF for hypergeom(N='+str(N)+', K, n) as n increases',
yaxis=dict(
autorange=False,
range=[0, 0.4]
),
xaxis=dict(
#tick0=0,
#dtick=1
range=[-1,275]
)
)
fig = dict(data=data, layout=layout)
offline.iplot(fig, filename='hypergeom_var')
Above, I fix $N = 1000$ and the ratio $K/N = p$. As $n$ gets smaller, compared to $N$ and $K$, for values of $K/N = p$ away from $0$ and $1$, the distribution of $Y \sim hypergeom(N,K,n)$ becomes approximately $X \sim binomial(n,p)$.