Notebook

In [1]:

%matplotlib inline
import numpy as np
from scipy.stats import binom, geom, hypergeom, nbinom, poisson
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.offline as offline

In [2]:

init_notebook_mode(connected=True)

Binomial¶

Recall that the pmf of $Y \sim Binom(n,p)$ for $p \in [0,1]$ is given by $f(y) = {n \choose y} p^y (1-p)^y$, for $y = 0, 1, \dots, n$.

$E[Y] = n p$ and $Var[Y] = n p (1-p)$.

PMF as $p$ varies¶

In [3]:

n = 40
x_range = np.arange(0,n+1)

p_range = np.arange(0.0,1.1,0.1)

data = [dict(
    visible = False,
    type = 'bar',
    name = 'p = '+str(p),
    x = x_range,
    y = binom.pmf(x_range,n,p)) for p in p_range]

steps = []
for i in range(len(data)):
    step = dict(
        method = 'restyle',
        args = ['visible', [False] * len(p_range)],
        label = str(p_range[i])
    )
    step['args'][1][i] = True # Toggle i'th trace to "visible"
    steps.append(step)

i = 5
data[i]['visible'] = True    

sliders = [ dict(
    active = i,
    currentvalue = {"prefix": "p = "},
    pad = {"t": 50},
    steps = steps
) ]


layout = dict(sliders=sliders, 
              title='PMF Bernoulli(n,p) as p varies',
              yaxis=dict(
                  autorange=False,
                  range=[0, 1]
              ))

fig = dict(data=data, layout=layout)

offline.iplot(fig, filename='Binomial_var_p')

PMF as $n$ varies¶

In [4]:

n_range = np.arange(1,41,1)
p_range = np.array([0.25,0.5,0.75])

hist = [dict(
    visible = False,
    type = 'bar',
    name = 'p = ' + str(p),
    x = np.arange(0,n+1),
    y = binom.pmf(np.arange(0,n+1),n,p)) for n in n_range for p in p_range]

lines = [dict(
    visible = False,
    type = 'scatter',
    name = 'µ for p = ' + str(p),
    line = dict(dash = 'dash'),
    opacity = 0.3,
    x=[n*p, n*p],
    y=[-10,10]) for n in n_range for p in p_range]

data = hist + lines

steps = []
for i in range(len(n_range)):
    step = dict(
        method = 'restyle',
        args = ['visible', [False] * len(data)],
        label = str(n_range[i])
    )
    step['args'][1][i*len(p_range) + 0] = True # Toggle i'th trace to "visible"
    step['args'][1][i*len(p_range) + 1] = True
    step['args'][1][i*len(p_range) + 2] = True
    step['args'][1][len(p_range)*len(n_range) + i*len(p_range) + 0] = True
    step['args'][1][len(p_range)*len(n_range) + i*len(p_range) + 1] = True
    step['args'][1][len(p_range)*len(n_range) + i*len(p_range) + 2] = True
    steps.append(step)

i = 20
data[i*len(p_range) + 0]['visible'] = True    
data[i*len(p_range) + 1]['visible'] = True  
data[i*len(p_range) + 2]['visible'] = True  
data[len(p_range)*len(n_range) + i*len(p_range) + 0]['visible'] = True    
data[len(p_range)*len(n_range) + i*len(p_range) + 1]['visible'] = True  
data[len(p_range)*len(n_range) + i*len(p_range) + 2]['visible'] = True  

sliders = []
sliders.append( dict(
    active = 20,
    currentvalue = {"prefix": "n = "},
    pad = {"t": 50},
    steps = steps
))

layout = dict(sliders=sliders, 
              title='PMF Bernoulli(n,p) as n varies',
              yaxis=dict(
                  autorange=False,
                  range=[0, .8]
                ),
              xaxis=dict(
                  tick0=0,
                  dtick=1
              )
            )

fig = dict(data=data, layout=layout)

offline.iplot(fig, filename='Binomial_var')

Geometric¶

Recall that the pmf of $Y \sim Geom(p)$ for $p \in [0,1]$ is given by $f(y) = (1-p)^{y-1}p$, for $y = 1, 2 \dots$.

$E[Y] = \frac{1}{p}$ and $Var[Y] = \frac{1-p}{p^2}$.

PMF as $p$ varies¶

In [5]:

p_range = np.arange(0.05,1.,0.05)

hist = [dict(
    visible = False,
    type = 'bar',
    name = 'f(y)',
    x = np.arange(geom.ppf(0.01,p), geom.ppf(0.99,p)),
    y = geom.pmf(np.arange(geom.ppf(0.01,p), geom.ppf(0.99,p)),p)) for p in p_range]

linesmu = [dict(
    visible = False,
    type = 'scatter',
    name = 'µ',
    line = dict(dash = 'dash'),
    opacity = 0.3,
    x=[1/p, 1/p],
    y=[-10,10]) for p in p_range]

linessig = [dict(
    visible = False,
    type = 'scatter',
    name = 'σ',
    line = dict(dash = 'dashdot'),
    opacity = 0.3,
    x=[np.sqrt((1-p)/p**2), np.sqrt((1-p)/p**2)],
    y=[-10,10]) for p in p_range]

data = []
data = hist + linesmu + linessig

steps = []
for i in range(len(p_range)):
    step = dict(
        method = 'restyle',
        args = ['visible', [False] * len(data)],
        label = str(p_range[i])
    )
    step['args'][1][i] = True # Toggle i'th trace to "visible"
    step['args'][1][len(p_range) + i] = True # Toggle i'th trace to "visible"
    step['args'][1][2*len(p_range) + i] = True # Toggle i'th trace to "visible"
    steps.append(step)

i = 5
data[i]['visible'] = True    
data[len(p_range) + i]['visible'] = True   
data[2*len(p_range) + i]['visible'] = True   

sliders = [ dict(
    active = i,
    currentvalue = {"prefix": "p = "},
    pad = {"t": 50},
    steps = steps
) ]


layout = dict(sliders=sliders, 
              title='PMF Geom(p) as p varies',
              yaxis=dict(
                  autorange=False,
                  range=[0, 1]
              ))

fig = dict(data=data, layout=layout)

offline.iplot(fig, filename='Geom_var')

Poisson¶

Recall that the pmf of $Y \sim Poisson(\lambda)$ for average $\lambda > 0$ is given by $f(y) = \frac{\lambda^y}{y!} e^{-\lambda}$, for $y = 0, 1, 2 \dots$.

$E[Y] = \lambda$ and $Var[Y] = \lambda$.

PMF as $\lambda$ varies¶

In [6]:

p_range = np.arange(0.25, 20, 0.25)
x_range = np.arange(-1, 200)

hist = [dict(
    visible = False,
    type = 'bar',
    name = 'f(y)',
    x = x_range,
    y = poisson.pmf(x_range, p)) for p in p_range]

linesmu = [dict(
    visible = False,
    type = 'scatter',
    name = 'µ',
    line = dict(dash = 'dash'),
    opacity = 0.3,
    x=[p, p],
    y=[-10,10]) for p in p_range]

linessig = [dict(
    visible = False,
    type = 'scatter',
    name = 'σ',
    line = dict(dash = 'dashdot'),
    opacity = 0.3,
    x=[np.sqrt(p), np.sqrt(p)],
    y=[-10,10]) for p in p_range]

data = []
data = hist + linesmu + linessig

steps = []
for i in range(len(p_range)):
    step = dict(
        method = 'restyle',
        args = ['visible', [False] * len(data)],
        label = str(p_range[i])
    )
    step['args'][1][i] = True # Toggle i'th trace to "visible"
    step['args'][1][len(p_range) + i] = True # Toggle i'th trace to "visible"
    step['args'][1][2*len(p_range) + i] = True # Toggle i'th trace to "visible"
    steps.append(step)

i = 5
data[i]['visible'] = True    
data[len(p_range) + i]['visible'] = True   
data[2*len(p_range) + i]['visible'] = True   

sliders = [ dict(
    active = i,
    currentvalue = {"prefix": "λ = "},
    pad = {"t": 50},
    steps = steps
) ]


layout = dict(sliders=sliders, 
              title='PMF Poisson(λ) as λ varies',
              yaxis=dict(
                  autorange=False,
                  range=[0, 1]
              ),
             xaxis=dict(
                 range=[-1,40]
             ))

fig = dict(data=data, layout=layout)

offline.iplot(fig, filename='Poisson_var')

Hypergeometric¶

Recall that the pmf of $Y \sim hypergeometric(N,K,n)$ for the probability of success in $n$ draws without replacement from a population of size $N$ that contains $K$ successes is given by $$P(Y=k) = f(k) = \frac{{N \choose k}{N-K \choose n-k}}{{N \choose n}}\,,$$ for $k$ in the range $\max(0, n+K-N) \leq k \leq \min(K,n)$.

$E[Y] = n \frac{K}{N}$ and $Var[Y] = n \frac{K}{N} \frac{N-K}{N} \frac{N-n}{N-1}$.

Note that the probability of success changes on each draw as each draw decreases the population since we are sampling without replacement. Question: How does this compare to the binomial distribution; that is, for a binomial experiment, how are we drawing the samples?

Let $X \sim binomial(n, p)$ for $p = K/N$. If ($N$ and $K$ are large compared to $n$) AND ($p$ is not close to $0$ or $1$) then $Y$ and $X$ have similar distributions: $$P(Y \leq k) \approx P(X \leq k)\,,$$ since if $n$ is small compared to $N$ and $K$ the draws are "closer" to being iid $Bernoulli(p)$ trials.

PMF as $n$ increases toward $N, K$ for three values of $p = K/N$¶

In [34]:

N = 1000
p_range = np.array([0.05,0.25, 0.5])
n_range = np.arange(25,475,25)

hist = []
hist = [dict(
    visible = False,
    type = 'bar',
    name = 'K/N = ' + str(p),
    x = np.arange(0,int(p*N)+1),
    # hypergeom.pmf(k, N, K, n)
    y = hypergeom.pmf(np.arange(0,int(p*N)+1), N, int(p*N), n)) for n in n_range for p in p_range]

blins = []
blines = [dict(
    visible = False,
    type = 'scatter',
    name = 'p = ' + str(p),
    x = np.arange(0, n+1),
    y = binom.pmf(np.arange(0,n+1),n,p)) for n in n_range for p in p_range]

data =[]
data = hist + blines # + lines

steps = []
for i in range(len(n_range)):
    step = dict(
        method = 'restyle',
        args = ['visible', [False] * len(data)],
        label = str(n_range[i])
    )
    step['args'][1][i*len(p_range) + 0] = True # Toggle i'th trace to "visible"
    step['args'][1][i*len(p_range) + 1] = True
    step['args'][1][i*len(p_range) + 2] = True
    step['args'][1][len(p_range)*len(n_range) + i*len(p_range) + 0] = True
    step['args'][1][len(p_range)*len(n_range) + i*len(p_range) + 1] = True
    step['args'][1][len(p_range)*len(n_range) + i*len(p_range) + 2] = True
    steps.append(step)

i = 0
data[i*len(p_range) + 0]['visible'] = True    
data[i*len(p_range) + 1]['visible'] = True  
data[i*len(p_range) + 2]['visible'] = True  
data[len(p_range)*len(n_range) + i*len(p_range) + 0]['visible'] = True    
data[len(p_range)*len(n_range) + i*len(p_range) + 1]['visible'] = True  
data[len(p_range)*len(n_range) + i*len(p_range) + 2]['visible'] = True  

sliders = []
sliders.append( dict(
    active = i,
    currentvalue = {"prefix": "n = "},
    pad = {"t": 50},
    steps = steps
))

layout = dict(sliders=sliders, 
              # hypergeom.pmf(k, N, K, n)
              title='PMF for hypergeom(N='+str(N)+', K, n) as n increases',
              yaxis=dict(
                  autorange=False,
                  range=[0, 0.4]
                ),
              xaxis=dict(
                  #tick0=0,
                  #dtick=1
                  range=[-1,275]
              )
            )

fig = dict(data=data, layout=layout)

offline.iplot(fig, filename='hypergeom_var')

Above, I fix $N = 1000$ and the ratio $K/N = p$. As $n$ gets smaller, compared to $N$ and $K$, for values of $K/N = p$ away from $0$ and $1$, the distribution of $Y \sim hypergeom(N,K,n)$ becomes approximately $X \sim binomial(n,p)$.

In [ ]: