%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import binom, bernoulli, multinomial, poisson, norm, uniform
import numpy as np
import pandas as pd
Toss a coint $n$ times with $k$ times success, probability of head is $\theta$, $X \in \{0, ..., n\}$.
# setup experiment
n = 5
theta = 0.4
k = x = np.arange(binom.ppf(0.01, n, theta), binom.ppf(0.99, n, theta))
y = binom.pmf(k, n, theta)
# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'bo', ms=8, label='binom pmf')
ax.vlines(x, ymin=0, ymax=y, colors='b', lw=5, alpha=0.5)
<matplotlib.collections.LineCollection at 0x1a16a529d0>
Toss a coin only once $X \in \{0, 1\}$.
# setup experiment
theta = 0.3
k = x = np.arange(bernoulli.ppf(0.01, theta), bernoulli.ppf(0.99, theta))
y = bernoulli.pmf(k, theta)
# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'bo', ms=8, label='bernoulli pmf')
ax.vlines(x, 0, y, colors='b', lw=5, alpha=0.5)
<matplotlib.collections.LineCollection at 0x1a16aa3390>
Tossing a $K$-sided die, let $\textbf{x} = (x_1, ..., x_K) \in \{0, 1, ..., n\}^K$, $x_j$ is the number of times side $j$ of the die occurs.
# setup experiment
x = [3, 4]
n = 7
theta = [0.4, 0.6]
print multinomial.pmf(x, n, theta)
print binom.pmf(3, 7, 0.4)
0.29030399999999973 0.2903040000000001
One-hot encoding $\textbf{x} \in \{0, 1\}^K$.
$$Mu(\textbf{x} | 1, \boldsymbol{\theta}) = \prod_{j=1}^K \theta_j^{\mathbf{I}(x_j = 1)}$$Counts rare events like radioactive decay and traffic accidents. $X \in \{0, 1, ...\}$, $\lambda > 0$, $e^{-\lambda}$ is normalization constant.
# setup experiment
mu = Lambda = 0.6
k = x = np.arange(poisson.ppf(0.01, Lambda), poisson.ppf(0.99, Lambda))
y = poisson.pmf(x, Lambda)
# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'bo', ms=8, label='poisson pmf')
ax.vlines(x, 0, y, colors='b', lw=5, alpha=0.5)
<matplotlib.collections.LineCollection at 0x1a16ac8050>
from scipy.stats import norm, laplace, gamma, pareto, beta, expon, erlang, chi2
# setup experiment
mu = 0
std = 1
x = np.linspace(norm.ppf(0.01, mu, std), norm.ppf(0.99, mu, std), 100)
y = norm.pdf(x, mu, std)
# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='norm pdf')
[<matplotlib.lines.Line2D at 0x1a16c12210>]
Heavy tails, also known as the double sided exponential distribution, $\mu$ is location, $b$ is scala.
# setup experiment
mu = 0
std = 1
x = np.linspace(laplace.ppf(0.01, mu, std), laplace.ppf(0.99, mu, std), 100)
y = laplace.pdf(x, mu, std)
# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='laplace pdf')
[<matplotlib.lines.Line2D at 0x1a16aa35d0>]
Flexible distribution for positive real valued rv’s.
# setup experiment
a = 1.99
b = 1
x = np.linspace(gamma.ppf(0.01, a), gamma.ppf(0.99, a), 100)
y = gamma.pdf(x, a)
# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='gamma pdf')
[<matplotlib.lines.Line2D at 0x1a16dc2450>]
# setup experiment exponential
a = 1
x = np.linspace(expon.ppf(0.01, a), expon.ppf(0.99, a), 100)
y = expon.pdf(x, a)
# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='exponential pdf')
[<matplotlib.lines.Line2D at 0x1a16de1550>]
# setup experiment exponential
a = 2
x = np.linspace(erlang.ppf(0.01, a), erlang.ppf(0.99, a), 100)
y = erlang.pdf(x, a)
# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='erlang pdf')
[<matplotlib.lines.Line2D at 0x1a17080e10>]
# setup experiment exponential
nu = 55
x = np.linspace(chi2.ppf(0.01, nu), chi2.ppf(0.99, nu), 100)
y = chi2.pdf(x, nu)
# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='Chi-square pdf')
[<matplotlib.lines.Line2D at 0x1a1714a490>]
# setup experiment
a, b = 2.31, 0.627
x = np.linspace(beta.ppf(0.01, a, b), beta.ppf(0.99, a, b), 100)
y = beta.pdf(x, a, b)
# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='beta pdf')
[<matplotlib.lines.Line2D at 0x1a17053dd0>]
Exhibit long tails, also called heavy tails, known as Zipf’s law or power law.
$$Pareto(x|k, m) = km^kx^{-(k + 1)}\mathbf{I} (x \ge m)$$$$mean = \frac{km}{k - 1} \ if \ k > 1, mode = m, var = \frac{m^2k}{(k - 1)^2(k - 2)} \ if \ k > 2$$# setup experiment
b = 2.62
x = np.linspace(pareto.ppf(0.01, b), pareto.ppf(0.99, b), 100)
y = pareto.pdf(x, b)
# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, pareto.pdf(x, b), 'r-', lw=5, alpha=0.6, label='pareto pdf')
[<matplotlib.lines.Line2D at 0x1a172529d0>]
# Evidence
x = pd.Series([0]*2 + [1]*3 + [2]*5 + [3]*14 + [4]*16 + [5]*15 + [6]*12 + [7]*8 + [8]*10 + [9]*8 + [10]*7)
p_true = dict(zip(np.histogram(x)[1], np.histogram(x)[0]))
p_true[10.0] = 7
p_true = np.array([p_true[val] * 0.01 for val in x])
sns.distplot(x, bins=10, label="Evidence")
# Uniform model
p_uniform = uniform.pdf(x, loc=0, scale=11)
plt.plot(x, p_uniform, 'k-', lw=2, label="Uniform")
# Normal model
mu, std = norm.fit(x)
p_norm = norm.pdf(x, mu, std)
plt.plot(x, p_norm, 'r-', lw=2, label="Normal")
plt.legend(loc="best")
/usr/local/anaconda2/lib/python2.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
<matplotlib.legend.Legend at 0x1a17566550>
print "Log likelihood (Uniform):", np.log(p_uniform).sum()
print "Log likelihood (Normal):", np.log(p_norm).sum()
Log likelihood (Uniform): -239.7895272798371 Log likelihood (Normal): -234.1304210201753
print "KL (Uniform):", (p_true * np.log(p_true / p_uniform)).sum()
print "KL (Normal):", (p_true * np.log(p_true / p_norm)).sum()
KL (Uniform): 3.9965854232729168 KL (Normal): 1.889817124546627
# cho distribution training good (x, y)
mu = 1
std = 1
x_good = np.linspace(norm.ppf(0.01, mu, std), norm.ppf(0.99, mu, std), 100)
y_good = norm.pdf(x_good, mu, std)
# cho distribution training bad (x, y)
mu = -1
std = 2
x_bad = np.linspace(norm.ppf(0.01, mu, std), norm.ppf(0.99, mu, std), 50)
y_bad = norm.pdf(x_bad, mu, std)
# plot 2 distribution
fig, ax = plt.subplots(1, 1)
ax.plot(x_good, y_good, 'r-', lw=5, alpha=0.6, label='good')
ax.plot(x_bad, y_bad, 'g.', lw=5, alpha=0.6, label='bad')
ax.legend(loc="best")
<matplotlib.legend.Legend at 0x1a175bc7d0>
# binning with quantiles 10
qtiles = [0, .25, .5, .75, .9, 1.]
bin_good = pd.qcut(x_good, qtiles)
bin_bad = pd.qcut(x_bad, qtiles)
# calculate pmf of each bin
pmf_good = bin_good.describe()
pmf_bad = bin_bad.describe()
print "PMF good"
display(pmf_good)
print "PMF bad"
display(pmf_bad)
# cho testing data, xac dinh prob cua data nay
test_data = np.array([-10, -1, 1, 2, 3])
def map_to_prob(data, pmf):
res = {}
for i in range(pmf.shape[0]):
idx = pmf.index[i]
for j in data:
if j in idx:
res[i] = pmf.loc[idx].freqs
return res
test_prob_good = map_to_prob(test_data, pmf_good)
test_prob_bad = map_to_prob(test_data, pmf_bad)
print "Good:", test_prob_good
print "Bad:", test_prob_bad
print "Diff:", np.array(test_prob_good.values()) - np.array(test_prob_bad.values())
PMF good
counts | freqs | |
---|---|---|
categories | ||
(-1.327, -0.163] | 25 | 0.25 |
(-0.163, 1.0] | 25 | 0.25 |
(1.0, 2.163] | 25 | 0.25 |
(2.163, 2.861] | 15 | 0.15 |
(2.861, 3.326] | 10 | 0.10 |
PMF bad
counts | freqs | |
---|---|---|
categories | ||
(-5.654, -3.326] | 13 | 0.26 |
(-3.326, -1.0] | 12 | 0.24 |
(-1.0, 1.326] | 12 | 0.24 |
(1.326, 2.722] | 8 | 0.16 |
(2.722, 3.653] | 5 | 0.10 |
Good: {0: 0.25, 1: 0.25, 2: 0.25, 4: 0.1} Bad: {1: 0.24, 2: 0.24, 3: 0.16, 4: 0.1} Diff: [0.01 0.01 0.09 0. ]
# apply model on quantiles
mu = 1
std = 1
cat = []
freqs = []
for b in bin_good.categories:
cat.append(b)
freqs.append(norm.cdf(b.right, mu, std) - norm.cdf(b.left, mu, std))
norm_model = pd.DataFrame({"categories": cat, "freqs": freqs})
norm_model = norm_model.set_index("categories")
norm_model
freqs | |
---|---|
categories | |
(-1.327, -0.163] | 0.112432 |
(-0.163, 1.0] | 0.377585 |
(1.0, 2.163] | 0.377585 |
(2.163, 2.861] | 0.091043 |
(2.861, 3.326] | 0.021363 |