#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


import matplotlib.pyplot as plt
import numpy as np
import pymc3 as pm
import scipy.stats
from sampled import sampled
import seaborn as sns
import theano.tensor as tt


# In[3]:


# plt.rcParams["figure.dpi"] = 600  # for presentation figures


# # Linear Regression

# Define Bayesian linear regression model:

# In[4]:


@sampled
def linear_regression(X=None, y=None):
    β = pm.Normal('β', mu=0, sd=10, shape=X.shape[1])
    σ = pm.HalfNormal('σ', sd=1)
    X = pm.Flat('X', observed=X)
    y = pm.Normal('y', mu=X.dot(β), sd=σ, observed=y)


# Generate synthetic data, sample from model, and plot posteriors:

# In[5]:


X = np.random.normal(size=(100, 4))
β = np.random.normal(size=X.shape[1])
σ = np.random.normal(size=X.shape[0])
y = X.dot(β) + σ

with linear_regression(X=X, y=y):
    posterior_trace = pm.sample()


# In[6]:


ax = sns.distplot(posterior_trace.get_values('σ'))
ax.vlines((σ ** 2).mean(), *ax.get_ylim());


# In[7]:


fig, axes = plt.subplots(nrows=len(β), sharex=True, sharey=True)

for ax, weight, true in zip(axes, posterior_trace.get_values('β').T, β):
    ax = sns.kdeplot(weight.T, ax=ax)
    ax.vlines(true, *ax.get_ylim(), alpha=0.5)
    ax.set_ylim(bottom=0)


# # Inverting the CDF

# In[8]:


x = np.linspace(0, 4)
y = 1 - np.exp(-x)
plt.plot(x, y, 'k-', linewidth=6)

z = 0.75
z_inv = -np.log(1 - z)
plt.hlines(z, 0, z_inv, linestyles='dashed', colors='steelblue', linewidth=6)
plt.vlines(-np.log(1 - z), 0, z, linestyles='dashed', colors='steelblue', linewidth=6);


# # Rejection Sampling

# Here is a pretty general implementation, and a vectorized one for plotting.

# In[9]:


def rejection_sampling(pdf, q, k, size):
    """Rejection sampling
    
    Args:
        pdf: A (possibly scaled) pdf.  Eats vectors and returns floats
        q: A scipy random variable with methods `.rv` to sample and `.pdf` to get the pdf of a point
        k: A scaling factor to make sure `kq(z) >= pdf(z)` for all `z`
        size: number of points to sample
    
    Returns:
        points: numpy array of sampled points
        draws: number of draws it took to get this many points
    """
    draws = 0
    points = []
    while len(points) < size:
        proposal = q.rvs(size=1)
        draws += 1
        accept_prob = pdf(proposal) / (k * q.pdf(proposal))
        if accept_prob > 1:
            raise RuntimeError('k isn\'t big enough at {}'.format(proposal))
        if np.random.random() < accept_prob:
            points.append(proposal)
    return points, draws


# In[10]:


def two_gaussians_pdf():
    pdf_one = scipy.stats.norm(2).pdf
    pdf_two = scipy.stats.norm(-2).pdf
    def pdf(z):
        return 0.8 * pdf_one(z) + 0.2 * pdf_two(z)
    return pdf

def rejection_sample_plot(N):
    """Hard codes rejection sampling for the above 
    pdf using a hand-tuned normal distribution
    """
    one_d_unknown = two_gaussians_pdf()
    one_d_wide = scipy.stats.norm(0, 4)
    k = 1.02 * one_d_unknown(2) / one_d_wide.pdf(2)
    new_x = one_d_wide.rvs(N)
    new_y = np.random.uniform(0, k * one_d_wide.pdf(new_x))
    extent = max(abs(new_x.max()), abs(new_x.min()))
    if N > 0:
        x = np.linspace(min(-8, -1.1 * extent), max(1.1 * extent, 8), 1000)
    else:
        x = np.linspace(-8, 8, 1000)
    fig, ax = plt.subplots()
    ax.plot(x, [one_d_unknown(j) for j in x])
    ax.plot(x, [k * one_d_wide.pdf(j) for j in x])
    ax.fill_between(x, 0, [one_d_unknown(j) for j in x], alpha=0.4)
    ax.fill_between(x, 0, [k * one_d_wide.pdf(j) for j in x], alpha=0.1)

    accept = new_y <= one_d_unknown(new_x)
    ax.plot(new_x[accept], new_y[accept], marker='o', color='green', linestyle='None')
    ax.plot(new_x[~accept], new_y[~accept], marker='o', color='red', linestyle='None')


# In[11]:


rejection_sample_plot(50)


# # Concentration of measure

# In[12]:


for N in (2, 4, 8, 16, 32):
    x = scipy.stats.multivariate_normal(cov=np.eye(N) / N).rvs(size=10000)
    _ = sns.distplot(np.linalg.norm(x, axis=1))


# # Sampling from a donut

# In[13]:


def jointplot(ary):
    """Helper to plot everything consistently"""
    sns.jointplot(*ary.T, alpha=0.1, stat_func=None, xlim=(-1.2, 1.2), ylim=(-1.2, 1.2))


# In[14]:


def metropolis_accept(old, new):
    """In log space -- the proposal should be symmetric!"""
    return np.log(np.random.uniform()) < new - old


def normal_proposal(scale):
    """Make a proposal for metropolis hastings from the normal distribution."""
    def proposal(x):
        return np.random.normal(x, scale)
    return proposal


def metropolis(logp, n=1000, step_size=0.15, init=np.array([1, 0])):
    """Implementation of Metropolis-Hastings.
    
    The proposal distribution is hardcoded to be a normal distribution, 
    but it will sample in whatever dimension `init` is from.  Also prints
    the acceptance rate at the end.  Optimal is ~0.234.
    """
    proposal = normal_proposal(step_size)
    accept = 0
    old = proposal(init)
    samples = []
    for _ in range(n):
        new = proposal(old)
        if metropolis_accept(logp(old), logp(new)):
            old = new
            accept += 1
        samples.append(old)
    print('Acceptance rate: {:.2f}%'.format(100 * accept / n))
    return np.array(samples)


# In[15]:


def donut_pdf(scale):
    """Sample pdf for visualizations.  
    
    Normally distributed around the unit 
    circle, and the radius may be scaled.
    """
    def logp(x):
        return -((1 - np.linalg.norm(x)) / scale) ** 2
    return logp


# Now we actually sample from donut distribution using the handwritten Metropolis sampler above

# In[16]:


samples = metropolis(donut_pdf(0.1), n=1000, step_size=0.37)
jointplot(samples)


# Now see how bad the sampler does with a smaller scale on the donut:

# In[17]:


samples = metropolis(donut_pdf(0.01), n=1000, step_size=0.03)
jointplot(samples)


# We can do the same thing with PyMC3, but we will want the pdf rewritten with `theano`.

# In[18]:


def tt_donut_pdf(scale):
    """Compare to `donut_pdf`"""
    def logp(x):
         return -tt.square((1 - x.norm(2)) / scale)
    return logp


# In[19]:


@sampled
def donut(scale=0.1, **observed):
    """Gets samples from the donut pdf, and allows adjusting the scale of the donut at sample time."""
    pm.DensityDist('donut', logp=tt_donut_pdf(scale), shape=2, testval=[0, 1])


# In[20]:


with donut(scale=0.1):
    metropolis_sample = pm.sample(draws=1000, step=pm.Metropolis())

jointplot(metropolis_sample.get_values('donut'))


# PyMC3 also does poorly on a smaller scaled donut (because the problem is the Metropolis sampler, not my implementation!)

# In[21]:


with donut(scale=0.01):
    metropolis_sample = pm.sample(draws=1000, step=pm.Metropolis())

jointplot(metropolis_sample.get_values('donut'))


# Sample the same distributions using the Hamiltonian sampler in PyMC3.  It has no problem with either:

# In[22]:


with donut(scale=0.1):
    nuts_sample = pm.sample(draws=1000, init=None, step=pm.NUTS())
    
jointplot(nuts_sample.get_values('donut'))


# In[23]:


with donut(scale=0.01):
    nuts_sample = pm.sample(draws=1000, init=None, step=pm.NUTS())
    
jointplot(nuts_sample.get_values('donut'))


# Also note that PyMC3 will make good choices if you just call `sample`:

# In[24]:


with donut(scale=0.1) as d:
    nuts_sample = pm.sample()
    
jointplot(nuts_sample.get_values('donut'))


# $$H(q, p) = -\log \pi(p | q) - \log \pi(q)$$
# $$\frac{dq}{dt} = \frac{\partial H}{\partial p}$$
# $$\frac{dp}{dt} = -\frac{\partial H}{\partial q}$$

# Example:
# 
# Let $\pi(q) = \mathcal{N}(0, 1)$, and $\pi(p | q) = \mathcal{N}(0, 1),$ so 
# $$ H(q, p) = C + p^2 + q^2.$$
# 
# We can manually compute the gradients for these distributions in order to plot the HMC paths in 2 dimensions: 1 for position, 1 for momentum.

# In[25]:


def gen_data(position, momentum, n=10000):
    """Generate a background density plot for the position and momentum distributions.  Not used for sampling."""
    q = position.rvs(n)
    p = momentum(q).rvs()
    return np.column_stack([q, p])


# In[26]:


def leapfrog(q, p, dHdq, dHdp, step_size, n_steps):
    """Perform the leapfrog integration.  
    
    Similar to the implementations in PyMC3, but 
    returns an array of all the points in the path
    for plotting.  It is a pretty general 
    implementation, in that it does hardcode
    the potential or kinetic energies.
    
    Args:
        q: current position
        p: current momentum
        dHdq: see Hamilton's equations above
        dHdp: see Hamilton's equations above
        step_size: How big of a step to take
        n_steps: How many steps to take
    
    Returns:
        (`n_steps` x 2) numpy array showing the 
        path through (position, momentum) space 
        the Hamiltonian path took.
    """
    data = [[q, p]]
    p += 0.5 * step_size * -dHdq(q, p)
    q += step_size * dHdp(q, p)
    data.append([q, p])
    for _ in range(n_steps - 1):
        p += step_size * -dHdq(q, p)
        q += step_size * dHdp(q, p)
        data.append([q, p])
    p += 0.5 * step_size * -dHdq(q, p)
    return np.array(data)    


# In[27]:


def leapfrog_paths(position, momentum, dHdq, dHdp, n=10):
    """Get a number `n` of paths from the HMC sampler.

    This is not quite as general -- I hardcode a step 
    size of 0.01 and 100 steps, since a path length of 1
    looked pretty good (and *not* because it is the best
    choice).  Returns an iterator of plot data.
    """
    q = position.rvs()
    p = momentum(q).rvs()
    for _ in range(n):
        path = leapfrog(q, p, dHdq, dHdp, 0.01, 100)
        yield path
        q, _ = path[-1]
        p = momentum(q).rvs()


# Here we write down the distributions above, and manually compute the gradients needed to use the above two functions.

# In[28]:


position = scipy.stats.norm(0, 1)
momentum = lambda q: scipy.stats.norm(0, np.ones(q.shape))
dHdp = lambda q, p: 2 * p
dHdq = lambda q, p: 2 * q


# In[29]:


#  First plot a KDE of the joint pdf
X = gen_data(position, momentum)
g = sns.jointplot(*X.T, kind='kde', stat_func=None, xlim=(-3, 3), ylim=(-3, 3), alpha=0.5)

#  Now plot the Leapfrog paths on top of that
for path in leapfrog_paths(position, momentum, dHdq, dHdp, 15):
    g.ax_joint.plot(*path.T, linewidth=3)


# # Volume of an N-dimensional unit sphere
# 
# It is pretty small:
# $$
# V_n(r) = \frac{\pi^{\frac{n}{2}}}{\Gamma \left(\frac{n}{2} + 1 \right)} r^n
# $$

# In[30]:


from scipy.special import gamma
n = np.linspace(0, 50, 1000)
V = np.power(np.pi, 0.5 * n) / gamma(0.5 * n + 1)
fig = plt.plot(n, V)


# # In 20 dimensions, how many samples from an enclosing cube fall inside the unit sphere?
# 
# This actually samples from one "hyper quadrant", but by symmetry, the probability will be the same.

# In[31]:


x = np.random.random(size=(20, 10000000))
((x * x).sum(axis=0) < 1).sum()