#!/usr/bin/env python
# coding: utf-8

# In[40]:


import os
os.environ['THEANO_FLAGS']="device=gpu2"

get_ipython().run_line_magic('matplotlib', 'inline')
from matplotlib import pylab as plt
import theano
from theano import tensor as T
import numpy as np


# In[249]:


#create a regular grid in weight space for visualisation
wmin = -5
wmax = 5
wrange = np.linspace(wmin,wmax,300)
w = np.repeat(wrange[:,None],300,axis=1)
w = np.concatenate([[w.flatten()],[w.T.flatten()]])


# In[297]:


prior_variance = 2
logprior = -(w**2).sum(axis=0)/2/prior_variance
plt.contourf(wrange, wrange, logprior.reshape(300,300), cmap='gray');
plt.axis('square');
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);


# In[298]:


#generating a toy dataset with three manually selected observations
from scipy.stats import logistic
sigmoid = logistic.cdf
logsigmoid = logistic.logcdf

def likelihood(w,x,b=0,y=1):
    return logsigmoid(y*(np.dot(w.T,x) + b)).flatten()


x1 = np.array([[1.5],[1]])
x2 = np.array([[-1.5],[1]])
x3 = np.array([[0.5],[-1]])

y1=1
y2=1
y3=-1

llh1 = likelihood(w, x1, y=y1) 
llh2 = likelihood(w, x2, y=y2) 
llh3 = likelihood(w, x3, y=y3)


# In[299]:


#calculating unnormalised log posterior
#this is only for illustration
logpost = llh1 + llh2 + llh3 + logprior


# In[300]:


#plotting the real log posterior
#the red dots show the three datapoints, the small line segments shows the direction
#in which the corresponding label shifts the posterior. Positive datapoints shift the 
# posterior away from zero in the direction of the datapoint, negative datapoints shift
# away from zero, in the opposite direction.

plt.contourf(wrange,
             wrange,
             np.exp(logpost.reshape(300,300).T),cmap='gray');
plt.axis('square');
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax])

plt.plot(x1[0],x1[1],'.r')
plt.plot([x1[0],x1[0]*(1+0.2*y1)],[x1[1],x1[1]*(1+0.2*y1)],'r-')

plt.plot(x2[0],x2[1],'.r')
plt.plot([x2[0],x2[0]*(1+0.2*y2)],[x2[1],x2[1]*(1+0.2*y2)],'r-')

plt.plot(x3[0],x3[1],'.r')
plt.plot([x3[0],x3[0]*(1+0.2*y3)],[x3[1],x3[1]*(1+0.2*y3)],'r-');


# ## Fitting an approximate posterior
# 
# This part is for the actual GAN stuff. Here we define the generator and the discriminator networks in Lasagne, and code up the two loss functions in theano.

# In[214]:


from lasagne.utils import floatX

from lasagne.layers import (
    InputLayer,
    DenseLayer,
    NonlinearityLayer)
from lasagne.nonlinearities import sigmoid

#defines a 'generator' network
def build_G(input_var=None, num_z = 3):
    
    network = InputLayer(input_var=input_var, shape=(None, num_z))
    
    network = DenseLayer(incoming = network, num_units=10)
    
    network = DenseLayer(incoming = network, num_units=20)
    
    network = DenseLayer(incoming = network, num_units=2, nonlinearity=None)
    
    return network

#defines the 'denoiser network'
def build_denoiser(input_var=None):

    network = InputLayer(input_var=input_var, shape = (None, 2))
    
    network = DenseLayer(incoming = network, num_units=20)
    
    network = DenseLayer(incoming = network, num_units=10)
    
    network = DenseLayer(incoming = network, num_units=20)
    
    network = DenseLayer(incoming = network, num_units=2, nonlinearity=None)
    
    return network
    

# In[509]:


from lasagne.layers import get_output, get_all_params
from theano.printing import debugprint
from lasagne.updates import adam
from theano.tensor.shared_randomstreams import RandomStreams
from lasagne.objectives import squared_error

#variables for input (design matrix), output labels, GAN noise variable, weights
x_var = T.matrix('design matrix')
y_var = T.vector('labels')
z_var = T.matrix('GAN noise')
w_var = T.matrix('weights')

#theano variables for things like batchsize, learning rate, etc.
batchsize_var = T.scalar('batchsize', dtype='int32')
prior_variance_var = T.scalar('prior variance')
noise_variance_var = T.scalar('noise variance')
learningrate_var = T.scalar('learning rate')

#random numbers for sampling from the variational distribution
srng = RandomStreams(seed=1337)
z_rnd = srng.normal((batchsize_var,3))
epsilon_rnd = T.sqrt(noise_variance_var)*srng.normal((batchsize_var,2))

#instantiating the G and denoiser networks
generator = build_G(z_var)
denoiser = build_denoiser()

#these expressions are random samples from the variational distribution respectively
samples_from_generator = get_output(generator, z_rnd)
noisy_samples_from_generator = samples_from_generator + epsilon_rnd

#denoiser output for synthetic samples and noisy synthetic samples
denoised_noisy_samples_from_generator = get_output(denoiser, inputs=noisy_samples_from_generator)
denoised_samples_from_generator = get_output(denoiser, inputs=samples_from_generator)

#loss of discriminator - simple binary cross-entropy loss
loss_denoiser = squared_error(samples_from_generator, denoised_noisy_samples_from_generator).mean()

#log likelihood for each synthetic w sampled from the generator
log_likelihood = T.log(
    T.nnet.sigmoid(
        (y_var.dimshuffle(0,'x','x')*(x_var.dimshuffle(0,1,'x') * samples_from_generator.dimshuffle('x', 1, 0))).sum(1)
    )
).sum(0).mean()

#log prior for synthetic w sampled from the generator
log_prior = -((samples_from_generator**2).sum(1)/2/prior_variance_var).mean()

params_G = get_all_params(generator, trainable=True)

#calculating the derivative of the entropy with respect to parameters of G, using theano's Lop
dHdG = (samples_from_generator - denoised_samples_from_generator)/noise_variance_var
dHdPhi = T.Lop(
    f = samples_from_generator.flatten()/batchsize_var,
    wrt = params_G,
    eval_points=dHdG.flatten())

#calculating gradients of other terms in the bound and summing it all up
dLikelihooddPhi = T.grad(log_likelihood, wrt=params_G)

dPriordPhi = T.grad(log_prior,wrt=params_G)

dLdPhi = [-a-b-c for a,b,c in zip(dHdPhi,dPriordPhi,dLikelihooddPhi)]

updates_G = adam(
    dLdPhi,
    params_G,
    learning_rate=learningrate_var,
)

#compiling theano functions:
evaluate_generator = theano.function(
    [z_var],
    get_output(generator),
    allow_input_downcast=True
)

sample_generator = theano.function(
    [batchsize_var],
    samples_from_generator,
    allow_input_downcast=True,
)

params_denoiser = get_all_params(denoiser, trainable=True)

updates_denoiser = adam(
    loss_denoiser,
    params_denoiser,
    learning_rate = learningrate_var
)

train_denoiser = theano.function(
    [batchsize_var, noise_variance_var, learningrate_var],
    loss_denoiser,
    updates = updates_denoiser,
    allow_input_downcast = True
)

train_G = theano.function(
    [x_var, y_var, prior_variance_var, noise_variance_var, batchsize_var, learningrate_var],
    [],
    updates = updates_G,
    allow_input_downcast = True
)

grad_comp_1 = theano.clone(dHdG, replace={samples_from_generator: w_var})/batchsize
grad_comp_2 = T.grad(theano.clone(log_prior, replace={samples_from_generator: w_var}), wrt=w_var)
grad_comp_3 = T.grad(theano.clone(log_likelihood, replace={samples_from_generator: w_var}), wrt=w_var)

    
evaluate_gradients_G = theano.function(
    [w_var, x_var, y_var, prior_variance_var, noise_variance_var],
    [grad_comp_1 + grad_comp_2 + grad_comp_3, grad_comp_1, grad_comp_2, grad_comp_3],
    allow_input_downcast = True
)

evaluate_denoiser = theano.function(
    [w_var],
    get_output(denoiser, w_var),
    allow_input_downcast = True
)

#this is to evaluate the log-likelihood of an arbitrary set of w
llh_for_w = T.nnet.sigmoid((y_var.dimshuffle(0,'x','x')*(x_var.dimshuffle(0,1,'x') * w_var.dimshuffle('x', 1, 0))).sum(1))

evaluate_loglikelihood = theano.function(
        [x_var, y_var, w_var],
        llh_for_w,
        allow_input_downcast = True
    )


# In[510]:


#checking that theano and numpy give the same likelihoods
import seaborn as sns
sns.set_context('poster')

X_train = np.concatenate([x1,x2,x3],axis=1).T
y_train = np.array([y1,y2,y3])
llh_theano = evaluate_loglikelihood(X_train, y_train, w.T)

plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plt.contourf(wrange, wrange ,np.log(llh_theano).sum(0).reshape(300,300).T,cmap='gray');
plt.axis('square');
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax])
plt.title('theano loglikelihood')

plt.subplot(1,2,2)
plt.contourf(wrange, wrange, (llh1+llh2+llh3).reshape(300,300).T,cmap='gray');
plt.axis('square');
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax])
plt.title('numpy loglikelihood')

assert np.allclose(llh1+llh2+llh3,np.log(llh_theano).sum(0))


# In[511]:


#q and the denoiser before training:
wrange_spaced = np.linspace(wmin,wmax,30)
w_spaced = np.repeat(wrange_spaced[:,None],30,axis=1)
w_spaced = np.concatenate([[w_spaced.flatten()],[w_spaced.T.flatten()]])
arrows = evaluate_denoiser(w_spaced.T) - w_spaced.T
plt.quiver(w_spaced[0,:],w_spaced[1,:],arrows[:,0],arrows[:,1])
W = sample_generator(200)
plt.plot(W[:,0],W[:,1],'g.')
plt.axis('square')
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);


# In[512]:


batchsize = 100
KARPATHY_CONSTANT = 3e-4
learning_rate = KARPATHY_CONSTANT*10

prior_variance = 1

#pre-training on large noise to avoid spurious modes
noise_variance = 0.7
# train discriminator for some time before starting iterative process
for i in range(5):
    get_ipython().run_line_magic('timeit', '-n 200 train_denoiser(batchsize, noise_variance, learning_rate)')
    print(train_denoiser(batchsize*10, noise_variance, learning_rate))

noise_variance = 0.1
# train discriminator for some time before starting iterative process
for i in range(10):
    get_ipython().run_line_magic('timeit', '-n 200 train_denoiser(batchsize, noise_variance, learning_rate)')
    print(train_denoiser(batchsize*10, noise_variance, learning_rate))


# In[513]:


#q and denoiser after training the denoiser:
arrows = evaluate_denoiser(w_spaced.T) - w_spaced.T
plt.quiver(w_spaced[0,:],w_spaced[1,:],arrows[:,0],arrows[:,1])
W = sample_generator(1000)
plt.plot(W[:,0],W[:,1],'g.')
plt.axis('square')
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);


# In[514]:


# this is the main training loop. For each gradient step of G I do 300 gradient steps of the denoiser
print (train_denoiser(batchsize, noise_variance, 0))
for i in range(200):
    get_ipython().run_line_magic('timeit', '-n 300 train_denoiser(batchsize, noise_variance, learning_rate)')
    print (train_denoiser(batchsize, noise_variance, 0))
    train_G(X_train, y_train, prior_variance, noise_variance, batchsize, learning_rate)


# In[515]:


#q and denoiser after training the denoiser:
sns.set_style('whitegrid')
plt.subplots(figsize=(16, 16))
arrows_sum, arrows_1, arrows_2, arrows_3 = evaluate_gradients_G(w_spaced.T, X_train, y_train, prior_variance, noise_variance)
W = sample_generator(200)

plt.subplot(2,2,1)
plt.quiver(w_spaced[0,:],w_spaced[1,:],0.1*arrows_sum[:,0],0.1*arrows_sum[:,1])
plt.plot(W[:,0],W[:,1],'g.')
plt.axis('square')
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);
plt.xticks([])
plt.yticks([])
plt.title('overall gradients')

plt.subplot(2,2,2)
plt.quiver(w_spaced[0,:],w_spaced[1,:],arrows_1[:,0],arrows_1[:,1])
plt.plot(W[:,0],W[:,1],'g.')
plt.axis('square')
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);
plt.xticks([])
plt.yticks([])
plt.title('entropy(from denoiser)')

plt.subplot(2,2,3)
plt.quiver(w_spaced[0,:],w_spaced[1,:],arrows_2[:,0],arrows_2[:,1])
plt.plot(W[:,0],W[:,1],'g.')
plt.axis('square')
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);
plt.xticks([])
plt.yticks([]);
plt.title('prior');

plt.subplot(2,2,4)
plt.quiver(w_spaced[0,:],w_spaced[1,:],arrows_3[:,0],arrows_3[:,1])
plt.plot(W[:,0],W[:,1],'g.')
plt.axis('square')
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);
plt.xticks([])
plt.yticks([]);
plt.title('likelihood');


# In[516]:


W = sample_generator(100)
arrows_sum, arrows_1, arrows_2, arrows_3 = evaluate_gradients_G(W, X_train, y_train, prior_variance, noise_variance)

sns.set_style('whitegrid')
plt.subplots(figsize=(16,16))
arrow_scale = 3e-1
plt.subplot(2,2,1)
plt.plot(W[:,0],W[:,1],'g.',alpha=0.3)
plt.quiver(W[:,0], W[:,1], arrows_sum[:,0], arrows_sum[:,1], scale = arrow_scale, scale_units='width', width=0.005)
plt.axis('square')
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);
plt.xticks([])
plt.yticks([])
plt.title('overall gradients')

plt.subplot(2,2,2)
plt.plot(W[:,0],W[:,1],'g.',alpha=0.3)
plt.quiver(W[:,0], W[:,1], arrows_1[:,0], arrows_1[:,1], scale=arrow_scale, scale_units='width', width=0.005)
plt.axis('square')
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);
plt.xticks([])
plt.yticks([])
plt.title('entropy (from denoiser)')

plt.subplot(2,2,3)
plt.plot(W[:,0],W[:,1],'g.',alpha=0.3)
plt.quiver(W[:,0], W[:,1], arrows_2[:,0], arrows_2[:,1], scale = arrow_scale, scale_units='width', width=0.005)
plt.axis('square')
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);
plt.xticks([])
plt.yticks([]);
plt.title('prior');


plt.subplot(2,2,4)
plt.plot(W[:,0],W[:,1],'g.',alpha=0.3)
plt.quiver(W[:,0], W[:,1], arrows_3[:,0], arrows_3[:,1], scale = arrow_scale, scale_units='width', width=0.005)
plt.axis('square')
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);
plt.xticks([])
plt.yticks([]);
plt.title('likelihood');


# In[517]:


plt.contourf(wrange, wrange, np.exp(logpost.reshape(300,300).T),cmap='gray');
plt.axis('square');
W = sample_generator(1000)
plt.plot(W[:,0],W[:,1],'.g')
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);
plt.title('true log posterior');


# In[518]:


sns.set_style('whitegrid')
plt.subplot(1,2,2)

W = sample_generator(5000)
plot = sns.kdeplot(W[:,0],W[:,1])
plt.axis('square')
plot.set(xlim=(wmin,wmax))
plot.set(ylim=(wmin,wmax))
plt.title('kde of approximate posterior')

plt.subplot(1,2,1)
plt.contourf(wrange, wrange, np.exp(logpost.reshape(300,300).T),cmap='gray');
plt.axis('square');
W = sample_generator(100)
plt.plot(W[:,0],W[:,1],'.g')
plt.xlim([wmin,wmax])
plt.ylim([wmin,wmax]);
plt.title('true posterior');


# In[520]:


#looking at average gradient magnitudes
#at convergence, the average entropy contribution should be roughly the same as prior and likelihood combined.

W = sample_generator(100)
arrows_sum, arrows_1, arrows_2, arrows_3 = evaluate_gradients_G(W, X_train, y_train, prior_variance, noise_variance)
print('entropy:',(arrows_1**2).mean())
print('log-prior:',(arrows_2**2).mean())
print('log-likelihood:',(arrows_3**2).mean())