#!/usr/bin/env python
# coding: utf-8

# In[ ]:


get_ipython().run_line_magic('matplotlib', 'inline')
from matplotlib import pyplot as plt

from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import RandomizedPCA
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVC

from scipy.stats import pearsonr
from random import shuffle
import cPickle, gzip
import numpy as np
import time

def time_f(f,n):
    start = time.time()
    for i in xrange(n):
        f()
    return (time.time()-start)/n

# download http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
 
f = gzip.open('mnist.pkl.gz', 'rb')
(mnist,labels), _, _ = cPickle.load(f)
f.close()


# # Monte Carlo integration

# In[ ]:


def estimate_pi(n = 1000):
    return np.mean(np.sum(np.power(np.random.rand(int(n),2),2),1) <= 1)*4


# In[ ]:


np.random.seed(0)

n_arr  = np.logspace(1,6,10)
p_arr  = np.zeros(len(n_arr))
n_reps = 10

for rep in xrange(n_reps):
    for i in xrange(len(n_arr)):
        p_arr[i] += np.abs(estimate_pi(n_arr[i])-np.pi)/n_reps


# In[ ]:


plt.plot(np.log(n_arr), np.log(p_arr));
plt.xlabel('number of samples');
plt.ylabel('absolute error');


# # Truncated SVD

# In[ ]:


def my_svd(A, k):
    Omega = np.random.randn(A.shape[1],2*k)
    Y = np.dot(A,Omega)
    Q, _ = np.linalg.qr(Y)
    B = np.dot(Q.T, A)
    S, Sigma, V = np.linalg.svd(B)
    U = np.dot(Q,S)
    U = U[:,0:k]
    Sigma = np.diag(Sigma[0:k])
    V = V[0:k,:]
    return np.dot(np.dot(U,Sigma),V)

t_n = [125,250,500,1000,2000]
t_my = np.zeros(len(t_n)) 
t_random = np.zeros(len(t_n)) 
t_arpack = np.zeros(len(t_n))

t_random_e = np.zeros(len(t_n))
t_arpack_e = np.zeros(len(t_n))
t_my_e = np.zeros(len(t_n))

for n in xrange(len(t_n)):
    A = np.dot(np.random.randn(t_n[n],2*t_n[n]/10),np.random.randn(2*t_n[n]/10,t_n[n]))
    
    def f(): my_svd(A, t_n[n]/10)
    t_my[n] = time_f(f,10)
    t_my_e[n] = np.linalg.norm(my_svd(A, t_n[n]/10)-A)/t_n[n]
    
    svd = TruncatedSVD(t_n[n]/10, 'randomized')
    def f(): svd.fit(A)
    t_random[n] = time_f(f,10)
    t_random_e[n] = np.linalg.norm(svd.inverse_transform(svd.transform(A))-A)/t_n[n]

    svd = TruncatedSVD(t_n[n]/10, 'arpack')
    def f(): svd.fit(A)
    t_arpack[n] = time_f(f,10)
    t_arpack_e[n] = np.linalg.norm(svd.inverse_transform(svd.transform(A))-A)/t_n[n]
    
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(t_n,t_my, label='My beauty');
plt.plot(t_n,t_arpack, label='LAPACK');
plt.plot(t_n,t_random, label='Halko et al.');
plt.xlabel('m')
plt.ylabel('running time')
plt.legend();

plt.subplot(1,2,2)
plt.plot(t_n,t_my_e, label='My beauty');
plt.plot(t_n,t_arpack_e, label='LAPACK');
plt.plot(t_n,t_random_e, label='Halko et al.');
plt.xlabel('m')
plt.ylabel('reconstruction error')
plt.legend();


# In[ ]:


plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(t_n,t_my, label='My beauty');
plt.plot(t_n,t_arpack, label='LAPACK');
plt.plot(t_n,t_random, label='Halko et al.');
plt.xlabel('m')
plt.ylabel('running time')
plt.legend();

plt.subplot(1,2,2)
plt.plot(t_n,t_my_e, label='My beauty');
plt.plot(t_n,t_arpack_e, label='LAPACK');
plt.plot(t_n,t_random_e, label='Halko et al.');
plt.xlabel('m')
plt.ylabel('reconstruction error')
plt.legend();


# # Dimensionality reduction

# In[ ]:


f = gzip.open('mnist.pkl.gz', 'rb')
(mnist,labels), _, _ = cPickle.load(f)
f.close()


# In[ ]:


i  = 1
plt.figure(figsize=(10,10))
for k in [10,50,100,200]:
    plt.subplot(2,2,i)
    x1 = mnist[np.random.permutation(mnist.shape[0])[0:1000]]
    x2 = mnist[np.random.permutation(mnist.shape[0])[0:1000]]
    dx = pairwise_distances(x2,x2).ravel()
 
    A  = np.random.randn(x1.shape[1],k)*1./np.sqrt(k)
    Ai = np.linalg.pinv(A, 1e-3)
    z2 = np.dot(x2,A)
    xz = np.dot(z2,Ai)
    dz = pairwise_distances(z2,z2).ravel()

    pca = PCA(k).fit(x1)
    p2  = pca.transform(x2)
    xp  = pca.inverse_transform(p2)
    dp  = pairwise_distances(p2,p2).ravel()

    plt.hist(dz/(dx+1e-6),bins=100, alpha=0.5, normed=True, label='random');
    #plt.hist(dp/(dx+1e-6),bins=100, alpha=0.5, normed=True, label='pca');
    plt.title(str(k) + ' projections')
    plt.legend()
    i = i+1
    
plt.tight_layout()


# ## Beware! There are many ways of being "far away" from something!

# In[ ]:


k  = 10 # ten latent components

A  = np.random.randn(x1.shape[1],k)*1./np.sqrt(k)
Ai = np.linalg.pinv(A, 1e-3)
z2 = np.dot(x2,A)
xz = np.dot(z2,Ai)
dz = pairwise_distances(z2,z2).ravel()

pca = PCA(k).fit(x1)
p2  = pca.transform(x2)
xp  = pca.inverse_transform(p2)
dp  = pairwise_distances(p2,p2).ravel()
    
plt.figure(figsize=(10,10))
for i in xrange(100):
    plt.subplot(10,10,i+1)
    plt.imshow(xp[i].reshape(28,28), cmap='Greys_r')
    plt.axis('off')
plt.show()

plt.figure(figsize=(10,10))
for i in xrange(100):
    plt.subplot(10,10,i+1)
    plt.imshow(xz[i].reshape(28,28), cmap='Greys_r')
    plt.axis('off')
plt.show()


# # Kernel methods and random features

# In[ ]:


np.random.seed(1)

def random_features(d,k,gamma):
    w = np.random.randn(d,k)*np.sqrt(2.*gamma)
    b = 2.*np.pi*np.random.rand(k)
    def foo(x):
        return np.sqrt(2./k)*np.cos(np.dot(x,w)+b)
    return foo

def estimate_gamma(x,n=100):
    x_sub = x[np.random.permutation(x.shape[0])[0:n]]
    return 1./np.median(np.power(pairwise_distances(x_sub,x_sub),2).flatten())

x = mnist[np.random.permutation(mnist.shape[0])[0:100]]
g = estimate_gamma(x)

plt.figure(figsize=(6,6))

plt.subplot(2,2,1)
K = rbf_kernel(x,x,g)
plt.imshow(K)
plt.title('exact kernel')
plt.axis('off');

p = 1
for k in [100,1000,10000]:
    plt.subplot(2,2,p+1)
    rf = random_features(x.shape[1], k,g)
    plt.imshow(np.dot(rf(x),rf(x).T))
    plt.title(k)
    plt.axis('off');
    p += 1

plt.tight_layout()


# ## Does it blend on MNIST?

# In[ ]:


f = gzip.open('mnist.pkl.gz', 'rb')
(x_tr, y_tr), (x_va, y_va), (x_te, y_te) = cPickle.load(f)
f.close()

x_tr = np.vstack((x_tr,x_va))
y_tr = np.hstack((y_tr,y_va))

rf = random_features(x_tr.shape[1], 2048, estimate_gamma(x_tr))

z_tr = rf(x_tr)
z_va = rf(x_va)
z_te = rf(x_te)


# In[ ]:


svm = LinearSVC().fit(z_tr,y_tr)
print np.mean(svm.predict(z_te)==y_te)


# ## How do we deal with uncertainty?

# The predictive distribution of Bayesian linear regression is
# \begin{equation*}
#   p(f | x, X, y) = \mathcal{N} \left( \sigma^{-2}_n x^\top A^{-1} Xy, x^\top A^{-1}x\right),
# \end{equation*}
# where
# \begin{equation*}
#   A = \sigma^{-2}_n XX^\top + \Sigma_p^{-1}.
# \end{equation*}
# where
# \begin{equation*}
#   w \sim \mathcal{N}(0,\Sigma_p)
# \end{equation*}

# In[ ]:


def bayesian_linear_regression(X,y,Sigma_p=None,sigma_n=None):
    if Sigma_p is None:
        Sigma_p = np.diag(np.repeat(1,X.shape[1]))
    if sigma_n is None:
        sigma_n = 1
        
    A  = np.power(sigma_n,-2)*np.dot(X.T,X)+np.linalg.inv(Sigma_p)
    Ai = np.linalg.inv(A)
    C  = np.dot(np.dot(Ai,X.T),y)
    
    def mean(x):
        return np.power(sigma_n,-2)*np.dot(x,C)
    def variance(x):
        return 2*np.sqrt(np.diag(np.dot(np.dot(x,Ai),x.T))[:,np.newaxis])
            
    return (mean, variance)

def f(x):
    return np.sin(5*x)+np.random.randn(x.shape[0],1)*0.1

def estimate_gamma(x,n=100):
    x_sub = x[np.random.permutation(x.shape[0])[0:n]]
    return 1./np.median(np.power(pairwise_distances(x_sub,x_sub),2).flatten())

np.random.seed(0)

N = 1000
K = 1000

x_tr = np.random.randn(N,1)
y_tr = f(x_tr)

s_x  = StandardScaler().fit(x_tr)
s_y  = StandardScaler().fit(y_tr)

x_tr = s_x.transform(x_tr)
y_tr = s_y.transform(y_tr)

rf   = random_features(1,K,estimate_gamma(x_tr))
z_tr = rf(x_tr)
s_z  = StandardScaler().fit(z_tr)
z_tr = s_z.transform(z_tr)
(mean, variance) = bayesian_linear_regression(z_tr,y_tr,None,1)

x_te = 0.75*np.sort(np.random.randn(N,1),0)
#x_te = 1.5*np.sort(np.random.randn(N,1),0)
y_te = f(x_te)

x_te = s_x.transform(x_te)
y_te = s_y.transform(y_te)
z_te = rf(x_te)
z_te = s_z.transform(z_te)

plt.plot(x_te,y_te,'.', label='test data')

plt.plot(x_tr,y_tr,'.',color='red', label='training data')
plt.plot(x_te,mean(z_te),lw=3, label='prediction')
plt.fill(np.concatenate([x_te, x_te[::-1]]),
        np.concatenate([mean(z_te) - 1.9600 * np.sqrt(variance(z_te)),
                       (mean(z_te) + 1.9600 * np.sqrt(variance(z_te)))[::-1]]),
        alpha=.25, fc='g', ec='None', label='95% confidence interval')
plt.xlim(np.min(x_te),np.max(x_te));
plt.legend(loc='best')
plt.title(np.power(y_te-mean(z_te),2).mean())


# # Nonlinear Component analysis

# In[ ]:


f = gzip.open('mnist.pkl.gz', 'rb')
(x_tr, y_tr), (x_va, y_va), (x_te, y_te) = cPickle.load(f)
f.close()

x_tr  = np.vstack((x_tr,x_va))
y_tr  = np.hstack((y_tr,y_va))

rf = random_features(x_tr.shape[1], 2048, estimate_gamma(x_tr))

z_tr = rf(x_tr)
z_va = rf(x_va)
z_te = rf(x_te)

xz_tr = z_tr
xz_te = z_te
pca   = RandomizedPCA(10).fit(z_tr)
p_tr  = pca.transform(z_tr)
p_te  = pca.transform(z_te)

rf    = random_features(p_tr.shape[1], 2048, estimate_gamma(p_tr))

pz_tr = rf(p_tr)
pz_te = rf(p_te)

ridge = Ridge().fit(pz_tr, x_tr)

rec_tr = ridge.predict(pz_tr)
rec_te = ridge.predict(pz_te)


# In[ ]:


plt.figure(figsize=(10,10))
for i in xrange(100):
    plt.subplot(10,10,i+1)
    plt.imshow(rec_tr[i].reshape(28,28), cmap='Greys_r')
    plt.axis('off')
plt.show()

plt.figure(figsize=(10,10))
for i in xrange(100):
    plt.subplot(10,10,i+1)
    plt.imshow(rec_te[i].reshape(28,28), cmap='Greys_r')
    plt.axis('off')
plt.show()


# # Dependence measurament

# In[ ]:


def random_features(d,k,gamma):
    w = np.random.randn(d,k)*np.sqrt(2.*gamma)
    b = 2.*np.pi*np.random.rand(k)
    def foo(x):
        return np.sqrt(2./k)*np.cos(np.dot(x,w)+b)
    return foo

def experiment_sin(n=1000,k=100,freq=3,noise=.1):
    x = np.random.randn(n,1)
    y = np.sin(freq*x)+noise*np.random.randn(n,1)
    
    #t = 2*np.pi*np.random.rand(n,1)
    #x = np.cos(freq*t)+noise*np.random.randn(n,1)
    #y = np.sin(freq*t)+noise*np.random.randn(n,1)

    k    = k
    rf_x = random_features(x.shape[1],k,estimate_gamma(x))
    z_x  = rf_x(x)
    rf_y = random_features(y.shape[1],k,estimate_gamma(y))
    z_y  = rf_x(y)

    f, g = CCA(n_components=1).fit_transform(z_x,z_y)
    return x, y, f, g


# In[ ]:


x, y, f, g = experiment_sin()

plt.figure(figsize=(8,8))

plt.subplot(2,2,1)
plt.plot(x,y,'.')
plt.title('Correlation: ' + str(pearsonr(x,y)[0][0]))

plt.subplot(2,2,2)
plt.plot(f,g,'.')
plt.title('Correlation: ' + str(pearsonr(f,g)[0][0]))

plt.subplot(2,2,3)
plt.plot(x,f,'.')
plt.title('x-transformation')
plt.xlabel('x')
plt.ylabel('f')

plt.subplot(2,2,4)
plt.plot(y,g,'.')
plt.title('y-transformation')
plt.xlabel('y')
plt.ylabel('g')

plt.tight_layout()


# In[ ]:


freqs  = [1,2,3,4,5,6,7,8,9,10]
corr_freqs = np.zeros(len(freqs))

for i in xrange(10):
    for fi in xrange(len(freqs)):
        x, y, f, g = experiment_sin(freq=freqs[fi])
        corr_freqs[fi] += pearsonr(f,g)[0][0]

plt.plot(freqs, corr_freqs)
plt.title('Degradation with respect to frequency')


# # Low-dimensional kernel mean embeddings

# ## Two-sample testing

# In[ ]:


f = gzip.open('mnist.pkl.gz', 'rb')
(x_tr,y_tr), (x_va,y_va), (x_te,y_te) = cPickle.load(f)
f.close()

x5_tr = x_tr[np.where(y_tr==5)]
x8_tr = x_tr[np.where(y_tr==8)]
x5_va = x_va[np.where(y_va==5)]
x8_va = x_va[np.where(y_va==8)]
x5_te = x_te[np.where(y_te==5)]
x8_te = x_te[np.where(y_te==8)]

gamma = estimate_gamma(x5_tr, 100)
rf    = random_features(x5_tr.shape[1],100,gamma)

def mmd(x,y,gamma):
    return rbf_kernel(x,gamma=gamma).mean()-\
           2*rbf_kernel(x,y,gamma=gamma).mean()+\
           rbf_kernel(y,gamma=gamma).mean()

def rmmd(x,y,rf):
    return np.power(np.linalg.norm(rf(x).mean(0)-rf(y).mean(0)),2)


# In[ ]:


print mmd(x5_tr,x8_tr,gamma)
print rmmd(x5_tr,x8_tr,rf)


# In[ ]:


print mmd(x5_tr,x5_va,gamma)
print rmmd(x5_tr,x5_va,rf)


# In[ ]:


def f(): mmd(x5_tr,x5_va,gamma)
print time_f(f,10)

def f(): rmmd(x5_tr,x5_va,rf)
print time_f(f,10)


# ## Distribution classification

# In[ ]:


from   sklearn.ensemble      import RandomForestClassifier as RFC
from   scipy.interpolate     import UnivariateSpline as sp
from   sklearn.preprocessing import scale
from   sklearn.mixture       import GMM

def continuous_cause(n,k,p1,p2):
  g = GMM(k)
  g.means_   = p1*np.random.randn(k,1)
  g.covars_  = np.abs(p2*np.random.randn(k,1)+1)
  g.weights_ = abs(np.random.rand(k,1))
  g.weights_ = g.weights_/sum(g.weights_)
  return scale(g.sample(n))

def noise(n,v):
     return v*np.random.randn(n,1)

def mechanism(x,d):
    g = np.linspace(min(x)-np.std(x),max(x)+np.std(x),d)
    return sp(g,np.random.randn(d))(x.flatten())[:,np.newaxis]

def pair(n,k,p1,p2,v,d):
    x = continuous_cause(n,k,p1,p2)
    y = scale(scale(mechanism(x,d))+noise(n,v))
    return (x,y)

def synset(N):
    res_x_fw = []; res_x_bw = []
    for i in range(N):
        n  = np.int(np.power(10,np.random.uniform(2,4)))
        k  = np.random.randint(1,5)
        p1 = np.random.uniform(0,5)
        p2 = np.random.uniform(0,5)
        v  = np.random.uniform(0,5)
        d  = np.random.randint(4,5)
        (x,y) = pair(n,k,p1,p2,v,d)
        res_x_fw += [np.hstack((x,y))]
        res_x_bw += [np.hstack((y,x))]
    return (res_x_fw + res_x_bw,np.hstack((np.zeros(N),np.ones(N))))

def kdist(args):
    x,w = args
    fx  = np.dot(x[:,0][:,np.newaxis],w[0,:][np.newaxis,:])
    fy  = np.dot(x[:,1][:,np.newaxis],w[1,:][np.newaxis,:])
    return np.hstack((np.cos(fx).mean(0),np.cos(fy).mean(0),np.cos(fx+fy).mean(0)))

def fdist(x,w):
    return np.array([kdist((scale(xi),w)) for xi in x])

def gen_random_parameter(M=100, G=[0.1,1,10]):
    return np.vstack([np.sqrt(g)*np.random.randn(M,2) for g in G]).T

def gen_dataset(w,N=100):
    x,y = synset(N)
    z   = fdist(x,w)
    p   = np.random.permutation(2*N)
    x2  = []
    for i in xrange(len(p)):
        x2.append(x[p[i]])
    return x2,y[p],z[p]


# In[ ]:


w     = gen_random_parameter()
x,y,z = gen_dataset(w)

plt.figure(figsize=(8,8))
p = np.random.permutation(len(x))
for i in xrange(25):
    plt.subplot(5,5,i+1)
    plt.plot(x[p[i]][:,0],x[p[i]][:,1],'.')
    plt.title(y[p[i]])
    plt.axis('off')
plt.tight_layout()


# In[ ]:


_,y_tr,z_tr = gen_dataset(w, N=1000)
_,y_te,z_te = gen_dataset(w, N=1000)


# In[ ]:


from sklearn.ensemble import RandomForestClassifier
print np.mean(RandomForestClassifier(n_estimators=500).fit(z_tr,y_tr).predict(z_te)==y_te)