#!/usr/bin/env python
# coding: utf-8

# Stein Unbiased Risk Estimator
# =============================
# 
# *Important:* Please read the [installation page](http://gpeyre.github.io/numerical-tours/installation_python/) for details about how to install the toolboxes.
# $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$
# $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$
# $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$
# $\newcommand{\norm}[1]{\|#1\|}$
# $\newcommand{\abs}[1]{\left|#1\right|}$
# $\newcommand{\choice}[1]{ \left\{  \begin{array}{l} #1 \end{array} \right. }$
# $\newcommand{\pa}[1]{\left(#1\right)}$
# $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$
# $\newcommand{\qandq}{\quad\text{and}\quad}$
# $\newcommand{\qwhereq}{\quad\text{where}\quad}$
# $\newcommand{\qifq}{ \quad \text{if} \quad }$
# $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$
# $\newcommand{\ZZ}{\mathbb{Z}}$
# $\newcommand{\CC}{\mathbb{C}}$
# $\newcommand{\RR}{\mathbb{R}}$
# $\newcommand{\EE}{\mathbb{E}}$
# $\newcommand{\Zz}{\mathcal{Z}}$
# $\newcommand{\Ww}{\mathcal{W}}$
# $\newcommand{\Vv}{\mathcal{V}}$
# $\newcommand{\Nn}{\mathcal{N}}$
# $\newcommand{\NN}{\mathcal{N}}$
# $\newcommand{\Hh}{\mathcal{H}}$
# $\newcommand{\Bb}{\mathcal{B}}$
# $\newcommand{\Ee}{\mathcal{E}}$
# $\newcommand{\Cc}{\mathcal{C}}$
# $\newcommand{\Gg}{\mathcal{G}}$
# $\newcommand{\Ss}{\mathcal{S}}$
# $\newcommand{\Pp}{\mathcal{P}}$
# $\newcommand{\Ff}{\mathcal{F}}$
# $\newcommand{\Xx}{\mathcal{X}}$
# $\newcommand{\Mm}{\mathcal{M}}$
# $\newcommand{\Ii}{\mathcal{I}}$
# $\newcommand{\Dd}{\mathcal{D}}$
# $\newcommand{\Ll}{\mathcal{L}}$
# $\newcommand{\Tt}{\mathcal{T}}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\al}{\alpha}$
# $\newcommand{\la}{\lambda}$
# $\newcommand{\ga}{\gamma}$
# $\newcommand{\Ga}{\Gamma}$
# $\newcommand{\La}{\Lambda}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\Si}{\Sigma}$
# $\newcommand{\be}{\beta}$
# $\newcommand{\de}{\delta}$
# $\newcommand{\De}{\Delta}$
# $\newcommand{\phi}{\varphi}$
# $\newcommand{\th}{\theta}$
# $\newcommand{\om}{\omega}$
# $\newcommand{\Om}{\Omega}$

# This tour uses the Stein Unbiased Risk Estimator (SURE) to optimize the
# value of parameters in denoising algorithms.

# In[1]:


from __future__ import division

import numpy as np
import scipy as scp
import pylab as pyl
import matplotlib.pyplot as plt

from nt_toolbox.general import *
from nt_toolbox.signal import *

import warnings
warnings.filterwarnings('ignore')

get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# Denoising and SURE
# ------------------
# We consider a simple generative model of noisy images $F = f_0+W$
# where $f_0 \in \RR^N$ is a deterministic image of $N$ pixels, and
# $W$ is a Gaussian white noise distributed according to $\Nn(0,\si^2
# \text{Id}_N)$, where $\si^2$ is the variance of noise.
# 
# 
# The goal of denoising is to define an estimator $h(F)$ of
# $f_0$ that depends only on $F$,
# where $h : \RR^N \rightarrow \RR^N$ is a potentially non-linear
# mapping.
# 
# 
# Note that while $f_0$ is a deterministic image, both $F$ and $h(F)$
# are random variables (hence the capital letters).
# 
# 
# The goal of denoising is to reduce as much as possible the denoising
# error given some prior knowledge on the (unknown) image $f_0$.
# A mathematical way to measure this error is to bound the
# quadratic risk $\EE_W(\norm{h(F) - f_0}^2)$, where the expectation is computed
# with respect to the distribution of the noise $W$
# 
# 
# For real life applications, one does not have access to the underlying
# image $f_0$. In this tour, we however assume that $f_0$ is known, and
# $f = f_0 + w\in \RR^N$ is generated using a single realization of the noise $w$
# that is drawn from $W$. We define the estimated deterministic image as
# $h(f)$ which is a realization of the random vector
# $h(F)$.
# 
# 
# Number $N = n \times n$ of pixels.

# In[2]:


n = 128*2
N = n**2


# First we load an image $f \in \RR^N$ where $N=n \times n$ is the number of pixels.

# In[3]:


f0 = load_image("nt_toolbox/data/hibiscus.bmp",n)


# Display it.

# In[4]:


plt.figure(figsize = (5,5))
imageplot(f0)


# Standard deviation $\si$ of the noise.

# In[5]:


sigma = .08


# Then we add Gaussian noise $w$ to obtain $f=f_0+w$.

# In[6]:


from numpy import random
f = f0 + sigma*random.standard_normal((n,n))


# Display the noisy image. Note the use of the clamp function to force the result to be in $[0,1]$ to avoid a loss of contrast of the display.

# In[7]:


plt.figure(figsize = (5,5))
imageplot(clamp(f), "Noisy, SNR = %.1f dB" %snr(f0, f))


# The Stein Unbiased Risk Estimator (SURE) associated to the mapping $h$
# is defined as
# 
# $$ \text{SURE}(f) = -N\si^2 + \norm{h(f)-f}^2 + 2\si^2 \text{df}(f) $$
# 
# where df stands for _degree of freedom,_ and is defined as
# 
# $$ \text{df}(f) = \text{div} h(f) = \sum_i \pd{h}{f_i}(f). $$
# 
# 
# It has been introduced in:
# 
# 
# Stein, Charles M. (November 1981).
# "Estimation of the Mean of a Multivariate Normal Distribution".
# The Annals of Statistics 9 (6): 1135-1151.
# 
# 
# And it has been applied to wavelet-based non-linear denoising in:
# 
# 
# Donoho, David L.; Iain M. Johnstone (December 1995).
# "Adapting to Unknown Smoothness via Wavelet Shrinkage".
# Journal of the American Statistical Association (Journal of the American Statistical Association,
# Vol. 90, No. 432) 90 (432): 1200-1244.
# 
# 
# If the mapping $f \mapsto h(f)$ is differentiable outside a set of
# zero measure (or more generally weakly differentiable),
# then SURE defines an unbiased estimate of the quadratic risk :
# 
# $$ \EE_W(\text{SURE}(F)) = \EE_W( \norm{f_0-h(F)}^2 ). $$
# 
# This is especially useful, since the evaluation of SURE does not
# necessitate the knowledge of the clean signal $f_0$ (but note however
# that it requires the knowledge of the noise level $\si$).
# 
# 
# In practice, one replaces $\text{SURE}(F)$ from its empirical
# evaluation $\text{SURE}(f)$ on a single realization $f$.
# One can then minimize $\text{SURE}(f)$ with respect to a parameter
# $\la$ that parameterizes the denoiser $h=h_\la$.
# 
# 
# Linear Denoising SURE
# ---------------------
# We consider a translation-invariant linear denoising operator, which is
# thus a convolution
# 
# $$ h(f) = f \star g $$
# 
# where $g \in \RR^N$ is a low pass kernel, and $\star$ denotes the
# periodic 2-D convolution.
# 
# 
# Since we use periodic boundary condition, we compute the
# convolution as a multiplication over the Fourier domain.
# 
# $$ \forall \om, \quad \hat h(f)(\om) = \hat f(\om) \hat g(\om) $$
# 
# where $\hat g(\om)$ is the frequency $\om$ of the discrete 2-D
# Fourier transform of $g$ (computed using the pylab function fft2 from the pylab package).

# In[8]:


convol = lambda f,g: np.real(pyl.ifft2(pyl.fft2(f)*np.tile(pyl.fft2(g), (1,1))))


# We define a parameteric kernel $g_\la$ parameterized by its bandwidth
# $\la>0$. We use here a Gaussian kernel
# 
# $$ g_\la(a) = \frac{1}{Z_\la} e^{ -\frac{\norm{a}}{2 \la^2} } $$
# where $Z_\la$ ensures that $\sum_a g_\la(a) = 1$.

# In[9]:


normalize = lambda f: f/np.sum(f)
x = np.hstack((np.arange(0,n//2+1),np.arange(-n//2 + 1,0)))
[Y, X] = np.meshgrid(x, x)
g = lambda lambd: normalize(np.exp(-(X**2 + Y**2)/(2*lambd**2)))


# Define our denoising operator $h=h_\la$ (we make explicit the
# dependency on $\la$):
# $$ h_\la(f) = g_\la \star f. $$

# In[10]:


h = lambda f, lambd: convol(f, g(lambd))


# Example of denoising result.

# In[11]:


lambd = 1.5

plt.figure(figsize = (5,5))
imageplot(clamp(h(f, lambd)))


# For linear operator, the dregree of freedom is equal to the trace of the operator, and
# thus in our case it is equal to the sum of the Fourier transform
# $$ \text{df}_\la(f) = \text{tr}(h_\la) = \sum_{\om} \hat g_\la(\om) $$
# Note that we have made explicit the dependency of df with respect
# to $\la$. Note also that df$(f)$ actually not actually depend on $f$.

# In[12]:


df = lambda lambd: np.real(np.sum(pyl.fft2(g(lambd))))


# We can now define the SURE=SURE$_\la$ operator, as a function of $f, h(f),
# \lambda$.

# In[13]:


from numpy import linalg
SURE = lambda f,hf,lambd: -N*sigma**2 + linalg.norm(hf-f,"fro")**2 + 2 * sigma**2 * df(lambd)


# __Exercise 1__
# 
# For a given $\lambda$, display the histogram of the repartition of
# the quadratic error $\norm{y-h(y)}^2$ and of $\text{SURE}(y)$.
# Compute these repartition using Monte-Carlo simulation (you need to
# generate lots of different realization of the noise $W$).
# Display in particular the location of the mean of these quantities.

# In[14]:


run -i nt_solutions/denoisingadv_9_sure/exo1


# In[15]:


## Insert your code here.


# In practice, the SURE is used to set up the value of $\la$ from a
# single realization $f=f_0+w$, by minimizing $\text{SURE}_\la(f)$.

# __Exercise 2__
# 
# Compute, for a single realization $f=f_0+w$, the evolution
# of
# 
# $$ E(\la) = \text{SURE}_\la(f) \qandq E_0(\lambda) = \norm{f-h_\la(f)}^2 $$
# as a function of $\lambda$.

# In[16]:


run -i nt_solutions/denoisingadv_9_sure/exo2


# In[17]:


## Insert your code here.


# __Exercise 3__
# 
# Display the best denoising result $h_{\la^*}(f)$
# where
# $$\la^* = \uargmin{\la} \text{SURE}_\la(f) $$

# In[18]:


run -i nt_solutions/denoisingadv_9_sure/exo3


# In[19]:


## Insert your code here.


# Soft Thresholding SURE
# ----------------------
# In order to enhance the denoising results for piecewise regular signal
# and image, it is possible to use non-linear thresholding in an orthogonal wavelet
# basis $ \Bb = \{ \psi_m \}_{m} $ where $\psi_m \in \RR^N$ is a wavelet element.
# 
# 
# Re-generate a noisy image.

# In[20]:


f = f0 + sigma*random.standard_normal((n,n))


# The soft-thresholding estimator thus reads
# $$ h_\la(f) = \sum_m s_\la( \dotp{f}{\psi_m} ) \psi_m
#       \qwhereq s_\la(\al) = \max\pa{0, 1-\frac{\la}{\abs{\al}}} \al. $$
# It can be conveniently written as
# $$ h_\la = \Ww^* \circ S_\la \circ \Ww $$
# where $\Ww$ and $\Ww^*$ are forward and inverse wavelet transform
# $$ \Ww(f) = ( \dotp{f}{\psi_m} )_m \qandq
#       \Ww^*(x) = \sum_m x_m \psi_m, $$
# and $ S_\la $ is the diagonal soft thresholding operator
# $$ S_\la(x) = ( s_\la(x_m) )_m. $$
# 
# 
# Define the wavelet transform and its inverse.

# In[21]:


from nt_toolbox.compute_wavelet_filter import *

h_daub = compute_wavelet_filter('Daubechies',4) 
W  = lambda f1: perform_wavortho_transf(f1,0,+1,h_daub)
Ws = lambda x: perform_wavortho_transf(x,0,-1,h_daub)


# Display the wavelet transform $\Ww(f_0)$ of the original image.

# In[22]:


plt.figure(figsize = (10,10))
plot_wavelet(W(f0), 1)
plt.show()


# Define the soft thresholding operator.

# In[23]:


S = lambda x,lambd: np.maximum(np.zeros(list(np.shape(x))), 1-lambd/np.maximum(1e-9*np.ones(list(np.shape(x))),abs(x)))*x


# Define the denoising operator.

# In[24]:


h = lambda f1,lambd: Ws(S(W(f1), lambd))


# Example of denoising result.

# In[25]:


lambd = 3*sigma/2

plt.figure(figsize = (5,5))
imageplot(clamp(h(f,lambd)))


# Since $Ww$ is an orthogonal transform, one has
# $$ \text{df}(f) = \text{div}( S_\la )( \Ww(f) )
#       = \sum_m s_\la'( \dotp{f}{\psi_m} ) = \norm{\Ww(h(f))}_0  $$
# where $ s_\la' $ is the derivative of the 1-D function $s_\la$, and
# $\norm{\cdot}_0$ is the $\ell^0$ pseudo-norm
# $$ \norm{x}_0 = \abs{ \enscond{m}{x_m \neq 0} }. $$
# 
# 
# To summarize, the degree of freedom is equal to the number of non-zero
# coefficients in the wavelet coefficients of $h(f)$.

# In[26]:


df = lambda hf,lambd : np.sum(abs(W(hf)) > 1e-8)


# We can now define the SURE operator, as a function of $f, h(f),
# \lambda$.

# In[27]:


SURE = lambda f,hf,lambd: -N*sigma**2 + linalg.norm(hf-f, 'fro')**2 + 2*sigma**2*df(hf, lambd)


# __Exercise 4__
# 
# For a given $\lambda$, display the histogram of the repartition of
# the quadratic error $\norm{y-h(y)}^2$ and of $\text{SURE}(y)$.
# Compute these repartition using Monte-Carlo simulation (you need to
# generate lots of different realization of the noise $W$).
# Display in particular the location of the mean of these quantities.
# _Hint:_ you can do the computation directly over the wavelet domain,
# i.e. consider that the noise is added to the wavelet transform.

# In[28]:


run -i nt_solutions/denoisingadv_9_sure/exo4


# In[29]:


## Insert your code here.


# __Exercise 5__
# 
# Compute, for a single realization $f=f_0+w$, the evolution
# of
# 
# $$ E(\la) = \text{SURE}_\la(f) \qandq E_0(\lambda) = \norm{f-h_\la(f)}^2 $$
# as a function of $\lambda$.

# In[30]:


run -i nt_solutions/denoisingadv_9_sure/exo5


# In[31]:


## Insert your code here.


# __Exercise 6__
# 
# Display the best denoising result $h_{\la^*}(f)$
# where
# $$\la^* = \uargmin{\la} \text{SURE}_\la(f) $$

# In[32]:


run -i nt_solutions/denoisingadv_9_sure/exo6


# In[33]:


## Insert your code here.


# Block-soft Thresholding SURE
# ----------------------------
# To improve the result of soft thresholding, it is possible to threshold
# blocks of coefficients.
# 
# 
# We define a partition $ \{1,\ldots,N\} = \cup_k b_k $ of the set of wavelet
# coefficient indexes. The block thresholding is defined as
# 
# $$ h_\la(f) = \sum_k \sum_{m \in b_k} a_\la( e_k ) \dotp{f}{\psi_m} \psi_m
#       \qwhereq
#       e_k = \sum_{m \in b_k} \abs{\dotp{f}{\psi_m}}^2,
# $$
# where we use the James-Stein attenuation threshold
# $$
#       a_\la(e) = \max\pa{ 0, 1 - \frac{\la^2}{e^2} }.
# $$
# 
# 
# The block size $q$.

# In[34]:


q = 4


# A function to extract blocks.

# In[35]:


[X,Y,dX,dY] = np.meshgrid(np.arange(1,n-q+2,q),np.arange(1,n-q+2,q),np.arange(0,q),np.arange(0,q))
I = (X + dX - 1) + (Y + dY - 1)*n

for i in range(n//q):
    for j in range(n//q):
        I[i,j] = np.transpose(I[i,j])
        
blocks = lambda fw : np.ravel(fw)[I]


# A function to reconstruct an image from blocks.

# In[36]:


def assign(M,I,H):
    M_temp = M
    np.ravel(M_temp)[I] = H
    return np.reshape(M_temp,(n,n))

unblock = lambda H : assign(np.zeros([n,n]), I, H)


# Compute the average energy of each block, and duplicate.

# In[37]:


def energy(H):
    H_tmp = np.copy(H)
    for i in range(n//q):
        for j in range(n//q):
            H_tmp[i][j] = np.mean(H_tmp[i][j]**2)*np.ones([q,q])
    return H_tmp


# Threshold the blocks. We use here a Stein block thresholding.
# All values within a block are atenuated by the same factor.

# In[38]:


S = lambda H,lambd : np.maximum(1-lambd**2/energy(H), np.zeros(np.shape(H)))*H


# Block thresholding estimator $h_\lambda(f)$.

# In[39]:


h = lambda f,lambd: Ws(unblock(S(blocks(W(f)), lambd)))


# Example of block denoising.

# In[40]:


lambd = 1.1*sigma

plt.figure(figsize = (5,5))
imageplot(clamp(h(f, lambd)))


# Since the block-thresholding operates in a block diagonal manner over the
# wavelet coefficients, it degree of freedom is a sum of the divergence of
# each block James-Stein operator
# $$
#   \text{df}(f) = \sum_{ e_k > \la^2 } \text{tr}( \partial \phi (a_k) )
# $$
# where $ a_k = (\dotp{f}{\psi_m})_{m \in b_k} $ is the set of coefficients
# inside a block, that satisfies $\norm{a_k}=e_k$, and
# where
# $$ \phi(a) = \pa{ 1 - \frac{\la^2}{\norm{a}^2} } a. $$
# One can compute explicitely the derivative of $\phi$
# $$ \partial \phi(a) = \pa{ 1 - \frac{\la^2}{\norm{a}^2} } \text{Id} + 2 \frac{\la^2}{\norm{a}^2} \Pi_a $$
# where $\Pi_a$ is the orthogonal projector on $a$.
# 
# 
# This gives the folowing formula for the degree of freedom
# $$
#   \text{df}(f) = \norm{\Ww(h_\la(f))}_0
# + \sum_{ e_k > \la^2 }
#   \frac{\la^2}{e_k} (2-\abs{b_k}).
# $$
# One can note that the degree of freedom differs from the one of the soft thresholding
# (it is not in general an integer).
<script>
  $(document).ready(function(){
      $('div.prompt').hide();
  });
</script>