In this notebook, we study the performance of some algorithms used to complete low-rank matrices where only a small fraction of the entries have been observed.
To vizualize performances of these algorithms we use real-world pictures as matrices. First we need to check that real-world picture are almost low-rank matrices. Then we will use a SDP formulation of the nuclear norm minimization procedure and then we will turn to a forward backward splitting algorithm which uses the proximal operator of the nuclear norm.
package: 1. PIL if one wants to use its own images 2. CVXOPT and CVXPY for solving a SDP
$\newcommand{\eps}{\varepsilon}$ $\newcommand{\R}{\mathbb{R}}$ $\newcommand{\bA}{\mathbb{A}}$ $\newcommand{\bS}{\mathbb{S}}$
$\newcommand{\cO}{\mathcal{O}}$ $\newcommand{\cA}{\mathcal{A}}$ $\newcommand{\cC}{\mathcal{C}}$ $\newcommand{\cS}{\mathcal{S}}$
$\newcommand{\1}{{\rm 1}\kern-0.24em{\rm I}}$ $\newcommand{\inr}[1]{\bigl< #1 \bigr>}$ $\newcommand{\norm}[1]{\|#1\|}$ $\newcommand{\abs}[1]{\left|#1\right|}$ $\DeclareMathOperator*{\argmin}{argmin}$
Default pictures are available in scipy or any personal picture will do as well.
from scipy import misc
import numpy as np
from PIL import Image
import time
import matplotlib.pyplot as plt
%matplotlib inline
ascent = misc.ascent()
face = misc.face()
plt.figure(figsize = (10,10))
plt.subplot(1, 2, 1)
plt.imshow(ascent, cmap=plt.cm.gray)
plt.subplot(1, 2, 2)
plt.imshow(face)
<matplotlib.image.AxesImage at 0x10884e860>
Personnal pictures can be loaded via the library PIL. Loaded colored images has a PIL type which is not a classical matrix (it is a list of triples). So for our study we will only consider one slice of this list.
ensae = Image.open("ensae.png")
print(ensae.size, type(ensae))
(790, 634) <class 'PIL.PngImagePlugin.PngImageFile'>
plt.imshow(ensae)
<matplotlib.image.AxesImage at 0x10a76f2b0>
ensae_np = np.array(ensae)
print(ensae_np.shape)
print("colored images are not simple 2d arrays!")
(634, 790, 4) colored images are not simple 2d arrays!
ensae = np.array(ensae)
plt.figure(figsize = (18,10))
plt.subplot(1,5,1)
plt.imshow(ensae[:,:,0])
plt.subplot(1,5,2)
plt.imshow(ensae[:,:,1])
plt.subplot(1,5,3)
plt.imshow(ensae[:,:,2])
plt.subplot(1,5,4)
plt.imshow(ensae[:,:,0] + ensae[:,:,1] + ensae[:,:,2] + ensae[:,:,3])#, cmap=plt.cm.gray)
plt.subplot(1,5,5)
plt.imshow(ensae_np)
<matplotlib.image.AxesImage at 0x10ae5a048>
ensae = ensae_np[:,:,0]
face = face[:,:,0]
plt.figure(figsize = (10,10))
plt.subplot(1,2,1)
plt.imshow(ensae, cmap=plt.cm.gray)#, interpolation="none")
plt.subplot(1,2,2)
plt.imshow(face, cmap=plt.cm.gray)#, interpolation="none")
<matplotlib.image.AxesImage at 0x10e5e3240>
Pictures loaded in python can now be seen as np.array
print(type(ensae), type(face))
print(ensae.shape, face.shape)
<class 'numpy.ndarray'> <class 'numpy.ndarray'> (634, 790) (768, 1024)
face[0:10, 0:10]
array([[121, 138, 153, 155, 155, 158, 159, 156, 147, 137], [ 89, 110, 130, 137, 141, 148, 152, 151, 164, 154], [ 73, 94, 115, 123, 127, 131, 132, 129, 139, 135], [ 81, 97, 113, 120, 125, 126, 120, 111, 101, 101], [103, 113, 123, 132, 142, 147, 140, 127, 109, 101], [124, 128, 131, 137, 150, 163, 163, 155, 136, 119], [109, 110, 108, 108, 122, 144, 159, 163, 155, 143], [ 68, 70, 67, 66, 81, 112, 142, 157, 170, 170], [102, 94, 85, 80, 74, 80, 106, 136, 166, 179], [123, 116, 108, 97, 83, 75, 82, 95, 135, 158]], dtype=uint8)
ensae[0:10, 0:10]
array([[255, 255, 255, 255, 255, 255, 255, 255, 255, 255], [255, 255, 255, 255, 255, 255, 255, 255, 255, 255], [255, 255, 255, 255, 255, 255, 255, 255, 255, 255], [255, 255, 255, 255, 255, 255, 255, 255, 255, 255], [255, 255, 255, 255, 255, 255, 255, 255, 255, 255], [255, 255, 255, 255, 255, 255, 255, 255, 255, 255], [255, 255, 255, 255, 255, 255, 255, 255, 255, 255], [255, 255, 255, 255, 255, 255, 255, 255, 255, 255], [255, 255, 255, 255, 255, 255, 255, 255, 255, 255], [255, 255, 255, 255, 255, 255, 255, 255, 255, 255]], dtype=uint8)
plt.imshow(face[123:130, 123:130], cmap=plt.cm.gray, interpolation="none")
<matplotlib.image.AxesImage at 0x10f644f60>
plt.imshow(ensae[123:130, 123:130], cmap=plt.cm.gray, interpolation="none")
<matplotlib.image.AxesImage at 0x10ec107f0>
We recall that any matrix $A\in\R^{u\times v}$ has a singular value decomposition (SVD): there exists $U\in\cO(u)$ and $V\in\cO(v)$ two orthogonal matrices such that $$ A=U S V $$ where $S= {\rm diag}(\sigma_A)\in\R^{u\times v}$ and $\sigma_A=(\sigma_1,\ldots,\sigma_{u\wedge v})$ is the spectrum of $A$. One can get this decomposition via numpy:
U, s, V = numpy.linalg.svd(A)
returns the SVD of $A$. Note that $s$ is the vector of all singular values ordered in a decreasing way; to get the diagonal matrix $S$ one can use np.diag(s). In this case, np.dot(U, np.dot(S, V))} is close to $A$.
S = np.diag(s)
np.dot(U, np.dot(S, V))
Note: np.diag(s) return a square matrix whereas we need a rectangular $u\times v$ matrix. So to make the exposition simple, we will work only with square matrix (i.e. $u=v$).
from numpy.linalg import svd
face = face[0:768, 0:768] # to get a square matrix
U, s, V = svd(face)
S = np.diag(s)
FACE = np.dot(U, np.dot(S, V))
print(np.allclose(face, FACE))
print(np.linalg.norm(FACE-face, 2))
True 2.89049891807e-10
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
plt.plot(s, lw=2)
plt.title("Singular values of face")
plt.xlim([-5, 512])
plt.subplot(1,2,2)
plt.plot(s[0:50], lw=2)
plt.title("50 first Singular values of face")
plt.xlim([-5, 55])
(-5, 55)
U, s, V = svd(ensae)
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
plt.plot(s, lw=2)
plt.title("Singular values of ensae")
plt.xlim([-5, 512])
plt.subplot(1,2,2)
plt.plot(s[0:50], lw=2)
plt.title("50 first Singular values of ensae")
plt.xlim([-5, 55])
(-5, 55)
Conclusion: images are amazingly "almost" low rank!
Let us see if one truncates the SVD of an image (by setting many small singular values to zero) one can recover the picture.
def truncated_svd(image = face, rank = 50):
"TODO: return the truncated svd of a matrix"
return U, s, V
image, rank = face, 50
U, sr, V = truncated_svd(image, rank)
S_r = np.diag(sr)
face_r = np.dot(U, np.dot(S_r, V))
plt.figure(figsize = (10, 10))
plt.subplot(2,2,1)
plt.imshow(face_r, cmap=plt.cm.gray, interpolation="none")
titre = 'face truncated rank ={}'.format(rank)
plt.title(titre)
plt.subplot(2,2,2)
plt.plot(sr, lw = 2)
plt.xlim([-10, 512])
titre = 'singular values face truncated rank ={}'.format(rank)
plt.title(titre)
plt.subplot(2, 2, 3)
plt.imshow(face, cmap=plt.cm.gray)
plt.title("original face")
plt.subplot(2,2,4)
plt.plot(s, lw=2)
plt.xlim([-10, 512])
plt.title("Singular values of face")
<matplotlib.text.Text at 0x109751c88>
We remove randomly some entries from the matrix associated to an image using Bernoulli variables; this is what is called the mask. We can formalize it as a function $$ P_\Omega : \left\{ \begin{array}{ccc} \R^{u\times v} & \to & \R^{u\times v}\\ A & \to & P_\Omega(A) \end{array} \right. \mbox{ where } (P_\Omega(A))_{ij} = \left\{ \begin{array}{cc} A_{ij} & \mbox{ if } (i,j)\in\Omega\\ 0 & \mbox{ otherwise} \end{array} \right. $$and $\Omega\subset\{1,\ldots,u\}\times \{1,\ldots,v\}$ is the set of indices of the observed entries. In the following, this set is constructed randomly using selectors (i.e. Bernoulli variables): every entry is selected with some probability called proportion below. We look at $P_\Omega(A)$ as a dataset of the form {(user, item, grade)} where $(user, item)\in\Omega$ and grade is the value of the pixel at (user, item) of the image. This is the classical way datasets are in collaborative filtering or recommandation system.
def mask(u, v, proportion = 0.2):
mat_mask = #construct a mask from a Bernoulli random matrix
print("We observe {} per cent of the entries of a {}*{} matrix".format(100 * mat_mask.mean(),u, v))
return mat_mask
image = face
proportion = 0.2
u, v = image.shape
mat_mask = mask(u, v, proportion)
image_masked = np.multiply(mat_mask, image)
We observe 20.012071397569446 per cent of the entries of a 768*768 matrix
plt.figure(figsize = (10,10))
plt.subplot(1, 2, 1)
plt.imshow(mat_mask, cmap=plt.cm.gray); plt.title("mask (black = unobserved)")
plt.subplot(1, 2, 2)
plt.imshow(image_masked, cmap=plt.cm.gray); plt.title("Observed entries; black = unobserved")
<matplotlib.text.Text at 0x10d7ab860>
rank = 30
image = face
U, sr, V = truncated_svd(image, rank)
S_r = np.diag(sr)
image_r = np.dot(U, np.dot(S_r, V))
proportion = 0.2
u, v = image_r.shape
mat_mask = mask(u, v, proportion)
We observe 20.02529568142361 per cent of the entries of a 768*768 matrix
Conclusion: now that we have observed some entries of a low-rank matrix, one can tries to complete it via some "matrix completion algorithms". The nuclear norm will play a central role in the following procedures.
A natural convex relaxation approach of the rank minimization procedure is the nuclear norm minimization procedure:
$$ \hat A \in \argmin_{A\in \R^{u\times v}}\big(\norm{A}_{S_1} : \cA(A) = y\big).\tag{NM} $$where $\cA(A) = (\inr{A, X_i})_{i=1}^m$, $y=\cA(A^*)$, $A^*$ is the low-rank matrix that we want to reconstruct and $\norm{\cdot}_{S_1}$ is the nuclear norm (that is the sum of singular values).
It is possible to recast (NM) as a SDP problem:
$$ \left( \begin{array}{c} \hat X\\ \hat Y\\ \hat Z \end{array} \right)\in\argmin_{X\in\R^{u\times v}, Y\in\cS^u, Z\in\cS^v} {\rm Tr}(Y) + {\rm Tr}(Z) \mbox{ tel que } \begin{array}{l} \cA(X) = y\\ \left[ \begin{array}{cc} Y & X\\ X^\top & Z \end{array} \right]\succeq 0 \end{array}\tag{SDP} $$Theorem
The two problems (NM) and (SDP) are equivalent :
if $\hat A$ is a solution to (NM) then $(\hat A, \hat Y, \hat Z)^\top$ is a solution to (SDP) for some $\hat Y$ et $\hat Z$,
if $(\hat X, \hat Y, \hat Z)^\top$ is solution to (SDP) then $\hat X$ is solution to (NM).
CVXPY:
We use the library cvxpy to solve this problem. Learn some basic examples at here
from cvxpy import *
print(installed_solvers())
['SCS', 'GLPK', 'ECOS', 'CVXOPT', 'GLPK_MI', 'ECOS_BB', 'LS']
Note: Only the solvers 'SCS' or 'CVXOPT' can solve SDP problems (cf. cvxpy 'Choosing a solver chapter). For the time being, only SCS works on my machine for this problem. Using 'CVXOPT' always leads to a dead kernel after a minute of computation (don't know why?).
def nuclear_norm_mini(mat_mask, image):
u, v = image.shape
A = Variable((u, v))
obj = #TODO: define the objective function
constraints = #TODO: define the constraints via mul_elemwise
prob = Problem(obj, constraints)
prob.solve(solver=SCS)
if prob.status != 'optimal':
print('CVXPY failed to reach optimal value')
return A.value
rank = 30
size = 180
image = face[0:size, 0:size]
U, sr, V = truncated_svd(image, rank)
S_r = np.diag(sr)
image_r = np.dot(U, np.dot(S_r, V))
proportion = 0.5
u, v = image_r.shape
mat_mask = mask(u, v, proportion)
We observe 50.191358024691354 per cent of the entries of a 180*180 matrix
now_0 = time.time()
A = nuclear_norm_mini(mat_mask, image_r)
print("{} seconds".format(time.time() - now_0))
27.229541063308716 seconds
plt.figure(figsize = (15, 10))
plt.subplot(1,3,1)
plt.imshow(image_r, cmap=plt.cm.gray)
plt.title("original image (rank = {})".format(rank))
plt.subplot(1,3,2)
plt.imshow(np.multiply(mat_mask, image_r), cmap=plt.cm.gray)
plt.title("Image masked (proportion = {} per cent)".format(proportion))
plt.subplot(1,3,3)
plt.imshow(A, cmap=plt.cm.gray)
plt.title("Image reconstructed via nuclear norm minimization")
<matplotlib.text.Text at 0x109cf0978>
def recons_plot(image, size=100, rank=30, proportion=0.2):
U, sr, V = truncated_svd(image[0:size, 0:size], rank)
S_r = np.diag(sr)
image_r = np.dot(U, np.dot(S_r, V))
u, v = image_r.shape
mat_mask = mask(u, v, proportion)
now_0 = time.time()
A = nuclear_norm_mini(mat_mask, image_r)
print("{} seconds for nulcear norm minimization".format(time.time() - now_0))
plt.figure(figsize = (15, 10))
plt.subplot(1,3,1)
plt.imshow(image_r, cmap=plt.cm.gray)
plt.title("original image (rank = {})".format(rank))
plt.subplot(1,3,2)
plt.imshow(np.multiply(mat_mask, image_r), cmap=plt.cm.gray)
plt.title("Image masked (proportion = {} per cent)".format(proportion))
plt.subplot(1,3,3)
plt.imshow(A, cmap=plt.cm.gray)
plt.title("Image reconstructed via nuclear norm minimization")
recons_plot(face, size=50, rank=10, proportion=0.4)
We observe 40.6 per cent of the entries of a 50*50 matrix 1.0997920036315918 seconds for nulcear norm minimization
images = [face, face, ascent, ensae]
for image in images:
recons_plot(image, size=100, rank=30, proportion=0.4)
We observe 40.339999999999996 per cent of the entries of a 100*100 matrix 3.615110158920288 seconds for nulcear norm minimization We observe 40.1 per cent of the entries of a 100*100 matrix 3.7095510959625244 seconds for nulcear norm minimization We observe 39.81 per cent of the entries of a 100*100 matrix 10.896809101104736 seconds for nulcear norm minimization We observe 40.61 per cent of the entries of a 100*100 matrix 4.030638933181763 seconds for nulcear norm minimization
Conclusion:
Nuclear norm minimization via cvxpy works well up to matrices of size $200\times200$.
It is harder to reconstruct matrices that are sparse and low rank. This last observation is known in theory since matrice completion works well only when the singular vectors (of the matrix to be reconstructed) are incoherent with the canonical basis. This is not the case of sparse + low-rank matrices.
Given the data $P_\Omega(A^*)$, one may try to recover $A^*$ via the least-squares regularization procedure:
$$ \hat A \in\argmin_{A\in\R^{u\times v}} \frac{1}{2}\norm{P_\Omega(A^*) - P_\Omega(A)}_{S_2}^2 + \lambda \norm{A}_{S_1} $$where $\lambda$ is a regularization parameter.
A way to implement this procedure is to perform a proximal gradient descent:
$$ A^{k+1} = {\rm prox}_{\gamma_k \lambda\norm{\cdot}_{S_1}}\big(A^k - \gamma_k \nabla F(A^k)\big) \mbox{ where } F(A) = \frac{1}{2}\norm{P_\Omega(A^*) - P_\Omega(A)}_{S_2}^2. $$One can see that $\nabla F(A) = (P_\Omega(A)-P_\Omega(A^*))$ and that the proximal operator of $\gamma_k \lambda\norm{\cdot}_{S_1}$ is the spectral soft thresholding operator $S_\mu$ -- for $\mu = \gamma_k\lambda$:
$$ {\rm prox}_{\mu\norm{\cdot}_{S_1}}(A) = S_\mu(A) = US_{\mu} V $$where a SVD of $A$ is $A = USV$ and $S_\mu = {\rm diag}((\sigma_1-\mu)_+, (\sigma_2-\mu)_+, \cdots)$ is the $u\times v$ diagonal matrix of the soft threshold of the spectrum of $A$.
def spectral_soft_thresh(A, mu):
"TODO: Construct a soft_thresholding operator on matrix A with threshold mu"
plt.figure(figsize=(10, 10))
plt.subplot(2, 3, 1)
image = spectral_soft_thresh(face, 100)
plt.imshow(image, cmap = plt.cm.gray)
plt.title("mu = 100")
plt.subplot(2, 3, 2)
image = spectral_soft_thresh(face, 1000)
plt.imshow(image, cmap = plt.cm.gray)
plt.title("mu = 1.000")
plt.subplot(2, 3, 3)
image = spectral_soft_thresh(face, 3000)
plt.imshow(image, cmap = plt.cm.gray)
plt.title("mu = 3.000")
plt.subplot(2, 3, 4)
image = spectral_soft_thresh(face, 5000)
plt.imshow(image, cmap = plt.cm.gray)
plt.title("mu = 5.000")
plt.subplot(2, 3, 5)
image = spectral_soft_thresh(face, 7500)
plt.imshow(image, cmap = plt.cm.gray)
plt.title("mu = 75.000")
plt.subplot(2, 3, 6)
image = spectral_soft_thresh(face, 10000)
plt.imshow(image, cmap = plt.cm.gray)
plt.title("mu = 10.000")
plt.tight_layout()
We define the proximal gradient descent for $\gamma_k=1$ that is:
$$ A^{k+1} = S_\lambda(P_{\Omega^c}(A^k) + P_\Omega(A^*)) $$def prox_grad_descent(image, mat_mask, mu, n_iter=100, verbose = False):
image_masked = np.multiply(image, mat_mask)
list_errors = []
A = np.zeros(image_masked.shape) #initialization
for i in range(n_iter + 1):
# implement the proximal gradient descent here
if i % 20 == 0 and verbose == True:
print "iteration = {}; error = {}".format(i, err)
plt.figure
plt.imshow(A, cmap = plt.cm.gray)
plt.show()
return A, list_errors
mu = 200
size = 100
proportion = 0.2
image = face#[0:size, 0:size]
u, v = image.shape
mat_mask = mask(u, v, proportion)
plt.figure(figsize=(5, 5))
image_reconstructed, list_errors = prox_grad_descent(image, mat_mask, mu, verbose=True)
We observe 20.0710296631 per cent of the entries of a 512*512 matrix iteration = 0; error = 0.802478574179
iteration = 20; error = 0.481445116582
iteration = 40; error = 0.256250518979
iteration = 60; error = 0.113802070894
iteration = 80; error = 0.0527773435742
iteration = 100; error = 0.0367006193083
plt.figure(figsize=(8, 4))
plt.plot(list_errors, lw=2)
plt.xlabel("iteration")
plt.ylabel("Estimation error")
plt.title("Evolution of the relative error along the projected descent")
<matplotlib.text.Text at 0x10f63e290>
rank = np.linalg.matrix_rank(image_reconstructed)
U, sr, V = truncated_svd(image, rank)
S_r = np.diag(sr)
image_r = np.dot(U, np.dot(S_r, V))
plt.figure(figsize = (10, 10))
#plt.rcParams['figure.figsize'] = 6., 4.
plt.subplot(1, 3, 1)
plt.imshow(image_reconstructed, cmap = plt.cm.gray)
plt.title("Reconstructed image")
plt.subplot(1, 3, 2)
plt.imshow(image_r, cmap = plt.cm.gray)
plt.title("Truncated image of rank = {}".format(rank))
plt.subplot(1, 3, 3)
plt.imshow(np.multiply(image, mat_mask), cmap = plt.cm.gray)
plt.title("Data")
<matplotlib.text.Text at 0x10b99b190>
Procedure $$\hat A \in\argmin \frac{1}{2}\norm{P_\Omega(A^*) - P_\Omega(A)}_{S_2}^2 + \lambda \norm{A}_{S_1}$$ depends on the regularization parameter $\lambda$. In the proximal gradient descent previously introduced to implement this procedure, the regularization parameter is exactly the threshold parameter of the spectral soft thresholding operator: $\lambda = \mu$. From a theoretical point one view we should take this parameter of the order of
$$ \sqrt{\frac{u+v}{|\Omega|}} $$where $|\Omega|$ is the number of observations. In the following we learn the value of this parameter via cross-validation.
def mask_train_test(u,v, V, proportion = 0.2, ratio_train_test = 0.9):
main_train = np.random.binomial(1, proportion, size = (u, v))
list_train_test = []
for i in range(V):
#TODO: construct a list of [train_set, test_set]
return list_train_test
def cv_mu(image, mu, V, proportion = 0.2, ratio_train_test = 0.9):
u, v = image.shape
error = 0
list_train_test = mask_train_test(u, v, V, proportion, ratio_train_test)
for ele in list_train_test:
train, test = ele[0], ele[1]
image_reconstructed, _ = prox_grad_descent(image, train, mu)
image_reconstructed_masked_test = np.multiply(image_reconstructed, test)
image_masked_test = np.multiply(image, test)
error = error + np.linalg.norm(image_reconstructed_masked_test - image_masked_test, 'fro')
return error/V
def cv_mu_list(list_mu, image, V = 3, proportion = 0.2, ratio_train_test = 0.9):
list_errors = []
for mu in list_mu:
list_errors.append(cv_mu(image, mu, V, proportion, ratio_train_test))
return list_errors
#list_mu = [200, 1000]
list_mu = np.logspace(2, 3, 30)
size = 300
proportion = 0.3
ratio_train_test = 0.9
image = face#[0:size, 0:size]
V = 5
list_errors = cv_mu_list(list_mu, image, V, proportion, ratio_train_test)
plt.figure(figsize=(8, 4))
plt.plot(list_mu, list_errors, lw=2)
plt.xlabel("regularization parameter")
plt.ylabel("V-fold error")
plt.title("Choice of regularization parameter via CV")
#plt.savefig("gaussian_100.png",bbox_inches='tight')
<matplotlib.text.Text at 0x1296ecdd0>
mu_best = list_mu[np.argmin(list_errors)]
print 'the best regularization parameter is {}'.format(mu_best)
the best regularization parameter is 239.502661999
u, v = image.shape
mat_mask = mask(u, v, proportion)
image_reconstructed, _ = prox_grad_descent(image, mat_mask, mu_best)
plt.imshow(image_reconstructed, cmap = plt.cm.gray)
We observe 30.1 per cent of the entries of a 100*100 matrix
<matplotlib.image.AxesImage at 0x122727ed0>