from traitlets.config.manager import BaseJSONConfigManager
import jupyter_core
path = "/Users/i.oseledets/anaconda2/envs/teaching/etc/jupyter/nbconfig"
cm = BaseJSONConfigManager(config_dir=path)
cm.update("livereveal", {
"theme": "sky",
"transition": "zoom",
"start_slideshow_at": "selected",
"scroll": True
})
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc("text", usetex=True)
import scipy as sp
import scipy.sparse
import scipy.sparse.linalg as spla
import scipy
from scipy.sparse import csc_matrix
n = 16
ex = np.ones(n);
A = sp.sparse.spdiags(np.vstack((-ex, 2*ex, -ex)), [-1, 0, 1], n, n, 'csr');
rhs = np.ones(n)
ev1, vec = spla.eigsh(A, k=2, which='LA')
ev2, vec = spla.eigsh(A, k=2, which='SA')
lam_max = ev1[0]
lam_min = ev2[0]
tau_opt = 2.0/(lam_max + lam_min)
fig, ax = plt.subplots()
plt.close(fig)
niters = 100
x = np.zeros(n)
res_richardson = []
for i in range(niters):
rr = A.dot(x) - rhs
x = x - tau_opt * rr
res_richardson.append(np.linalg.norm(rr))
#Convergence of an ordinary Richardson (with optimal parameter)
plt.semilogy(res_richardson)
plt.xlabel("Number of iterations, $k$", fontsize=20)
plt.ylabel("Residual norm, $\|Ax_k - b\|_2$", fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
(array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]), <a list of 5 Text yticklabel objects>)
niters = 64
roots = [np.cos((np.pi * (2 * i + 1)) / (2 * niters)) for i in range(niters)]
taus = [(lam_max + lam_min - (lam_min - lam_max) * r) / 2 for r in roots]
x = np.zeros(n)
r = A.dot(x) - rhs
res_cheb = [np.linalg.norm(r)]
# Implementation may be non-optimal if number of iterations is not power of two
def good_shuffle(idx):
if len(idx) == 1:
return idx
else:
new_len = int(np.ceil((len(idx) / 2)))
new_idx = good_shuffle(idx[:new_len])
res_perm = []
perm_count = 0
for i in new_idx:
res_perm.append(i)
perm_count += 1
if perm_count == len(idx):
break
res_perm.append(len(idx) + 1 - i)
perm_count += 1
if perm_count == len(idx):
break
return res_perm
good_perm = good_shuffle([i for i in range(1, niters+1)])
# good_perm = [i for i in range(niters, 0, -1)]
# good_perm = np.random.permutation([i for i in range(1, niters+1)])
for i in range(niters):
x = x - 1.0/taus[good_perm[i] - 1] * r
r = A.dot(x) - rhs
res_cheb.append(np.linalg.norm(r))
plt.semilogy(res_richardson, label="Richardson")
plt.semilogy(res_cheb, label="Chebyshev")
plt.legend(fontsize=20)
plt.xlabel("Number of iterations, $k$", fontsize=20)
plt.ylabel("Residual norm, $\|Ax_k - b\|_2$", fontsize=20)
plt.xticks(fontsize=20)
_ = plt.yticks(fontsize=20)
Instead of solving a linear system, we can minimize the residual:
R(x)=‖Ax−f‖22.The condition ∇R(x)=0 gives
A∗Ax=A∗f,thus it has squared condition number, so direct minimization of the residual by standard optimization methods is rarely used.
For the symmetric positive definite case there is a much simpler functional.
Let A=A∗>0, then the following functional
Φ(x)=(Ax,x)−2(f,x)is called energy functional.
Since it is strictly convex, it has unique local minimum, which is also global
Its global minimum x∗ satisfies
Indeed,
∇Φ=2(Ax−f).and the first order optimality condition ∇Φ(x∗)=0 yields
Ax∗=f.Given a linear M-dimensional subspace {y1,…,yM}, we want to find an approximate solution in this basis, i.e.
Ax≈f,x=x0+M∑k=1ckyk,where c is the vector of coefficients.
In the symmetric positive definite case we need to minimize
(Ax,x)−2(f,x)subject to x=x0+Yc,
where Y=[y1,…,yM] is n×M and vector c has length M.
Using the representation of x, we have the following minimization for c:
ˆΦ(c)=(AYc,Yc)+2(Y∗Ax0,c)−2(f,Yc)=(Y∗AYc,c)−2(Y∗(f−Ax0),c).Note that this is the same functional, but for the Galerkin projection of A
Y∗AYc=Y∗(f−Ax0)=Y∗r0,which is an M×M linear system with symmetric positive definite matrix if Y has full column rank.
But how to choose Y?
In the Krylov subspace we generate the whole subspace from a single vector r0=f−Ax0:
y0≡k0=r0,y1≡k1=Ar0,y2≡k2=A2r0,…,yM−1≡kM−1=AM−1r0.This gives the Krylov subpace of the M-th order
KM(A,r0)=Span(r0,Ar0,…,AM−1r0).The natural basis in the Krylov subspace is very ill-conditioned, since
ki=Air0→λimaxv,where v is the eigenvector, corresponding to the maximal eigenvalue of A,
i.e. ki become more and more collinear for large i.
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse as spsp
%matplotlib inline
n = 100
ex = np.ones(n);
A = spsp.spdiags(np.vstack((-ex, 2*ex, -ex)), [-1, 0, 1], n, n, 'csr');
f = np.ones(n)
x0 = np.random.randn(n)
subspace_order = 10
krylov_vectors = np.zeros((n, subspace_order))
krylov_vectors[:, 0] = f - A.dot(x0)
for i in range(1, subspace_order):
krylov_vectors[:, i] = A.dot(krylov_vectors[:, i-1])
s = np.linalg.svd(krylov_vectors, compute_uv=False)
print("Condition number = {}".format(s.max() / s.min()))
Condition number = 595184527.1939951
Solution: Compute orthogonal basis in the Krylov subspace.
In order to have stability, we first orthogonalize the vectors from the Krylov subspace using Gram-Schmidt orthogonalization process (or, QR-factorization).
Kj=[r0Ar0A2r0…Aj−1r0]=QjRj,and the solution will be approximated as x≈x0+Qjc.
Statement. The Krylov matrix Kj satisfies an important recurrent relation (called Arnoldi relation)
AQj=QjHj+hj,j−1qje⊤j−1,where Hj is upper Hessenberg, and Qj+1=[q0,…,qj] has orthogonal columns that spans columns of Kj+1.
Let us prove it (consider j=3 for simplicity):
A[k0k1k2]=[k1k2k3]=[k0k1k2][00α010α101α2]+[00k3−α0k0−α1k1−α2k2],where αs will be selected later. Denote ˆk3=k3−α0k0−α1k1−α2k2.
In the matrix form,
AK3=K3Z+ˆk3e⊤2,
where Z is the lower shift matrix with the last column (α0,α1,α2)T, and e2 is the last column of the identity matrix.
Let K3=Q3R3
AQ3R3=Q3R3Z+ˆk3e⊤2,
AQ3=Q3R3ZR−13+ˆk3e⊤2R−13.
Note that
e⊤2R−13=[001][∗∗∗0∗∗00∗]=γe⊤2,and
R3ZR−13=[∗∗∗∗∗∗0∗∗],
in the general case it will be an upper Hessenberg matrix H, i.e. a matrix that
Hij=0,if i>j+1.Let Qj be the orthogonal basis in the Krylov subspace, then we have almost the Arnoldi relation
AQj=QjHj+γˆkje⊤j−1,where Hj is an upper Hessenberg matrix, and
ˆkj=kj−j−1∑s=0αsks.We select αs in such a way that
Q∗jˆkj=0.Then, ˆkj=hj,j−1qj, where qj is the last column of Qj+1.
We have
AQj=QjHj+hj,j−1qje⊤j−1.This is the crucial formula for the efficient generation of such subspaces.
For non-symmetric case, it is just modified Gram-Schmidt.
For the symmetric case, we have a much simpler form (Lanczos process).
If A=A∗, then
Q∗jAQj=Hj,thus Hj is hermitian, and thus it is tridiagonal, Hj=Tj.
This gives a short-term recurrence relation to generate the Arnoldi vectors qj without full orthogonalization.
In order to get qj, we need to compute just the last column of
tj,j−1qj=(AQj−QjTj)ej−1=Aqj−1−tj−1,j−1qj−1−tj−2,j−1qj−2.The coefficients αj=tj−1,j−1 and βj=tj−2,j−1 can be recovered from orthogonality constraints
(qj,qj−1)=0,(qj,qj−2)=0
All the other constraints will be satisfied automatically!!
And we only need to store two vectors to get the new one.
We can now get from the Lanczos recurrence to the famous conjugate gradient method.
We have for A=A∗>0
AQj=QjTj+Tj,j−1qj.Recall that when we minimize energy functional in basis Y we get a system Y∗AYc=Y∗f,. Here Y=Qj, so the approximate solution of Ax≈f with xj=x0+Qjcj can be found by solving a small system
Q∗jAQjcj=Tjcj=Q∗jr0.Since f is the first Krylov subspace, then Note!!! (recall what the first column in Qj is)
Q∗jr0=‖r0‖22e0=γe0.We have a tridiagonal system of equations for c:
Tjcj=γe0and xj=Qjcj.
We could stop at this point, but we want short recurrent formulas instead of solving linear system with matrix Tj at each step.
Derivation of the following update formulas is not required on the oral exam!
Since A is positive definite, Tj is also positive definite, and it allows an LU decomposition
Tj=LjUj, where Lj is a bidiagonal matrix with ones on the diagonal, Uj is a upper bidiagonal matrix.
We need to define one subdiagonal in L (with elements c1,…,cj−1), main diagonal of Uj (with elements d0,…,dj−1 and superdiagonal of Uj (with elements b1,…,bj−1).
They have convenient recurrences:
and
zj=[zj−1ξj].and qj are found from the Lanczos relation (see slides above).
We have the direct Lanczos method, where we store
pj−1,qj,xj−1to get a new estimate of xj.
The main problem is with qj: we have the three-term recurrence, but in the floating point arithmetic the orthogonality is can be lost, leading to numerical errors.
Let us do some demo.
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scipy as sp
import scipy.sparse as spsp
from scipy.sparse import csc_matrix
n = 128
ex = np.ones(n);
A = spsp.spdiags(np.vstack((ex, -2*ex, ex)), [-1, 0, 1], n, n, 'csr');
rhs = np.ones(n)
nit = 64
q1 = rhs/np.linalg.norm(rhs)
q2 = A.dot(q1)
q2 = q2 - np.dot(q2, q1)*q1
q2 = q2/np.linalg.norm(q2)
qall = [q1, q2]
for i in range(nit):
qnew = A.dot(qall[-1])
qnew = qnew - np.dot(qnew, qall[-1])*qall[-1]
qnew = qnew/np.linalg.norm(qnew)
qnew = qnew - np.dot(qnew, qall[-2])*qall[-2]
qnew = qnew/np.linalg.norm(qnew)
qall.append(qnew)
qall_mat = np.vstack(qall).T
print(np.linalg.norm(qall_mat.T.dot(qall_mat) - np.eye(qall_mat.shape[1])))
1.6432256263275
Instead of qj (last vector in the modified Gram-Schmidt process), it is more convenient to work with the residual
rj=f−Axj.The resulting recurrency has the form
xj=xj−1+αj−1pj−1
rj=rj−1−αj−1Apj−1
pj=rj+βjpj−1.
Hence the name conjugate gradient: to the gradient rj we add a conjugate direction pj.
We have orthogonality of residuals (check!):
(ri,rj)=0,i≠jand A-orthogonality of conjugate directions (check!):
(Api,pj)=0,which can be checked from the definition.
The equations for αj and βj can be now defined explicitly from these two properties.
We have (rj,rj−1)=0=(rj−1−αj−1Arj−1,rj−1),
thus
αj−1=(rj−1,rj−1)(Arj−1,rj−1).In the similar way, we have
βj−1=(rj,rj)(rj−1,rj−1).Recall that
xj=xj−1+αj−1pj−1
rj=rj−1−αj−1Apj−1
pj=rj+βjpj−1.
Only one matrix-by-vector product per iteration.
More details here: https://www.siam.org/meetings/la09/talks/oleary.pdf
When Hestenes worked on conjugate bases in 1936, he was advised by a Harvard professor that it was too obvious for publication
We need to store 3 vectors.
Since it generates A-orthogonal sequence p1,…,pN, aften n steps it should stop (i.e., pN+1=0.)
In practice it does not have this property in finite precision, thus after its invention in 1952 by Hestens and Stiefel it was labeled unstable.
In fact, it is a brilliant iterative method.
Energy functional can be written as
(Ax,x)−2(f,x)=(A(x−x∗),(x−x∗))−(Ax∗,x∗),where Ax∗=f. Up to a constant factor,
(A(x−x∗),(x−x∗))=‖x−x∗‖2Ais the A-norm of the error.
The CG method computes xk that minimizes the energy functional over the Krylov subspace, i.e. xk=p(A)f, where p is a polynomial of degree k+1, so
‖xk−x∗‖A=infp‖(p(A)−A−1)f‖A.Using eigendecomposition of A we have
A=UΛU∗,g=U∗f,and
‖x−x∗‖2A=infp‖(p(Λ)−Λ−1)g‖2Λ=infpn∑i=1(λip(λi)−1)2g2iλi=infq,q(0)=1n∑i=1q(λi)2g2iλi
Selection of the optimal q depends on the eigenvalue distribution.
We have ‖x−x∗‖2A≤n∑i=1g2iλiinfq,q(0)=1maxjq(λj)2
The first term is just n∑i=1g2iλi=(A−1f,f)=‖x∗‖2A.
And we have relative error bound
‖x−x∗‖A‖x∗‖A≤infq,q(0)=1maxj|q(λj)|,so if matrix has only 2 different eigenvalues, then there exists a polynomial of degree 2 such that q(λ1)=q(λ2)=0, so in this case CG converges in 2 iterations.
If eigenvalues are clustered and there are l outliers, then after first O(l) iterations CG will converge as if there are no outliers (and hence the effective condition number is smaller).Let us find another useful upper-bound estimate of convergence. Since
infq,q(0)=1maxj|q(λj)|≤infq,q(0)=1maxλ∈[λmin,λmax]|q(λ)|The last term is just the same as for the Chebyshev acceleration, thus the same upper convergence bound holds:
‖xk−x∗‖A‖x∗‖A≤γ(√cond(A)−1√cond(A)+1)k.As a result, better convergence than Chebyshev acceleration, but slightly higher cost per iteration.
CG is the method of choice for symmetric positive definite systems:
Before we discussed symmetric positive definite systems. What happens if A is non-symmetric?
We can still orthogonalize the Krylov subspace using Arnoldi process, and get
AQj=QjHj+hj,j−1qje⊤j−1.Let us rewrite the latter expression as
AQj=QjHj+hj,j−1qje⊤j−1=Qj+1˜Hj,˜Hj=[h0,0h0,1…h0,j−2h0,j−1h1,0h1,1…h1,j−2h1,j−10h2,2…h2,j−2h2,j−100⋱⋮⋮00hj,j−1hj−1,j−100…0hj,j−1]Then, if we need to minimize the residual over the Krylov subspace, we have
xj=x0+Qjcjand xj has to be selected as
‖Axj−f‖2=‖AQjcj−r0‖2→mincj.Using the Arnoldi recursion, we have
‖Qj+1˜Hjcj−r0‖2→mincj.Using the orthogonal invariance under multiplication by unitary matrix, we get
‖˜Hjcj−γe0‖2→mincj,where we have used that Q∗j+1r0=γe0.
This is just a linear least squares with (j+1) equations and j unknowns.
The matrix is also upper Hessenberg, thus its QR factorization can be computed in a very cheap way.
This allows the computation of cj. This method is called GMRES (generalized minimal residual)
import scipy.sparse.linalg as la
from scipy.sparse import csc_matrix, csr_matrix
import numpy as np
import matplotlib.pyplot as plt
import time
%matplotlib inline
n = 150
ex = np.ones(n);
lp1 = sp.sparse.spdiags(np.vstack((ex, -2*ex, ex)), [-1, 0, 1], n, n, 'csr');
e = sp.sparse.eye(n)
A = sp.sparse.kron(lp1, e) + sp.sparse.kron(e, lp1)
A = csr_matrix(A)
rhs = np.ones(n * n)
plt.figure(figsize=(10, 5))
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
for restart in [5, 40, 200]:
hist = []
def callback(rk):
hist.append(np.linalg.norm(rk) / np.linalg.norm(rhs))
st = time.time()
sol = la.gmres(A, rhs, x0=np.zeros(n*n), maxiter=200, restart=restart, callback=callback, tol=1e-16)
current_time = time.time() - st
ax1.semilogy(np.array(hist), label='rst={}'.format(restart))
ax2.semilogy([current_time * i / len(hist) for i in range(len(hist))], np.array(hist), label='rst={}'.format(restart))
ax1.legend(loc='best')
ax2.legend(loc='best')
ax1.set_xlabel("Number of outer iterations", fontsize=20)
ax2.set_xlabel("Time, sec", fontsize=20)
ax1.set_ylabel(r"$\frac{||r_k||_2}{||rhs||_2}$", fontsize=20)
ax2.set_ylabel(r"$\frac{||r_k||_2}{||rhs||_2}$", fontsize=20)
plt.sca(ax1)
plt.yticks(fontsize=20)
plt.sca(ax2)
plt.yticks(fontsize=20)
f.tight_layout()
<Figure size 720x360 with 0 Axes>
import scipy.sparse.linalg as la
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Example from http://www.caam.rice.edu/~embree/39961.pdf
A = np.array([[1, 1, 1],
[0, 1, 3],
[0, 0, 1]]
)
rhs = np.array([2, -4, 1])
x0 = np.zeros(3)
for restart in [1, 2, 3]:
hist = []
def callback(rk):
hist.append(np.linalg.norm(rk)/np.linalg.norm(rhs))
_ = la.gmres(A, rhs, x0=x0, maxiter=20, restart=restart, callback=callback)
plt.semilogy(np.array(hist), label='rst={}'.format(restart))
plt.legend()
plt.xlabel("Number of outer iterations", fontsize=20)
plt.ylabel(r"$\frac{||r_k||_2}{||rhs||_2}$", fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tight_layout()
from IPython.core.display import HTML
def css_styling():
styles = open("./styles/custom.css", "r").read()
return HTML(styles)
css_styling()