import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
from scipy.sparse import csr_matrix
def save_sparse_csr(filename, array):
# note that .npz extension is added automatically
np.savez(filename, data=array.data, indices=array.indices,
indptr=array.indptr, shape=array.shape)
def load_sparse_csr(filename):
# here we need to add .npz extension manually
loader = np.load(filename + '.npz')
return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
shape=loader['shape'])
wc_file = "../../data/documents/word_counts.txt"
X = load_sparse_csr(wc_file)
type(X)
scipy.sparse.csr.csr_matrix
Searching for how to run SVD in Python, I come across two implementations:
Why both numpy.linalg and scipy.linalg? What’s the difference?
scipy.linalg is a more complete wrapping of Fortran LAPACK using f2py.
X.toarray().shape
(177140, 99946)
When we run SVD on the full matrix, we get dead kernels. The size of the matrix is too big
# import numpy as np
#
# U, s, V = np.linalg.svd(X.toarray(), full_matrices=True)
# import scipy.linalg
#
# scipy.linalg.svd(X.toarray())
x = X.toarray()[:10000, :100]
import scipy.linalg
U, s, Vh = scipy.linalg.svd(x)
U.shape, Vh.shape, s.shape
((10000, 10000), (100, 100), (100,))
f, ax = plt.subplots(figsize=(15,4))
sns.pointplot(y=s, x=list(range(1,len(s)+1)))
ax.set(xlabel="singular values", ylabel="eigenvalue")
[<matplotlib.text.Text at 0x112824908>, <matplotlib.text.Text at 0x11280c0b8>]
Information in first 8 singluar values
numSV = 8
s[:numSV]
array([ 2. , 1.73205081, 1.73205081, 1. , 1. , 1. , 1. , 1. ])
U = U[:,:numSV]
Vh = Vh[:numSV,:]
s = s[:numSV]
U.shape, Vh.shape, s.shape
((10000, 8), (8, 100), (8,))
# Now we reconstruct the original matrix using this matrix decomposition we did above
reconstructed = np.dot(np.dot(U,np.diag(s)), Vh)
reconstructed
array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], ..., [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]])
Figure out how to decompose the full matrix