In [1]:

import numpy as np
from scipy.sparse import csr_matrix

def save_sparse_csr(filename, array):
    # note that .npz extension is added automatically
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)

def load_sparse_csr(filename):
    # here we need to add .npz extension manually
    loader = np.load(filename + '.npz')
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

In [2]:

wc_file = "../../data/documents/word_counts.txt"
X = load_sparse_csr(wc_file)

SVD with Scikit-learn¶

In [3]:

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, random_state=5)

svd.fit(X)

Out[3]:

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
       random_state=5, tol=0.0)

It didn't crash!

In [4]:

svd = TruncatedSVD(n_components=100, random_state=5)

# Fit LSI model to X and perform dimensionality reduction on X.
X_new = svd.fit_transform(X)

print(X_new.shape)
X_new

(177140, 100)

Out[4]:

array([[ 0.33195896, -0.14014558, -0.07146808, ...,  0.04157464,
         0.22544143, -0.1791284 ],
       [ 0.27524292, -0.35492953, -0.26604158, ...,  0.02714783,
        -0.33951171,  0.14421771],
       [ 0.87030143, -1.05706501,  0.04199029, ...,  0.05837507,
         0.18249266,  0.32320418],
       ..., 
       [ 0.10661509, -0.13295076, -0.09141061, ..., -0.15282697,
         0.02647602,  0.06981038],
       [ 0.15230048, -0.20636379, -0.16772075, ...,  0.0375541 ,
        -0.01315718,  0.01197927],
       [ 0.33925433, -0.47067816, -0.41242699, ..., -0.09150298,
         0.23343678,  0.08946104]])

Getting U, Sigma, V¶

Now I want to get the individual matrices

Ref: http://stackoverflow.com/questions/31523575/get-u-sigma-v-matrix-from-truncated-svd-in-scikit-learn

In [5]:

from sklearn.utils.extmath import randomized_svd

U, s, Vh = randomized_svd(X, n_components=100, n_iter=5, random_state=5)

In [6]:

U.shape, s.shape, Vh.shape

Out[6]:

((177140, 100), (100,), (100, 99946))