import numpy as np
from scipy.sparse import csr_matrix
def save_sparse_csr(filename, array):
# note that .npz extension is added automatically
np.savez(filename, data=array.data, indices=array.indices,
indptr=array.indptr, shape=array.shape)
def load_sparse_csr(filename):
# here we need to add .npz extension manually
loader = np.load(filename + '.npz')
return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
shape=loader['shape'])
wc_file = "../../data/documents/word_counts.txt"
X = load_sparse_csr(wc_file)
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100, random_state=5)
svd.fit(X)
TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5, random_state=5, tol=0.0)
It didn't crash!
svd = TruncatedSVD(n_components=100, random_state=5)
# Fit LSI model to X and perform dimensionality reduction on X.
X_new = svd.fit_transform(X)
print(X_new.shape)
X_new
(177140, 100)
array([[ 0.33195896, -0.14014558, -0.07146808, ..., 0.04157464, 0.22544143, -0.1791284 ], [ 0.27524292, -0.35492953, -0.26604158, ..., 0.02714783, -0.33951171, 0.14421771], [ 0.87030143, -1.05706501, 0.04199029, ..., 0.05837507, 0.18249266, 0.32320418], ..., [ 0.10661509, -0.13295076, -0.09141061, ..., -0.15282697, 0.02647602, 0.06981038], [ 0.15230048, -0.20636379, -0.16772075, ..., 0.0375541 , -0.01315718, 0.01197927], [ 0.33925433, -0.47067816, -0.41242699, ..., -0.09150298, 0.23343678, 0.08946104]])
Now I want to get the individual matrices
Ref: http://stackoverflow.com/questions/31523575/get-u-sigma-v-matrix-from-truncated-svd-in-scikit-learn
from sklearn.utils.extmath import randomized_svd
U, s, Vh = randomized_svd(X, n_components=100, n_iter=5, random_state=5)
U.shape, s.shape, Vh.shape
((177140, 100), (100,), (100, 99946))