Published: November 6, 2020
Author: Xinyu Chen [GitHub homepage]
Download: This Jupyter notebook is at our GitHub repository. If you want to evaluate the code, please download the notebook from the transdim repository.
This notebook shows how to implement the Bayesian Probabilistic Matrix Factorization (BPMF), a fully Bayesian matrix factorization model, on some real-world data sets. For an in-depth discussion of BPMF, please see [1].
import numpy as np
from numpy.linalg import inv as inv
from numpy.random import normal as normrnd
from scipy.linalg import khatri_rao as kr_prod
from scipy.stats import wishart
from numpy.linalg import solve as solve
from scipy.linalg import cholesky as cholesky_upper
from scipy.linalg import solve_triangular as solve_ut
import matplotlib.pyplot as plt
%matplotlib inline
def mvnrnd_pre(mu, Lambda):
src = normrnd(size = (mu.shape[0],))
return solve_ut(cholesky_upper(Lambda, overwrite_a = True, check_finite = False),
src, lower = False, check_finite = False, overwrite_b = True) + mu
def cov_mat(mat, mat_bar):
mat = mat - mat_bar
return mat.T @ mat
def sample_factor_w(tau_sparse_mat, tau_ind, W, X, tau, beta0 = 1, vargin = 0):
"""Sampling N-by-R factor matrix W and its hyperparameters (mu_w, Lambda_w)."""
dim1, rank = W.shape
W_bar = np.mean(W, axis = 0)
temp = dim1 / (dim1 + beta0)
var_mu_hyper = temp * W_bar
var_W_hyper = inv(np.eye(rank) + cov_mat(W, W_bar) + temp * beta0 * np.outer(W_bar, W_bar))
var_Lambda_hyper = wishart.rvs(df = dim1 + rank, scale = var_W_hyper)
var_mu_hyper = mvnrnd_pre(var_mu_hyper, (dim1 + beta0) * var_Lambda_hyper)
if dim1 * rank ** 2 > 1e+8:
vargin = 1
if vargin == 0:
var1 = X.T
var2 = kr_prod(var1, var1)
var3 = (var2 @ tau_ind.T).reshape([rank, rank, dim1]) + var_Lambda_hyper[:, :, np.newaxis]
var4 = var1 @ tau_sparse_mat.T + (var_Lambda_hyper @ var_mu_hyper)[:, np.newaxis]
for i in range(dim1):
W[i, :] = mvnrnd_pre(solve(var3[:, :, i], var4[:, i]), var3[:, :, i])
elif vargin == 1:
for i in range(dim1):
pos0 = np.where(sparse_mat[i, :] != 0)
Xt = X[pos0[0], :]
var_mu = tau * Xt.T @ sparse_mat[i, pos0[0]] + var_Lambda_hyper @ var_mu_hyper
var_Lambda = tau * Xt.T @ Xt + var_Lambda_hyper
W[i, :] = mvnrnd_pre(solve(var_Lambda, var_mu), var_Lambda)
return W
def sample_factor_x(tau_sparse_mat, tau_ind, W, X, beta0 = 1):
"""Sampling T-by-R factor matrix X and its hyperparameters (mu_x, Lambda_x)."""
dim2, rank = X.shape
X_bar = np.mean(X, axis = 0)
temp = dim2 / (dim2 + beta0)
var_mu_hyper = temp * X_bar
var_X_hyper = inv(np.eye(rank) + cov_mat(X, X_bar) + temp * beta0 * np.outer(X_bar, X_bar))
var_Lambda_hyper = wishart.rvs(df = dim2 + rank, scale = var_X_hyper)
var_mu_hyper = mvnrnd_pre(var_mu_hyper, (dim2 + beta0) * var_Lambda_hyper)
var1 = W.T
var2 = kr_prod(var1, var1)
var3 = (var2 @ tau_ind).reshape([rank, rank, dim2]) + var_Lambda_hyper[:, :, np.newaxis]
var4 = var1 @ tau_sparse_mat + (var_Lambda_hyper @ var_mu_hyper)[:, np.newaxis]
for t in range(dim2):
X[t, :] = mvnrnd_pre(solve(var3[:, :, t], var4[:, t]), var3[:, :, t])
return X
def sample_precision_tau(sparse_mat, mat_hat, ind):
var_alpha = 1e-6 + 0.5 * np.sum(ind)
var_beta = 1e-6 + 0.5 * np.sum(((sparse_mat - mat_hat) ** 2) * ind)
return np.random.gamma(var_alpha, 1 / var_beta)
def compute_mape(var, var_hat):
return np.sum(np.abs(var - var_hat) / var) / var.shape[0]
def compute_rmse(var, var_hat):
return np.sqrt(np.sum((var - var_hat) ** 2) / var.shape[0])
def BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter):
"""Bayesian Probabilistic Matrix Factorization, BPMF."""
dim1, dim2 = sparse_mat.shape
W = init["W"]
X = init["X"]
if np.isnan(sparse_mat).any() == False:
ind = sparse_mat != 0
pos_obs = np.where(ind)
pos_test = np.where((dense_mat != 0) & (sparse_mat == 0))
elif np.isnan(sparse_mat).any() == True:
pos_test = np.where((dense_mat != 0) & (np.isnan(sparse_mat)))
ind = ~np.isnan(sparse_mat)
pos_obs = np.where(ind)
sparse_mat[np.isnan(sparse_mat)] = 0
dense_test = dense_mat[pos_test]
del dense_mat
tau = 1
W_plus = np.zeros((dim1, rank))
X_plus = np.zeros((dim2, rank))
temp_hat = np.zeros(sparse_mat.shape)
show_iter = 200
mat_hat_plus = np.zeros(sparse_mat.shape)
for it in range(burn_iter + gibbs_iter):
tau_ind = tau * ind
tau_sparse_mat = tau * sparse_mat
W = sample_factor_w(tau_sparse_mat, tau_ind, W, X, tau)
X = sample_factor_x(tau_sparse_mat, tau_ind, W, X)
mat_hat = W @ X.T
tau = sample_precision_tau(sparse_mat, mat_hat, ind)
temp_hat += mat_hat
if (it + 1) % show_iter == 0 and it < burn_iter:
temp_hat = temp_hat / show_iter
print('Iter: {}'.format(it + 1))
print('MAPE: {:.6}'.format(compute_mape(dense_test, temp_hat[pos_test])))
print('RMSE: {:.6}'.format(compute_rmse(dense_test, temp_hat[pos_test])))
temp_hat = np.zeros(sparse_mat.shape)
print()
if it + 1 > burn_iter:
W_plus += W
X_plus += X
mat_hat_plus += mat_hat
mat_hat = mat_hat_plus / gibbs_iter
W = W_plus / gibbs_iter
X = X_plus / gibbs_iter
print('Imputation MAPE: {:.6}'.format(compute_mape(dense_test, mat_hat[pos_test])))
print('Imputation RMSE: {:.6}'.format(compute_rmse(dense_test, mat_hat[pos_test])))
print()
return mat_hat, W, X
Scenario setting:
Model setting:
import time
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/Guangzhou-data-set/tensor.mat')['tensor']
dim = dense_tensor.shape
missing_rate = 0.4 # Non-random missing (NM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1])[:, :, np.newaxis] + 0.5 - missing_rate)
dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])
sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])
del dense_tensor, sparse_tensor
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 10
init = {"W": 0.01 * np.random.randn(dim1, rank), "X": 0.01 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Scenario setting:
Model setting:
import time
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/Guangzhou-data-set/tensor.mat')['tensor']
dim = dense_tensor.shape
missing_rate = 0.4 # Random missing (RM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)
dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])
sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])
del dense_tensor, sparse_tensor
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 80
init = {"W": 0.01 * np.random.randn(dim1, rank), "X": 0.01 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Scenario setting:
Model setting:
import time
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/Guangzhou-data-set/tensor.mat')['tensor']
dim = dense_tensor.shape
missing_rate = 0.6 # Random missing (RM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)
dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])
sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])
del dense_tensor, sparse_tensor
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 80
init = {"W": 0.01 * np.random.randn(dim1, rank), "X": 0.01 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Scenario setting:
Model setting:
import time
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/Birmingham-data-set/tensor.mat')['tensor']
dim = dense_tensor.shape
missing_rate = 0.4 # Non-random missing (NM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1])[:, :, np.newaxis] + 0.5 - missing_rate)
dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])
sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])
del dense_tensor, sparse_tensor
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 20
init = {"W": 0.01 * np.random.randn(dim1, rank), "X": 0.01 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Scenario setting:
Model setting:
import time
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/Birmingham-data-set/tensor.mat')['tensor']
dim = dense_tensor.shape
missing_rate = 0.4 # Random missing (RM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)
dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])
sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])
del dense_tensor, sparse_tensor
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 20
init = {"W": 0.1 * np.random.randn(dim1, rank), "X": 0.1 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Scenario setting:
Model setting:
import time
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/Birmingham-data-set/tensor.mat')['tensor']
dim = dense_tensor.shape
missing_rate = 0.6 # Random missing (RM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)
dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])
sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])
del dense_tensor, sparse_tensor
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 20
init = {"W": 0.1 * np.random.randn(dim1, rank), "X": 0.1 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Scenario setting:
Model setting:
import time
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/Hangzhou-data-set/tensor.mat')['tensor']
dim = dense_tensor.shape
missing_rate = 0.4 # Non-random missing (NM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1])[:, :, np.newaxis] + 0.5 - missing_rate)
dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])
sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])
del dense_tensor, sparse_tensor
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 30
init = {"W": 0.01 * np.random.randn(dim1, rank), "X": 0.01 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.349724 RMSE: 50.3797 Iter: 400 MAPE: 0.353828 RMSE: 52.0155 Iter: 600 MAPE: 0.352678 RMSE: 49.2252 Iter: 800 MAPE: 0.353151 RMSE: 45.6113 Iter: 1000 MAPE: 0.352523 RMSE: 45.7271 Imputation MAPE: 0.350175 Imputation RMSE: 45.3692 Running time: 243 seconds
Scenario setting:
Model setting:
import time
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/Hangzhou-data-set/tensor.mat')['tensor']
dim = dense_tensor.shape
missing_rate = 0.4 # Random missing (RM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)
dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])
sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])
del dense_tensor, sparse_tensor
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 30
init = {"W": 0.1 * np.random.randn(dim1, rank), "X": 0.1 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.322692 RMSE: 41.591 Iter: 400 MAPE: 0.323997 RMSE: 41.9302 Iter: 600 MAPE: 0.324506 RMSE: 41.8721 Iter: 800 MAPE: 0.322947 RMSE: 41.7555 Iter: 1000 MAPE: 0.324867 RMSE: 41.9841 Imputation MAPE: 0.323612 Imputation RMSE: 41.8382 Running time: 234 seconds
Scenario setting:
Model setting:
import time
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/Hangzhou-data-set/tensor.mat')['tensor']
dim = dense_tensor.shape
missing_rate = 0.6 # Random missing (RM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)
dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])
sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])
del dense_tensor, sparse_tensor
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 30
init = {"W": 0.1 * np.random.randn(dim1, rank), "X": 0.1 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.393414 RMSE: 46.1231 Iter: 400 MAPE: 0.397607 RMSE: 46.8354 Iter: 600 MAPE: 0.400496 RMSE: 46.8306 Iter: 800 MAPE: 0.39948 RMSE: 46.6026 Iter: 1000 MAPE: 0.400208 RMSE: 46.7884 Imputation MAPE: 0.398716 Imputation RMSE: 46.6787 Running time: 234 seconds
Scenario setting:
Model setting:
import time
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/Seattle-data-set/tensor.npz')['arr_0']
dim = dense_tensor.shape
missing_rate = 0.4 # Non-random missing (NM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1])[:, :, np.newaxis] + 0.5 - missing_rate)
dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])
sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])
del dense_tensor, sparse_tensor
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 10
init = {"W": 0.01 * np.random.randn(dim1, rank), "X": 0.01 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.0962377 RMSE: 5.41434 Iter: 400 MAPE: 0.0918425 RMSE: 5.30282 Iter: 600 MAPE: 0.0918725 RMSE: 5.30348 Iter: 800 MAPE: 0.0918654 RMSE: 5.30324 Iter: 1000 MAPE: 0.0918592 RMSE: 5.30311 Imputation MAPE: 0.0918528 Imputation RMSE: 5.30302 Running time: 441 seconds
Scenario setting:
Model setting:
import time
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/Seattle-data-set/tensor.npz')['arr_0']
dim = dense_tensor.shape
missing_rate = 0.4 # Random missing (RM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)
dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])
sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])
del dense_tensor, sparse_tensor
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 50
init = {"W": 0.1 * np.random.randn(dim1, rank), "X": 0.1 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.0677314 RMSE: 4.16611 Iter: 400 MAPE: 0.0678637 RMSE: 4.18104 Iter: 600 MAPE: 0.067846 RMSE: 4.18084 Iter: 800 MAPE: 0.0678551 RMSE: 4.1807 Iter: 1000 MAPE: 0.0678554 RMSE: 4.18122 Imputation MAPE: 0.0678574 Imputation RMSE: 4.1814 Running time: 2302 seconds
Scenario setting:
Model setting:
import time
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/Seattle-data-set/tensor.npz')['arr_0']
dim = dense_tensor.shape
missing_rate = 0.6 # Random missing (RM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)
dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])
sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])
del dense_tensor, sparse_tensor
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 50
init = {"W": 0.1 * np.random.randn(dim1, rank), "X": 0.1 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.0735971 RMSE: 4.47002 Iter: 400 MAPE: 0.0737839 RMSE: 4.49133 Iter: 600 MAPE: 0.0738166 RMSE: 4.49302 Iter: 800 MAPE: 0.0738074 RMSE: 4.49357 Iter: 1000 MAPE: 0.0737871 RMSE: 4.49266 Imputation MAPE: 0.0738219 Imputation RMSE: 4.4948 Running time: 2280 seconds
London movement speed data set is is a city-wide hourly traffic speeddataset collected in London.
Observation rate | $>90\%$ | $>80\%$ | $>70\%$ | $>60\%$ | $>50\%$ |
---|---|---|---|---|---|
Number of roads | 17,666 | 27,148 | 35,912 | 44,352 | 52,727 |
If want to test on the full dataset, you could consider the following setting for masking observations as missing values.
import numpy as np
np.random.seed(1000)
mask_rate = 0.20
dense_mat = np.load('../datasets/London-data-set/hourly_speed_mat.npy')
pos_obs = np.where(dense_mat != 0)
num = len(pos_obs[0])
sample_ind = np.random.choice(num, size = int(mask_rate * num), replace = False)
sparse_mat = dense_mat.copy()
sparse_mat[pos_obs[0][sample_ind], pos_obs[1][sample_ind]] = 0
Notably, you could also consider to evaluate the model on a subset of the data with the following setting.
import numpy as np
np.random.seed(1000)
missing_rate = 0.4
dense_mat = np.load('../datasets/London-data-set/hourly_speed_mat.npy')
binary_mat = dense_mat.copy()
binary_mat[binary_mat != 0] = 1
pos = np.where(np.sum(binary_mat, axis = 1) > 0.7 * binary_mat.shape[1])
dense_mat = dense_mat[pos[0], :]
## Non-random missing (NM)
binary_mat = np.zeros(dense_mat.shape)
random_mat = np.random.rand(dense_mat.shape[0], 30)
for i1 in range(dense_mat.shape[0]):
for i2 in range(30):
binary_mat[i1, i2 * 24 : (i2 + 1) * 24] = np.round(random_mat[i1, i2] + 0.5 - missing_rate)
sparse_mat = np.multiply(dense_mat, binary_mat)
Model setting:
import time
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 20
init = {"W": 0.01 * np.random.randn(dim1, rank), "X": 0.01 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.0983183 RMSE: 2.40201 Iter: 400 MAPE: 0.094693 RMSE: 2.29785 Iter: 600 MAPE: 0.0946962 RMSE: 2.29789 Iter: 800 MAPE: 0.0946931 RMSE: 2.2979 Iter: 1000 MAPE: 0.0946954 RMSE: 2.29787 Imputation MAPE: 0.0946913 Imputation RMSE: 2.29782 Running time: 3566 seconds
import numpy as np
np.random.seed(1000)
missing_rate = 0.4
dense_mat = np.load('../datasets/London-data-set/hourly_speed_mat.npy')
binary_mat = dense_mat.copy()
binary_mat[binary_mat != 0] = 1
pos = np.where(np.sum(binary_mat, axis = 1) > 0.7 * binary_mat.shape[1])
dense_mat = dense_mat[pos[0], :]
## Random missing (RM)
random_mat = np.random.rand(dense_mat.shape[0], dense_mat.shape[1])
binary_mat = np.round(random_mat + 0.5 - missing_rate)
sparse_mat = np.multiply(dense_mat, binary_mat)
Model setting:
import time
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 20
init = {"W": 0.01 * np.random.randn(dim1, rank), "X": 0.01 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.0987723 RMSE: 2.41736 Iter: 400 MAPE: 0.0914476 RMSE: 2.21269 Iter: 600 MAPE: 0.0914455 RMSE: 2.21263 Iter: 800 MAPE: 0.0914505 RMSE: 2.21272 Iter: 1000 MAPE: 0.091447 RMSE: 2.21266 Imputation MAPE: 0.091447 Imputation RMSE: 2.21267 Running time: 3378 seconds
import numpy as np
np.random.seed(1000)
missing_rate = 0.6
dense_mat = np.load('../datasets/London-data-set/hourly_speed_mat.npy')
binary_mat = dense_mat.copy()
binary_mat[binary_mat != 0] = 1
pos = np.where(np.sum(binary_mat, axis = 1) > 0.7 * binary_mat.shape[1])
dense_mat = dense_mat[pos[0], :]
## Random missing (RM)
random_mat = np.random.rand(dense_mat.shape[0], dense_mat.shape[1])
binary_mat = np.round(random_mat + 0.5 - missing_rate)
sparse_mat = np.multiply(dense_mat, binary_mat)
Model setting:
import time
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 20
init = {"W": 0.01 * np.random.randn(dim1, rank), "X": 0.01 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.11639 RMSE: 2.87399 Iter: 400 MAPE: 0.0928231 RMSE: 2.24379 Iter: 600 MAPE: 0.0928241 RMSE: 2.24376 Iter: 800 MAPE: 0.0928241 RMSE: 2.24374 Iter: 1000 MAPE: 0.0928254 RMSE: 2.2438 Imputation MAPE: 0.0928232 Imputation RMSE: 2.24375 Running time: 3489 seconds
Scenario setting:
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/NYC-data-set/tensor.mat')['tensor'].astype(np.float32)
dim = dense_tensor.shape
missing_rate = 0.4 # Random missing (RM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)
dim1, dim2, dim3 = dense_tensor.shape
dense_mat = np.zeros((dim1 * dim2, dim3))
sparse_mat = np.zeros((dim1 * dim2, dim3))
for i in range(dim1):
dense_mat[i * dim2 : (i + 1) * dim2, :] = dense_tensor[i, :, :]
sparse_mat[i * dim2 : (i + 1) * dim2, :] = sparse_tensor[i, :, :]
del dense_tensor, sparse_tensor
Model setting:
import time
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 30
init = {"W": 0.1 * np.random.randn(dim1, rank), "X": 0.1 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.446785 RMSE: 4.63568 Iter: 400 MAPE: 0.449345 RMSE: 4.70971 Iter: 600 MAPE: 0.449055 RMSE: 4.70074 Iter: 800 MAPE: 0.449258 RMSE: 4.71094 Iter: 1000 MAPE: 0.449475 RMSE: 4.71905 Imputation MAPE: 0.449309 Imputation RMSE: 4.71647 Running time: 315 seconds
Scenario setting:
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/NYC-data-set/tensor.mat')['tensor'].astype(np.float32)
dim = dense_tensor.shape
missing_rate = 0.6 # Random missing (RM)
sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)
dim1, dim2, dim3 = dense_tensor.shape
dense_mat = np.zeros((dim1 * dim2, dim3))
sparse_mat = np.zeros((dim1 * dim2, dim3))
for i in range(dim1):
dense_mat[i * dim2 : (i + 1) * dim2, :] = dense_tensor[i, :, :]
sparse_mat[i * dim2 : (i + 1) * dim2, :] = sparse_tensor[i, :, :]
del dense_tensor, sparse_tensor
Model setting:
import time
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 30
init = {"W": 0.1 * np.random.randn(dim1, rank), "X": 0.1 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.461471 RMSE: 4.846 Iter: 400 MAPE: 0.466909 RMSE: 4.96785 Iter: 600 MAPE: 0.467616 RMSE: 4.98044 Iter: 800 MAPE: 0.467748 RMSE: 4.98002 Iter: 1000 MAPE: 0.467811 RMSE: 4.98422 Imputation MAPE: 0.467925 Imputation RMSE: 4.989 Running time: 290 seconds
Scenario setting:
import scipy.io
import numpy as np
np.random.seed(1000)
dense_tensor = scipy.io.loadmat('../datasets/NYC-data-set/tensor.mat')['tensor']
dim = dense_tensor.shape
nm_tensor = np.random.rand(dim[0], dim[1], dim[2])
missing_rate = 0.4 # Non-random missing (NM)
binary_tensor = np.zeros(dense_tensor.shape)
for i1 in range(dim[0]):
for i2 in range(dim[1]):
for i3 in range(61):
binary_tensor[i1, i2, i3 * 24 : (i3 + 1) * 24] = np.round(nm_tensor[i1, i2, i3] + 0.5 - missing_rate)
sparse_tensor = dense_tensor * binary_tensor
dim1, dim2, dim3 = dense_tensor.shape
dense_mat = np.zeros((dim1 * dim2, dim3))
sparse_mat = np.zeros((dim1 * dim2, dim3))
for i in range(dim1):
dense_mat[i * dim2 : (i + 1) * dim2, :] = dense_tensor[i, :, :]
sparse_mat[i * dim2 : (i + 1) * dim2, :] = sparse_tensor[i, :, :]
del dense_tensor, sparse_tensor
Model setting:
import time
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 30
init = {"W": 0.1 * np.random.randn(dim1, rank), "X": 0.1 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.450534 RMSE: 4.70565 Iter: 400 MAPE: 0.453454 RMSE: 4.78572 Iter: 600 MAPE: 0.453593 RMSE: 4.78719 Iter: 800 MAPE: 0.453732 RMSE: 4.79153 Iter: 1000 MAPE: 0.453568 RMSE: 4.78587 Imputation MAPE: 0.453621 Imputation RMSE: 4.78941 Running time: 286 seconds
Scenario setting:
import numpy as np
np.random.seed(1000)
dense_tensor = np.load('../datasets/Temperature-data-set/tensor.npy').astype(np.float32)
pos = np.where(dense_tensor[:, 0, :] > 50)
dense_tensor[pos[0], :, pos[1]] = 0
random_tensor = np.random.rand(dense_tensor.shape[0], dense_tensor.shape[1], dense_tensor.shape[2])
missing_rate = 0.4
## Random missing (RM)
binary_tensor = np.round(random_tensor + 0.5 - missing_rate)
sparse_tensor = dense_tensor.copy()
sparse_tensor[binary_tensor == 0] = np.nan
sparse_tensor[sparse_tensor == 0] = np.nan
dim1, dim2, dim3 = dense_tensor.shape
dense_mat = np.zeros((dim1 * dim2, dim3))
sparse_mat = np.zeros((dim1 * dim2, dim3))
for i in range(dim1):
dense_mat[i * dim2 : (i + 1) * dim2, :] = dense_tensor[i, :, :]
sparse_mat[i * dim2 : (i + 1) * dim2, :] = sparse_tensor[i, :, :]
Model setting:
import time
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 30
init = {"W": 0.1 * np.random.randn(dim1, rank), "X": 0.1 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.0138166 RMSE: 0.460386 Iter: 400 MAPE: 0.0138145 RMSE: 0.459798 Iter: 600 MAPE: 0.0138147 RMSE: 0.459801 Iter: 800 MAPE: 0.0138142 RMSE: 0.459815 Iter: 1000 MAPE: 0.0138138 RMSE: 0.459805 Imputation MAPE: 0.0138141 Imputation RMSE: 0.459829 Running time: 440 seconds
Scenario setting:
import numpy as np
np.random.seed(1000)
dense_tensor = np.load('../datasets/Temperature-data-set/tensor.npy').astype(np.float32)
pos = np.where(dense_tensor[:, 0, :] > 50)
dense_tensor[pos[0], :, pos[1]] = 0
random_tensor = np.random.rand(dense_tensor.shape[0], dense_tensor.shape[1], dense_tensor.shape[2])
missing_rate = 0.6
## Random missing (RM)
binary_tensor = np.round(random_tensor + 0.5 - missing_rate)
sparse_tensor = dense_tensor.copy()
sparse_tensor[binary_tensor == 0] = np.nan
sparse_tensor[sparse_tensor == 0] = np.nan
dim1, dim2, dim3 = dense_tensor.shape
dense_mat = np.zeros((dim1 * dim2, dim3))
sparse_mat = np.zeros((dim1 * dim2, dim3))
for i in range(dim1):
dense_mat[i * dim2 : (i + 1) * dim2, :] = dense_tensor[i, :, :]
sparse_mat[i * dim2 : (i + 1) * dim2, :] = sparse_tensor[i, :, :]
Model setting:
import time
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 30
init = {"W": 0.1 * np.random.randn(dim1, rank), "X": 0.1 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.0145174 RMSE: 0.487207 Iter: 400 MAPE: 0.0144817 RMSE: 0.485571 Iter: 600 MAPE: 0.0144834 RMSE: 0.485601 Iter: 800 MAPE: 0.0144779 RMSE: 0.485488 Iter: 1000 MAPE: 0.0144777 RMSE: 0.485473 Imputation MAPE: 0.0144806 Imputation RMSE: 0.485545 Running time: 378 seconds
Scenario setting:
import numpy as np
np.random.seed(1000)
dense_tensor = np.load('../datasets/Temperature-data-set/tensor.npy').astype(np.float32)
pos = np.where(dense_tensor[:, 0, :] > 50)
dense_tensor[pos[0], :, pos[1]] = 0
random_tensor = np.random.rand(dense_tensor.shape[0], dense_tensor.shape[1], int(dense_tensor.shape[2] / 3))
missing_rate = 0.4
## Non-random missing (NM)
binary_tensor = np.zeros(dense_tensor.shape)
for i1 in range(dense_tensor.shape[0]):
for i2 in range(dense_tensor.shape[1]):
for i3 in range(int(dense_tensor.shape[2] / 3)):
binary_tensor[i1, i2, i3 * 3 : (i3 + 1) * 3] = np.round(random_tensor[i1, i2, i3] + 0.5 - missing_rate)
sparse_tensor = dense_tensor.copy()
sparse_tensor[binary_tensor == 0] = np.nan
sparse_tensor[sparse_tensor == 0] = np.nan
dim1, dim2, dim3 = dense_tensor.shape
dense_mat = np.zeros((dim1 * dim2, dim3))
sparse_mat = np.zeros((dim1 * dim2, dim3))
for i in range(dim1):
dense_mat[i * dim2 : (i + 1) * dim2, :] = dense_tensor[i, :, :]
sparse_mat[i * dim2 : (i + 1) * dim2, :] = sparse_tensor[i, :, :]
Model setting:
import time
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 30
init = {"W": 0.1 * np.random.randn(dim1, rank), "X": 0.1 * np.random.randn(dim2, rank)}
burn_iter = 1000
gibbs_iter = 200
mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)
end = time.time()
print('Running time: %d seconds'%(end - start))
Iter: 200 MAPE: 0.0143242 RMSE: 0.475141 Iter: 400 MAPE: 0.0144314 RMSE: 0.477967 Iter: 600 MAPE: 0.0144304 RMSE: 0.477905 Iter: 800 MAPE: 0.0144277 RMSE: 0.477888 Iter: 1000 MAPE: 0.0144317 RMSE: 0.47798 Imputation MAPE: 0.0144295 Imputation RMSE: 0.477908 Running time: 325 seconds